2018-06-05 08:27:24 +08:00
|
|
|
; RUN: llc %s -o - -enable-shrink-wrap=true -pass-remarks-output=%t | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
|
|
|
|
; RUN: cat %t | FileCheck %s --check-prefix=REMARKS
|
2015-05-27 14:28:41 +08:00
|
|
|
; RUN: llc %s -o - -enable-shrink-wrap=false | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
|
|
|
|
;
|
|
|
|
; Note: Lots of tests use inline asm instead of regular calls.
|
|
|
|
; This allows to have a better control on what the allocation will do.
|
|
|
|
; Otherwise, we may have spill right in the entry block, defeating
|
|
|
|
; shrink-wrapping. Moreover, some of the inline asm statement (nop)
|
|
|
|
; are here to ensure that the related paths do not end up as critical
|
|
|
|
; edges.
|
|
|
|
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
|
|
|
|
target triple = "x86_64-apple-macosx"
|
|
|
|
|
|
|
|
|
|
|
|
; Initial motivating example: Simple diamond with a call just on one side.
|
|
|
|
; CHECK-LABEL: foo:
|
|
|
|
;
|
|
|
|
; Compare the arguments and jump to exit.
|
|
|
|
; No prologue needed.
|
|
|
|
; ENABLE: movl %edi, [[ARG0CPY:%e[a-z]+]]
|
2018-02-28 00:59:10 +08:00
|
|
|
; ENABLE-NEXT: cmpl %esi, %edi
|
2015-05-27 14:28:41 +08:00
|
|
|
; ENABLE-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
|
|
|
|
;
|
|
|
|
; Prologue code.
|
|
|
|
; (What we push does not matter. It should be some random sratch register.)
|
|
|
|
; CHECK: pushq
|
|
|
|
;
|
|
|
|
; Compare the arguments and jump to exit.
|
|
|
|
; After the prologue is set.
|
|
|
|
; DISABLE: movl %edi, [[ARG0CPY:%e[a-z]+]]
|
2018-02-28 00:59:10 +08:00
|
|
|
; DISABLE-NEXT: cmpl %esi, %edi
|
2015-05-27 14:28:41 +08:00
|
|
|
; DISABLE-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
|
|
|
|
;
|
|
|
|
; Store %a in the alloca.
|
|
|
|
; CHECK: movl [[ARG0CPY]], 4(%rsp)
|
|
|
|
; Set the alloca address in the second argument.
|
|
|
|
; CHECK-NEXT: leaq 4(%rsp), %rsi
|
|
|
|
; Set the first argument to zero.
|
|
|
|
; CHECK-NEXT: xorl %edi, %edi
|
|
|
|
; CHECK-NEXT: callq _doSomething
|
|
|
|
;
|
|
|
|
; With shrink-wrapping, epilogue is just after the call.
|
|
|
|
; ENABLE-NEXT: addq $8, %rsp
|
|
|
|
;
|
|
|
|
; CHECK: [[EXIT_LABEL]]:
|
|
|
|
;
|
|
|
|
; Without shrink-wrapping, epilogue is in the exit block.
|
|
|
|
; Epilogue code. (What we pop does not matter.)
|
|
|
|
; DISABLE-NEXT: popq
|
|
|
|
;
|
|
|
|
; CHECK-NEXT: retq
|
|
|
|
define i32 @foo(i32 %a, i32 %b) {
|
|
|
|
%tmp = alloca i32, align 4
|
|
|
|
%tmp2 = icmp slt i32 %a, %b
|
|
|
|
br i1 %tmp2, label %true, label %false
|
|
|
|
|
|
|
|
true:
|
|
|
|
store i32 %a, i32* %tmp, align 4
|
|
|
|
%tmp4 = call i32 @doSomething(i32 0, i32* %tmp)
|
|
|
|
br label %false
|
|
|
|
|
|
|
|
false:
|
|
|
|
%tmp.0 = phi i32 [ %tmp4, %true ], [ %a, %0 ]
|
|
|
|
ret i32 %tmp.0
|
|
|
|
}
|
|
|
|
|
|
|
|
; Function Attrs: optsize
|
|
|
|
declare i32 @doSomething(i32, i32*)
|
|
|
|
|
|
|
|
|
|
|
|
; Check that we do not perform the restore inside the loop whereas the save
|
|
|
|
; is outside.
|
|
|
|
; CHECK-LABEL: freqSaveAndRestoreOutsideLoop:
|
|
|
|
;
|
|
|
|
; Shrink-wrapping allows to skip the prologue in the else case.
|
[X86] Promote i8 CMOV's (PR40965)
Summary:
@mclow.lists brought up this issue up in IRC, it came up during
implementation of libc++ `std::midpoint()` implementation (D59099)
https://godbolt.org/z/oLrHBP
Currently LLVM X86 backend only promotes i8 CMOV if it came from 2x`trunc`.
This differential proposes to always promote i8 CMOV.
There are several concerns here:
* Is this actually more performant, or is it just the ASM that looks cuter?
* Does this result in partial register stalls?
* What about branch predictor?
# Indeed, performance should be the main point here.
Let's look at a simple microbenchmark: {F8412076}
```
#include "benchmark/benchmark.h"
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <iterator>
#include <limits>
#include <random>
#include <type_traits>
#include <utility>
#include <vector>
// Future preliminary libc++ code, from Marshall Clow.
namespace std {
template <class _Tp>
__inline _Tp midpoint(_Tp __a, _Tp __b) noexcept {
using _Up = typename std::make_unsigned<typename remove_cv<_Tp>::type>::type;
int __sign = 1;
_Up __m = __a;
_Up __M = __b;
if (__a > __b) {
__sign = -1;
__m = __b;
__M = __a;
}
return __a + __sign * _Tp(_Up(__M - __m) >> 1);
}
} // namespace std
template <typename T>
std::vector<T> getVectorOfRandomNumbers(size_t count) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<T> dis(std::numeric_limits<T>::min(),
std::numeric_limits<T>::max());
std::vector<T> v;
v.reserve(count);
std::generate_n(std::back_inserter(v), count,
[&dis, &gen]() { return dis(gen); });
assert(v.size() == count);
return v;
}
struct RandRand {
template <typename T>
static std::pair<std::vector<T>, std::vector<T>> Gen(size_t count) {
return std::make_pair(getVectorOfRandomNumbers<T>(count),
getVectorOfRandomNumbers<T>(count));
}
};
struct ZeroRand {
template <typename T>
static std::pair<std::vector<T>, std::vector<T>> Gen(size_t count) {
return std::make_pair(std::vector<T>(count, T(0)),
getVectorOfRandomNumbers<T>(count));
}
};
template <class T, class Gen>
void BM_StdMidpoint(benchmark::State& state) {
const size_t Length = state.range(0);
const std::pair<std::vector<T>, std::vector<T>> Data =
Gen::template Gen<T>(Length);
const std::vector<T>& a = Data.first;
const std::vector<T>& b = Data.second;
assert(a.size() == Length && b.size() == a.size());
benchmark::ClobberMemory();
benchmark::DoNotOptimize(a);
benchmark::DoNotOptimize(a.data());
benchmark::DoNotOptimize(b);
benchmark::DoNotOptimize(b.data());
for (auto _ : state) {
for (size_t i = 0; i < Length; i++) {
const auto calculated = std::midpoint(a[i], b[i]);
benchmark::DoNotOptimize(calculated);
}
}
state.SetComplexityN(Length);
state.counters["midpoints"] =
benchmark::Counter(Length, benchmark::Counter::kIsIterationInvariant);
state.counters["midpoints/sec"] =
benchmark::Counter(Length, benchmark::Counter::kIsIterationInvariantRate);
const size_t BytesRead = 2 * sizeof(T) * Length;
state.counters["bytes_read/iteration"] =
benchmark::Counter(BytesRead, benchmark::Counter::kDefaults,
benchmark::Counter::OneK::kIs1024);
state.counters["bytes_read/sec"] = benchmark::Counter(
BytesRead, benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::OneK::kIs1024);
}
template <typename T>
static void CustomArguments(benchmark::internal::Benchmark* b) {
const size_t L2SizeBytes = 2 * 1024 * 1024;
// What is the largest range we can check to always fit within given L2 cache?
const size_t MaxLen = L2SizeBytes / /*total bufs*/ 2 /
/*maximal elt size*/ sizeof(T) / /*safety margin*/ 2;
b->RangeMultiplier(2)->Range(1, MaxLen)->Complexity(benchmark::oN);
}
// Both of the values are random.
// The comparison is unpredictable.
BENCHMARK_TEMPLATE(BM_StdMidpoint, int32_t, RandRand)
->Apply(CustomArguments<int32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint32_t, RandRand)
->Apply(CustomArguments<uint32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int64_t, RandRand)
->Apply(CustomArguments<int64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint64_t, RandRand)
->Apply(CustomArguments<uint64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int16_t, RandRand)
->Apply(CustomArguments<int16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint16_t, RandRand)
->Apply(CustomArguments<uint16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int8_t, RandRand)
->Apply(CustomArguments<int8_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint8_t, RandRand)
->Apply(CustomArguments<uint8_t>);
// One value is always zero, and another is bigger or equal than zero.
// The comparison is predictable.
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint32_t, ZeroRand)
->Apply(CustomArguments<uint32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint64_t, ZeroRand)
->Apply(CustomArguments<uint64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint16_t, ZeroRand)
->Apply(CustomArguments<uint16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint8_t, ZeroRand)
->Apply(CustomArguments<uint8_t>);
```
```
$ ~/src/googlebenchmark/tools/compare.py --no-utest benchmarks ./llvm-cmov-bench-OLD ./llvm-cmov-bench-NEW
RUNNING: ./llvm-cmov-bench-OLD --benchmark_out=/tmp/tmp5a5qjm
2019-03-06 21:53:31
Running ./llvm-cmov-bench-OLD
Run on (8 X 4000 MHz CPU s)
CPU Caches:
L1 Data 16K (x8)
L1 Instruction 64K (x4)
L2 Unified 2048K (x4)
L3 Unified 8192K (x1)
Load Average: 1.78, 1.81, 1.36
----------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters<...>
----------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 300398 ns 300404 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25083G/s midpoints=305.398M midpoints/sec=436.319M/s
BM_StdMidpoint<int32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<int32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 300433 ns 300433 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25052G/s midpoints=305.398M midpoints/sec=436.278M/s
BM_StdMidpoint<uint32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 169857 ns 169858 ns 4121 bytes_read/iteration=1024k bytes_read/sec=5.74929G/s midpoints=270.074M midpoints/sec=385.828M/s
BM_StdMidpoint<int64_t, RandRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<int64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 169770 ns 169771 ns 4125 bytes_read/iteration=1024k bytes_read/sec=5.75223G/s midpoints=270.336M midpoints/sec=386.026M/s
BM_StdMidpoint<uint64_t, RandRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 591169 ns 591179 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.65189G/s midpoints=309.854M midpoints/sec=443.426M/s
BM_StdMidpoint<int16_t, RandRand>_BigO 2.25 N 2.25 N
BM_StdMidpoint<int16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 591264 ns 591274 ns 1184 bytes_read/iteration=1024k bytes_read/sec=1.65162G/s midpoints=310.378M midpoints/sec=443.354M/s
BM_StdMidpoint<uint16_t, RandRand>_BigO 2.25 N 2.25 N
BM_StdMidpoint<uint16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 2983669 ns 2983689 ns 235 bytes_read/iteration=1024k bytes_read/sec=335.156M/s midpoints=123.208M midpoints/sec=175.718M/s
BM_StdMidpoint<int8_t, RandRand>_BigO 5.69 N 5.69 N
BM_StdMidpoint<int8_t, RandRand>_RMS 0 % 0 %
<...>
BM_StdMidpoint<uint8_t, RandRand>/524288 2668398 ns 2668419 ns 262 bytes_read/iteration=1024k bytes_read/sec=374.754M/s midpoints=137.363M midpoints/sec=196.479M/s
BM_StdMidpoint<uint8_t, RandRand>_BigO 5.09 N 5.09 N
BM_StdMidpoint<uint8_t, RandRand>_RMS 0 % 0 %
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 300887 ns 300887 ns 2331 bytes_read/iteration=1024k bytes_read/sec=3.24561G/s midpoints=305.529M midpoints/sec=435.619M/s
BM_StdMidpoint<uint32_t, ZeroRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, ZeroRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 169634 ns 169634 ns 4102 bytes_read/iteration=1024k bytes_read/sec=5.75688G/s midpoints=268.829M midpoints/sec=386.338M/s
BM_StdMidpoint<uint64_t, ZeroRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, ZeroRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 592252 ns 592255 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.64889G/s midpoints=309.854M midpoints/sec=442.62M/s
BM_StdMidpoint<uint16_t, ZeroRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<uint16_t, ZeroRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 987295 ns 987309 ns 711 bytes_read/iteration=1024k bytes_read/sec=1012.85M/s midpoints=372.769M midpoints/sec=531.028M/s
BM_StdMidpoint<uint8_t, ZeroRand>_BigO 1.88 N 1.88 N
BM_StdMidpoint<uint8_t, ZeroRand>_RMS 1 % 1 %
RUNNING: ./llvm-cmov-bench-NEW --benchmark_out=/tmp/tmpPvwpfW
2019-03-06 21:56:58
Running ./llvm-cmov-bench-NEW
Run on (8 X 4000 MHz CPU s)
CPU Caches:
L1 Data 16K (x8)
L1 Instruction 64K (x4)
L2 Unified 2048K (x4)
L3 Unified 8192K (x1)
Load Average: 1.17, 1.46, 1.30
----------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters<...>
----------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 300878 ns 300880 ns 2324 bytes_read/iteration=1024k bytes_read/sec=3.24569G/s midpoints=304.611M midpoints/sec=435.629M/s
BM_StdMidpoint<int32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<int32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 300231 ns 300226 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25276G/s midpoints=305.398M midpoints/sec=436.578M/s
BM_StdMidpoint<uint32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 170819 ns 170777 ns 4115 bytes_read/iteration=1024k bytes_read/sec=5.71835G/s midpoints=269.681M midpoints/sec=383.752M/s
BM_StdMidpoint<int64_t, RandRand>_BigO 2.60 N 2.60 N
BM_StdMidpoint<int64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 171705 ns 171708 ns 4106 bytes_read/iteration=1024k bytes_read/sec=5.68733G/s midpoints=269.091M midpoints/sec=381.671M/s
BM_StdMidpoint<uint64_t, RandRand>_BigO 2.62 N 2.62 N
BM_StdMidpoint<uint64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 592510 ns 592516 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.64816G/s midpoints=309.854M midpoints/sec=442.425M/s
BM_StdMidpoint<int16_t, RandRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<int16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 614823 ns 614823 ns 1180 bytes_read/iteration=1024k bytes_read/sec=1.58836G/s midpoints=309.33M midpoints/sec=426.373M/s
BM_StdMidpoint<uint16_t, RandRand>_BigO 2.33 N 2.33 N
BM_StdMidpoint<uint16_t, RandRand>_RMS 4 % 4 %
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 1073181 ns 1073201 ns 650 bytes_read/iteration=1024k bytes_read/sec=931.791M/s midpoints=340.787M midpoints/sec=488.527M/s
BM_StdMidpoint<int8_t, RandRand>_BigO 2.05 N 2.05 N
BM_StdMidpoint<int8_t, RandRand>_RMS 1 % 1 %
BM_StdMidpoint<uint8_t, RandRand>/524288 1071010 ns 1071020 ns 653 bytes_read/iteration=1024k bytes_read/sec=933.689M/s midpoints=342.36M midpoints/sec=489.522M/s
BM_StdMidpoint<uint8_t, RandRand>_BigO 2.05 N 2.05 N
BM_StdMidpoint<uint8_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 300413 ns 300416 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.2507G/s midpoints=305.398M midpoints/sec=436.302M/s
BM_StdMidpoint<uint32_t, ZeroRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, ZeroRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 169667 ns 169669 ns 4123 bytes_read/iteration=1024k bytes_read/sec=5.75568G/s midpoints=270.205M midpoints/sec=386.257M/s
BM_StdMidpoint<uint64_t, ZeroRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, ZeroRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 591396 ns 591404 ns 1184 bytes_read/iteration=1024k bytes_read/sec=1.65126G/s midpoints=310.378M midpoints/sec=443.257M/s
BM_StdMidpoint<uint16_t, ZeroRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<uint16_t, ZeroRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 1069421 ns 1069413 ns 655 bytes_read/iteration=1024k bytes_read/sec=935.092M/s midpoints=343.409M midpoints/sec=490.258M/s
BM_StdMidpoint<uint8_t, ZeroRand>_BigO 2.04 N 2.04 N
BM_StdMidpoint<uint8_t, ZeroRand>_RMS 0 % 0 %
Comparing ./llvm-cmov-bench-OLD to ./llvm-cmov-bench-NEW
Benchmark Time CPU Time Old Time New CPU Old CPU New
----------------------------------------------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 +0.0016 +0.0016 300398 300878 300404 300880
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 -0.0007 -0.0007 300433 300231 300433 300226
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 +0.0057 +0.0054 169857 170819 169858 170777
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 +0.0114 +0.0114 169770 171705 169771 171708
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 +0.0023 +0.0023 591169 592510 591179 592516
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 +0.0398 +0.0398 591264 614823 591274 614823
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 -0.6403 -0.6403 2983669 1073181 2983689 1073201
<...>
BM_StdMidpoint<uint8_t, RandRand>/524288 -0.5986 -0.5986 2668398 1071010 2668419 1071020
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 -0.0016 -0.0016 300887 300413 300887 300416
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 +0.0002 +0.0002 169634 169667 169634 169669
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 -0.0014 -0.0014 592252 591396 592255 591404
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 +0.0832 +0.0832 987295 1069421 987309 1069413
```
What can we tell from the benchmark?
* `BM_StdMidpoint<[u]int8_t, RandRand>` indeed has the worst performance.
* All `BM_StdMidpoint<uint{8,16,32}_t, ZeroRand>` are all performant, even the 8-bit case.
That is because there we are computing mid point between zero and some random number,
thus if the branch predictor is in use, it is in optimal situation.
* Promoting 8-bit CMOV did improve performance of `BM_StdMidpoint<[u]int8_t, RandRand>`, by -59%..-64%.
# What about branch predictor?
* `BM_StdMidpoint<uint8_t, ZeroRand>` was faster than `BM_StdMidpoint<uint{16,32,64}_t, ZeroRand>`,
which may mean that well-predicted branch is better than `cmov`.
* Promoting 8-bit CMOV degraded performance of `BM_StdMidpoint<uint8_t, ZeroRand>`,
`cmov` is up to +10% worse than well-predicted branch.
* However, i do not believe this is a concern. If the branch is well predicted, then the PGO
will also say that it is well predicted, and LLVM will happily expand cmov back into branch:
https://godbolt.org/z/P5ufig
# What about partial register stalls?
I'm not really able to answer that.
What i can say is that if the branch is unpredictable (if it is predictable, then use PGO and you'll have branch)
in ~50% of cases you will have to pay branch misprediction penalty.
```
$ grep -i MispredictPenalty X86Sched*.td
X86SchedBroadwell.td: let MispredictPenalty = 16;
X86SchedHaswell.td: let MispredictPenalty = 16;
X86SchedSandyBridge.td: let MispredictPenalty = 16;
X86SchedSkylakeClient.td: let MispredictPenalty = 14;
X86SchedSkylakeServer.td: let MispredictPenalty = 14;
X86ScheduleBdVer2.td: let MispredictPenalty = 20; // Minimum branch misdirection penalty.
X86ScheduleBtVer2.td: let MispredictPenalty = 14; // Minimum branch misdirection penalty
X86ScheduleSLM.td: let MispredictPenalty = 10;
X86ScheduleZnver1.td: let MispredictPenalty = 17;
```
.. which it can be as small as 10 cycles and as large as 20 cycles.
Partial register stalls do not seem to be an issue for AMD CPU's.
For intel CPU's, they should be around ~5 cycles?
Is that actually an issue here? I'm not sure.
In short, i'd say this is an improvement, at least on this microbenchmark.
Fixes [[ https://bugs.llvm.org/show_bug.cgi?id=40965 | PR40965 ]].
Reviewers: craig.topper, RKSimon, spatel, andreadb, nikic
Reviewed By: craig.topper, andreadb
Subscribers: jfb, jdoerfert, llvm-commits, mclow.lists
Tags: #llvm, #libc
Differential Revision: https://reviews.llvm.org/D59035
llvm-svn: 356300
2019-03-16 05:17:53 +08:00
|
|
|
; ENABLE: testl %edi, %edi
|
2015-05-27 14:28:41 +08:00
|
|
|
; ENABLE: je [[ELSE_LABEL:LBB[0-9_]+]]
|
|
|
|
;
|
|
|
|
; Prologue code.
|
|
|
|
; Make sure we save the CSR used in the inline asm: rbx.
|
|
|
|
; CHECK: pushq %rbx
|
|
|
|
;
|
|
|
|
; DISABLE: testl %edi, %edi
|
|
|
|
; DISABLE: je [[ELSE_LABEL:LBB[0-9_]+]]
|
|
|
|
;
|
2018-09-20 02:59:08 +08:00
|
|
|
; CHECK: xorl [[SUM:%eax]], [[SUM]]
|
2015-05-27 14:28:41 +08:00
|
|
|
; CHECK-NEXT: movl $10, [[IV:%e[a-z]+]]
|
|
|
|
;
|
|
|
|
; Next BB.
|
|
|
|
; CHECK: [[LOOP:LBB[0-9_]+]]: ## %for.body
|
|
|
|
; CHECK: movl $1, [[TMP:%e[a-z]+]]
|
|
|
|
; CHECK: addl [[TMP]], [[SUM]]
|
|
|
|
; CHECK-NEXT: decl [[IV]]
|
|
|
|
; CHECK-NEXT: jne [[LOOP]]
|
|
|
|
;
|
|
|
|
; Next BB.
|
|
|
|
; SUM << 3.
|
|
|
|
; CHECK: shll $3, [[SUM]]
|
|
|
|
;
|
2018-09-20 02:59:08 +08:00
|
|
|
; DISABLE: popq
|
|
|
|
; DISABLE: retq
|
2015-05-27 14:28:41 +08:00
|
|
|
;
|
|
|
|
; DISABLE: [[ELSE_LABEL]]: ## %if.else
|
2018-09-20 02:59:08 +08:00
|
|
|
; Shift second argument by one in returned register.
|
|
|
|
; DISABLE: movl %esi, %eax
|
|
|
|
; DISABLE: addl %esi, %eax
|
2015-05-27 14:28:41 +08:00
|
|
|
;
|
|
|
|
; Epilogue code.
|
|
|
|
; CHECK-DAG: popq %rbx
|
|
|
|
; CHECK: retq
|
|
|
|
;
|
|
|
|
; ENABLE: [[ELSE_LABEL]]: ## %if.else
|
|
|
|
; Shift second argument by one and store into returned register.
|
2018-09-20 02:59:08 +08:00
|
|
|
; ENABLE: movl %esi, %eax
|
|
|
|
; ENABLE: addl %esi, %eax
|
2015-05-27 14:28:41 +08:00
|
|
|
; ENABLE-NEXT: retq
|
|
|
|
define i32 @freqSaveAndRestoreOutsideLoop(i32 %cond, i32 %N) {
|
|
|
|
entry:
|
|
|
|
%tobool = icmp eq i32 %cond, 0
|
|
|
|
br i1 %tobool, label %if.else, label %for.preheader
|
|
|
|
|
|
|
|
for.preheader:
|
|
|
|
tail call void asm "nop", ""()
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
|
|
%i.05 = phi i32 [ %inc, %for.body ], [ 0, %for.preheader ]
|
|
|
|
%sum.04 = phi i32 [ %add, %for.body ], [ 0, %for.preheader ]
|
2018-12-05 11:41:26 +08:00
|
|
|
%call = tail call i32 asm sideeffect "movl $$1, $0", "=r,~{ebx}"()
|
2015-05-27 14:28:41 +08:00
|
|
|
%add = add nsw i32 %call, %sum.04
|
|
|
|
%inc = add nuw nsw i32 %i.05, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, 10
|
|
|
|
br i1 %exitcond, label %for.end, label %for.body
|
|
|
|
|
|
|
|
for.end: ; preds = %for.body
|
|
|
|
%shl = shl i32 %add, 3
|
|
|
|
br label %if.end
|
|
|
|
|
|
|
|
if.else: ; preds = %entry
|
|
|
|
%mul = shl nsw i32 %N, 1
|
|
|
|
br label %if.end
|
|
|
|
|
|
|
|
if.end: ; preds = %if.else, %for.end
|
|
|
|
%sum.1 = phi i32 [ %shl, %for.end ], [ %mul, %if.else ]
|
|
|
|
ret i32 %sum.1
|
|
|
|
}
|
|
|
|
|
|
|
|
declare i32 @something(...)
|
|
|
|
|
|
|
|
; Check that we do not perform the shrink-wrapping inside the loop even
|
|
|
|
; though that would be legal. The cost model must prevent that.
|
|
|
|
; CHECK-LABEL: freqSaveAndRestoreOutsideLoop2:
|
|
|
|
; Prologue code.
|
|
|
|
; Make sure we save the CSR used in the inline asm: rbx.
|
|
|
|
; CHECK: pushq %rbx
|
|
|
|
; CHECK: nop
|
|
|
|
; CHECK: xorl [[SUM:%e[a-z]+]], [[SUM]]
|
|
|
|
; CHECK-NEXT: movl $10, [[IV:%e[a-z]+]]
|
|
|
|
; Next BB.
|
|
|
|
; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ## %for.body
|
|
|
|
; CHECK: movl $1, [[TMP:%e[a-z]+]]
|
|
|
|
; CHECK: addl [[TMP]], [[SUM]]
|
|
|
|
; CHECK-NEXT: decl [[IV]]
|
|
|
|
; CHECK-NEXT: jne [[LOOP_LABEL]]
|
|
|
|
; Next BB.
|
|
|
|
; CHECK: ## %for.exit
|
|
|
|
; CHECK: nop
|
|
|
|
; CHECK: popq %rbx
|
|
|
|
; CHECK-NEXT: retq
|
|
|
|
define i32 @freqSaveAndRestoreOutsideLoop2(i32 %cond) {
|
|
|
|
entry:
|
|
|
|
br label %for.preheader
|
|
|
|
|
|
|
|
for.preheader:
|
|
|
|
tail call void asm "nop", ""()
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body, %entry
|
|
|
|
%i.04 = phi i32 [ 0, %for.preheader ], [ %inc, %for.body ]
|
|
|
|
%sum.03 = phi i32 [ 0, %for.preheader ], [ %add, %for.body ]
|
2018-12-05 11:41:26 +08:00
|
|
|
%call = tail call i32 asm sideeffect "movl $$1, $0", "=r,~{ebx}"()
|
2015-05-27 14:28:41 +08:00
|
|
|
%add = add nsw i32 %call, %sum.03
|
|
|
|
%inc = add nuw nsw i32 %i.04, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, 10
|
|
|
|
br i1 %exitcond, label %for.exit, label %for.body
|
|
|
|
|
|
|
|
for.exit:
|
|
|
|
tail call void asm "nop", ""()
|
|
|
|
br label %for.end
|
|
|
|
|
|
|
|
for.end: ; preds = %for.body
|
|
|
|
ret i32 %add
|
|
|
|
}
|
|
|
|
|
|
|
|
; Check with a more complex case that we do not have save within the loop and
|
|
|
|
; restore outside.
|
|
|
|
; CHECK-LABEL: loopInfoSaveOutsideLoop:
|
|
|
|
;
|
|
|
|
; ENABLE: testl %edi, %edi
|
|
|
|
; ENABLE-NEXT: je [[ELSE_LABEL:LBB[0-9_]+]]
|
|
|
|
;
|
|
|
|
; Prologue code.
|
|
|
|
; Make sure we save the CSR used in the inline asm: rbx.
|
|
|
|
; CHECK: pushq %rbx
|
|
|
|
;
|
|
|
|
; DISABLE: testl %edi, %edi
|
|
|
|
; DISABLE-NEXT: je [[ELSE_LABEL:LBB[0-9_]+]]
|
|
|
|
;
|
|
|
|
; CHECK: nop
|
2018-09-20 02:59:08 +08:00
|
|
|
; CHECK: xorl [[SUM:%eax]], [[SUM]]
|
2015-05-27 14:28:41 +08:00
|
|
|
; CHECK-NEXT: movl $10, [[IV:%e[a-z]+]]
|
|
|
|
;
|
|
|
|
; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ## %for.body
|
|
|
|
; CHECK: movl $1, [[TMP:%e[a-z]+]]
|
|
|
|
; CHECK: addl [[TMP]], [[SUM]]
|
|
|
|
; CHECK-NEXT: decl [[IV]]
|
|
|
|
; CHECK-NEXT: jne [[LOOP_LABEL]]
|
|
|
|
; Next BB.
|
|
|
|
; CHECK: nop
|
|
|
|
; CHECK: shll $3, [[SUM]]
|
|
|
|
;
|
2018-09-20 02:59:08 +08:00
|
|
|
; DISABLE: popq
|
|
|
|
; DISABLE: retq
|
2015-05-27 14:28:41 +08:00
|
|
|
;
|
|
|
|
; DISABLE: [[ELSE_LABEL]]: ## %if.else
|
2018-09-20 02:59:08 +08:00
|
|
|
; Shift second argument by one in returned register.
|
|
|
|
; DISABLE: movl %esi, %eax
|
|
|
|
; DISABLE: addl %esi, %eax
|
2015-05-27 14:28:41 +08:00
|
|
|
;
|
|
|
|
; Epilogue code.
|
|
|
|
; CHECK-DAG: popq %rbx
|
|
|
|
; CHECK: retq
|
|
|
|
;
|
|
|
|
; ENABLE: [[ELSE_LABEL]]: ## %if.else
|
|
|
|
; Shift second argument by one and store into returned register.
|
2018-09-20 02:59:08 +08:00
|
|
|
; ENABLE: movl %esi, %eax
|
|
|
|
; ENABLE: addl %esi, %eax
|
2015-05-27 14:28:41 +08:00
|
|
|
; ENABLE-NEXT: retq
|
|
|
|
define i32 @loopInfoSaveOutsideLoop(i32 %cond, i32 %N) {
|
|
|
|
entry:
|
|
|
|
%tobool = icmp eq i32 %cond, 0
|
|
|
|
br i1 %tobool, label %if.else, label %for.preheader
|
|
|
|
|
|
|
|
for.preheader:
|
|
|
|
tail call void asm "nop", ""()
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
|
|
%i.05 = phi i32 [ %inc, %for.body ], [ 0, %for.preheader ]
|
|
|
|
%sum.04 = phi i32 [ %add, %for.body ], [ 0, %for.preheader ]
|
2018-12-05 11:41:26 +08:00
|
|
|
%call = tail call i32 asm sideeffect "movl $$1, $0", "=r,~{ebx}"()
|
2015-05-27 14:28:41 +08:00
|
|
|
%add = add nsw i32 %call, %sum.04
|
|
|
|
%inc = add nuw nsw i32 %i.05, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, 10
|
|
|
|
br i1 %exitcond, label %for.end, label %for.body
|
|
|
|
|
|
|
|
for.end: ; preds = %for.body
|
|
|
|
tail call void asm "nop", "~{ebx}"()
|
|
|
|
%shl = shl i32 %add, 3
|
|
|
|
br label %if.end
|
|
|
|
|
|
|
|
if.else: ; preds = %entry
|
|
|
|
%mul = shl nsw i32 %N, 1
|
|
|
|
br label %if.end
|
|
|
|
|
|
|
|
if.end: ; preds = %if.else, %for.end
|
|
|
|
%sum.1 = phi i32 [ %shl, %for.end ], [ %mul, %if.else ]
|
|
|
|
ret i32 %sum.1
|
|
|
|
}
|
|
|
|
|
|
|
|
; Check with a more complex case that we do not have restore within the loop and
|
|
|
|
; save outside.
|
|
|
|
; CHECK-LABEL: loopInfoRestoreOutsideLoop:
|
|
|
|
;
|
|
|
|
; ENABLE: testl %edi, %edi
|
|
|
|
; ENABLE-NEXT: je [[ELSE_LABEL:LBB[0-9_]+]]
|
|
|
|
;
|
|
|
|
; Prologue code.
|
|
|
|
; Make sure we save the CSR used in the inline asm: rbx.
|
|
|
|
; CHECK: pushq %rbx
|
|
|
|
;
|
|
|
|
; DISABLE: testl %edi, %edi
|
|
|
|
; DISABLE-NEXT: je [[ELSE_LABEL:LBB[0-9_]+]]
|
|
|
|
;
|
|
|
|
; CHECK: nop
|
2018-09-20 02:59:08 +08:00
|
|
|
; CHECK: xorl [[SUM:%eax]], [[SUM]]
|
2015-05-27 14:28:41 +08:00
|
|
|
; CHECK-NEXT: movl $10, [[IV:%e[a-z]+]]
|
|
|
|
;
|
|
|
|
; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ## %for.body
|
|
|
|
; CHECK: movl $1, [[TMP:%e[a-z]+]]
|
|
|
|
; CHECK: addl [[TMP]], [[SUM]]
|
|
|
|
; CHECK-NEXT: decl [[IV]]
|
|
|
|
; CHECK-NEXT: jne [[LOOP_LABEL]]
|
|
|
|
; Next BB.
|
|
|
|
; CHECK: shll $3, [[SUM]]
|
|
|
|
;
|
2018-09-20 02:59:08 +08:00
|
|
|
; DISABLE: popq
|
|
|
|
; DISABLE: retq
|
2015-05-27 14:28:41 +08:00
|
|
|
;
|
|
|
|
; DISABLE: [[ELSE_LABEL]]: ## %if.else
|
|
|
|
|
2018-09-20 02:59:08 +08:00
|
|
|
; Shift second argument by one in returned register.
|
|
|
|
; DISABLE: movl %esi, %eax
|
|
|
|
; DISABLE: addl %esi, %eax
|
2015-05-27 14:28:41 +08:00
|
|
|
;
|
|
|
|
; Epilogue code.
|
|
|
|
; CHECK-DAG: popq %rbx
|
|
|
|
; CHECK: retq
|
|
|
|
;
|
|
|
|
; ENABLE: [[ELSE_LABEL]]: ## %if.else
|
|
|
|
; Shift second argument by one and store into returned register.
|
2018-09-20 02:59:08 +08:00
|
|
|
; ENABLE: movl %esi, %eax
|
|
|
|
; ENABLE: addl %esi, %eax
|
2015-05-27 14:28:41 +08:00
|
|
|
; ENABLE-NEXT: retq
|
2018-04-07 18:57:03 +08:00
|
|
|
define i32 @loopInfoRestoreOutsideLoop(i32 %cond, i32 %N) nounwind {
|
2015-05-27 14:28:41 +08:00
|
|
|
entry:
|
|
|
|
%tobool = icmp eq i32 %cond, 0
|
|
|
|
br i1 %tobool, label %if.else, label %if.then
|
|
|
|
|
|
|
|
if.then: ; preds = %entry
|
|
|
|
tail call void asm "nop", "~{ebx}"()
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body, %if.then
|
|
|
|
%i.05 = phi i32 [ 0, %if.then ], [ %inc, %for.body ]
|
|
|
|
%sum.04 = phi i32 [ 0, %if.then ], [ %add, %for.body ]
|
2018-12-05 11:41:26 +08:00
|
|
|
%call = tail call i32 asm sideeffect "movl $$1, $0", "=r,~{ebx}"()
|
2015-05-27 14:28:41 +08:00
|
|
|
%add = add nsw i32 %call, %sum.04
|
|
|
|
%inc = add nuw nsw i32 %i.05, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, 10
|
|
|
|
br i1 %exitcond, label %for.end, label %for.body
|
|
|
|
|
|
|
|
for.end: ; preds = %for.body
|
|
|
|
%shl = shl i32 %add, 3
|
|
|
|
br label %if.end
|
|
|
|
|
|
|
|
if.else: ; preds = %entry
|
|
|
|
%mul = shl nsw i32 %N, 1
|
|
|
|
br label %if.end
|
|
|
|
|
|
|
|
if.end: ; preds = %if.else, %for.end
|
|
|
|
%sum.1 = phi i32 [ %shl, %for.end ], [ %mul, %if.else ]
|
|
|
|
ret i32 %sum.1
|
|
|
|
}
|
|
|
|
|
|
|
|
; Check that we handle function with no frame information correctly.
|
|
|
|
; CHECK-LABEL: emptyFrame:
|
|
|
|
; CHECK: ## %entry
|
|
|
|
; CHECK-NEXT: xorl %eax, %eax
|
|
|
|
; CHECK-NEXT: retq
|
|
|
|
define i32 @emptyFrame() {
|
|
|
|
entry:
|
|
|
|
ret i32 0
|
|
|
|
}
|
|
|
|
|
|
|
|
; Check that we handle inline asm correctly.
|
|
|
|
; CHECK-LABEL: inlineAsm:
|
|
|
|
;
|
|
|
|
; ENABLE: testl %edi, %edi
|
|
|
|
; ENABLE-NEXT: je [[ELSE_LABEL:LBB[0-9_]+]]
|
|
|
|
;
|
|
|
|
; Prologue code.
|
|
|
|
; Make sure we save the CSR used in the inline asm: rbx.
|
|
|
|
; CHECK: pushq %rbx
|
|
|
|
;
|
|
|
|
; DISABLE: testl %edi, %edi
|
|
|
|
; DISABLE-NEXT: je [[ELSE_LABEL:LBB[0-9_]+]]
|
|
|
|
;
|
|
|
|
; CHECK: nop
|
|
|
|
; CHECK: movl $10, [[IV:%e[a-z]+]]
|
|
|
|
;
|
|
|
|
; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ## %for.body
|
|
|
|
; Inline asm statement.
|
|
|
|
; CHECK: addl $1, %ebx
|
|
|
|
; CHECK: decl [[IV]]
|
|
|
|
; CHECK-NEXT: jne [[LOOP_LABEL]]
|
|
|
|
; Next BB.
|
|
|
|
; CHECK: nop
|
2018-09-20 02:59:08 +08:00
|
|
|
; CHECK: xorl %eax, %eax
|
2015-05-27 14:28:41 +08:00
|
|
|
;
|
2018-09-20 02:59:08 +08:00
|
|
|
; DISABLE: popq
|
|
|
|
; DISABLE: retq
|
2015-05-27 14:28:41 +08:00
|
|
|
;
|
|
|
|
; DISABLE: [[ELSE_LABEL]]: ## %if.else
|
2018-09-20 02:59:08 +08:00
|
|
|
; Shift second argument by one in returned register.
|
|
|
|
; DISABLE: movl %esi, %eax
|
|
|
|
; DISABLE: addl %esi, %eax
|
2015-05-27 14:28:41 +08:00
|
|
|
;
|
|
|
|
; Epilogue code.
|
|
|
|
; CHECK-DAG: popq %rbx
|
|
|
|
; CHECK: retq
|
|
|
|
;
|
|
|
|
; ENABLE: [[ELSE_LABEL]]: ## %if.else
|
|
|
|
; Shift second argument by one and store into returned register.
|
2018-09-20 02:59:08 +08:00
|
|
|
; ENABLE: movl %esi, %eax
|
|
|
|
; ENABLE: addl %esi, %eax
|
2015-05-27 14:28:41 +08:00
|
|
|
; ENABLE-NEXT: retq
|
|
|
|
define i32 @inlineAsm(i32 %cond, i32 %N) {
|
|
|
|
entry:
|
|
|
|
%tobool = icmp eq i32 %cond, 0
|
|
|
|
br i1 %tobool, label %if.else, label %for.preheader
|
|
|
|
|
|
|
|
for.preheader:
|
|
|
|
tail call void asm "nop", ""()
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %entry, %for.body
|
|
|
|
%i.03 = phi i32 [ %inc, %for.body ], [ 0, %for.preheader ]
|
|
|
|
tail call void asm "addl $$1, %ebx", "~{ebx}"()
|
|
|
|
%inc = add nuw nsw i32 %i.03, 1
|
|
|
|
%exitcond = icmp eq i32 %inc, 10
|
|
|
|
br i1 %exitcond, label %for.exit, label %for.body
|
|
|
|
|
|
|
|
for.exit:
|
|
|
|
tail call void asm "nop", ""()
|
|
|
|
br label %if.end
|
|
|
|
|
|
|
|
if.else: ; preds = %entry
|
|
|
|
%mul = shl nsw i32 %N, 1
|
|
|
|
br label %if.end
|
|
|
|
|
|
|
|
if.end: ; preds = %for.body, %if.else
|
|
|
|
%sum.0 = phi i32 [ %mul, %if.else ], [ 0, %for.exit ]
|
|
|
|
ret i32 %sum.0
|
|
|
|
}
|
|
|
|
|
|
|
|
; Check that we handle calls to variadic functions correctly.
|
|
|
|
; CHECK-LABEL: callVariadicFunc:
|
|
|
|
;
|
Bias physical register immediate assignments
The machine scheduler currently biases register copies to/from
physical registers to be closer to their point of use / def to
minimize their live ranges. This change extends this to also physical
register assignments from immediate values.
This causes a reduction in reduction in overall register pressure and
minor reduction in spills and indirectly fixes an out-of-registers
assertion (PR39391).
Most test changes are from minor instruction reorderings and register
name selection changes and direct consequences of that.
Reviewers: MatzeB, qcolombet, myatsina, pcc
Subscribers: nemanjai, jvesely, nhaehnle, eraman, hiraditya,
javed.absar, arphaman, jfb, jsji, llvm-commits
Differential Revision: https://reviews.llvm.org/D54218
llvm-svn: 346894
2018-11-15 05:11:53 +08:00
|
|
|
; ENABLE: movl %esi, %eax
|
2015-05-27 14:28:41 +08:00
|
|
|
; ENABLE: testl %edi, %edi
|
|
|
|
; ENABLE-NEXT: je [[ELSE_LABEL:LBB[0-9_]+]]
|
|
|
|
;
|
|
|
|
; Prologue code.
|
|
|
|
; CHECK: pushq
|
|
|
|
;
|
Bias physical register immediate assignments
The machine scheduler currently biases register copies to/from
physical registers to be closer to their point of use / def to
minimize their live ranges. This change extends this to also physical
register assignments from immediate values.
This causes a reduction in reduction in overall register pressure and
minor reduction in spills and indirectly fixes an out-of-registers
assertion (PR39391).
Most test changes are from minor instruction reorderings and register
name selection changes and direct consequences of that.
Reviewers: MatzeB, qcolombet, myatsina, pcc
Subscribers: nemanjai, jvesely, nhaehnle, eraman, hiraditya,
javed.absar, arphaman, jfb, jsji, llvm-commits
Differential Revision: https://reviews.llvm.org/D54218
llvm-svn: 346894
2018-11-15 05:11:53 +08:00
|
|
|
; DISABLE: movl %esi, %eax
|
2015-05-27 14:28:41 +08:00
|
|
|
; DISABLE: testl %edi, %edi
|
|
|
|
; DISABLE-NEXT: je [[ELSE_LABEL:LBB[0-9_]+]]
|
|
|
|
;
|
|
|
|
; Setup of the varags.
|
Bias physical register immediate assignments
The machine scheduler currently biases register copies to/from
physical registers to be closer to their point of use / def to
minimize their live ranges. This change extends this to also physical
register assignments from immediate values.
This causes a reduction in reduction in overall register pressure and
minor reduction in spills and indirectly fixes an out-of-registers
assertion (PR39391).
Most test changes are from minor instruction reorderings and register
name selection changes and direct consequences of that.
Reviewers: MatzeB, qcolombet, myatsina, pcc
Subscribers: nemanjai, jvesely, nhaehnle, eraman, hiraditya,
javed.absar, arphaman, jfb, jsji, llvm-commits
Differential Revision: https://reviews.llvm.org/D54218
llvm-svn: 346894
2018-11-15 05:11:53 +08:00
|
|
|
; CHECK: movl %eax, (%rsp)
|
|
|
|
; CHECK-NEXT: movl %eax, %edi
|
|
|
|
; CHECK-NEXT: movl %eax, %esi
|
|
|
|
; CHECK-NEXT: movl %eax, %edx
|
|
|
|
; CHECK-NEXT: movl %eax, %ecx
|
|
|
|
; CHECK-NEXT: movl %eax, %r8d
|
|
|
|
; CHECK-NEXT: movl %eax, %r9d
|
|
|
|
; CHECK-NEXT: xorl %eax, %eax
|
2015-05-27 14:28:41 +08:00
|
|
|
; CHECK-NEXT: callq _someVariadicFunc
|
Bias physical register immediate assignments
The machine scheduler currently biases register copies to/from
physical registers to be closer to their point of use / def to
minimize their live ranges. This change extends this to also physical
register assignments from immediate values.
This causes a reduction in reduction in overall register pressure and
minor reduction in spills and indirectly fixes an out-of-registers
assertion (PR39391).
Most test changes are from minor instruction reorderings and register
name selection changes and direct consequences of that.
Reviewers: MatzeB, qcolombet, myatsina, pcc
Subscribers: nemanjai, jvesely, nhaehnle, eraman, hiraditya,
javed.absar, arphaman, jfb, jsji, llvm-commits
Differential Revision: https://reviews.llvm.org/D54218
llvm-svn: 346894
2018-11-15 05:11:53 +08:00
|
|
|
; CHECK-NEXT: shll $3, %eax
|
2015-05-27 14:28:41 +08:00
|
|
|
;
|
|
|
|
; ENABLE-NEXT: addq $8, %rsp
|
|
|
|
; ENABLE-NEXT: retq
|
|
|
|
;
|
Bias physical register immediate assignments
The machine scheduler currently biases register copies to/from
physical registers to be closer to their point of use / def to
minimize their live ranges. This change extends this to also physical
register assignments from immediate values.
This causes a reduction in reduction in overall register pressure and
minor reduction in spills and indirectly fixes an out-of-registers
assertion (PR39391).
Most test changes are from minor instruction reorderings and register
name selection changes and direct consequences of that.
Reviewers: MatzeB, qcolombet, myatsina, pcc
Subscribers: nemanjai, jvesely, nhaehnle, eraman, hiraditya,
javed.absar, arphaman, jfb, jsji, llvm-commits
Differential Revision: https://reviews.llvm.org/D54218
llvm-svn: 346894
2018-11-15 05:11:53 +08:00
|
|
|
|
2015-05-27 14:28:41 +08:00
|
|
|
; CHECK: [[ELSE_LABEL]]: ## %if.else
|
|
|
|
; Shift second argument by one and store into returned register.
|
Bias physical register immediate assignments
The machine scheduler currently biases register copies to/from
physical registers to be closer to their point of use / def to
minimize their live ranges. This change extends this to also physical
register assignments from immediate values.
This causes a reduction in reduction in overall register pressure and
minor reduction in spills and indirectly fixes an out-of-registers
assertion (PR39391).
Most test changes are from minor instruction reorderings and register
name selection changes and direct consequences of that.
Reviewers: MatzeB, qcolombet, myatsina, pcc
Subscribers: nemanjai, jvesely, nhaehnle, eraman, hiraditya,
javed.absar, arphaman, jfb, jsji, llvm-commits
Differential Revision: https://reviews.llvm.org/D54218
llvm-svn: 346894
2018-11-15 05:11:53 +08:00
|
|
|
; CHECK: addl %eax, %eax
|
2015-05-27 14:28:41 +08:00
|
|
|
;
|
|
|
|
; Epilogue code.
|
|
|
|
; DISABLE-NEXT: popq
|
|
|
|
; CHECK-NEXT: retq
|
|
|
|
define i32 @callVariadicFunc(i32 %cond, i32 %N) {
|
|
|
|
entry:
|
|
|
|
%tobool = icmp eq i32 %cond, 0
|
|
|
|
br i1 %tobool, label %if.else, label %if.then
|
|
|
|
|
|
|
|
if.then: ; preds = %entry
|
|
|
|
%call = tail call i32 (i32, ...) @someVariadicFunc(i32 %N, i32 %N, i32 %N, i32 %N, i32 %N, i32 %N, i32 %N)
|
|
|
|
%shl = shl i32 %call, 3
|
|
|
|
br label %if.end
|
|
|
|
|
|
|
|
if.else: ; preds = %entry
|
|
|
|
%mul = shl nsw i32 %N, 1
|
|
|
|
br label %if.end
|
|
|
|
|
|
|
|
if.end: ; preds = %if.else, %if.then
|
|
|
|
%sum.0 = phi i32 [ %shl, %if.then ], [ %mul, %if.else ]
|
|
|
|
ret i32 %sum.0
|
|
|
|
}
|
|
|
|
|
|
|
|
declare i32 @someVariadicFunc(i32, ...)
|
|
|
|
|
|
|
|
; Check that we use LEA not to clobber EFLAGS.
|
|
|
|
%struct.temp_slot = type { %struct.temp_slot*, %struct.rtx_def*, %struct.rtx_def*, i32, i64, %union.tree_node*, %union.tree_node*, i8, i8, i32, i32, i64, i64 }
|
|
|
|
%union.tree_node = type { %struct.tree_decl }
|
|
|
|
%struct.tree_decl = type { %struct.tree_common, i8*, i32, i32, %union.tree_node*, i48, %union.anon, %union.tree_node*, %union.tree_node*, %union.tree_node*, %union.tree_node*, %union.tree_node*, %union.tree_node*, %union.tree_node*, %union.tree_node*, %union.tree_node*, %union.tree_node*, %struct.rtx_def*, %struct.rtx_def*, %union.anon.1, %union.tree_node*, %union.tree_node*, %union.tree_node*, i64, %struct.lang_decl* }
|
|
|
|
%struct.tree_common = type { %union.tree_node*, %union.tree_node*, i32 }
|
|
|
|
%union.anon = type { i64 }
|
|
|
|
%union.anon.1 = type { %struct.function* }
|
|
|
|
%struct.function = type { %struct.eh_status*, %struct.stmt_status*, %struct.expr_status*, %struct.emit_status*, %struct.varasm_status*, i8*, %union.tree_node*, %struct.function*, i32, i32, i32, i32, %struct.rtx_def*, %struct.ix86_args, %struct.rtx_def*, %struct.rtx_def*, i8*, %struct.initial_value_struct*, i32, %union.tree_node*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %union.tree_node*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, i64, %union.tree_node*, %union.tree_node*, %struct.rtx_def*, %struct.rtx_def*, i32, %struct.rtx_def**, %struct.temp_slot*, i32, i32, i32, %struct.var_refs_queue*, i32, i32, i8*, %union.tree_node*, %struct.rtx_def*, i32, i32, %struct.machine_function*, i32, i32, %struct.language_function*, %struct.rtx_def*, i24 }
|
|
|
|
%struct.eh_status = type opaque
|
|
|
|
%struct.stmt_status = type opaque
|
|
|
|
%struct.expr_status = type { i32, i32, i32, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def* }
|
|
|
|
%struct.emit_status = type { i32, i32, %struct.rtx_def*, %struct.rtx_def*, %union.tree_node*, %struct.sequence_stack*, i32, i32, i8*, i32, i8*, %union.tree_node**, %struct.rtx_def** }
|
|
|
|
%struct.sequence_stack = type { %struct.rtx_def*, %struct.rtx_def*, %union.tree_node*, %struct.sequence_stack* }
|
|
|
|
%struct.varasm_status = type opaque
|
|
|
|
%struct.ix86_args = type { i32, i32, i32, i32, i32, i32, i32 }
|
|
|
|
%struct.initial_value_struct = type opaque
|
|
|
|
%struct.var_refs_queue = type { %struct.rtx_def*, i32, i32, %struct.var_refs_queue* }
|
|
|
|
%struct.machine_function = type opaque
|
|
|
|
%struct.language_function = type opaque
|
|
|
|
%struct.lang_decl = type opaque
|
|
|
|
%struct.rtx_def = type { i32, [1 x %union.rtunion_def] }
|
|
|
|
%union.rtunion_def = type { i64 }
|
|
|
|
|
|
|
|
declare hidden fastcc %struct.temp_slot* @find_temp_slot_from_address(%struct.rtx_def* readonly)
|
|
|
|
|
|
|
|
; CHECK-LABEL: useLEA:
|
[X86] Promote i8 CMOV's (PR40965)
Summary:
@mclow.lists brought up this issue up in IRC, it came up during
implementation of libc++ `std::midpoint()` implementation (D59099)
https://godbolt.org/z/oLrHBP
Currently LLVM X86 backend only promotes i8 CMOV if it came from 2x`trunc`.
This differential proposes to always promote i8 CMOV.
There are several concerns here:
* Is this actually more performant, or is it just the ASM that looks cuter?
* Does this result in partial register stalls?
* What about branch predictor?
# Indeed, performance should be the main point here.
Let's look at a simple microbenchmark: {F8412076}
```
#include "benchmark/benchmark.h"
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <iterator>
#include <limits>
#include <random>
#include <type_traits>
#include <utility>
#include <vector>
// Future preliminary libc++ code, from Marshall Clow.
namespace std {
template <class _Tp>
__inline _Tp midpoint(_Tp __a, _Tp __b) noexcept {
using _Up = typename std::make_unsigned<typename remove_cv<_Tp>::type>::type;
int __sign = 1;
_Up __m = __a;
_Up __M = __b;
if (__a > __b) {
__sign = -1;
__m = __b;
__M = __a;
}
return __a + __sign * _Tp(_Up(__M - __m) >> 1);
}
} // namespace std
template <typename T>
std::vector<T> getVectorOfRandomNumbers(size_t count) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<T> dis(std::numeric_limits<T>::min(),
std::numeric_limits<T>::max());
std::vector<T> v;
v.reserve(count);
std::generate_n(std::back_inserter(v), count,
[&dis, &gen]() { return dis(gen); });
assert(v.size() == count);
return v;
}
struct RandRand {
template <typename T>
static std::pair<std::vector<T>, std::vector<T>> Gen(size_t count) {
return std::make_pair(getVectorOfRandomNumbers<T>(count),
getVectorOfRandomNumbers<T>(count));
}
};
struct ZeroRand {
template <typename T>
static std::pair<std::vector<T>, std::vector<T>> Gen(size_t count) {
return std::make_pair(std::vector<T>(count, T(0)),
getVectorOfRandomNumbers<T>(count));
}
};
template <class T, class Gen>
void BM_StdMidpoint(benchmark::State& state) {
const size_t Length = state.range(0);
const std::pair<std::vector<T>, std::vector<T>> Data =
Gen::template Gen<T>(Length);
const std::vector<T>& a = Data.first;
const std::vector<T>& b = Data.second;
assert(a.size() == Length && b.size() == a.size());
benchmark::ClobberMemory();
benchmark::DoNotOptimize(a);
benchmark::DoNotOptimize(a.data());
benchmark::DoNotOptimize(b);
benchmark::DoNotOptimize(b.data());
for (auto _ : state) {
for (size_t i = 0; i < Length; i++) {
const auto calculated = std::midpoint(a[i], b[i]);
benchmark::DoNotOptimize(calculated);
}
}
state.SetComplexityN(Length);
state.counters["midpoints"] =
benchmark::Counter(Length, benchmark::Counter::kIsIterationInvariant);
state.counters["midpoints/sec"] =
benchmark::Counter(Length, benchmark::Counter::kIsIterationInvariantRate);
const size_t BytesRead = 2 * sizeof(T) * Length;
state.counters["bytes_read/iteration"] =
benchmark::Counter(BytesRead, benchmark::Counter::kDefaults,
benchmark::Counter::OneK::kIs1024);
state.counters["bytes_read/sec"] = benchmark::Counter(
BytesRead, benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::OneK::kIs1024);
}
template <typename T>
static void CustomArguments(benchmark::internal::Benchmark* b) {
const size_t L2SizeBytes = 2 * 1024 * 1024;
// What is the largest range we can check to always fit within given L2 cache?
const size_t MaxLen = L2SizeBytes / /*total bufs*/ 2 /
/*maximal elt size*/ sizeof(T) / /*safety margin*/ 2;
b->RangeMultiplier(2)->Range(1, MaxLen)->Complexity(benchmark::oN);
}
// Both of the values are random.
// The comparison is unpredictable.
BENCHMARK_TEMPLATE(BM_StdMidpoint, int32_t, RandRand)
->Apply(CustomArguments<int32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint32_t, RandRand)
->Apply(CustomArguments<uint32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int64_t, RandRand)
->Apply(CustomArguments<int64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint64_t, RandRand)
->Apply(CustomArguments<uint64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int16_t, RandRand)
->Apply(CustomArguments<int16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint16_t, RandRand)
->Apply(CustomArguments<uint16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int8_t, RandRand)
->Apply(CustomArguments<int8_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint8_t, RandRand)
->Apply(CustomArguments<uint8_t>);
// One value is always zero, and another is bigger or equal than zero.
// The comparison is predictable.
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint32_t, ZeroRand)
->Apply(CustomArguments<uint32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint64_t, ZeroRand)
->Apply(CustomArguments<uint64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint16_t, ZeroRand)
->Apply(CustomArguments<uint16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint8_t, ZeroRand)
->Apply(CustomArguments<uint8_t>);
```
```
$ ~/src/googlebenchmark/tools/compare.py --no-utest benchmarks ./llvm-cmov-bench-OLD ./llvm-cmov-bench-NEW
RUNNING: ./llvm-cmov-bench-OLD --benchmark_out=/tmp/tmp5a5qjm
2019-03-06 21:53:31
Running ./llvm-cmov-bench-OLD
Run on (8 X 4000 MHz CPU s)
CPU Caches:
L1 Data 16K (x8)
L1 Instruction 64K (x4)
L2 Unified 2048K (x4)
L3 Unified 8192K (x1)
Load Average: 1.78, 1.81, 1.36
----------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters<...>
----------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 300398 ns 300404 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25083G/s midpoints=305.398M midpoints/sec=436.319M/s
BM_StdMidpoint<int32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<int32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 300433 ns 300433 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25052G/s midpoints=305.398M midpoints/sec=436.278M/s
BM_StdMidpoint<uint32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 169857 ns 169858 ns 4121 bytes_read/iteration=1024k bytes_read/sec=5.74929G/s midpoints=270.074M midpoints/sec=385.828M/s
BM_StdMidpoint<int64_t, RandRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<int64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 169770 ns 169771 ns 4125 bytes_read/iteration=1024k bytes_read/sec=5.75223G/s midpoints=270.336M midpoints/sec=386.026M/s
BM_StdMidpoint<uint64_t, RandRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 591169 ns 591179 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.65189G/s midpoints=309.854M midpoints/sec=443.426M/s
BM_StdMidpoint<int16_t, RandRand>_BigO 2.25 N 2.25 N
BM_StdMidpoint<int16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 591264 ns 591274 ns 1184 bytes_read/iteration=1024k bytes_read/sec=1.65162G/s midpoints=310.378M midpoints/sec=443.354M/s
BM_StdMidpoint<uint16_t, RandRand>_BigO 2.25 N 2.25 N
BM_StdMidpoint<uint16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 2983669 ns 2983689 ns 235 bytes_read/iteration=1024k bytes_read/sec=335.156M/s midpoints=123.208M midpoints/sec=175.718M/s
BM_StdMidpoint<int8_t, RandRand>_BigO 5.69 N 5.69 N
BM_StdMidpoint<int8_t, RandRand>_RMS 0 % 0 %
<...>
BM_StdMidpoint<uint8_t, RandRand>/524288 2668398 ns 2668419 ns 262 bytes_read/iteration=1024k bytes_read/sec=374.754M/s midpoints=137.363M midpoints/sec=196.479M/s
BM_StdMidpoint<uint8_t, RandRand>_BigO 5.09 N 5.09 N
BM_StdMidpoint<uint8_t, RandRand>_RMS 0 % 0 %
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 300887 ns 300887 ns 2331 bytes_read/iteration=1024k bytes_read/sec=3.24561G/s midpoints=305.529M midpoints/sec=435.619M/s
BM_StdMidpoint<uint32_t, ZeroRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, ZeroRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 169634 ns 169634 ns 4102 bytes_read/iteration=1024k bytes_read/sec=5.75688G/s midpoints=268.829M midpoints/sec=386.338M/s
BM_StdMidpoint<uint64_t, ZeroRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, ZeroRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 592252 ns 592255 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.64889G/s midpoints=309.854M midpoints/sec=442.62M/s
BM_StdMidpoint<uint16_t, ZeroRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<uint16_t, ZeroRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 987295 ns 987309 ns 711 bytes_read/iteration=1024k bytes_read/sec=1012.85M/s midpoints=372.769M midpoints/sec=531.028M/s
BM_StdMidpoint<uint8_t, ZeroRand>_BigO 1.88 N 1.88 N
BM_StdMidpoint<uint8_t, ZeroRand>_RMS 1 % 1 %
RUNNING: ./llvm-cmov-bench-NEW --benchmark_out=/tmp/tmpPvwpfW
2019-03-06 21:56:58
Running ./llvm-cmov-bench-NEW
Run on (8 X 4000 MHz CPU s)
CPU Caches:
L1 Data 16K (x8)
L1 Instruction 64K (x4)
L2 Unified 2048K (x4)
L3 Unified 8192K (x1)
Load Average: 1.17, 1.46, 1.30
----------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters<...>
----------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 300878 ns 300880 ns 2324 bytes_read/iteration=1024k bytes_read/sec=3.24569G/s midpoints=304.611M midpoints/sec=435.629M/s
BM_StdMidpoint<int32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<int32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 300231 ns 300226 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25276G/s midpoints=305.398M midpoints/sec=436.578M/s
BM_StdMidpoint<uint32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 170819 ns 170777 ns 4115 bytes_read/iteration=1024k bytes_read/sec=5.71835G/s midpoints=269.681M midpoints/sec=383.752M/s
BM_StdMidpoint<int64_t, RandRand>_BigO 2.60 N 2.60 N
BM_StdMidpoint<int64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 171705 ns 171708 ns 4106 bytes_read/iteration=1024k bytes_read/sec=5.68733G/s midpoints=269.091M midpoints/sec=381.671M/s
BM_StdMidpoint<uint64_t, RandRand>_BigO 2.62 N 2.62 N
BM_StdMidpoint<uint64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 592510 ns 592516 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.64816G/s midpoints=309.854M midpoints/sec=442.425M/s
BM_StdMidpoint<int16_t, RandRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<int16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 614823 ns 614823 ns 1180 bytes_read/iteration=1024k bytes_read/sec=1.58836G/s midpoints=309.33M midpoints/sec=426.373M/s
BM_StdMidpoint<uint16_t, RandRand>_BigO 2.33 N 2.33 N
BM_StdMidpoint<uint16_t, RandRand>_RMS 4 % 4 %
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 1073181 ns 1073201 ns 650 bytes_read/iteration=1024k bytes_read/sec=931.791M/s midpoints=340.787M midpoints/sec=488.527M/s
BM_StdMidpoint<int8_t, RandRand>_BigO 2.05 N 2.05 N
BM_StdMidpoint<int8_t, RandRand>_RMS 1 % 1 %
BM_StdMidpoint<uint8_t, RandRand>/524288 1071010 ns 1071020 ns 653 bytes_read/iteration=1024k bytes_read/sec=933.689M/s midpoints=342.36M midpoints/sec=489.522M/s
BM_StdMidpoint<uint8_t, RandRand>_BigO 2.05 N 2.05 N
BM_StdMidpoint<uint8_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 300413 ns 300416 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.2507G/s midpoints=305.398M midpoints/sec=436.302M/s
BM_StdMidpoint<uint32_t, ZeroRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, ZeroRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 169667 ns 169669 ns 4123 bytes_read/iteration=1024k bytes_read/sec=5.75568G/s midpoints=270.205M midpoints/sec=386.257M/s
BM_StdMidpoint<uint64_t, ZeroRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, ZeroRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 591396 ns 591404 ns 1184 bytes_read/iteration=1024k bytes_read/sec=1.65126G/s midpoints=310.378M midpoints/sec=443.257M/s
BM_StdMidpoint<uint16_t, ZeroRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<uint16_t, ZeroRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 1069421 ns 1069413 ns 655 bytes_read/iteration=1024k bytes_read/sec=935.092M/s midpoints=343.409M midpoints/sec=490.258M/s
BM_StdMidpoint<uint8_t, ZeroRand>_BigO 2.04 N 2.04 N
BM_StdMidpoint<uint8_t, ZeroRand>_RMS 0 % 0 %
Comparing ./llvm-cmov-bench-OLD to ./llvm-cmov-bench-NEW
Benchmark Time CPU Time Old Time New CPU Old CPU New
----------------------------------------------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 +0.0016 +0.0016 300398 300878 300404 300880
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 -0.0007 -0.0007 300433 300231 300433 300226
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 +0.0057 +0.0054 169857 170819 169858 170777
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 +0.0114 +0.0114 169770 171705 169771 171708
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 +0.0023 +0.0023 591169 592510 591179 592516
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 +0.0398 +0.0398 591264 614823 591274 614823
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 -0.6403 -0.6403 2983669 1073181 2983689 1073201
<...>
BM_StdMidpoint<uint8_t, RandRand>/524288 -0.5986 -0.5986 2668398 1071010 2668419 1071020
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 -0.0016 -0.0016 300887 300413 300887 300416
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 +0.0002 +0.0002 169634 169667 169634 169669
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 -0.0014 -0.0014 592252 591396 592255 591404
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 +0.0832 +0.0832 987295 1069421 987309 1069413
```
What can we tell from the benchmark?
* `BM_StdMidpoint<[u]int8_t, RandRand>` indeed has the worst performance.
* All `BM_StdMidpoint<uint{8,16,32}_t, ZeroRand>` are all performant, even the 8-bit case.
That is because there we are computing mid point between zero and some random number,
thus if the branch predictor is in use, it is in optimal situation.
* Promoting 8-bit CMOV did improve performance of `BM_StdMidpoint<[u]int8_t, RandRand>`, by -59%..-64%.
# What about branch predictor?
* `BM_StdMidpoint<uint8_t, ZeroRand>` was faster than `BM_StdMidpoint<uint{16,32,64}_t, ZeroRand>`,
which may mean that well-predicted branch is better than `cmov`.
* Promoting 8-bit CMOV degraded performance of `BM_StdMidpoint<uint8_t, ZeroRand>`,
`cmov` is up to +10% worse than well-predicted branch.
* However, i do not believe this is a concern. If the branch is well predicted, then the PGO
will also say that it is well predicted, and LLVM will happily expand cmov back into branch:
https://godbolt.org/z/P5ufig
# What about partial register stalls?
I'm not really able to answer that.
What i can say is that if the branch is unpredictable (if it is predictable, then use PGO and you'll have branch)
in ~50% of cases you will have to pay branch misprediction penalty.
```
$ grep -i MispredictPenalty X86Sched*.td
X86SchedBroadwell.td: let MispredictPenalty = 16;
X86SchedHaswell.td: let MispredictPenalty = 16;
X86SchedSandyBridge.td: let MispredictPenalty = 16;
X86SchedSkylakeClient.td: let MispredictPenalty = 14;
X86SchedSkylakeServer.td: let MispredictPenalty = 14;
X86ScheduleBdVer2.td: let MispredictPenalty = 20; // Minimum branch misdirection penalty.
X86ScheduleBtVer2.td: let MispredictPenalty = 14; // Minimum branch misdirection penalty
X86ScheduleSLM.td: let MispredictPenalty = 10;
X86ScheduleZnver1.td: let MispredictPenalty = 17;
```
.. which it can be as small as 10 cycles and as large as 20 cycles.
Partial register stalls do not seem to be an issue for AMD CPU's.
For intel CPU's, they should be around ~5 cycles?
Is that actually an issue here? I'm not sure.
In short, i'd say this is an improvement, at least on this microbenchmark.
Fixes [[ https://bugs.llvm.org/show_bug.cgi?id=40965 | PR40965 ]].
Reviewers: craig.topper, RKSimon, spatel, andreadb, nikic
Reviewed By: craig.topper, andreadb
Subscribers: jfb, jdoerfert, llvm-commits, mclow.lists
Tags: #llvm, #libc
Differential Revision: https://reviews.llvm.org/D59035
llvm-svn: 356300
2019-03-16 05:17:53 +08:00
|
|
|
; DISABLE: pushq
|
2015-05-27 14:28:41 +08:00
|
|
|
;
|
|
|
|
; CHECK: testq %rdi, %rdi
|
|
|
|
; CHECK-NEXT: je [[CLEANUP:LBB[0-9_]+]]
|
|
|
|
;
|
2018-10-06 02:13:36 +08:00
|
|
|
; CHECK: cmpw $66, (%rdi)
|
2015-05-27 14:28:41 +08:00
|
|
|
; CHECK-NEXT: jne [[CLEANUP]]
|
|
|
|
;
|
|
|
|
; CHECK: movq 8(%rdi), %rdi
|
|
|
|
; CHECK-NEXT: movzwl (%rdi), %e[[BF_LOAD2:[a-z]+]]
|
|
|
|
; CHECK-NEXT: leal -54(%r[[BF_LOAD2]]), [[TMP:%e[a-z]+]]
|
|
|
|
; CHECK-NEXT: cmpl $14, [[TMP]]
|
|
|
|
; CHECK-NEXT: ja [[LOR_LHS_FALSE:LBB[0-9_]+]]
|
|
|
|
;
|
|
|
|
; CHECK: movl $24599, [[TMP2:%e[a-z]+]]
|
|
|
|
; CHECK-NEXT: btl [[TMP]], [[TMP2]]
|
2015-08-27 07:15:32 +08:00
|
|
|
; CHECK-NEXT: jae [[LOR_LHS_FALSE:LBB[0-9_]+]]
|
|
|
|
;
|
|
|
|
; CHECK: [[CLEANUP]]: ## %cleanup
|
|
|
|
; DISABLE: popq
|
|
|
|
; CHECK-NEXT: retq
|
2015-05-27 14:28:41 +08:00
|
|
|
;
|
|
|
|
; CHECK: [[LOR_LHS_FALSE]]: ## %lor.lhs.false
|
|
|
|
; CHECK: cmpl $134, %e[[BF_LOAD2]]
|
|
|
|
; CHECK-NEXT: je [[CLEANUP]]
|
|
|
|
;
|
|
|
|
; CHECK: cmpl $140, %e[[BF_LOAD2]]
|
|
|
|
; CHECK-NEXT: je [[CLEANUP]]
|
|
|
|
;
|
|
|
|
; ENABLE: pushq
|
|
|
|
; CHECK: callq _find_temp_slot_from_address
|
|
|
|
; CHECK-NEXT: testq %rax, %rax
|
|
|
|
;
|
|
|
|
; The adjustment must use LEA here (or be moved above the test).
|
|
|
|
; ENABLE-NEXT: leaq 8(%rsp), %rsp
|
|
|
|
;
|
|
|
|
; CHECK-NEXT: je [[CLEANUP]]
|
|
|
|
;
|
|
|
|
; CHECK: movb $1, 57(%rax)
|
|
|
|
define void @useLEA(%struct.rtx_def* readonly %x) {
|
|
|
|
entry:
|
|
|
|
%cmp = icmp eq %struct.rtx_def* %x, null
|
|
|
|
br i1 %cmp, label %cleanup, label %if.end
|
|
|
|
|
|
|
|
if.end: ; preds = %entry
|
|
|
|
%tmp = getelementptr inbounds %struct.rtx_def, %struct.rtx_def* %x, i64 0, i32 0
|
|
|
|
%bf.load = load i32, i32* %tmp, align 8
|
|
|
|
%bf.clear = and i32 %bf.load, 65535
|
|
|
|
%cmp1 = icmp eq i32 %bf.clear, 66
|
|
|
|
br i1 %cmp1, label %lor.lhs.false, label %cleanup
|
|
|
|
|
|
|
|
lor.lhs.false: ; preds = %if.end
|
|
|
|
%arrayidx = getelementptr inbounds %struct.rtx_def, %struct.rtx_def* %x, i64 0, i32 1, i64 0
|
|
|
|
%rtx = bitcast %union.rtunion_def* %arrayidx to %struct.rtx_def**
|
|
|
|
%tmp1 = load %struct.rtx_def*, %struct.rtx_def** %rtx, align 8
|
|
|
|
%tmp2 = getelementptr inbounds %struct.rtx_def, %struct.rtx_def* %tmp1, i64 0, i32 0
|
|
|
|
%bf.load2 = load i32, i32* %tmp2, align 8
|
|
|
|
%bf.clear3 = and i32 %bf.load2, 65535
|
|
|
|
switch i32 %bf.clear3, label %if.end.55 [
|
|
|
|
i32 67, label %cleanup
|
|
|
|
i32 68, label %cleanup
|
|
|
|
i32 54, label %cleanup
|
|
|
|
i32 55, label %cleanup
|
|
|
|
i32 58, label %cleanup
|
|
|
|
i32 134, label %cleanup
|
|
|
|
i32 56, label %cleanup
|
|
|
|
i32 140, label %cleanup
|
|
|
|
]
|
|
|
|
|
|
|
|
if.end.55: ; preds = %lor.lhs.false
|
|
|
|
%call = tail call fastcc %struct.temp_slot* @find_temp_slot_from_address(%struct.rtx_def* %tmp1) #2
|
|
|
|
%cmp59 = icmp eq %struct.temp_slot* %call, null
|
|
|
|
br i1 %cmp59, label %cleanup, label %if.then.60
|
|
|
|
|
|
|
|
if.then.60: ; preds = %if.end.55
|
|
|
|
%addr_taken = getelementptr inbounds %struct.temp_slot, %struct.temp_slot* %call, i64 0, i32 8
|
|
|
|
store i8 1, i8* %addr_taken, align 1
|
|
|
|
br label %cleanup
|
|
|
|
|
|
|
|
cleanup: ; preds = %if.then.60, %if.end.55, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %lor.lhs.false, %if.end, %entry
|
|
|
|
ret void
|
|
|
|
}
|
2015-07-11 06:09:55 +08:00
|
|
|
|
|
|
|
; Make sure we do not insert unreachable code after noreturn function.
|
|
|
|
; Although this is not incorrect to insert such code, it is useless
|
|
|
|
; and it hurts the binary size.
|
|
|
|
;
|
|
|
|
; CHECK-LABEL: noreturn:
|
|
|
|
; DISABLE: pushq
|
|
|
|
;
|
|
|
|
; CHECK: testb %dil, %dil
|
|
|
|
; CHECK-NEXT: jne [[ABORT:LBB[0-9_]+]]
|
|
|
|
;
|
|
|
|
; CHECK: movl $42, %eax
|
|
|
|
;
|
|
|
|
; DISABLE-NEXT: popq
|
|
|
|
;
|
|
|
|
; CHECK-NEXT: retq
|
|
|
|
;
|
|
|
|
; CHECK: [[ABORT]]: ## %if.abort
|
|
|
|
;
|
|
|
|
; ENABLE: pushq
|
|
|
|
;
|
|
|
|
; CHECK: callq _abort
|
|
|
|
; ENABLE-NOT: popq
|
|
|
|
define i32 @noreturn(i8 signext %bad_thing) {
|
|
|
|
entry:
|
|
|
|
%tobool = icmp eq i8 %bad_thing, 0
|
|
|
|
br i1 %tobool, label %if.end, label %if.abort
|
|
|
|
|
|
|
|
if.abort:
|
|
|
|
tail call void @abort() #0
|
|
|
|
unreachable
|
|
|
|
|
|
|
|
if.end:
|
|
|
|
ret i32 42
|
|
|
|
}
|
|
|
|
|
|
|
|
declare void @abort() #0
|
|
|
|
|
|
|
|
attributes #0 = { noreturn nounwind }
|
2015-08-07 03:01:57 +08:00
|
|
|
|
|
|
|
|
|
|
|
; Make sure that we handle infinite loops properly When checking that the Save
|
|
|
|
; and Restore blocks are control flow equivalent, the loop searches for the
|
|
|
|
; immediate (post) dominator for the (restore) save blocks. When either the Save
|
|
|
|
; or Restore block is located in an infinite loop the only immediate (post)
|
|
|
|
; dominator is itself. In this case, we cannot perform shrink wrapping, but we
|
|
|
|
; should return gracefully and continue compilation.
|
|
|
|
; The only condition for this test is the compilation finishes correctly.
|
|
|
|
;
|
|
|
|
; CHECK-LABEL: infiniteloop
|
|
|
|
; CHECK: retq
|
|
|
|
define void @infiniteloop() {
|
|
|
|
entry:
|
|
|
|
br i1 undef, label %if.then, label %if.end
|
|
|
|
|
|
|
|
if.then:
|
|
|
|
%ptr = alloca i32, i32 4
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body, %entry
|
|
|
|
%sum.03 = phi i32 [ 0, %if.then ], [ %add, %for.body ]
|
|
|
|
%call = tail call i32 asm "movl $$1, $0", "=r,~{ebx}"()
|
|
|
|
%add = add nsw i32 %call, %sum.03
|
|
|
|
store i32 %add, i32* %ptr
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
if.end:
|
|
|
|
ret void
|
|
|
|
}
|
2015-09-16 02:51:43 +08:00
|
|
|
|
|
|
|
; Another infinite loop test this time with a body bigger than just one block.
|
|
|
|
; CHECK-LABEL: infiniteloop2
|
|
|
|
; CHECK: retq
|
|
|
|
define void @infiniteloop2() {
|
|
|
|
entry:
|
|
|
|
br i1 undef, label %if.then, label %if.end
|
|
|
|
|
|
|
|
if.then:
|
|
|
|
%ptr = alloca i32, i32 4
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body, %entry
|
|
|
|
%sum.03 = phi i32 [ 0, %if.then ], [ %add, %body1 ], [ 1, %body2]
|
|
|
|
%call = tail call i32 asm "movl $$1, $0", "=r,~{ebx}"()
|
|
|
|
%add = add nsw i32 %call, %sum.03
|
|
|
|
store i32 %add, i32* %ptr
|
|
|
|
br i1 undef, label %body1, label %body2
|
|
|
|
|
|
|
|
body1:
|
|
|
|
tail call void asm sideeffect "nop", "~{ebx}"()
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
body2:
|
|
|
|
tail call void asm sideeffect "nop", "~{ebx}"()
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
if.end:
|
|
|
|
ret void
|
|
|
|
}
|
2015-09-18 07:21:34 +08:00
|
|
|
|
|
|
|
; Another infinite loop test this time with two nested infinite loop.
|
|
|
|
; CHECK-LABEL: infiniteloop3
|
|
|
|
; CHECK: retq
|
|
|
|
define void @infiniteloop3() {
|
|
|
|
entry:
|
|
|
|
br i1 undef, label %loop2a, label %body
|
|
|
|
|
|
|
|
body: ; preds = %entry
|
|
|
|
br i1 undef, label %loop2a, label %end
|
|
|
|
|
|
|
|
loop1: ; preds = %loop2a, %loop2b
|
|
|
|
%var.phi = phi i32* [ %next.phi, %loop2b ], [ %var, %loop2a ]
|
|
|
|
%next.phi = phi i32* [ %next.load, %loop2b ], [ %next.var, %loop2a ]
|
|
|
|
%0 = icmp eq i32* %var, null
|
|
|
|
%next.load = load i32*, i32** undef
|
|
|
|
br i1 %0, label %loop2a, label %loop2b
|
|
|
|
|
|
|
|
loop2a: ; preds = %loop1, %body, %entry
|
|
|
|
%var = phi i32* [ null, %body ], [ null, %entry ], [ %next.phi, %loop1 ]
|
|
|
|
%next.var = phi i32* [ undef, %body ], [ null, %entry ], [ %next.load, %loop1 ]
|
|
|
|
br label %loop1
|
|
|
|
|
|
|
|
loop2b: ; preds = %loop1
|
|
|
|
%gep1 = bitcast i32* %var.phi to i32*
|
|
|
|
%next.ptr = bitcast i32* %gep1 to i32**
|
|
|
|
store i32* %next.phi, i32** %next.ptr
|
|
|
|
br label %loop1
|
|
|
|
|
|
|
|
end:
|
|
|
|
ret void
|
|
|
|
}
|
2015-11-07 05:00:13 +08:00
|
|
|
|
|
|
|
; Check that we just don't bail out on RegMask.
|
|
|
|
; In this case, the RegMask does not touch a CSR so we are good to go!
|
|
|
|
; CHECK-LABEL: regmask:
|
|
|
|
;
|
|
|
|
; Compare the arguments and jump to exit.
|
|
|
|
; No prologue needed.
|
|
|
|
; ENABLE: cmpl %esi, %edi
|
|
|
|
; ENABLE-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
|
|
|
|
;
|
|
|
|
; Prologue code.
|
|
|
|
; (What we push does not matter. It should be some random sratch register.)
|
|
|
|
; CHECK: pushq
|
|
|
|
;
|
|
|
|
; Compare the arguments and jump to exit.
|
|
|
|
; After the prologue is set.
|
|
|
|
; DISABLE: cmpl %esi, %edi
|
|
|
|
; DISABLE-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
|
|
|
|
;
|
|
|
|
; CHECK: nop
|
|
|
|
; Set the first argument to zero.
|
|
|
|
; CHECK: xorl %edi, %edi
|
|
|
|
; Set the second argument to addr.
|
|
|
|
; CHECK-NEXT: movq %rdx, %rsi
|
|
|
|
; CHECK-NEXT: callq _doSomething
|
|
|
|
; CHECK-NEXT: popq
|
|
|
|
; CHECK-NEXT: retq
|
|
|
|
;
|
|
|
|
; CHECK: [[EXIT_LABEL]]:
|
|
|
|
; Set the first argument to 6.
|
|
|
|
; CHECK-NEXT: movl $6, %edi
|
|
|
|
; Set the second argument to addr.
|
|
|
|
; CHECK-NEXT: movq %rdx, %rsi
|
|
|
|
;
|
|
|
|
; Without shrink-wrapping, we need to restore the stack before
|
|
|
|
; making the tail call.
|
|
|
|
; Epilogue code.
|
|
|
|
; DISABLE-NEXT: popq
|
|
|
|
;
|
|
|
|
; CHECK-NEXT: jmp _doSomething
|
|
|
|
define i32 @regmask(i32 %a, i32 %b, i32* %addr) {
|
|
|
|
%tmp2 = icmp slt i32 %a, %b
|
|
|
|
br i1 %tmp2, label %true, label %false
|
|
|
|
|
|
|
|
true:
|
|
|
|
; Clobber a CSR so that we check something on the regmask
|
|
|
|
; of the tail call.
|
|
|
|
tail call void asm sideeffect "nop", "~{ebx}"()
|
|
|
|
%tmp4 = call i32 @doSomething(i32 0, i32* %addr)
|
|
|
|
br label %end
|
|
|
|
|
|
|
|
false:
|
|
|
|
%tmp5 = tail call i32 @doSomething(i32 6, i32* %addr)
|
|
|
|
br label %end
|
|
|
|
|
|
|
|
end:
|
|
|
|
%tmp.0 = phi i32 [ %tmp4, %true ], [ %tmp5, %false ]
|
|
|
|
ret i32 %tmp.0
|
|
|
|
}
|
2015-12-02 03:49:31 +08:00
|
|
|
|
|
|
|
@b = internal unnamed_addr global i1 false
|
|
|
|
@c = internal unnamed_addr global i8 0, align 1
|
|
|
|
@a = common global i32 0, align 4
|
|
|
|
|
|
|
|
; Make sure the prologue does not clobber the EFLAGS when
|
|
|
|
; it is live accross.
|
|
|
|
; PR25629.
|
|
|
|
; Note: The registers may change in the following patterns, but
|
|
|
|
; because they imply register hierarchy (e.g., eax, al) this is
|
|
|
|
; tricky to write robust patterns.
|
|
|
|
;
|
|
|
|
; CHECK-LABEL: useLEAForPrologue:
|
|
|
|
;
|
|
|
|
; Prologue is at the beginning of the function when shrink-wrapping
|
|
|
|
; is disabled.
|
|
|
|
; DISABLE: pushq
|
|
|
|
; The stack adjustment can use SUB instr because we do not need to
|
|
|
|
; preserve the EFLAGS at this point.
|
|
|
|
; DISABLE-NEXT: subq $16, %rsp
|
|
|
|
;
|
|
|
|
; Load the value of b.
|
|
|
|
; Create the zero value for the select assignment.
|
2018-02-12 10:48:42 +08:00
|
|
|
; CHECK: xorl [[CMOVE_VAL:%eax]], [[CMOVE_VAL]]
|
|
|
|
; CHECK-NEXT: cmpb $0, _b(%rip)
|
[X86] Promote i8 CMOV's (PR40965)
Summary:
@mclow.lists brought up this issue up in IRC, it came up during
implementation of libc++ `std::midpoint()` implementation (D59099)
https://godbolt.org/z/oLrHBP
Currently LLVM X86 backend only promotes i8 CMOV if it came from 2x`trunc`.
This differential proposes to always promote i8 CMOV.
There are several concerns here:
* Is this actually more performant, or is it just the ASM that looks cuter?
* Does this result in partial register stalls?
* What about branch predictor?
# Indeed, performance should be the main point here.
Let's look at a simple microbenchmark: {F8412076}
```
#include "benchmark/benchmark.h"
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <iterator>
#include <limits>
#include <random>
#include <type_traits>
#include <utility>
#include <vector>
// Future preliminary libc++ code, from Marshall Clow.
namespace std {
template <class _Tp>
__inline _Tp midpoint(_Tp __a, _Tp __b) noexcept {
using _Up = typename std::make_unsigned<typename remove_cv<_Tp>::type>::type;
int __sign = 1;
_Up __m = __a;
_Up __M = __b;
if (__a > __b) {
__sign = -1;
__m = __b;
__M = __a;
}
return __a + __sign * _Tp(_Up(__M - __m) >> 1);
}
} // namespace std
template <typename T>
std::vector<T> getVectorOfRandomNumbers(size_t count) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<T> dis(std::numeric_limits<T>::min(),
std::numeric_limits<T>::max());
std::vector<T> v;
v.reserve(count);
std::generate_n(std::back_inserter(v), count,
[&dis, &gen]() { return dis(gen); });
assert(v.size() == count);
return v;
}
struct RandRand {
template <typename T>
static std::pair<std::vector<T>, std::vector<T>> Gen(size_t count) {
return std::make_pair(getVectorOfRandomNumbers<T>(count),
getVectorOfRandomNumbers<T>(count));
}
};
struct ZeroRand {
template <typename T>
static std::pair<std::vector<T>, std::vector<T>> Gen(size_t count) {
return std::make_pair(std::vector<T>(count, T(0)),
getVectorOfRandomNumbers<T>(count));
}
};
template <class T, class Gen>
void BM_StdMidpoint(benchmark::State& state) {
const size_t Length = state.range(0);
const std::pair<std::vector<T>, std::vector<T>> Data =
Gen::template Gen<T>(Length);
const std::vector<T>& a = Data.first;
const std::vector<T>& b = Data.second;
assert(a.size() == Length && b.size() == a.size());
benchmark::ClobberMemory();
benchmark::DoNotOptimize(a);
benchmark::DoNotOptimize(a.data());
benchmark::DoNotOptimize(b);
benchmark::DoNotOptimize(b.data());
for (auto _ : state) {
for (size_t i = 0; i < Length; i++) {
const auto calculated = std::midpoint(a[i], b[i]);
benchmark::DoNotOptimize(calculated);
}
}
state.SetComplexityN(Length);
state.counters["midpoints"] =
benchmark::Counter(Length, benchmark::Counter::kIsIterationInvariant);
state.counters["midpoints/sec"] =
benchmark::Counter(Length, benchmark::Counter::kIsIterationInvariantRate);
const size_t BytesRead = 2 * sizeof(T) * Length;
state.counters["bytes_read/iteration"] =
benchmark::Counter(BytesRead, benchmark::Counter::kDefaults,
benchmark::Counter::OneK::kIs1024);
state.counters["bytes_read/sec"] = benchmark::Counter(
BytesRead, benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::OneK::kIs1024);
}
template <typename T>
static void CustomArguments(benchmark::internal::Benchmark* b) {
const size_t L2SizeBytes = 2 * 1024 * 1024;
// What is the largest range we can check to always fit within given L2 cache?
const size_t MaxLen = L2SizeBytes / /*total bufs*/ 2 /
/*maximal elt size*/ sizeof(T) / /*safety margin*/ 2;
b->RangeMultiplier(2)->Range(1, MaxLen)->Complexity(benchmark::oN);
}
// Both of the values are random.
// The comparison is unpredictable.
BENCHMARK_TEMPLATE(BM_StdMidpoint, int32_t, RandRand)
->Apply(CustomArguments<int32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint32_t, RandRand)
->Apply(CustomArguments<uint32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int64_t, RandRand)
->Apply(CustomArguments<int64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint64_t, RandRand)
->Apply(CustomArguments<uint64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int16_t, RandRand)
->Apply(CustomArguments<int16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint16_t, RandRand)
->Apply(CustomArguments<uint16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int8_t, RandRand)
->Apply(CustomArguments<int8_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint8_t, RandRand)
->Apply(CustomArguments<uint8_t>);
// One value is always zero, and another is bigger or equal than zero.
// The comparison is predictable.
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint32_t, ZeroRand)
->Apply(CustomArguments<uint32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint64_t, ZeroRand)
->Apply(CustomArguments<uint64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint16_t, ZeroRand)
->Apply(CustomArguments<uint16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint8_t, ZeroRand)
->Apply(CustomArguments<uint8_t>);
```
```
$ ~/src/googlebenchmark/tools/compare.py --no-utest benchmarks ./llvm-cmov-bench-OLD ./llvm-cmov-bench-NEW
RUNNING: ./llvm-cmov-bench-OLD --benchmark_out=/tmp/tmp5a5qjm
2019-03-06 21:53:31
Running ./llvm-cmov-bench-OLD
Run on (8 X 4000 MHz CPU s)
CPU Caches:
L1 Data 16K (x8)
L1 Instruction 64K (x4)
L2 Unified 2048K (x4)
L3 Unified 8192K (x1)
Load Average: 1.78, 1.81, 1.36
----------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters<...>
----------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 300398 ns 300404 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25083G/s midpoints=305.398M midpoints/sec=436.319M/s
BM_StdMidpoint<int32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<int32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 300433 ns 300433 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25052G/s midpoints=305.398M midpoints/sec=436.278M/s
BM_StdMidpoint<uint32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 169857 ns 169858 ns 4121 bytes_read/iteration=1024k bytes_read/sec=5.74929G/s midpoints=270.074M midpoints/sec=385.828M/s
BM_StdMidpoint<int64_t, RandRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<int64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 169770 ns 169771 ns 4125 bytes_read/iteration=1024k bytes_read/sec=5.75223G/s midpoints=270.336M midpoints/sec=386.026M/s
BM_StdMidpoint<uint64_t, RandRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 591169 ns 591179 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.65189G/s midpoints=309.854M midpoints/sec=443.426M/s
BM_StdMidpoint<int16_t, RandRand>_BigO 2.25 N 2.25 N
BM_StdMidpoint<int16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 591264 ns 591274 ns 1184 bytes_read/iteration=1024k bytes_read/sec=1.65162G/s midpoints=310.378M midpoints/sec=443.354M/s
BM_StdMidpoint<uint16_t, RandRand>_BigO 2.25 N 2.25 N
BM_StdMidpoint<uint16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 2983669 ns 2983689 ns 235 bytes_read/iteration=1024k bytes_read/sec=335.156M/s midpoints=123.208M midpoints/sec=175.718M/s
BM_StdMidpoint<int8_t, RandRand>_BigO 5.69 N 5.69 N
BM_StdMidpoint<int8_t, RandRand>_RMS 0 % 0 %
<...>
BM_StdMidpoint<uint8_t, RandRand>/524288 2668398 ns 2668419 ns 262 bytes_read/iteration=1024k bytes_read/sec=374.754M/s midpoints=137.363M midpoints/sec=196.479M/s
BM_StdMidpoint<uint8_t, RandRand>_BigO 5.09 N 5.09 N
BM_StdMidpoint<uint8_t, RandRand>_RMS 0 % 0 %
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 300887 ns 300887 ns 2331 bytes_read/iteration=1024k bytes_read/sec=3.24561G/s midpoints=305.529M midpoints/sec=435.619M/s
BM_StdMidpoint<uint32_t, ZeroRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, ZeroRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 169634 ns 169634 ns 4102 bytes_read/iteration=1024k bytes_read/sec=5.75688G/s midpoints=268.829M midpoints/sec=386.338M/s
BM_StdMidpoint<uint64_t, ZeroRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, ZeroRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 592252 ns 592255 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.64889G/s midpoints=309.854M midpoints/sec=442.62M/s
BM_StdMidpoint<uint16_t, ZeroRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<uint16_t, ZeroRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 987295 ns 987309 ns 711 bytes_read/iteration=1024k bytes_read/sec=1012.85M/s midpoints=372.769M midpoints/sec=531.028M/s
BM_StdMidpoint<uint8_t, ZeroRand>_BigO 1.88 N 1.88 N
BM_StdMidpoint<uint8_t, ZeroRand>_RMS 1 % 1 %
RUNNING: ./llvm-cmov-bench-NEW --benchmark_out=/tmp/tmpPvwpfW
2019-03-06 21:56:58
Running ./llvm-cmov-bench-NEW
Run on (8 X 4000 MHz CPU s)
CPU Caches:
L1 Data 16K (x8)
L1 Instruction 64K (x4)
L2 Unified 2048K (x4)
L3 Unified 8192K (x1)
Load Average: 1.17, 1.46, 1.30
----------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters<...>
----------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 300878 ns 300880 ns 2324 bytes_read/iteration=1024k bytes_read/sec=3.24569G/s midpoints=304.611M midpoints/sec=435.629M/s
BM_StdMidpoint<int32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<int32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 300231 ns 300226 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25276G/s midpoints=305.398M midpoints/sec=436.578M/s
BM_StdMidpoint<uint32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 170819 ns 170777 ns 4115 bytes_read/iteration=1024k bytes_read/sec=5.71835G/s midpoints=269.681M midpoints/sec=383.752M/s
BM_StdMidpoint<int64_t, RandRand>_BigO 2.60 N 2.60 N
BM_StdMidpoint<int64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 171705 ns 171708 ns 4106 bytes_read/iteration=1024k bytes_read/sec=5.68733G/s midpoints=269.091M midpoints/sec=381.671M/s
BM_StdMidpoint<uint64_t, RandRand>_BigO 2.62 N 2.62 N
BM_StdMidpoint<uint64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 592510 ns 592516 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.64816G/s midpoints=309.854M midpoints/sec=442.425M/s
BM_StdMidpoint<int16_t, RandRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<int16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 614823 ns 614823 ns 1180 bytes_read/iteration=1024k bytes_read/sec=1.58836G/s midpoints=309.33M midpoints/sec=426.373M/s
BM_StdMidpoint<uint16_t, RandRand>_BigO 2.33 N 2.33 N
BM_StdMidpoint<uint16_t, RandRand>_RMS 4 % 4 %
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 1073181 ns 1073201 ns 650 bytes_read/iteration=1024k bytes_read/sec=931.791M/s midpoints=340.787M midpoints/sec=488.527M/s
BM_StdMidpoint<int8_t, RandRand>_BigO 2.05 N 2.05 N
BM_StdMidpoint<int8_t, RandRand>_RMS 1 % 1 %
BM_StdMidpoint<uint8_t, RandRand>/524288 1071010 ns 1071020 ns 653 bytes_read/iteration=1024k bytes_read/sec=933.689M/s midpoints=342.36M midpoints/sec=489.522M/s
BM_StdMidpoint<uint8_t, RandRand>_BigO 2.05 N 2.05 N
BM_StdMidpoint<uint8_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 300413 ns 300416 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.2507G/s midpoints=305.398M midpoints/sec=436.302M/s
BM_StdMidpoint<uint32_t, ZeroRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, ZeroRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 169667 ns 169669 ns 4123 bytes_read/iteration=1024k bytes_read/sec=5.75568G/s midpoints=270.205M midpoints/sec=386.257M/s
BM_StdMidpoint<uint64_t, ZeroRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, ZeroRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 591396 ns 591404 ns 1184 bytes_read/iteration=1024k bytes_read/sec=1.65126G/s midpoints=310.378M midpoints/sec=443.257M/s
BM_StdMidpoint<uint16_t, ZeroRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<uint16_t, ZeroRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 1069421 ns 1069413 ns 655 bytes_read/iteration=1024k bytes_read/sec=935.092M/s midpoints=343.409M midpoints/sec=490.258M/s
BM_StdMidpoint<uint8_t, ZeroRand>_BigO 2.04 N 2.04 N
BM_StdMidpoint<uint8_t, ZeroRand>_RMS 0 % 0 %
Comparing ./llvm-cmov-bench-OLD to ./llvm-cmov-bench-NEW
Benchmark Time CPU Time Old Time New CPU Old CPU New
----------------------------------------------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 +0.0016 +0.0016 300398 300878 300404 300880
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 -0.0007 -0.0007 300433 300231 300433 300226
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 +0.0057 +0.0054 169857 170819 169858 170777
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 +0.0114 +0.0114 169770 171705 169771 171708
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 +0.0023 +0.0023 591169 592510 591179 592516
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 +0.0398 +0.0398 591264 614823 591274 614823
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 -0.6403 -0.6403 2983669 1073181 2983689 1073201
<...>
BM_StdMidpoint<uint8_t, RandRand>/524288 -0.5986 -0.5986 2668398 1071010 2668419 1071020
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 -0.0016 -0.0016 300887 300413 300887 300416
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 +0.0002 +0.0002 169634 169667 169634 169669
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 -0.0014 -0.0014 592252 591396 592255 591404
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 +0.0832 +0.0832 987295 1069421 987309 1069413
```
What can we tell from the benchmark?
* `BM_StdMidpoint<[u]int8_t, RandRand>` indeed has the worst performance.
* All `BM_StdMidpoint<uint{8,16,32}_t, ZeroRand>` are all performant, even the 8-bit case.
That is because there we are computing mid point between zero and some random number,
thus if the branch predictor is in use, it is in optimal situation.
* Promoting 8-bit CMOV did improve performance of `BM_StdMidpoint<[u]int8_t, RandRand>`, by -59%..-64%.
# What about branch predictor?
* `BM_StdMidpoint<uint8_t, ZeroRand>` was faster than `BM_StdMidpoint<uint{16,32,64}_t, ZeroRand>`,
which may mean that well-predicted branch is better than `cmov`.
* Promoting 8-bit CMOV degraded performance of `BM_StdMidpoint<uint8_t, ZeroRand>`,
`cmov` is up to +10% worse than well-predicted branch.
* However, i do not believe this is a concern. If the branch is well predicted, then the PGO
will also say that it is well predicted, and LLVM will happily expand cmov back into branch:
https://godbolt.org/z/P5ufig
# What about partial register stalls?
I'm not really able to answer that.
What i can say is that if the branch is unpredictable (if it is predictable, then use PGO and you'll have branch)
in ~50% of cases you will have to pay branch misprediction penalty.
```
$ grep -i MispredictPenalty X86Sched*.td
X86SchedBroadwell.td: let MispredictPenalty = 16;
X86SchedHaswell.td: let MispredictPenalty = 16;
X86SchedSandyBridge.td: let MispredictPenalty = 16;
X86SchedSkylakeClient.td: let MispredictPenalty = 14;
X86SchedSkylakeServer.td: let MispredictPenalty = 14;
X86ScheduleBdVer2.td: let MispredictPenalty = 20; // Minimum branch misdirection penalty.
X86ScheduleBtVer2.td: let MispredictPenalty = 14; // Minimum branch misdirection penalty
X86ScheduleSLM.td: let MispredictPenalty = 10;
X86ScheduleZnver1.td: let MispredictPenalty = 17;
```
.. which it can be as small as 10 cycles and as large as 20 cycles.
Partial register stalls do not seem to be an issue for AMD CPU's.
For intel CPU's, they should be around ~5 cycles?
Is that actually an issue here? I'm not sure.
In short, i'd say this is an improvement, at least on this microbenchmark.
Fixes [[ https://bugs.llvm.org/show_bug.cgi?id=40965 | PR40965 ]].
Reviewers: craig.topper, RKSimon, spatel, andreadb, nikic
Reviewed By: craig.topper, andreadb
Subscribers: jfb, jdoerfert, llvm-commits, mclow.lists
Tags: #llvm, #libc
Differential Revision: https://reviews.llvm.org/D59035
llvm-svn: 356300
2019-03-16 05:17:53 +08:00
|
|
|
; CHECK-NEXT: movl $48, [[IMM_VAL:%ecx]]
|
|
|
|
; CHECK-NEXT: cmovnel [[CMOVE_VAL]], [[IMM_VAL]]
|
|
|
|
; CHECK-NEXT: movb %cl, _c(%rip)
|
2015-12-02 03:49:31 +08:00
|
|
|
; CHECK-NEXT: je [[VARFUNC_CALL:LBB[0-9_]+]]
|
|
|
|
;
|
|
|
|
; The code of the loop is not interesting.
|
|
|
|
; [...]
|
|
|
|
;
|
|
|
|
; CHECK: [[VARFUNC_CALL]]:
|
|
|
|
; Set the null parameter.
|
|
|
|
; CHECK-NEXT: xorl %edi, %edi
|
|
|
|
; CHECK-NEXT: callq _varfunc
|
|
|
|
;
|
|
|
|
; Set the return value.
|
|
|
|
; CHECK-NEXT: xorl %eax, %eax
|
|
|
|
;
|
|
|
|
; Epilogue code.
|
|
|
|
; CHECK-NEXT: addq $16, %rsp
|
|
|
|
; CHECK-NEXT: popq
|
|
|
|
; CHECK-NEXT: retq
|
|
|
|
define i32 @useLEAForPrologue(i32 %d, i32 %a, i8 %c) #3 {
|
|
|
|
entry:
|
|
|
|
%tmp = alloca i3
|
|
|
|
%.b = load i1, i1* @b, align 1
|
|
|
|
%bool = select i1 %.b, i8 0, i8 48
|
|
|
|
store i8 %bool, i8* @c, align 1
|
|
|
|
br i1 %.b, label %for.body.lr.ph, label %for.end
|
|
|
|
|
|
|
|
for.body.lr.ph: ; preds = %entry
|
|
|
|
tail call void asm sideeffect "nop", "~{ebx}"()
|
|
|
|
br label %for.body
|
|
|
|
|
|
|
|
for.body: ; preds = %for.body.lr.ph, %for.body
|
|
|
|
%inc6 = phi i8 [ %c, %for.body.lr.ph ], [ %inc, %for.body ]
|
|
|
|
%cond5 = phi i32 [ %a, %for.body.lr.ph ], [ %conv3, %for.body ]
|
|
|
|
%cmp2 = icmp slt i32 %d, %cond5
|
|
|
|
%conv3 = zext i1 %cmp2 to i32
|
|
|
|
%inc = add i8 %inc6, 1
|
|
|
|
%cmp = icmp slt i8 %inc, 45
|
|
|
|
br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
|
|
|
|
|
|
|
|
for.cond.for.end_crit_edge: ; preds = %for.body
|
|
|
|
store i32 %conv3, i32* @a, align 4
|
|
|
|
br label %for.end
|
|
|
|
|
|
|
|
for.end: ; preds = %for.cond.for.end_crit_edge, %entry
|
|
|
|
%call = tail call i32 (i8*) @varfunc(i8* null)
|
|
|
|
ret i32 0
|
|
|
|
}
|
|
|
|
|
|
|
|
declare i32 @varfunc(i8* nocapture readonly)
|
|
|
|
|
2016-01-07 03:09:26 +08:00
|
|
|
@sum1 = external hidden thread_local global i32, align 4
|
|
|
|
|
|
|
|
|
|
|
|
; Function Attrs: nounwind
|
|
|
|
; Make sure the TLS call used to access @sum1 happens after the prologue
|
|
|
|
; and before the epilogue.
|
|
|
|
; TLS calls used to be wrongly model and shrink-wrapping would have inserted
|
|
|
|
; the prologue and epilogue just around the call to doSomething.
|
|
|
|
; PR25820.
|
|
|
|
;
|
|
|
|
; CHECK-LABEL: tlsCall:
|
|
|
|
; CHECK: pushq
|
|
|
|
; CHECK: testb $1, %dil
|
|
|
|
; CHECK: je [[ELSE_LABEL:LBB[0-9_]+]]
|
|
|
|
;
|
|
|
|
; master bb
|
|
|
|
; CHECK: movq _sum1@TLVP(%rip), %rdi
|
|
|
|
; CHECK-NEXT: callq *(%rdi)
|
|
|
|
; CHECK: jmp [[EXIT_LABEL:LBB[0-9_]+]]
|
|
|
|
;
|
|
|
|
; [[ELSE_LABEL]]:
|
|
|
|
; CHECK: callq _doSomething
|
|
|
|
;
|
|
|
|
; [[EXIT_LABEL]]:
|
|
|
|
; CHECK: popq
|
|
|
|
; CHECK-NEXT: retq
|
|
|
|
define i32 @tlsCall(i1 %bool1, i32 %arg, i32* readonly dereferenceable(4) %sum1) #3 {
|
|
|
|
entry:
|
|
|
|
br i1 %bool1, label %master, label %else
|
|
|
|
|
|
|
|
master:
|
|
|
|
%tmp1 = load i32, i32* %sum1, align 4
|
|
|
|
store i32 %tmp1, i32* @sum1, align 4
|
|
|
|
br label %exit
|
|
|
|
|
|
|
|
else:
|
|
|
|
%call = call i32 @doSomething(i32 0, i32* null)
|
|
|
|
br label %exit
|
|
|
|
|
|
|
|
exit:
|
|
|
|
%res = phi i32 [ %arg, %master], [ %call, %else ]
|
|
|
|
ret i32 %res
|
|
|
|
}
|
|
|
|
|
2015-12-02 03:49:31 +08:00
|
|
|
attributes #3 = { nounwind }
|
2016-01-07 09:23:49 +08:00
|
|
|
|
|
|
|
@irreducibleCFGa = common global i32 0, align 4
|
|
|
|
@irreducibleCFGf = common global i8 0, align 1
|
|
|
|
@irreducibleCFGb = common global i32 0, align 4
|
|
|
|
|
|
|
|
; Check that we do not run shrink-wrapping on irreducible CFGs until
|
|
|
|
; it is actually supported.
|
|
|
|
; At the moment, on those CFGs the loop information may be incorrect
|
|
|
|
; and since we use that information to do the placement, we may end up
|
|
|
|
; inserting the prologue/epilogue at incorrect places.
|
|
|
|
; PR25988.
|
|
|
|
;
|
|
|
|
; CHECK-LABEL: irreducibleCFG:
|
|
|
|
; CHECK: %entry
|
|
|
|
; Make sure the prologue happens in the entry block.
|
|
|
|
; CHECK-NEXT: pushq
|
|
|
|
; ...
|
|
|
|
; Make sure the epilogue happens in the exit block.
|
|
|
|
; CHECK-NOT: popq
|
|
|
|
; CHECK: popq
|
|
|
|
; CHECK-NEXT: popq
|
|
|
|
; CHECK-NEXT: retq
|
2018-06-05 08:27:24 +08:00
|
|
|
; Make sure we emit missed optimization remarks for this.
|
|
|
|
; REMARKS: Pass: shrink-wrap
|
|
|
|
; REMARKS-NEXT: Name: UnsupportedIrreducibleCFG
|
|
|
|
; REMARKS-NEXT: Function: irreducibleCFG
|
|
|
|
; REMARKS-NEXT: Args:
|
|
|
|
; REMARKS-NEXT: - String: Irreducible CFGs are not supported yet
|
|
|
|
|
2016-01-07 09:23:49 +08:00
|
|
|
define i32 @irreducibleCFG() #4 {
|
|
|
|
entry:
|
|
|
|
%i0 = load i32, i32* @irreducibleCFGa, align 4
|
|
|
|
%.pr = load i8, i8* @irreducibleCFGf, align 1
|
|
|
|
%bool = icmp eq i8 %.pr, 0
|
|
|
|
br i1 %bool, label %split, label %preheader
|
|
|
|
|
|
|
|
preheader:
|
|
|
|
br label %preheader
|
|
|
|
|
|
|
|
split:
|
|
|
|
%i1 = load i32, i32* @irreducibleCFGb, align 4
|
|
|
|
%tobool1.i = icmp ne i32 %i1, 0
|
|
|
|
br i1 %tobool1.i, label %for.body4.i, label %for.cond8.i.preheader
|
|
|
|
|
|
|
|
for.body4.i:
|
|
|
|
%call.i = tail call i32 (...) @something(i32 %i0)
|
|
|
|
br label %for.cond8
|
|
|
|
|
|
|
|
for.cond8:
|
|
|
|
%p1 = phi i32 [ %inc18.i, %for.inc ], [ 0, %for.body4.i ]
|
|
|
|
%.pr1.pr = load i32, i32* @irreducibleCFGb, align 4
|
|
|
|
br label %for.cond8.i.preheader
|
|
|
|
|
|
|
|
for.cond8.i.preheader:
|
|
|
|
%.pr1 = phi i32 [ %.pr1.pr, %for.cond8 ], [ %i1, %split ]
|
|
|
|
%p13 = phi i32 [ %p1, %for.cond8 ], [ 0, %split ]
|
|
|
|
br label %for.inc
|
|
|
|
|
|
|
|
fn1.exit:
|
|
|
|
ret i32 0
|
|
|
|
|
|
|
|
for.inc:
|
|
|
|
%inc18.i = add nuw nsw i32 %p13, 1
|
|
|
|
%cmp = icmp slt i32 %inc18.i, 7
|
|
|
|
br i1 %cmp, label %for.cond8, label %fn1.exit
|
|
|
|
}
|
|
|
|
|
|
|
|
attributes #4 = { "no-frame-pointer-elim"="true" }
|
2017-05-16 07:13:35 +08:00
|
|
|
|
|
|
|
@x = external global i32, align 4
|
|
|
|
@y = external global i32, align 4
|
|
|
|
|
|
|
|
; The post-dominator tree does not include the branch containing the infinite
|
|
|
|
; loop, which can occur into a misplacement of the restore block, if we're
|
|
|
|
; looking for the nearest common post-dominator of an "unreachable" block.
|
|
|
|
|
|
|
|
; CHECK-LABEL: infiniteLoopNoSuccessor:
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.0:
|
2017-05-16 07:13:35 +08:00
|
|
|
; Make sure the prologue happens in the entry block.
|
|
|
|
; CHECK-NEXT: pushq %rbp
|
|
|
|
; ...
|
|
|
|
; Make sure we don't shrink-wrap.
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.1
|
2017-05-16 07:13:35 +08:00
|
|
|
; CHECK-NOT: pushq %rbp
|
|
|
|
; ...
|
|
|
|
; Make sure the epilogue happens in the exit block.
|
2017-12-05 01:18:51 +08:00
|
|
|
; CHECK: ## %bb.5
|
2017-05-16 07:13:35 +08:00
|
|
|
; CHECK: popq %rbp
|
|
|
|
; CHECK-NEXT: retq
|
|
|
|
define void @infiniteLoopNoSuccessor() #5 {
|
|
|
|
%1 = load i32, i32* @x, align 4
|
|
|
|
%2 = icmp ne i32 %1, 0
|
|
|
|
br i1 %2, label %3, label %4
|
|
|
|
|
|
|
|
; <label>:3:
|
|
|
|
store i32 0, i32* @x, align 4
|
|
|
|
br label %4
|
|
|
|
|
|
|
|
; <label>:4:
|
|
|
|
call void (...) @somethingElse()
|
|
|
|
%5 = load i32, i32* @y, align 4
|
|
|
|
%6 = icmp ne i32 %5, 0
|
|
|
|
br i1 %6, label %10, label %7
|
|
|
|
|
|
|
|
; <label>:7:
|
|
|
|
%8 = call i32 (...) @something()
|
|
|
|
br label %9
|
|
|
|
|
|
|
|
; <label>:9:
|
|
|
|
call void (...) @somethingElse()
|
|
|
|
br label %9
|
|
|
|
|
|
|
|
; <label>:10:
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
declare void @somethingElse(...)
|
|
|
|
|
|
|
|
attributes #5 = { nounwind "no-frame-pointer-elim-non-leaf" }
|