2018-09-11 01:40:15 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=ANY,SSE,SSE2
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefixes=ANY,SSE,SSE41
|
|
|
|
|
|
|
|
; There are at least 3 potential patterns corresponding to an unsigned saturated add: min, cmp with sum, cmp with not.
|
|
|
|
; Test each of those patterns with i8/i16/i32/i64.
|
|
|
|
; Test each of those with a constant operand and a variable operand.
|
|
|
|
; Test each of those with a 128-bit vector type.
|
|
|
|
|
|
|
|
define i8 @unsigned_sat_constant_i8_using_min(i8 %x) {
|
|
|
|
; ANY-LABEL: unsigned_sat_constant_i8_using_min:
|
|
|
|
; ANY: # %bb.0:
|
[X86] Promote i8 CMOV's (PR40965)
Summary:
@mclow.lists brought up this issue up in IRC, it came up during
implementation of libc++ `std::midpoint()` implementation (D59099)
https://godbolt.org/z/oLrHBP
Currently LLVM X86 backend only promotes i8 CMOV if it came from 2x`trunc`.
This differential proposes to always promote i8 CMOV.
There are several concerns here:
* Is this actually more performant, or is it just the ASM that looks cuter?
* Does this result in partial register stalls?
* What about branch predictor?
# Indeed, performance should be the main point here.
Let's look at a simple microbenchmark: {F8412076}
```
#include "benchmark/benchmark.h"
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <iterator>
#include <limits>
#include <random>
#include <type_traits>
#include <utility>
#include <vector>
// Future preliminary libc++ code, from Marshall Clow.
namespace std {
template <class _Tp>
__inline _Tp midpoint(_Tp __a, _Tp __b) noexcept {
using _Up = typename std::make_unsigned<typename remove_cv<_Tp>::type>::type;
int __sign = 1;
_Up __m = __a;
_Up __M = __b;
if (__a > __b) {
__sign = -1;
__m = __b;
__M = __a;
}
return __a + __sign * _Tp(_Up(__M - __m) >> 1);
}
} // namespace std
template <typename T>
std::vector<T> getVectorOfRandomNumbers(size_t count) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<T> dis(std::numeric_limits<T>::min(),
std::numeric_limits<T>::max());
std::vector<T> v;
v.reserve(count);
std::generate_n(std::back_inserter(v), count,
[&dis, &gen]() { return dis(gen); });
assert(v.size() == count);
return v;
}
struct RandRand {
template <typename T>
static std::pair<std::vector<T>, std::vector<T>> Gen(size_t count) {
return std::make_pair(getVectorOfRandomNumbers<T>(count),
getVectorOfRandomNumbers<T>(count));
}
};
struct ZeroRand {
template <typename T>
static std::pair<std::vector<T>, std::vector<T>> Gen(size_t count) {
return std::make_pair(std::vector<T>(count, T(0)),
getVectorOfRandomNumbers<T>(count));
}
};
template <class T, class Gen>
void BM_StdMidpoint(benchmark::State& state) {
const size_t Length = state.range(0);
const std::pair<std::vector<T>, std::vector<T>> Data =
Gen::template Gen<T>(Length);
const std::vector<T>& a = Data.first;
const std::vector<T>& b = Data.second;
assert(a.size() == Length && b.size() == a.size());
benchmark::ClobberMemory();
benchmark::DoNotOptimize(a);
benchmark::DoNotOptimize(a.data());
benchmark::DoNotOptimize(b);
benchmark::DoNotOptimize(b.data());
for (auto _ : state) {
for (size_t i = 0; i < Length; i++) {
const auto calculated = std::midpoint(a[i], b[i]);
benchmark::DoNotOptimize(calculated);
}
}
state.SetComplexityN(Length);
state.counters["midpoints"] =
benchmark::Counter(Length, benchmark::Counter::kIsIterationInvariant);
state.counters["midpoints/sec"] =
benchmark::Counter(Length, benchmark::Counter::kIsIterationInvariantRate);
const size_t BytesRead = 2 * sizeof(T) * Length;
state.counters["bytes_read/iteration"] =
benchmark::Counter(BytesRead, benchmark::Counter::kDefaults,
benchmark::Counter::OneK::kIs1024);
state.counters["bytes_read/sec"] = benchmark::Counter(
BytesRead, benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::OneK::kIs1024);
}
template <typename T>
static void CustomArguments(benchmark::internal::Benchmark* b) {
const size_t L2SizeBytes = 2 * 1024 * 1024;
// What is the largest range we can check to always fit within given L2 cache?
const size_t MaxLen = L2SizeBytes / /*total bufs*/ 2 /
/*maximal elt size*/ sizeof(T) / /*safety margin*/ 2;
b->RangeMultiplier(2)->Range(1, MaxLen)->Complexity(benchmark::oN);
}
// Both of the values are random.
// The comparison is unpredictable.
BENCHMARK_TEMPLATE(BM_StdMidpoint, int32_t, RandRand)
->Apply(CustomArguments<int32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint32_t, RandRand)
->Apply(CustomArguments<uint32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int64_t, RandRand)
->Apply(CustomArguments<int64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint64_t, RandRand)
->Apply(CustomArguments<uint64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int16_t, RandRand)
->Apply(CustomArguments<int16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint16_t, RandRand)
->Apply(CustomArguments<uint16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int8_t, RandRand)
->Apply(CustomArguments<int8_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint8_t, RandRand)
->Apply(CustomArguments<uint8_t>);
// One value is always zero, and another is bigger or equal than zero.
// The comparison is predictable.
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint32_t, ZeroRand)
->Apply(CustomArguments<uint32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint64_t, ZeroRand)
->Apply(CustomArguments<uint64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint16_t, ZeroRand)
->Apply(CustomArguments<uint16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint8_t, ZeroRand)
->Apply(CustomArguments<uint8_t>);
```
```
$ ~/src/googlebenchmark/tools/compare.py --no-utest benchmarks ./llvm-cmov-bench-OLD ./llvm-cmov-bench-NEW
RUNNING: ./llvm-cmov-bench-OLD --benchmark_out=/tmp/tmp5a5qjm
2019-03-06 21:53:31
Running ./llvm-cmov-bench-OLD
Run on (8 X 4000 MHz CPU s)
CPU Caches:
L1 Data 16K (x8)
L1 Instruction 64K (x4)
L2 Unified 2048K (x4)
L3 Unified 8192K (x1)
Load Average: 1.78, 1.81, 1.36
----------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters<...>
----------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 300398 ns 300404 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25083G/s midpoints=305.398M midpoints/sec=436.319M/s
BM_StdMidpoint<int32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<int32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 300433 ns 300433 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25052G/s midpoints=305.398M midpoints/sec=436.278M/s
BM_StdMidpoint<uint32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 169857 ns 169858 ns 4121 bytes_read/iteration=1024k bytes_read/sec=5.74929G/s midpoints=270.074M midpoints/sec=385.828M/s
BM_StdMidpoint<int64_t, RandRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<int64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 169770 ns 169771 ns 4125 bytes_read/iteration=1024k bytes_read/sec=5.75223G/s midpoints=270.336M midpoints/sec=386.026M/s
BM_StdMidpoint<uint64_t, RandRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 591169 ns 591179 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.65189G/s midpoints=309.854M midpoints/sec=443.426M/s
BM_StdMidpoint<int16_t, RandRand>_BigO 2.25 N 2.25 N
BM_StdMidpoint<int16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 591264 ns 591274 ns 1184 bytes_read/iteration=1024k bytes_read/sec=1.65162G/s midpoints=310.378M midpoints/sec=443.354M/s
BM_StdMidpoint<uint16_t, RandRand>_BigO 2.25 N 2.25 N
BM_StdMidpoint<uint16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 2983669 ns 2983689 ns 235 bytes_read/iteration=1024k bytes_read/sec=335.156M/s midpoints=123.208M midpoints/sec=175.718M/s
BM_StdMidpoint<int8_t, RandRand>_BigO 5.69 N 5.69 N
BM_StdMidpoint<int8_t, RandRand>_RMS 0 % 0 %
<...>
BM_StdMidpoint<uint8_t, RandRand>/524288 2668398 ns 2668419 ns 262 bytes_read/iteration=1024k bytes_read/sec=374.754M/s midpoints=137.363M midpoints/sec=196.479M/s
BM_StdMidpoint<uint8_t, RandRand>_BigO 5.09 N 5.09 N
BM_StdMidpoint<uint8_t, RandRand>_RMS 0 % 0 %
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 300887 ns 300887 ns 2331 bytes_read/iteration=1024k bytes_read/sec=3.24561G/s midpoints=305.529M midpoints/sec=435.619M/s
BM_StdMidpoint<uint32_t, ZeroRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, ZeroRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 169634 ns 169634 ns 4102 bytes_read/iteration=1024k bytes_read/sec=5.75688G/s midpoints=268.829M midpoints/sec=386.338M/s
BM_StdMidpoint<uint64_t, ZeroRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, ZeroRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 592252 ns 592255 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.64889G/s midpoints=309.854M midpoints/sec=442.62M/s
BM_StdMidpoint<uint16_t, ZeroRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<uint16_t, ZeroRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 987295 ns 987309 ns 711 bytes_read/iteration=1024k bytes_read/sec=1012.85M/s midpoints=372.769M midpoints/sec=531.028M/s
BM_StdMidpoint<uint8_t, ZeroRand>_BigO 1.88 N 1.88 N
BM_StdMidpoint<uint8_t, ZeroRand>_RMS 1 % 1 %
RUNNING: ./llvm-cmov-bench-NEW --benchmark_out=/tmp/tmpPvwpfW
2019-03-06 21:56:58
Running ./llvm-cmov-bench-NEW
Run on (8 X 4000 MHz CPU s)
CPU Caches:
L1 Data 16K (x8)
L1 Instruction 64K (x4)
L2 Unified 2048K (x4)
L3 Unified 8192K (x1)
Load Average: 1.17, 1.46, 1.30
----------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters<...>
----------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 300878 ns 300880 ns 2324 bytes_read/iteration=1024k bytes_read/sec=3.24569G/s midpoints=304.611M midpoints/sec=435.629M/s
BM_StdMidpoint<int32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<int32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 300231 ns 300226 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25276G/s midpoints=305.398M midpoints/sec=436.578M/s
BM_StdMidpoint<uint32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 170819 ns 170777 ns 4115 bytes_read/iteration=1024k bytes_read/sec=5.71835G/s midpoints=269.681M midpoints/sec=383.752M/s
BM_StdMidpoint<int64_t, RandRand>_BigO 2.60 N 2.60 N
BM_StdMidpoint<int64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 171705 ns 171708 ns 4106 bytes_read/iteration=1024k bytes_read/sec=5.68733G/s midpoints=269.091M midpoints/sec=381.671M/s
BM_StdMidpoint<uint64_t, RandRand>_BigO 2.62 N 2.62 N
BM_StdMidpoint<uint64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 592510 ns 592516 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.64816G/s midpoints=309.854M midpoints/sec=442.425M/s
BM_StdMidpoint<int16_t, RandRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<int16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 614823 ns 614823 ns 1180 bytes_read/iteration=1024k bytes_read/sec=1.58836G/s midpoints=309.33M midpoints/sec=426.373M/s
BM_StdMidpoint<uint16_t, RandRand>_BigO 2.33 N 2.33 N
BM_StdMidpoint<uint16_t, RandRand>_RMS 4 % 4 %
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 1073181 ns 1073201 ns 650 bytes_read/iteration=1024k bytes_read/sec=931.791M/s midpoints=340.787M midpoints/sec=488.527M/s
BM_StdMidpoint<int8_t, RandRand>_BigO 2.05 N 2.05 N
BM_StdMidpoint<int8_t, RandRand>_RMS 1 % 1 %
BM_StdMidpoint<uint8_t, RandRand>/524288 1071010 ns 1071020 ns 653 bytes_read/iteration=1024k bytes_read/sec=933.689M/s midpoints=342.36M midpoints/sec=489.522M/s
BM_StdMidpoint<uint8_t, RandRand>_BigO 2.05 N 2.05 N
BM_StdMidpoint<uint8_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 300413 ns 300416 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.2507G/s midpoints=305.398M midpoints/sec=436.302M/s
BM_StdMidpoint<uint32_t, ZeroRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, ZeroRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 169667 ns 169669 ns 4123 bytes_read/iteration=1024k bytes_read/sec=5.75568G/s midpoints=270.205M midpoints/sec=386.257M/s
BM_StdMidpoint<uint64_t, ZeroRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, ZeroRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 591396 ns 591404 ns 1184 bytes_read/iteration=1024k bytes_read/sec=1.65126G/s midpoints=310.378M midpoints/sec=443.257M/s
BM_StdMidpoint<uint16_t, ZeroRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<uint16_t, ZeroRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 1069421 ns 1069413 ns 655 bytes_read/iteration=1024k bytes_read/sec=935.092M/s midpoints=343.409M midpoints/sec=490.258M/s
BM_StdMidpoint<uint8_t, ZeroRand>_BigO 2.04 N 2.04 N
BM_StdMidpoint<uint8_t, ZeroRand>_RMS 0 % 0 %
Comparing ./llvm-cmov-bench-OLD to ./llvm-cmov-bench-NEW
Benchmark Time CPU Time Old Time New CPU Old CPU New
----------------------------------------------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 +0.0016 +0.0016 300398 300878 300404 300880
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 -0.0007 -0.0007 300433 300231 300433 300226
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 +0.0057 +0.0054 169857 170819 169858 170777
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 +0.0114 +0.0114 169770 171705 169771 171708
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 +0.0023 +0.0023 591169 592510 591179 592516
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 +0.0398 +0.0398 591264 614823 591274 614823
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 -0.6403 -0.6403 2983669 1073181 2983689 1073201
<...>
BM_StdMidpoint<uint8_t, RandRand>/524288 -0.5986 -0.5986 2668398 1071010 2668419 1071020
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 -0.0016 -0.0016 300887 300413 300887 300416
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 +0.0002 +0.0002 169634 169667 169634 169669
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 -0.0014 -0.0014 592252 591396 592255 591404
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 +0.0832 +0.0832 987295 1069421 987309 1069413
```
What can we tell from the benchmark?
* `BM_StdMidpoint<[u]int8_t, RandRand>` indeed has the worst performance.
* All `BM_StdMidpoint<uint{8,16,32}_t, ZeroRand>` are all performant, even the 8-bit case.
That is because there we are computing mid point between zero and some random number,
thus if the branch predictor is in use, it is in optimal situation.
* Promoting 8-bit CMOV did improve performance of `BM_StdMidpoint<[u]int8_t, RandRand>`, by -59%..-64%.
# What about branch predictor?
* `BM_StdMidpoint<uint8_t, ZeroRand>` was faster than `BM_StdMidpoint<uint{16,32,64}_t, ZeroRand>`,
which may mean that well-predicted branch is better than `cmov`.
* Promoting 8-bit CMOV degraded performance of `BM_StdMidpoint<uint8_t, ZeroRand>`,
`cmov` is up to +10% worse than well-predicted branch.
* However, i do not believe this is a concern. If the branch is well predicted, then the PGO
will also say that it is well predicted, and LLVM will happily expand cmov back into branch:
https://godbolt.org/z/P5ufig
# What about partial register stalls?
I'm not really able to answer that.
What i can say is that if the branch is unpredictable (if it is predictable, then use PGO and you'll have branch)
in ~50% of cases you will have to pay branch misprediction penalty.
```
$ grep -i MispredictPenalty X86Sched*.td
X86SchedBroadwell.td: let MispredictPenalty = 16;
X86SchedHaswell.td: let MispredictPenalty = 16;
X86SchedSandyBridge.td: let MispredictPenalty = 16;
X86SchedSkylakeClient.td: let MispredictPenalty = 14;
X86SchedSkylakeServer.td: let MispredictPenalty = 14;
X86ScheduleBdVer2.td: let MispredictPenalty = 20; // Minimum branch misdirection penalty.
X86ScheduleBtVer2.td: let MispredictPenalty = 14; // Minimum branch misdirection penalty
X86ScheduleSLM.td: let MispredictPenalty = 10;
X86ScheduleZnver1.td: let MispredictPenalty = 17;
```
.. which it can be as small as 10 cycles and as large as 20 cycles.
Partial register stalls do not seem to be an issue for AMD CPU's.
For intel CPU's, they should be around ~5 cycles?
Is that actually an issue here? I'm not sure.
In short, i'd say this is an improvement, at least on this microbenchmark.
Fixes [[ https://bugs.llvm.org/show_bug.cgi?id=40965 | PR40965 ]].
Reviewers: craig.topper, RKSimon, spatel, andreadb, nikic
Reviewed By: craig.topper, andreadb
Subscribers: jfb, jdoerfert, llvm-commits, mclow.lists
Tags: #llvm, #libc
Differential Revision: https://reviews.llvm.org/D59035
llvm-svn: 356300
2019-03-16 05:17:53 +08:00
|
|
|
; ANY-NEXT: cmpb $-43, %dil
|
|
|
|
; ANY-NEXT: movl $213, %eax
|
|
|
|
; ANY-NEXT: cmovbl %edi, %eax
|
2018-09-20 02:59:08 +08:00
|
|
|
; ANY-NEXT: addb $42, %al
|
|
|
|
; ANY-NEXT: # kill: def $al killed $al killed $eax
|
2018-09-11 01:40:15 +08:00
|
|
|
; ANY-NEXT: retq
|
|
|
|
%c = icmp ult i8 %x, -43
|
|
|
|
%s = select i1 %c, i8 %x, i8 -43
|
|
|
|
%r = add i8 %s, 42
|
|
|
|
ret i8 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define i8 @unsigned_sat_constant_i8_using_cmp_sum(i8 %x) {
|
|
|
|
; ANY-LABEL: unsigned_sat_constant_i8_using_cmp_sum:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: addb $42, %dil
|
[X86] Promote i8 CMOV's (PR40965)
Summary:
@mclow.lists brought up this issue up in IRC, it came up during
implementation of libc++ `std::midpoint()` implementation (D59099)
https://godbolt.org/z/oLrHBP
Currently LLVM X86 backend only promotes i8 CMOV if it came from 2x`trunc`.
This differential proposes to always promote i8 CMOV.
There are several concerns here:
* Is this actually more performant, or is it just the ASM that looks cuter?
* Does this result in partial register stalls?
* What about branch predictor?
# Indeed, performance should be the main point here.
Let's look at a simple microbenchmark: {F8412076}
```
#include "benchmark/benchmark.h"
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <iterator>
#include <limits>
#include <random>
#include <type_traits>
#include <utility>
#include <vector>
// Future preliminary libc++ code, from Marshall Clow.
namespace std {
template <class _Tp>
__inline _Tp midpoint(_Tp __a, _Tp __b) noexcept {
using _Up = typename std::make_unsigned<typename remove_cv<_Tp>::type>::type;
int __sign = 1;
_Up __m = __a;
_Up __M = __b;
if (__a > __b) {
__sign = -1;
__m = __b;
__M = __a;
}
return __a + __sign * _Tp(_Up(__M - __m) >> 1);
}
} // namespace std
template <typename T>
std::vector<T> getVectorOfRandomNumbers(size_t count) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<T> dis(std::numeric_limits<T>::min(),
std::numeric_limits<T>::max());
std::vector<T> v;
v.reserve(count);
std::generate_n(std::back_inserter(v), count,
[&dis, &gen]() { return dis(gen); });
assert(v.size() == count);
return v;
}
struct RandRand {
template <typename T>
static std::pair<std::vector<T>, std::vector<T>> Gen(size_t count) {
return std::make_pair(getVectorOfRandomNumbers<T>(count),
getVectorOfRandomNumbers<T>(count));
}
};
struct ZeroRand {
template <typename T>
static std::pair<std::vector<T>, std::vector<T>> Gen(size_t count) {
return std::make_pair(std::vector<T>(count, T(0)),
getVectorOfRandomNumbers<T>(count));
}
};
template <class T, class Gen>
void BM_StdMidpoint(benchmark::State& state) {
const size_t Length = state.range(0);
const std::pair<std::vector<T>, std::vector<T>> Data =
Gen::template Gen<T>(Length);
const std::vector<T>& a = Data.first;
const std::vector<T>& b = Data.second;
assert(a.size() == Length && b.size() == a.size());
benchmark::ClobberMemory();
benchmark::DoNotOptimize(a);
benchmark::DoNotOptimize(a.data());
benchmark::DoNotOptimize(b);
benchmark::DoNotOptimize(b.data());
for (auto _ : state) {
for (size_t i = 0; i < Length; i++) {
const auto calculated = std::midpoint(a[i], b[i]);
benchmark::DoNotOptimize(calculated);
}
}
state.SetComplexityN(Length);
state.counters["midpoints"] =
benchmark::Counter(Length, benchmark::Counter::kIsIterationInvariant);
state.counters["midpoints/sec"] =
benchmark::Counter(Length, benchmark::Counter::kIsIterationInvariantRate);
const size_t BytesRead = 2 * sizeof(T) * Length;
state.counters["bytes_read/iteration"] =
benchmark::Counter(BytesRead, benchmark::Counter::kDefaults,
benchmark::Counter::OneK::kIs1024);
state.counters["bytes_read/sec"] = benchmark::Counter(
BytesRead, benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::OneK::kIs1024);
}
template <typename T>
static void CustomArguments(benchmark::internal::Benchmark* b) {
const size_t L2SizeBytes = 2 * 1024 * 1024;
// What is the largest range we can check to always fit within given L2 cache?
const size_t MaxLen = L2SizeBytes / /*total bufs*/ 2 /
/*maximal elt size*/ sizeof(T) / /*safety margin*/ 2;
b->RangeMultiplier(2)->Range(1, MaxLen)->Complexity(benchmark::oN);
}
// Both of the values are random.
// The comparison is unpredictable.
BENCHMARK_TEMPLATE(BM_StdMidpoint, int32_t, RandRand)
->Apply(CustomArguments<int32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint32_t, RandRand)
->Apply(CustomArguments<uint32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int64_t, RandRand)
->Apply(CustomArguments<int64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint64_t, RandRand)
->Apply(CustomArguments<uint64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int16_t, RandRand)
->Apply(CustomArguments<int16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint16_t, RandRand)
->Apply(CustomArguments<uint16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int8_t, RandRand)
->Apply(CustomArguments<int8_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint8_t, RandRand)
->Apply(CustomArguments<uint8_t>);
// One value is always zero, and another is bigger or equal than zero.
// The comparison is predictable.
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint32_t, ZeroRand)
->Apply(CustomArguments<uint32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint64_t, ZeroRand)
->Apply(CustomArguments<uint64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint16_t, ZeroRand)
->Apply(CustomArguments<uint16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint8_t, ZeroRand)
->Apply(CustomArguments<uint8_t>);
```
```
$ ~/src/googlebenchmark/tools/compare.py --no-utest benchmarks ./llvm-cmov-bench-OLD ./llvm-cmov-bench-NEW
RUNNING: ./llvm-cmov-bench-OLD --benchmark_out=/tmp/tmp5a5qjm
2019-03-06 21:53:31
Running ./llvm-cmov-bench-OLD
Run on (8 X 4000 MHz CPU s)
CPU Caches:
L1 Data 16K (x8)
L1 Instruction 64K (x4)
L2 Unified 2048K (x4)
L3 Unified 8192K (x1)
Load Average: 1.78, 1.81, 1.36
----------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters<...>
----------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 300398 ns 300404 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25083G/s midpoints=305.398M midpoints/sec=436.319M/s
BM_StdMidpoint<int32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<int32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 300433 ns 300433 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25052G/s midpoints=305.398M midpoints/sec=436.278M/s
BM_StdMidpoint<uint32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 169857 ns 169858 ns 4121 bytes_read/iteration=1024k bytes_read/sec=5.74929G/s midpoints=270.074M midpoints/sec=385.828M/s
BM_StdMidpoint<int64_t, RandRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<int64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 169770 ns 169771 ns 4125 bytes_read/iteration=1024k bytes_read/sec=5.75223G/s midpoints=270.336M midpoints/sec=386.026M/s
BM_StdMidpoint<uint64_t, RandRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 591169 ns 591179 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.65189G/s midpoints=309.854M midpoints/sec=443.426M/s
BM_StdMidpoint<int16_t, RandRand>_BigO 2.25 N 2.25 N
BM_StdMidpoint<int16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 591264 ns 591274 ns 1184 bytes_read/iteration=1024k bytes_read/sec=1.65162G/s midpoints=310.378M midpoints/sec=443.354M/s
BM_StdMidpoint<uint16_t, RandRand>_BigO 2.25 N 2.25 N
BM_StdMidpoint<uint16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 2983669 ns 2983689 ns 235 bytes_read/iteration=1024k bytes_read/sec=335.156M/s midpoints=123.208M midpoints/sec=175.718M/s
BM_StdMidpoint<int8_t, RandRand>_BigO 5.69 N 5.69 N
BM_StdMidpoint<int8_t, RandRand>_RMS 0 % 0 %
<...>
BM_StdMidpoint<uint8_t, RandRand>/524288 2668398 ns 2668419 ns 262 bytes_read/iteration=1024k bytes_read/sec=374.754M/s midpoints=137.363M midpoints/sec=196.479M/s
BM_StdMidpoint<uint8_t, RandRand>_BigO 5.09 N 5.09 N
BM_StdMidpoint<uint8_t, RandRand>_RMS 0 % 0 %
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 300887 ns 300887 ns 2331 bytes_read/iteration=1024k bytes_read/sec=3.24561G/s midpoints=305.529M midpoints/sec=435.619M/s
BM_StdMidpoint<uint32_t, ZeroRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, ZeroRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 169634 ns 169634 ns 4102 bytes_read/iteration=1024k bytes_read/sec=5.75688G/s midpoints=268.829M midpoints/sec=386.338M/s
BM_StdMidpoint<uint64_t, ZeroRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, ZeroRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 592252 ns 592255 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.64889G/s midpoints=309.854M midpoints/sec=442.62M/s
BM_StdMidpoint<uint16_t, ZeroRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<uint16_t, ZeroRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 987295 ns 987309 ns 711 bytes_read/iteration=1024k bytes_read/sec=1012.85M/s midpoints=372.769M midpoints/sec=531.028M/s
BM_StdMidpoint<uint8_t, ZeroRand>_BigO 1.88 N 1.88 N
BM_StdMidpoint<uint8_t, ZeroRand>_RMS 1 % 1 %
RUNNING: ./llvm-cmov-bench-NEW --benchmark_out=/tmp/tmpPvwpfW
2019-03-06 21:56:58
Running ./llvm-cmov-bench-NEW
Run on (8 X 4000 MHz CPU s)
CPU Caches:
L1 Data 16K (x8)
L1 Instruction 64K (x4)
L2 Unified 2048K (x4)
L3 Unified 8192K (x1)
Load Average: 1.17, 1.46, 1.30
----------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters<...>
----------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 300878 ns 300880 ns 2324 bytes_read/iteration=1024k bytes_read/sec=3.24569G/s midpoints=304.611M midpoints/sec=435.629M/s
BM_StdMidpoint<int32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<int32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 300231 ns 300226 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25276G/s midpoints=305.398M midpoints/sec=436.578M/s
BM_StdMidpoint<uint32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 170819 ns 170777 ns 4115 bytes_read/iteration=1024k bytes_read/sec=5.71835G/s midpoints=269.681M midpoints/sec=383.752M/s
BM_StdMidpoint<int64_t, RandRand>_BigO 2.60 N 2.60 N
BM_StdMidpoint<int64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 171705 ns 171708 ns 4106 bytes_read/iteration=1024k bytes_read/sec=5.68733G/s midpoints=269.091M midpoints/sec=381.671M/s
BM_StdMidpoint<uint64_t, RandRand>_BigO 2.62 N 2.62 N
BM_StdMidpoint<uint64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 592510 ns 592516 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.64816G/s midpoints=309.854M midpoints/sec=442.425M/s
BM_StdMidpoint<int16_t, RandRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<int16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 614823 ns 614823 ns 1180 bytes_read/iteration=1024k bytes_read/sec=1.58836G/s midpoints=309.33M midpoints/sec=426.373M/s
BM_StdMidpoint<uint16_t, RandRand>_BigO 2.33 N 2.33 N
BM_StdMidpoint<uint16_t, RandRand>_RMS 4 % 4 %
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 1073181 ns 1073201 ns 650 bytes_read/iteration=1024k bytes_read/sec=931.791M/s midpoints=340.787M midpoints/sec=488.527M/s
BM_StdMidpoint<int8_t, RandRand>_BigO 2.05 N 2.05 N
BM_StdMidpoint<int8_t, RandRand>_RMS 1 % 1 %
BM_StdMidpoint<uint8_t, RandRand>/524288 1071010 ns 1071020 ns 653 bytes_read/iteration=1024k bytes_read/sec=933.689M/s midpoints=342.36M midpoints/sec=489.522M/s
BM_StdMidpoint<uint8_t, RandRand>_BigO 2.05 N 2.05 N
BM_StdMidpoint<uint8_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 300413 ns 300416 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.2507G/s midpoints=305.398M midpoints/sec=436.302M/s
BM_StdMidpoint<uint32_t, ZeroRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, ZeroRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 169667 ns 169669 ns 4123 bytes_read/iteration=1024k bytes_read/sec=5.75568G/s midpoints=270.205M midpoints/sec=386.257M/s
BM_StdMidpoint<uint64_t, ZeroRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, ZeroRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 591396 ns 591404 ns 1184 bytes_read/iteration=1024k bytes_read/sec=1.65126G/s midpoints=310.378M midpoints/sec=443.257M/s
BM_StdMidpoint<uint16_t, ZeroRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<uint16_t, ZeroRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 1069421 ns 1069413 ns 655 bytes_read/iteration=1024k bytes_read/sec=935.092M/s midpoints=343.409M midpoints/sec=490.258M/s
BM_StdMidpoint<uint8_t, ZeroRand>_BigO 2.04 N 2.04 N
BM_StdMidpoint<uint8_t, ZeroRand>_RMS 0 % 0 %
Comparing ./llvm-cmov-bench-OLD to ./llvm-cmov-bench-NEW
Benchmark Time CPU Time Old Time New CPU Old CPU New
----------------------------------------------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 +0.0016 +0.0016 300398 300878 300404 300880
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 -0.0007 -0.0007 300433 300231 300433 300226
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 +0.0057 +0.0054 169857 170819 169858 170777
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 +0.0114 +0.0114 169770 171705 169771 171708
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 +0.0023 +0.0023 591169 592510 591179 592516
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 +0.0398 +0.0398 591264 614823 591274 614823
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 -0.6403 -0.6403 2983669 1073181 2983689 1073201
<...>
BM_StdMidpoint<uint8_t, RandRand>/524288 -0.5986 -0.5986 2668398 1071010 2668419 1071020
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 -0.0016 -0.0016 300887 300413 300887 300416
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 +0.0002 +0.0002 169634 169667 169634 169669
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 -0.0014 -0.0014 592252 591396 592255 591404
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 +0.0832 +0.0832 987295 1069421 987309 1069413
```
What can we tell from the benchmark?
* `BM_StdMidpoint<[u]int8_t, RandRand>` indeed has the worst performance.
* All `BM_StdMidpoint<uint{8,16,32}_t, ZeroRand>` are all performant, even the 8-bit case.
That is because there we are computing mid point between zero and some random number,
thus if the branch predictor is in use, it is in optimal situation.
* Promoting 8-bit CMOV did improve performance of `BM_StdMidpoint<[u]int8_t, RandRand>`, by -59%..-64%.
# What about branch predictor?
* `BM_StdMidpoint<uint8_t, ZeroRand>` was faster than `BM_StdMidpoint<uint{16,32,64}_t, ZeroRand>`,
which may mean that well-predicted branch is better than `cmov`.
* Promoting 8-bit CMOV degraded performance of `BM_StdMidpoint<uint8_t, ZeroRand>`,
`cmov` is up to +10% worse than well-predicted branch.
* However, i do not believe this is a concern. If the branch is well predicted, then the PGO
will also say that it is well predicted, and LLVM will happily expand cmov back into branch:
https://godbolt.org/z/P5ufig
# What about partial register stalls?
I'm not really able to answer that.
What i can say is that if the branch is unpredictable (if it is predictable, then use PGO and you'll have branch)
in ~50% of cases you will have to pay branch misprediction penalty.
```
$ grep -i MispredictPenalty X86Sched*.td
X86SchedBroadwell.td: let MispredictPenalty = 16;
X86SchedHaswell.td: let MispredictPenalty = 16;
X86SchedSandyBridge.td: let MispredictPenalty = 16;
X86SchedSkylakeClient.td: let MispredictPenalty = 14;
X86SchedSkylakeServer.td: let MispredictPenalty = 14;
X86ScheduleBdVer2.td: let MispredictPenalty = 20; // Minimum branch misdirection penalty.
X86ScheduleBtVer2.td: let MispredictPenalty = 14; // Minimum branch misdirection penalty
X86ScheduleSLM.td: let MispredictPenalty = 10;
X86ScheduleZnver1.td: let MispredictPenalty = 17;
```
.. which it can be as small as 10 cycles and as large as 20 cycles.
Partial register stalls do not seem to be an issue for AMD CPU's.
For intel CPU's, they should be around ~5 cycles?
Is that actually an issue here? I'm not sure.
In short, i'd say this is an improvement, at least on this microbenchmark.
Fixes [[ https://bugs.llvm.org/show_bug.cgi?id=40965 | PR40965 ]].
Reviewers: craig.topper, RKSimon, spatel, andreadb, nikic
Reviewed By: craig.topper, andreadb
Subscribers: jfb, jdoerfert, llvm-commits, mclow.lists
Tags: #llvm, #libc
Differential Revision: https://reviews.llvm.org/D59035
llvm-svn: 356300
2019-03-16 05:17:53 +08:00
|
|
|
; ANY-NEXT: movzbl %dil, %ecx
|
|
|
|
; ANY-NEXT: movl $255, %eax
|
|
|
|
; ANY-NEXT: cmovael %ecx, %eax
|
|
|
|
; ANY-NEXT: # kill: def $al killed $al killed $eax
|
2018-09-11 01:40:15 +08:00
|
|
|
; ANY-NEXT: retq
|
|
|
|
%a = add i8 %x, 42
|
|
|
|
%c = icmp ugt i8 %x, %a
|
|
|
|
%r = select i1 %c, i8 -1, i8 %a
|
|
|
|
ret i8 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define i8 @unsigned_sat_constant_i8_using_cmp_notval(i8 %x) {
|
|
|
|
; ANY-LABEL: unsigned_sat_constant_i8_using_cmp_notval:
|
|
|
|
; ANY: # %bb.0:
|
2018-09-24 22:47:15 +08:00
|
|
|
; ANY-NEXT: addb $42, %dil
|
[X86] Promote i8 CMOV's (PR40965)
Summary:
@mclow.lists brought up this issue up in IRC, it came up during
implementation of libc++ `std::midpoint()` implementation (D59099)
https://godbolt.org/z/oLrHBP
Currently LLVM X86 backend only promotes i8 CMOV if it came from 2x`trunc`.
This differential proposes to always promote i8 CMOV.
There are several concerns here:
* Is this actually more performant, or is it just the ASM that looks cuter?
* Does this result in partial register stalls?
* What about branch predictor?
# Indeed, performance should be the main point here.
Let's look at a simple microbenchmark: {F8412076}
```
#include "benchmark/benchmark.h"
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <iterator>
#include <limits>
#include <random>
#include <type_traits>
#include <utility>
#include <vector>
// Future preliminary libc++ code, from Marshall Clow.
namespace std {
template <class _Tp>
__inline _Tp midpoint(_Tp __a, _Tp __b) noexcept {
using _Up = typename std::make_unsigned<typename remove_cv<_Tp>::type>::type;
int __sign = 1;
_Up __m = __a;
_Up __M = __b;
if (__a > __b) {
__sign = -1;
__m = __b;
__M = __a;
}
return __a + __sign * _Tp(_Up(__M - __m) >> 1);
}
} // namespace std
template <typename T>
std::vector<T> getVectorOfRandomNumbers(size_t count) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<T> dis(std::numeric_limits<T>::min(),
std::numeric_limits<T>::max());
std::vector<T> v;
v.reserve(count);
std::generate_n(std::back_inserter(v), count,
[&dis, &gen]() { return dis(gen); });
assert(v.size() == count);
return v;
}
struct RandRand {
template <typename T>
static std::pair<std::vector<T>, std::vector<T>> Gen(size_t count) {
return std::make_pair(getVectorOfRandomNumbers<T>(count),
getVectorOfRandomNumbers<T>(count));
}
};
struct ZeroRand {
template <typename T>
static std::pair<std::vector<T>, std::vector<T>> Gen(size_t count) {
return std::make_pair(std::vector<T>(count, T(0)),
getVectorOfRandomNumbers<T>(count));
}
};
template <class T, class Gen>
void BM_StdMidpoint(benchmark::State& state) {
const size_t Length = state.range(0);
const std::pair<std::vector<T>, std::vector<T>> Data =
Gen::template Gen<T>(Length);
const std::vector<T>& a = Data.first;
const std::vector<T>& b = Data.second;
assert(a.size() == Length && b.size() == a.size());
benchmark::ClobberMemory();
benchmark::DoNotOptimize(a);
benchmark::DoNotOptimize(a.data());
benchmark::DoNotOptimize(b);
benchmark::DoNotOptimize(b.data());
for (auto _ : state) {
for (size_t i = 0; i < Length; i++) {
const auto calculated = std::midpoint(a[i], b[i]);
benchmark::DoNotOptimize(calculated);
}
}
state.SetComplexityN(Length);
state.counters["midpoints"] =
benchmark::Counter(Length, benchmark::Counter::kIsIterationInvariant);
state.counters["midpoints/sec"] =
benchmark::Counter(Length, benchmark::Counter::kIsIterationInvariantRate);
const size_t BytesRead = 2 * sizeof(T) * Length;
state.counters["bytes_read/iteration"] =
benchmark::Counter(BytesRead, benchmark::Counter::kDefaults,
benchmark::Counter::OneK::kIs1024);
state.counters["bytes_read/sec"] = benchmark::Counter(
BytesRead, benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::OneK::kIs1024);
}
template <typename T>
static void CustomArguments(benchmark::internal::Benchmark* b) {
const size_t L2SizeBytes = 2 * 1024 * 1024;
// What is the largest range we can check to always fit within given L2 cache?
const size_t MaxLen = L2SizeBytes / /*total bufs*/ 2 /
/*maximal elt size*/ sizeof(T) / /*safety margin*/ 2;
b->RangeMultiplier(2)->Range(1, MaxLen)->Complexity(benchmark::oN);
}
// Both of the values are random.
// The comparison is unpredictable.
BENCHMARK_TEMPLATE(BM_StdMidpoint, int32_t, RandRand)
->Apply(CustomArguments<int32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint32_t, RandRand)
->Apply(CustomArguments<uint32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int64_t, RandRand)
->Apply(CustomArguments<int64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint64_t, RandRand)
->Apply(CustomArguments<uint64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int16_t, RandRand)
->Apply(CustomArguments<int16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint16_t, RandRand)
->Apply(CustomArguments<uint16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int8_t, RandRand)
->Apply(CustomArguments<int8_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint8_t, RandRand)
->Apply(CustomArguments<uint8_t>);
// One value is always zero, and another is bigger or equal than zero.
// The comparison is predictable.
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint32_t, ZeroRand)
->Apply(CustomArguments<uint32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint64_t, ZeroRand)
->Apply(CustomArguments<uint64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint16_t, ZeroRand)
->Apply(CustomArguments<uint16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint8_t, ZeroRand)
->Apply(CustomArguments<uint8_t>);
```
```
$ ~/src/googlebenchmark/tools/compare.py --no-utest benchmarks ./llvm-cmov-bench-OLD ./llvm-cmov-bench-NEW
RUNNING: ./llvm-cmov-bench-OLD --benchmark_out=/tmp/tmp5a5qjm
2019-03-06 21:53:31
Running ./llvm-cmov-bench-OLD
Run on (8 X 4000 MHz CPU s)
CPU Caches:
L1 Data 16K (x8)
L1 Instruction 64K (x4)
L2 Unified 2048K (x4)
L3 Unified 8192K (x1)
Load Average: 1.78, 1.81, 1.36
----------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters<...>
----------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 300398 ns 300404 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25083G/s midpoints=305.398M midpoints/sec=436.319M/s
BM_StdMidpoint<int32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<int32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 300433 ns 300433 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25052G/s midpoints=305.398M midpoints/sec=436.278M/s
BM_StdMidpoint<uint32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 169857 ns 169858 ns 4121 bytes_read/iteration=1024k bytes_read/sec=5.74929G/s midpoints=270.074M midpoints/sec=385.828M/s
BM_StdMidpoint<int64_t, RandRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<int64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 169770 ns 169771 ns 4125 bytes_read/iteration=1024k bytes_read/sec=5.75223G/s midpoints=270.336M midpoints/sec=386.026M/s
BM_StdMidpoint<uint64_t, RandRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 591169 ns 591179 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.65189G/s midpoints=309.854M midpoints/sec=443.426M/s
BM_StdMidpoint<int16_t, RandRand>_BigO 2.25 N 2.25 N
BM_StdMidpoint<int16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 591264 ns 591274 ns 1184 bytes_read/iteration=1024k bytes_read/sec=1.65162G/s midpoints=310.378M midpoints/sec=443.354M/s
BM_StdMidpoint<uint16_t, RandRand>_BigO 2.25 N 2.25 N
BM_StdMidpoint<uint16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 2983669 ns 2983689 ns 235 bytes_read/iteration=1024k bytes_read/sec=335.156M/s midpoints=123.208M midpoints/sec=175.718M/s
BM_StdMidpoint<int8_t, RandRand>_BigO 5.69 N 5.69 N
BM_StdMidpoint<int8_t, RandRand>_RMS 0 % 0 %
<...>
BM_StdMidpoint<uint8_t, RandRand>/524288 2668398 ns 2668419 ns 262 bytes_read/iteration=1024k bytes_read/sec=374.754M/s midpoints=137.363M midpoints/sec=196.479M/s
BM_StdMidpoint<uint8_t, RandRand>_BigO 5.09 N 5.09 N
BM_StdMidpoint<uint8_t, RandRand>_RMS 0 % 0 %
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 300887 ns 300887 ns 2331 bytes_read/iteration=1024k bytes_read/sec=3.24561G/s midpoints=305.529M midpoints/sec=435.619M/s
BM_StdMidpoint<uint32_t, ZeroRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, ZeroRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 169634 ns 169634 ns 4102 bytes_read/iteration=1024k bytes_read/sec=5.75688G/s midpoints=268.829M midpoints/sec=386.338M/s
BM_StdMidpoint<uint64_t, ZeroRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, ZeroRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 592252 ns 592255 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.64889G/s midpoints=309.854M midpoints/sec=442.62M/s
BM_StdMidpoint<uint16_t, ZeroRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<uint16_t, ZeroRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 987295 ns 987309 ns 711 bytes_read/iteration=1024k bytes_read/sec=1012.85M/s midpoints=372.769M midpoints/sec=531.028M/s
BM_StdMidpoint<uint8_t, ZeroRand>_BigO 1.88 N 1.88 N
BM_StdMidpoint<uint8_t, ZeroRand>_RMS 1 % 1 %
RUNNING: ./llvm-cmov-bench-NEW --benchmark_out=/tmp/tmpPvwpfW
2019-03-06 21:56:58
Running ./llvm-cmov-bench-NEW
Run on (8 X 4000 MHz CPU s)
CPU Caches:
L1 Data 16K (x8)
L1 Instruction 64K (x4)
L2 Unified 2048K (x4)
L3 Unified 8192K (x1)
Load Average: 1.17, 1.46, 1.30
----------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters<...>
----------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 300878 ns 300880 ns 2324 bytes_read/iteration=1024k bytes_read/sec=3.24569G/s midpoints=304.611M midpoints/sec=435.629M/s
BM_StdMidpoint<int32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<int32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 300231 ns 300226 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25276G/s midpoints=305.398M midpoints/sec=436.578M/s
BM_StdMidpoint<uint32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 170819 ns 170777 ns 4115 bytes_read/iteration=1024k bytes_read/sec=5.71835G/s midpoints=269.681M midpoints/sec=383.752M/s
BM_StdMidpoint<int64_t, RandRand>_BigO 2.60 N 2.60 N
BM_StdMidpoint<int64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 171705 ns 171708 ns 4106 bytes_read/iteration=1024k bytes_read/sec=5.68733G/s midpoints=269.091M midpoints/sec=381.671M/s
BM_StdMidpoint<uint64_t, RandRand>_BigO 2.62 N 2.62 N
BM_StdMidpoint<uint64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 592510 ns 592516 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.64816G/s midpoints=309.854M midpoints/sec=442.425M/s
BM_StdMidpoint<int16_t, RandRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<int16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 614823 ns 614823 ns 1180 bytes_read/iteration=1024k bytes_read/sec=1.58836G/s midpoints=309.33M midpoints/sec=426.373M/s
BM_StdMidpoint<uint16_t, RandRand>_BigO 2.33 N 2.33 N
BM_StdMidpoint<uint16_t, RandRand>_RMS 4 % 4 %
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 1073181 ns 1073201 ns 650 bytes_read/iteration=1024k bytes_read/sec=931.791M/s midpoints=340.787M midpoints/sec=488.527M/s
BM_StdMidpoint<int8_t, RandRand>_BigO 2.05 N 2.05 N
BM_StdMidpoint<int8_t, RandRand>_RMS 1 % 1 %
BM_StdMidpoint<uint8_t, RandRand>/524288 1071010 ns 1071020 ns 653 bytes_read/iteration=1024k bytes_read/sec=933.689M/s midpoints=342.36M midpoints/sec=489.522M/s
BM_StdMidpoint<uint8_t, RandRand>_BigO 2.05 N 2.05 N
BM_StdMidpoint<uint8_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 300413 ns 300416 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.2507G/s midpoints=305.398M midpoints/sec=436.302M/s
BM_StdMidpoint<uint32_t, ZeroRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, ZeroRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 169667 ns 169669 ns 4123 bytes_read/iteration=1024k bytes_read/sec=5.75568G/s midpoints=270.205M midpoints/sec=386.257M/s
BM_StdMidpoint<uint64_t, ZeroRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, ZeroRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 591396 ns 591404 ns 1184 bytes_read/iteration=1024k bytes_read/sec=1.65126G/s midpoints=310.378M midpoints/sec=443.257M/s
BM_StdMidpoint<uint16_t, ZeroRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<uint16_t, ZeroRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 1069421 ns 1069413 ns 655 bytes_read/iteration=1024k bytes_read/sec=935.092M/s midpoints=343.409M midpoints/sec=490.258M/s
BM_StdMidpoint<uint8_t, ZeroRand>_BigO 2.04 N 2.04 N
BM_StdMidpoint<uint8_t, ZeroRand>_RMS 0 % 0 %
Comparing ./llvm-cmov-bench-OLD to ./llvm-cmov-bench-NEW
Benchmark Time CPU Time Old Time New CPU Old CPU New
----------------------------------------------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 +0.0016 +0.0016 300398 300878 300404 300880
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 -0.0007 -0.0007 300433 300231 300433 300226
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 +0.0057 +0.0054 169857 170819 169858 170777
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 +0.0114 +0.0114 169770 171705 169771 171708
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 +0.0023 +0.0023 591169 592510 591179 592516
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 +0.0398 +0.0398 591264 614823 591274 614823
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 -0.6403 -0.6403 2983669 1073181 2983689 1073201
<...>
BM_StdMidpoint<uint8_t, RandRand>/524288 -0.5986 -0.5986 2668398 1071010 2668419 1071020
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 -0.0016 -0.0016 300887 300413 300887 300416
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 +0.0002 +0.0002 169634 169667 169634 169669
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 -0.0014 -0.0014 592252 591396 592255 591404
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 +0.0832 +0.0832 987295 1069421 987309 1069413
```
What can we tell from the benchmark?
* `BM_StdMidpoint<[u]int8_t, RandRand>` indeed has the worst performance.
* All `BM_StdMidpoint<uint{8,16,32}_t, ZeroRand>` are all performant, even the 8-bit case.
That is because there we are computing mid point between zero and some random number,
thus if the branch predictor is in use, it is in optimal situation.
* Promoting 8-bit CMOV did improve performance of `BM_StdMidpoint<[u]int8_t, RandRand>`, by -59%..-64%.
# What about branch predictor?
* `BM_StdMidpoint<uint8_t, ZeroRand>` was faster than `BM_StdMidpoint<uint{16,32,64}_t, ZeroRand>`,
which may mean that well-predicted branch is better than `cmov`.
* Promoting 8-bit CMOV degraded performance of `BM_StdMidpoint<uint8_t, ZeroRand>`,
`cmov` is up to +10% worse than well-predicted branch.
* However, i do not believe this is a concern. If the branch is well predicted, then the PGO
will also say that it is well predicted, and LLVM will happily expand cmov back into branch:
https://godbolt.org/z/P5ufig
# What about partial register stalls?
I'm not really able to answer that.
What i can say is that if the branch is unpredictable (if it is predictable, then use PGO and you'll have branch)
in ~50% of cases you will have to pay branch misprediction penalty.
```
$ grep -i MispredictPenalty X86Sched*.td
X86SchedBroadwell.td: let MispredictPenalty = 16;
X86SchedHaswell.td: let MispredictPenalty = 16;
X86SchedSandyBridge.td: let MispredictPenalty = 16;
X86SchedSkylakeClient.td: let MispredictPenalty = 14;
X86SchedSkylakeServer.td: let MispredictPenalty = 14;
X86ScheduleBdVer2.td: let MispredictPenalty = 20; // Minimum branch misdirection penalty.
X86ScheduleBtVer2.td: let MispredictPenalty = 14; // Minimum branch misdirection penalty
X86ScheduleSLM.td: let MispredictPenalty = 10;
X86ScheduleZnver1.td: let MispredictPenalty = 17;
```
.. which it can be as small as 10 cycles and as large as 20 cycles.
Partial register stalls do not seem to be an issue for AMD CPU's.
For intel CPU's, they should be around ~5 cycles?
Is that actually an issue here? I'm not sure.
In short, i'd say this is an improvement, at least on this microbenchmark.
Fixes [[ https://bugs.llvm.org/show_bug.cgi?id=40965 | PR40965 ]].
Reviewers: craig.topper, RKSimon, spatel, andreadb, nikic
Reviewed By: craig.topper, andreadb
Subscribers: jfb, jdoerfert, llvm-commits, mclow.lists
Tags: #llvm, #libc
Differential Revision: https://reviews.llvm.org/D59035
llvm-svn: 356300
2019-03-16 05:17:53 +08:00
|
|
|
; ANY-NEXT: movzbl %dil, %ecx
|
|
|
|
; ANY-NEXT: movl $255, %eax
|
|
|
|
; ANY-NEXT: cmovael %ecx, %eax
|
|
|
|
; ANY-NEXT: # kill: def $al killed $al killed $eax
|
2018-09-11 01:40:15 +08:00
|
|
|
; ANY-NEXT: retq
|
|
|
|
%a = add i8 %x, 42
|
|
|
|
%c = icmp ugt i8 %x, -43
|
|
|
|
%r = select i1 %c, i8 -1, i8 %a
|
|
|
|
ret i8 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define i16 @unsigned_sat_constant_i16_using_min(i16 %x) {
|
|
|
|
; ANY-LABEL: unsigned_sat_constant_i16_using_min:
|
|
|
|
; ANY: # %bb.0:
|
2018-10-06 02:13:36 +08:00
|
|
|
; ANY-NEXT: cmpw $-43, %di
|
2018-09-11 01:40:15 +08:00
|
|
|
; ANY-NEXT: movl $65493, %eax # imm = 0xFFD5
|
|
|
|
; ANY-NEXT: cmovbl %edi, %eax
|
|
|
|
; ANY-NEXT: addl $42, %eax
|
|
|
|
; ANY-NEXT: # kill: def $ax killed $ax killed $eax
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%c = icmp ult i16 %x, -43
|
|
|
|
%s = select i1 %c, i16 %x, i16 -43
|
|
|
|
%r = add i16 %s, 42
|
|
|
|
ret i16 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define i16 @unsigned_sat_constant_i16_using_cmp_sum(i16 %x) {
|
|
|
|
; ANY-LABEL: unsigned_sat_constant_i16_using_cmp_sum:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: addw $42, %di
|
|
|
|
; ANY-NEXT: movl $65535, %eax # imm = 0xFFFF
|
|
|
|
; ANY-NEXT: cmovael %edi, %eax
|
|
|
|
; ANY-NEXT: # kill: def $ax killed $ax killed $eax
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%a = add i16 %x, 42
|
|
|
|
%c = icmp ugt i16 %x, %a
|
|
|
|
%r = select i1 %c, i16 -1, i16 %a
|
|
|
|
ret i16 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define i16 @unsigned_sat_constant_i16_using_cmp_notval(i16 %x) {
|
|
|
|
; ANY-LABEL: unsigned_sat_constant_i16_using_cmp_notval:
|
|
|
|
; ANY: # %bb.0:
|
2018-09-24 22:47:15 +08:00
|
|
|
; ANY-NEXT: addw $42, %di
|
2018-09-11 01:40:15 +08:00
|
|
|
; ANY-NEXT: movl $65535, %eax # imm = 0xFFFF
|
2018-09-24 22:47:15 +08:00
|
|
|
; ANY-NEXT: cmovael %edi, %eax
|
2018-09-11 01:40:15 +08:00
|
|
|
; ANY-NEXT: # kill: def $ax killed $ax killed $eax
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%a = add i16 %x, 42
|
|
|
|
%c = icmp ugt i16 %x, -43
|
|
|
|
%r = select i1 %c, i16 -1, i16 %a
|
|
|
|
ret i16 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @unsigned_sat_constant_i32_using_min(i32 %x) {
|
|
|
|
; ANY-LABEL: unsigned_sat_constant_i32_using_min:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: cmpl $-43, %edi
|
|
|
|
; ANY-NEXT: movl $-43, %eax
|
|
|
|
; ANY-NEXT: cmovbl %edi, %eax
|
|
|
|
; ANY-NEXT: addl $42, %eax
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%c = icmp ult i32 %x, -43
|
|
|
|
%s = select i1 %c, i32 %x, i32 -43
|
|
|
|
%r = add i32 %s, 42
|
|
|
|
ret i32 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @unsigned_sat_constant_i32_using_cmp_sum(i32 %x) {
|
|
|
|
; ANY-LABEL: unsigned_sat_constant_i32_using_cmp_sum:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: addl $42, %edi
|
|
|
|
; ANY-NEXT: movl $-1, %eax
|
|
|
|
; ANY-NEXT: cmovael %edi, %eax
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%a = add i32 %x, 42
|
|
|
|
%c = icmp ugt i32 %x, %a
|
|
|
|
%r = select i1 %c, i32 -1, i32 %a
|
|
|
|
ret i32 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @unsigned_sat_constant_i32_using_cmp_notval(i32 %x) {
|
|
|
|
; ANY-LABEL: unsigned_sat_constant_i32_using_cmp_notval:
|
|
|
|
; ANY: # %bb.0:
|
2018-09-24 22:47:15 +08:00
|
|
|
; ANY-NEXT: addl $42, %edi
|
2018-09-11 01:40:15 +08:00
|
|
|
; ANY-NEXT: movl $-1, %eax
|
2018-09-24 22:47:15 +08:00
|
|
|
; ANY-NEXT: cmovael %edi, %eax
|
2018-09-11 01:40:15 +08:00
|
|
|
; ANY-NEXT: retq
|
|
|
|
%a = add i32 %x, 42
|
|
|
|
%c = icmp ugt i32 %x, -43
|
|
|
|
%r = select i1 %c, i32 -1, i32 %a
|
|
|
|
ret i32 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define i64 @unsigned_sat_constant_i64_using_min(i64 %x) {
|
|
|
|
; ANY-LABEL: unsigned_sat_constant_i64_using_min:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: cmpq $-43, %rdi
|
|
|
|
; ANY-NEXT: movq $-43, %rax
|
|
|
|
; ANY-NEXT: cmovbq %rdi, %rax
|
|
|
|
; ANY-NEXT: addq $42, %rax
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%c = icmp ult i64 %x, -43
|
|
|
|
%s = select i1 %c, i64 %x, i64 -43
|
|
|
|
%r = add i64 %s, 42
|
|
|
|
ret i64 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define i64 @unsigned_sat_constant_i64_using_cmp_sum(i64 %x) {
|
|
|
|
; ANY-LABEL: unsigned_sat_constant_i64_using_cmp_sum:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: addq $42, %rdi
|
|
|
|
; ANY-NEXT: movq $-1, %rax
|
|
|
|
; ANY-NEXT: cmovaeq %rdi, %rax
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%a = add i64 %x, 42
|
|
|
|
%c = icmp ugt i64 %x, %a
|
|
|
|
%r = select i1 %c, i64 -1, i64 %a
|
|
|
|
ret i64 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define i64 @unsigned_sat_constant_i64_using_cmp_notval(i64 %x) {
|
|
|
|
; ANY-LABEL: unsigned_sat_constant_i64_using_cmp_notval:
|
|
|
|
; ANY: # %bb.0:
|
2018-09-24 22:47:15 +08:00
|
|
|
; ANY-NEXT: addq $42, %rdi
|
|
|
|
; ANY-NEXT: movq $-1, %rax
|
|
|
|
; ANY-NEXT: cmovaeq %rdi, %rax
|
2018-09-11 01:40:15 +08:00
|
|
|
; ANY-NEXT: retq
|
|
|
|
%a = add i64 %x, 42
|
|
|
|
%c = icmp ugt i64 %x, -43
|
|
|
|
%r = select i1 %c, i64 -1, i64 %a
|
|
|
|
ret i64 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define i8 @unsigned_sat_variable_i8_using_min(i8 %x, i8 %y) {
|
|
|
|
; ANY-LABEL: unsigned_sat_variable_i8_using_min:
|
|
|
|
; ANY: # %bb.0:
|
[X86] Promote i8 CMOV's (PR40965)
Summary:
@mclow.lists brought up this issue up in IRC, it came up during
implementation of libc++ `std::midpoint()` implementation (D59099)
https://godbolt.org/z/oLrHBP
Currently LLVM X86 backend only promotes i8 CMOV if it came from 2x`trunc`.
This differential proposes to always promote i8 CMOV.
There are several concerns here:
* Is this actually more performant, or is it just the ASM that looks cuter?
* Does this result in partial register stalls?
* What about branch predictor?
# Indeed, performance should be the main point here.
Let's look at a simple microbenchmark: {F8412076}
```
#include "benchmark/benchmark.h"
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <iterator>
#include <limits>
#include <random>
#include <type_traits>
#include <utility>
#include <vector>
// Future preliminary libc++ code, from Marshall Clow.
namespace std {
template <class _Tp>
__inline _Tp midpoint(_Tp __a, _Tp __b) noexcept {
using _Up = typename std::make_unsigned<typename remove_cv<_Tp>::type>::type;
int __sign = 1;
_Up __m = __a;
_Up __M = __b;
if (__a > __b) {
__sign = -1;
__m = __b;
__M = __a;
}
return __a + __sign * _Tp(_Up(__M - __m) >> 1);
}
} // namespace std
template <typename T>
std::vector<T> getVectorOfRandomNumbers(size_t count) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<T> dis(std::numeric_limits<T>::min(),
std::numeric_limits<T>::max());
std::vector<T> v;
v.reserve(count);
std::generate_n(std::back_inserter(v), count,
[&dis, &gen]() { return dis(gen); });
assert(v.size() == count);
return v;
}
struct RandRand {
template <typename T>
static std::pair<std::vector<T>, std::vector<T>> Gen(size_t count) {
return std::make_pair(getVectorOfRandomNumbers<T>(count),
getVectorOfRandomNumbers<T>(count));
}
};
struct ZeroRand {
template <typename T>
static std::pair<std::vector<T>, std::vector<T>> Gen(size_t count) {
return std::make_pair(std::vector<T>(count, T(0)),
getVectorOfRandomNumbers<T>(count));
}
};
template <class T, class Gen>
void BM_StdMidpoint(benchmark::State& state) {
const size_t Length = state.range(0);
const std::pair<std::vector<T>, std::vector<T>> Data =
Gen::template Gen<T>(Length);
const std::vector<T>& a = Data.first;
const std::vector<T>& b = Data.second;
assert(a.size() == Length && b.size() == a.size());
benchmark::ClobberMemory();
benchmark::DoNotOptimize(a);
benchmark::DoNotOptimize(a.data());
benchmark::DoNotOptimize(b);
benchmark::DoNotOptimize(b.data());
for (auto _ : state) {
for (size_t i = 0; i < Length; i++) {
const auto calculated = std::midpoint(a[i], b[i]);
benchmark::DoNotOptimize(calculated);
}
}
state.SetComplexityN(Length);
state.counters["midpoints"] =
benchmark::Counter(Length, benchmark::Counter::kIsIterationInvariant);
state.counters["midpoints/sec"] =
benchmark::Counter(Length, benchmark::Counter::kIsIterationInvariantRate);
const size_t BytesRead = 2 * sizeof(T) * Length;
state.counters["bytes_read/iteration"] =
benchmark::Counter(BytesRead, benchmark::Counter::kDefaults,
benchmark::Counter::OneK::kIs1024);
state.counters["bytes_read/sec"] = benchmark::Counter(
BytesRead, benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::OneK::kIs1024);
}
template <typename T>
static void CustomArguments(benchmark::internal::Benchmark* b) {
const size_t L2SizeBytes = 2 * 1024 * 1024;
// What is the largest range we can check to always fit within given L2 cache?
const size_t MaxLen = L2SizeBytes / /*total bufs*/ 2 /
/*maximal elt size*/ sizeof(T) / /*safety margin*/ 2;
b->RangeMultiplier(2)->Range(1, MaxLen)->Complexity(benchmark::oN);
}
// Both of the values are random.
// The comparison is unpredictable.
BENCHMARK_TEMPLATE(BM_StdMidpoint, int32_t, RandRand)
->Apply(CustomArguments<int32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint32_t, RandRand)
->Apply(CustomArguments<uint32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int64_t, RandRand)
->Apply(CustomArguments<int64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint64_t, RandRand)
->Apply(CustomArguments<uint64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int16_t, RandRand)
->Apply(CustomArguments<int16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint16_t, RandRand)
->Apply(CustomArguments<uint16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int8_t, RandRand)
->Apply(CustomArguments<int8_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint8_t, RandRand)
->Apply(CustomArguments<uint8_t>);
// One value is always zero, and another is bigger or equal than zero.
// The comparison is predictable.
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint32_t, ZeroRand)
->Apply(CustomArguments<uint32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint64_t, ZeroRand)
->Apply(CustomArguments<uint64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint16_t, ZeroRand)
->Apply(CustomArguments<uint16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint8_t, ZeroRand)
->Apply(CustomArguments<uint8_t>);
```
```
$ ~/src/googlebenchmark/tools/compare.py --no-utest benchmarks ./llvm-cmov-bench-OLD ./llvm-cmov-bench-NEW
RUNNING: ./llvm-cmov-bench-OLD --benchmark_out=/tmp/tmp5a5qjm
2019-03-06 21:53:31
Running ./llvm-cmov-bench-OLD
Run on (8 X 4000 MHz CPU s)
CPU Caches:
L1 Data 16K (x8)
L1 Instruction 64K (x4)
L2 Unified 2048K (x4)
L3 Unified 8192K (x1)
Load Average: 1.78, 1.81, 1.36
----------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters<...>
----------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 300398 ns 300404 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25083G/s midpoints=305.398M midpoints/sec=436.319M/s
BM_StdMidpoint<int32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<int32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 300433 ns 300433 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25052G/s midpoints=305.398M midpoints/sec=436.278M/s
BM_StdMidpoint<uint32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 169857 ns 169858 ns 4121 bytes_read/iteration=1024k bytes_read/sec=5.74929G/s midpoints=270.074M midpoints/sec=385.828M/s
BM_StdMidpoint<int64_t, RandRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<int64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 169770 ns 169771 ns 4125 bytes_read/iteration=1024k bytes_read/sec=5.75223G/s midpoints=270.336M midpoints/sec=386.026M/s
BM_StdMidpoint<uint64_t, RandRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 591169 ns 591179 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.65189G/s midpoints=309.854M midpoints/sec=443.426M/s
BM_StdMidpoint<int16_t, RandRand>_BigO 2.25 N 2.25 N
BM_StdMidpoint<int16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 591264 ns 591274 ns 1184 bytes_read/iteration=1024k bytes_read/sec=1.65162G/s midpoints=310.378M midpoints/sec=443.354M/s
BM_StdMidpoint<uint16_t, RandRand>_BigO 2.25 N 2.25 N
BM_StdMidpoint<uint16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 2983669 ns 2983689 ns 235 bytes_read/iteration=1024k bytes_read/sec=335.156M/s midpoints=123.208M midpoints/sec=175.718M/s
BM_StdMidpoint<int8_t, RandRand>_BigO 5.69 N 5.69 N
BM_StdMidpoint<int8_t, RandRand>_RMS 0 % 0 %
<...>
BM_StdMidpoint<uint8_t, RandRand>/524288 2668398 ns 2668419 ns 262 bytes_read/iteration=1024k bytes_read/sec=374.754M/s midpoints=137.363M midpoints/sec=196.479M/s
BM_StdMidpoint<uint8_t, RandRand>_BigO 5.09 N 5.09 N
BM_StdMidpoint<uint8_t, RandRand>_RMS 0 % 0 %
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 300887 ns 300887 ns 2331 bytes_read/iteration=1024k bytes_read/sec=3.24561G/s midpoints=305.529M midpoints/sec=435.619M/s
BM_StdMidpoint<uint32_t, ZeroRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, ZeroRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 169634 ns 169634 ns 4102 bytes_read/iteration=1024k bytes_read/sec=5.75688G/s midpoints=268.829M midpoints/sec=386.338M/s
BM_StdMidpoint<uint64_t, ZeroRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, ZeroRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 592252 ns 592255 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.64889G/s midpoints=309.854M midpoints/sec=442.62M/s
BM_StdMidpoint<uint16_t, ZeroRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<uint16_t, ZeroRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 987295 ns 987309 ns 711 bytes_read/iteration=1024k bytes_read/sec=1012.85M/s midpoints=372.769M midpoints/sec=531.028M/s
BM_StdMidpoint<uint8_t, ZeroRand>_BigO 1.88 N 1.88 N
BM_StdMidpoint<uint8_t, ZeroRand>_RMS 1 % 1 %
RUNNING: ./llvm-cmov-bench-NEW --benchmark_out=/tmp/tmpPvwpfW
2019-03-06 21:56:58
Running ./llvm-cmov-bench-NEW
Run on (8 X 4000 MHz CPU s)
CPU Caches:
L1 Data 16K (x8)
L1 Instruction 64K (x4)
L2 Unified 2048K (x4)
L3 Unified 8192K (x1)
Load Average: 1.17, 1.46, 1.30
----------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters<...>
----------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 300878 ns 300880 ns 2324 bytes_read/iteration=1024k bytes_read/sec=3.24569G/s midpoints=304.611M midpoints/sec=435.629M/s
BM_StdMidpoint<int32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<int32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 300231 ns 300226 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25276G/s midpoints=305.398M midpoints/sec=436.578M/s
BM_StdMidpoint<uint32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 170819 ns 170777 ns 4115 bytes_read/iteration=1024k bytes_read/sec=5.71835G/s midpoints=269.681M midpoints/sec=383.752M/s
BM_StdMidpoint<int64_t, RandRand>_BigO 2.60 N 2.60 N
BM_StdMidpoint<int64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 171705 ns 171708 ns 4106 bytes_read/iteration=1024k bytes_read/sec=5.68733G/s midpoints=269.091M midpoints/sec=381.671M/s
BM_StdMidpoint<uint64_t, RandRand>_BigO 2.62 N 2.62 N
BM_StdMidpoint<uint64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 592510 ns 592516 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.64816G/s midpoints=309.854M midpoints/sec=442.425M/s
BM_StdMidpoint<int16_t, RandRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<int16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 614823 ns 614823 ns 1180 bytes_read/iteration=1024k bytes_read/sec=1.58836G/s midpoints=309.33M midpoints/sec=426.373M/s
BM_StdMidpoint<uint16_t, RandRand>_BigO 2.33 N 2.33 N
BM_StdMidpoint<uint16_t, RandRand>_RMS 4 % 4 %
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 1073181 ns 1073201 ns 650 bytes_read/iteration=1024k bytes_read/sec=931.791M/s midpoints=340.787M midpoints/sec=488.527M/s
BM_StdMidpoint<int8_t, RandRand>_BigO 2.05 N 2.05 N
BM_StdMidpoint<int8_t, RandRand>_RMS 1 % 1 %
BM_StdMidpoint<uint8_t, RandRand>/524288 1071010 ns 1071020 ns 653 bytes_read/iteration=1024k bytes_read/sec=933.689M/s midpoints=342.36M midpoints/sec=489.522M/s
BM_StdMidpoint<uint8_t, RandRand>_BigO 2.05 N 2.05 N
BM_StdMidpoint<uint8_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 300413 ns 300416 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.2507G/s midpoints=305.398M midpoints/sec=436.302M/s
BM_StdMidpoint<uint32_t, ZeroRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, ZeroRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 169667 ns 169669 ns 4123 bytes_read/iteration=1024k bytes_read/sec=5.75568G/s midpoints=270.205M midpoints/sec=386.257M/s
BM_StdMidpoint<uint64_t, ZeroRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, ZeroRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 591396 ns 591404 ns 1184 bytes_read/iteration=1024k bytes_read/sec=1.65126G/s midpoints=310.378M midpoints/sec=443.257M/s
BM_StdMidpoint<uint16_t, ZeroRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<uint16_t, ZeroRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 1069421 ns 1069413 ns 655 bytes_read/iteration=1024k bytes_read/sec=935.092M/s midpoints=343.409M midpoints/sec=490.258M/s
BM_StdMidpoint<uint8_t, ZeroRand>_BigO 2.04 N 2.04 N
BM_StdMidpoint<uint8_t, ZeroRand>_RMS 0 % 0 %
Comparing ./llvm-cmov-bench-OLD to ./llvm-cmov-bench-NEW
Benchmark Time CPU Time Old Time New CPU Old CPU New
----------------------------------------------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 +0.0016 +0.0016 300398 300878 300404 300880
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 -0.0007 -0.0007 300433 300231 300433 300226
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 +0.0057 +0.0054 169857 170819 169858 170777
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 +0.0114 +0.0114 169770 171705 169771 171708
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 +0.0023 +0.0023 591169 592510 591179 592516
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 +0.0398 +0.0398 591264 614823 591274 614823
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 -0.6403 -0.6403 2983669 1073181 2983689 1073201
<...>
BM_StdMidpoint<uint8_t, RandRand>/524288 -0.5986 -0.5986 2668398 1071010 2668419 1071020
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 -0.0016 -0.0016 300887 300413 300887 300416
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 +0.0002 +0.0002 169634 169667 169634 169669
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 -0.0014 -0.0014 592252 591396 592255 591404
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 +0.0832 +0.0832 987295 1069421 987309 1069413
```
What can we tell from the benchmark?
* `BM_StdMidpoint<[u]int8_t, RandRand>` indeed has the worst performance.
* All `BM_StdMidpoint<uint{8,16,32}_t, ZeroRand>` are all performant, even the 8-bit case.
That is because there we are computing mid point between zero and some random number,
thus if the branch predictor is in use, it is in optimal situation.
* Promoting 8-bit CMOV did improve performance of `BM_StdMidpoint<[u]int8_t, RandRand>`, by -59%..-64%.
# What about branch predictor?
* `BM_StdMidpoint<uint8_t, ZeroRand>` was faster than `BM_StdMidpoint<uint{16,32,64}_t, ZeroRand>`,
which may mean that well-predicted branch is better than `cmov`.
* Promoting 8-bit CMOV degraded performance of `BM_StdMidpoint<uint8_t, ZeroRand>`,
`cmov` is up to +10% worse than well-predicted branch.
* However, i do not believe this is a concern. If the branch is well predicted, then the PGO
will also say that it is well predicted, and LLVM will happily expand cmov back into branch:
https://godbolt.org/z/P5ufig
# What about partial register stalls?
I'm not really able to answer that.
What i can say is that if the branch is unpredictable (if it is predictable, then use PGO and you'll have branch)
in ~50% of cases you will have to pay branch misprediction penalty.
```
$ grep -i MispredictPenalty X86Sched*.td
X86SchedBroadwell.td: let MispredictPenalty = 16;
X86SchedHaswell.td: let MispredictPenalty = 16;
X86SchedSandyBridge.td: let MispredictPenalty = 16;
X86SchedSkylakeClient.td: let MispredictPenalty = 14;
X86SchedSkylakeServer.td: let MispredictPenalty = 14;
X86ScheduleBdVer2.td: let MispredictPenalty = 20; // Minimum branch misdirection penalty.
X86ScheduleBtVer2.td: let MispredictPenalty = 14; // Minimum branch misdirection penalty
X86ScheduleSLM.td: let MispredictPenalty = 10;
X86ScheduleZnver1.td: let MispredictPenalty = 17;
```
.. which it can be as small as 10 cycles and as large as 20 cycles.
Partial register stalls do not seem to be an issue for AMD CPU's.
For intel CPU's, they should be around ~5 cycles?
Is that actually an issue here? I'm not sure.
In short, i'd say this is an improvement, at least on this microbenchmark.
Fixes [[ https://bugs.llvm.org/show_bug.cgi?id=40965 | PR40965 ]].
Reviewers: craig.topper, RKSimon, spatel, andreadb, nikic
Reviewed By: craig.topper, andreadb
Subscribers: jfb, jdoerfert, llvm-commits, mclow.lists
Tags: #llvm, #libc
Differential Revision: https://reviews.llvm.org/D59035
llvm-svn: 356300
2019-03-16 05:17:53 +08:00
|
|
|
; ANY-NEXT: movl %esi, %eax
|
|
|
|
; ANY-NEXT: notb %al
|
|
|
|
; ANY-NEXT: cmpb %al, %dil
|
|
|
|
; ANY-NEXT: movzbl %al, %eax
|
|
|
|
; ANY-NEXT: cmovbl %edi, %eax
|
2018-09-20 02:59:08 +08:00
|
|
|
; ANY-NEXT: addb %sil, %al
|
|
|
|
; ANY-NEXT: # kill: def $al killed $al killed $eax
|
2018-09-11 01:40:15 +08:00
|
|
|
; ANY-NEXT: retq
|
|
|
|
%noty = xor i8 %y, -1
|
|
|
|
%c = icmp ult i8 %x, %noty
|
|
|
|
%s = select i1 %c, i8 %x, i8 %noty
|
|
|
|
%r = add i8 %s, %y
|
|
|
|
ret i8 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define i8 @unsigned_sat_variable_i8_using_cmp_sum(i8 %x, i8 %y) {
|
|
|
|
; ANY-LABEL: unsigned_sat_variable_i8_using_cmp_sum:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: addb %sil, %dil
|
[X86] Promote i8 CMOV's (PR40965)
Summary:
@mclow.lists brought up this issue up in IRC, it came up during
implementation of libc++ `std::midpoint()` implementation (D59099)
https://godbolt.org/z/oLrHBP
Currently LLVM X86 backend only promotes i8 CMOV if it came from 2x`trunc`.
This differential proposes to always promote i8 CMOV.
There are several concerns here:
* Is this actually more performant, or is it just the ASM that looks cuter?
* Does this result in partial register stalls?
* What about branch predictor?
# Indeed, performance should be the main point here.
Let's look at a simple microbenchmark: {F8412076}
```
#include "benchmark/benchmark.h"
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <iterator>
#include <limits>
#include <random>
#include <type_traits>
#include <utility>
#include <vector>
// Future preliminary libc++ code, from Marshall Clow.
namespace std {
template <class _Tp>
__inline _Tp midpoint(_Tp __a, _Tp __b) noexcept {
using _Up = typename std::make_unsigned<typename remove_cv<_Tp>::type>::type;
int __sign = 1;
_Up __m = __a;
_Up __M = __b;
if (__a > __b) {
__sign = -1;
__m = __b;
__M = __a;
}
return __a + __sign * _Tp(_Up(__M - __m) >> 1);
}
} // namespace std
template <typename T>
std::vector<T> getVectorOfRandomNumbers(size_t count) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<T> dis(std::numeric_limits<T>::min(),
std::numeric_limits<T>::max());
std::vector<T> v;
v.reserve(count);
std::generate_n(std::back_inserter(v), count,
[&dis, &gen]() { return dis(gen); });
assert(v.size() == count);
return v;
}
struct RandRand {
template <typename T>
static std::pair<std::vector<T>, std::vector<T>> Gen(size_t count) {
return std::make_pair(getVectorOfRandomNumbers<T>(count),
getVectorOfRandomNumbers<T>(count));
}
};
struct ZeroRand {
template <typename T>
static std::pair<std::vector<T>, std::vector<T>> Gen(size_t count) {
return std::make_pair(std::vector<T>(count, T(0)),
getVectorOfRandomNumbers<T>(count));
}
};
template <class T, class Gen>
void BM_StdMidpoint(benchmark::State& state) {
const size_t Length = state.range(0);
const std::pair<std::vector<T>, std::vector<T>> Data =
Gen::template Gen<T>(Length);
const std::vector<T>& a = Data.first;
const std::vector<T>& b = Data.second;
assert(a.size() == Length && b.size() == a.size());
benchmark::ClobberMemory();
benchmark::DoNotOptimize(a);
benchmark::DoNotOptimize(a.data());
benchmark::DoNotOptimize(b);
benchmark::DoNotOptimize(b.data());
for (auto _ : state) {
for (size_t i = 0; i < Length; i++) {
const auto calculated = std::midpoint(a[i], b[i]);
benchmark::DoNotOptimize(calculated);
}
}
state.SetComplexityN(Length);
state.counters["midpoints"] =
benchmark::Counter(Length, benchmark::Counter::kIsIterationInvariant);
state.counters["midpoints/sec"] =
benchmark::Counter(Length, benchmark::Counter::kIsIterationInvariantRate);
const size_t BytesRead = 2 * sizeof(T) * Length;
state.counters["bytes_read/iteration"] =
benchmark::Counter(BytesRead, benchmark::Counter::kDefaults,
benchmark::Counter::OneK::kIs1024);
state.counters["bytes_read/sec"] = benchmark::Counter(
BytesRead, benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::OneK::kIs1024);
}
template <typename T>
static void CustomArguments(benchmark::internal::Benchmark* b) {
const size_t L2SizeBytes = 2 * 1024 * 1024;
// What is the largest range we can check to always fit within given L2 cache?
const size_t MaxLen = L2SizeBytes / /*total bufs*/ 2 /
/*maximal elt size*/ sizeof(T) / /*safety margin*/ 2;
b->RangeMultiplier(2)->Range(1, MaxLen)->Complexity(benchmark::oN);
}
// Both of the values are random.
// The comparison is unpredictable.
BENCHMARK_TEMPLATE(BM_StdMidpoint, int32_t, RandRand)
->Apply(CustomArguments<int32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint32_t, RandRand)
->Apply(CustomArguments<uint32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int64_t, RandRand)
->Apply(CustomArguments<int64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint64_t, RandRand)
->Apply(CustomArguments<uint64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int16_t, RandRand)
->Apply(CustomArguments<int16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint16_t, RandRand)
->Apply(CustomArguments<uint16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int8_t, RandRand)
->Apply(CustomArguments<int8_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint8_t, RandRand)
->Apply(CustomArguments<uint8_t>);
// One value is always zero, and another is bigger or equal than zero.
// The comparison is predictable.
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint32_t, ZeroRand)
->Apply(CustomArguments<uint32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint64_t, ZeroRand)
->Apply(CustomArguments<uint64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint16_t, ZeroRand)
->Apply(CustomArguments<uint16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint8_t, ZeroRand)
->Apply(CustomArguments<uint8_t>);
```
```
$ ~/src/googlebenchmark/tools/compare.py --no-utest benchmarks ./llvm-cmov-bench-OLD ./llvm-cmov-bench-NEW
RUNNING: ./llvm-cmov-bench-OLD --benchmark_out=/tmp/tmp5a5qjm
2019-03-06 21:53:31
Running ./llvm-cmov-bench-OLD
Run on (8 X 4000 MHz CPU s)
CPU Caches:
L1 Data 16K (x8)
L1 Instruction 64K (x4)
L2 Unified 2048K (x4)
L3 Unified 8192K (x1)
Load Average: 1.78, 1.81, 1.36
----------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters<...>
----------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 300398 ns 300404 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25083G/s midpoints=305.398M midpoints/sec=436.319M/s
BM_StdMidpoint<int32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<int32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 300433 ns 300433 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25052G/s midpoints=305.398M midpoints/sec=436.278M/s
BM_StdMidpoint<uint32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 169857 ns 169858 ns 4121 bytes_read/iteration=1024k bytes_read/sec=5.74929G/s midpoints=270.074M midpoints/sec=385.828M/s
BM_StdMidpoint<int64_t, RandRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<int64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 169770 ns 169771 ns 4125 bytes_read/iteration=1024k bytes_read/sec=5.75223G/s midpoints=270.336M midpoints/sec=386.026M/s
BM_StdMidpoint<uint64_t, RandRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 591169 ns 591179 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.65189G/s midpoints=309.854M midpoints/sec=443.426M/s
BM_StdMidpoint<int16_t, RandRand>_BigO 2.25 N 2.25 N
BM_StdMidpoint<int16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 591264 ns 591274 ns 1184 bytes_read/iteration=1024k bytes_read/sec=1.65162G/s midpoints=310.378M midpoints/sec=443.354M/s
BM_StdMidpoint<uint16_t, RandRand>_BigO 2.25 N 2.25 N
BM_StdMidpoint<uint16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 2983669 ns 2983689 ns 235 bytes_read/iteration=1024k bytes_read/sec=335.156M/s midpoints=123.208M midpoints/sec=175.718M/s
BM_StdMidpoint<int8_t, RandRand>_BigO 5.69 N 5.69 N
BM_StdMidpoint<int8_t, RandRand>_RMS 0 % 0 %
<...>
BM_StdMidpoint<uint8_t, RandRand>/524288 2668398 ns 2668419 ns 262 bytes_read/iteration=1024k bytes_read/sec=374.754M/s midpoints=137.363M midpoints/sec=196.479M/s
BM_StdMidpoint<uint8_t, RandRand>_BigO 5.09 N 5.09 N
BM_StdMidpoint<uint8_t, RandRand>_RMS 0 % 0 %
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 300887 ns 300887 ns 2331 bytes_read/iteration=1024k bytes_read/sec=3.24561G/s midpoints=305.529M midpoints/sec=435.619M/s
BM_StdMidpoint<uint32_t, ZeroRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, ZeroRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 169634 ns 169634 ns 4102 bytes_read/iteration=1024k bytes_read/sec=5.75688G/s midpoints=268.829M midpoints/sec=386.338M/s
BM_StdMidpoint<uint64_t, ZeroRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, ZeroRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 592252 ns 592255 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.64889G/s midpoints=309.854M midpoints/sec=442.62M/s
BM_StdMidpoint<uint16_t, ZeroRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<uint16_t, ZeroRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 987295 ns 987309 ns 711 bytes_read/iteration=1024k bytes_read/sec=1012.85M/s midpoints=372.769M midpoints/sec=531.028M/s
BM_StdMidpoint<uint8_t, ZeroRand>_BigO 1.88 N 1.88 N
BM_StdMidpoint<uint8_t, ZeroRand>_RMS 1 % 1 %
RUNNING: ./llvm-cmov-bench-NEW --benchmark_out=/tmp/tmpPvwpfW
2019-03-06 21:56:58
Running ./llvm-cmov-bench-NEW
Run on (8 X 4000 MHz CPU s)
CPU Caches:
L1 Data 16K (x8)
L1 Instruction 64K (x4)
L2 Unified 2048K (x4)
L3 Unified 8192K (x1)
Load Average: 1.17, 1.46, 1.30
----------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters<...>
----------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 300878 ns 300880 ns 2324 bytes_read/iteration=1024k bytes_read/sec=3.24569G/s midpoints=304.611M midpoints/sec=435.629M/s
BM_StdMidpoint<int32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<int32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 300231 ns 300226 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25276G/s midpoints=305.398M midpoints/sec=436.578M/s
BM_StdMidpoint<uint32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 170819 ns 170777 ns 4115 bytes_read/iteration=1024k bytes_read/sec=5.71835G/s midpoints=269.681M midpoints/sec=383.752M/s
BM_StdMidpoint<int64_t, RandRand>_BigO 2.60 N 2.60 N
BM_StdMidpoint<int64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 171705 ns 171708 ns 4106 bytes_read/iteration=1024k bytes_read/sec=5.68733G/s midpoints=269.091M midpoints/sec=381.671M/s
BM_StdMidpoint<uint64_t, RandRand>_BigO 2.62 N 2.62 N
BM_StdMidpoint<uint64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 592510 ns 592516 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.64816G/s midpoints=309.854M midpoints/sec=442.425M/s
BM_StdMidpoint<int16_t, RandRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<int16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 614823 ns 614823 ns 1180 bytes_read/iteration=1024k bytes_read/sec=1.58836G/s midpoints=309.33M midpoints/sec=426.373M/s
BM_StdMidpoint<uint16_t, RandRand>_BigO 2.33 N 2.33 N
BM_StdMidpoint<uint16_t, RandRand>_RMS 4 % 4 %
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 1073181 ns 1073201 ns 650 bytes_read/iteration=1024k bytes_read/sec=931.791M/s midpoints=340.787M midpoints/sec=488.527M/s
BM_StdMidpoint<int8_t, RandRand>_BigO 2.05 N 2.05 N
BM_StdMidpoint<int8_t, RandRand>_RMS 1 % 1 %
BM_StdMidpoint<uint8_t, RandRand>/524288 1071010 ns 1071020 ns 653 bytes_read/iteration=1024k bytes_read/sec=933.689M/s midpoints=342.36M midpoints/sec=489.522M/s
BM_StdMidpoint<uint8_t, RandRand>_BigO 2.05 N 2.05 N
BM_StdMidpoint<uint8_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 300413 ns 300416 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.2507G/s midpoints=305.398M midpoints/sec=436.302M/s
BM_StdMidpoint<uint32_t, ZeroRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, ZeroRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 169667 ns 169669 ns 4123 bytes_read/iteration=1024k bytes_read/sec=5.75568G/s midpoints=270.205M midpoints/sec=386.257M/s
BM_StdMidpoint<uint64_t, ZeroRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, ZeroRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 591396 ns 591404 ns 1184 bytes_read/iteration=1024k bytes_read/sec=1.65126G/s midpoints=310.378M midpoints/sec=443.257M/s
BM_StdMidpoint<uint16_t, ZeroRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<uint16_t, ZeroRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 1069421 ns 1069413 ns 655 bytes_read/iteration=1024k bytes_read/sec=935.092M/s midpoints=343.409M midpoints/sec=490.258M/s
BM_StdMidpoint<uint8_t, ZeroRand>_BigO 2.04 N 2.04 N
BM_StdMidpoint<uint8_t, ZeroRand>_RMS 0 % 0 %
Comparing ./llvm-cmov-bench-OLD to ./llvm-cmov-bench-NEW
Benchmark Time CPU Time Old Time New CPU Old CPU New
----------------------------------------------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 +0.0016 +0.0016 300398 300878 300404 300880
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 -0.0007 -0.0007 300433 300231 300433 300226
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 +0.0057 +0.0054 169857 170819 169858 170777
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 +0.0114 +0.0114 169770 171705 169771 171708
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 +0.0023 +0.0023 591169 592510 591179 592516
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 +0.0398 +0.0398 591264 614823 591274 614823
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 -0.6403 -0.6403 2983669 1073181 2983689 1073201
<...>
BM_StdMidpoint<uint8_t, RandRand>/524288 -0.5986 -0.5986 2668398 1071010 2668419 1071020
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 -0.0016 -0.0016 300887 300413 300887 300416
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 +0.0002 +0.0002 169634 169667 169634 169669
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 -0.0014 -0.0014 592252 591396 592255 591404
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 +0.0832 +0.0832 987295 1069421 987309 1069413
```
What can we tell from the benchmark?
* `BM_StdMidpoint<[u]int8_t, RandRand>` indeed has the worst performance.
* All `BM_StdMidpoint<uint{8,16,32}_t, ZeroRand>` are all performant, even the 8-bit case.
That is because there we are computing mid point between zero and some random number,
thus if the branch predictor is in use, it is in optimal situation.
* Promoting 8-bit CMOV did improve performance of `BM_StdMidpoint<[u]int8_t, RandRand>`, by -59%..-64%.
# What about branch predictor?
* `BM_StdMidpoint<uint8_t, ZeroRand>` was faster than `BM_StdMidpoint<uint{16,32,64}_t, ZeroRand>`,
which may mean that well-predicted branch is better than `cmov`.
* Promoting 8-bit CMOV degraded performance of `BM_StdMidpoint<uint8_t, ZeroRand>`,
`cmov` is up to +10% worse than well-predicted branch.
* However, i do not believe this is a concern. If the branch is well predicted, then the PGO
will also say that it is well predicted, and LLVM will happily expand cmov back into branch:
https://godbolt.org/z/P5ufig
# What about partial register stalls?
I'm not really able to answer that.
What i can say is that if the branch is unpredictable (if it is predictable, then use PGO and you'll have branch)
in ~50% of cases you will have to pay branch misprediction penalty.
```
$ grep -i MispredictPenalty X86Sched*.td
X86SchedBroadwell.td: let MispredictPenalty = 16;
X86SchedHaswell.td: let MispredictPenalty = 16;
X86SchedSandyBridge.td: let MispredictPenalty = 16;
X86SchedSkylakeClient.td: let MispredictPenalty = 14;
X86SchedSkylakeServer.td: let MispredictPenalty = 14;
X86ScheduleBdVer2.td: let MispredictPenalty = 20; // Minimum branch misdirection penalty.
X86ScheduleBtVer2.td: let MispredictPenalty = 14; // Minimum branch misdirection penalty
X86ScheduleSLM.td: let MispredictPenalty = 10;
X86ScheduleZnver1.td: let MispredictPenalty = 17;
```
.. which it can be as small as 10 cycles and as large as 20 cycles.
Partial register stalls do not seem to be an issue for AMD CPU's.
For intel CPU's, they should be around ~5 cycles?
Is that actually an issue here? I'm not sure.
In short, i'd say this is an improvement, at least on this microbenchmark.
Fixes [[ https://bugs.llvm.org/show_bug.cgi?id=40965 | PR40965 ]].
Reviewers: craig.topper, RKSimon, spatel, andreadb, nikic
Reviewed By: craig.topper, andreadb
Subscribers: jfb, jdoerfert, llvm-commits, mclow.lists
Tags: #llvm, #libc
Differential Revision: https://reviews.llvm.org/D59035
llvm-svn: 356300
2019-03-16 05:17:53 +08:00
|
|
|
; ANY-NEXT: movzbl %dil, %ecx
|
|
|
|
; ANY-NEXT: movl $255, %eax
|
|
|
|
; ANY-NEXT: cmovael %ecx, %eax
|
|
|
|
; ANY-NEXT: # kill: def $al killed $al killed $eax
|
2018-09-11 01:40:15 +08:00
|
|
|
; ANY-NEXT: retq
|
|
|
|
%a = add i8 %x, %y
|
|
|
|
%c = icmp ugt i8 %x, %a
|
|
|
|
%r = select i1 %c, i8 -1, i8 %a
|
|
|
|
ret i8 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define i8 @unsigned_sat_variable_i8_using_cmp_notval(i8 %x, i8 %y) {
|
|
|
|
; ANY-LABEL: unsigned_sat_variable_i8_using_cmp_notval:
|
|
|
|
; ANY: # %bb.0:
|
[X86] Promote i8 CMOV's (PR40965)
Summary:
@mclow.lists brought up this issue up in IRC, it came up during
implementation of libc++ `std::midpoint()` implementation (D59099)
https://godbolt.org/z/oLrHBP
Currently LLVM X86 backend only promotes i8 CMOV if it came from 2x`trunc`.
This differential proposes to always promote i8 CMOV.
There are several concerns here:
* Is this actually more performant, or is it just the ASM that looks cuter?
* Does this result in partial register stalls?
* What about branch predictor?
# Indeed, performance should be the main point here.
Let's look at a simple microbenchmark: {F8412076}
```
#include "benchmark/benchmark.h"
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <iterator>
#include <limits>
#include <random>
#include <type_traits>
#include <utility>
#include <vector>
// Future preliminary libc++ code, from Marshall Clow.
namespace std {
template <class _Tp>
__inline _Tp midpoint(_Tp __a, _Tp __b) noexcept {
using _Up = typename std::make_unsigned<typename remove_cv<_Tp>::type>::type;
int __sign = 1;
_Up __m = __a;
_Up __M = __b;
if (__a > __b) {
__sign = -1;
__m = __b;
__M = __a;
}
return __a + __sign * _Tp(_Up(__M - __m) >> 1);
}
} // namespace std
template <typename T>
std::vector<T> getVectorOfRandomNumbers(size_t count) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<T> dis(std::numeric_limits<T>::min(),
std::numeric_limits<T>::max());
std::vector<T> v;
v.reserve(count);
std::generate_n(std::back_inserter(v), count,
[&dis, &gen]() { return dis(gen); });
assert(v.size() == count);
return v;
}
struct RandRand {
template <typename T>
static std::pair<std::vector<T>, std::vector<T>> Gen(size_t count) {
return std::make_pair(getVectorOfRandomNumbers<T>(count),
getVectorOfRandomNumbers<T>(count));
}
};
struct ZeroRand {
template <typename T>
static std::pair<std::vector<T>, std::vector<T>> Gen(size_t count) {
return std::make_pair(std::vector<T>(count, T(0)),
getVectorOfRandomNumbers<T>(count));
}
};
template <class T, class Gen>
void BM_StdMidpoint(benchmark::State& state) {
const size_t Length = state.range(0);
const std::pair<std::vector<T>, std::vector<T>> Data =
Gen::template Gen<T>(Length);
const std::vector<T>& a = Data.first;
const std::vector<T>& b = Data.second;
assert(a.size() == Length && b.size() == a.size());
benchmark::ClobberMemory();
benchmark::DoNotOptimize(a);
benchmark::DoNotOptimize(a.data());
benchmark::DoNotOptimize(b);
benchmark::DoNotOptimize(b.data());
for (auto _ : state) {
for (size_t i = 0; i < Length; i++) {
const auto calculated = std::midpoint(a[i], b[i]);
benchmark::DoNotOptimize(calculated);
}
}
state.SetComplexityN(Length);
state.counters["midpoints"] =
benchmark::Counter(Length, benchmark::Counter::kIsIterationInvariant);
state.counters["midpoints/sec"] =
benchmark::Counter(Length, benchmark::Counter::kIsIterationInvariantRate);
const size_t BytesRead = 2 * sizeof(T) * Length;
state.counters["bytes_read/iteration"] =
benchmark::Counter(BytesRead, benchmark::Counter::kDefaults,
benchmark::Counter::OneK::kIs1024);
state.counters["bytes_read/sec"] = benchmark::Counter(
BytesRead, benchmark::Counter::kIsIterationInvariantRate,
benchmark::Counter::OneK::kIs1024);
}
template <typename T>
static void CustomArguments(benchmark::internal::Benchmark* b) {
const size_t L2SizeBytes = 2 * 1024 * 1024;
// What is the largest range we can check to always fit within given L2 cache?
const size_t MaxLen = L2SizeBytes / /*total bufs*/ 2 /
/*maximal elt size*/ sizeof(T) / /*safety margin*/ 2;
b->RangeMultiplier(2)->Range(1, MaxLen)->Complexity(benchmark::oN);
}
// Both of the values are random.
// The comparison is unpredictable.
BENCHMARK_TEMPLATE(BM_StdMidpoint, int32_t, RandRand)
->Apply(CustomArguments<int32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint32_t, RandRand)
->Apply(CustomArguments<uint32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int64_t, RandRand)
->Apply(CustomArguments<int64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint64_t, RandRand)
->Apply(CustomArguments<uint64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int16_t, RandRand)
->Apply(CustomArguments<int16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint16_t, RandRand)
->Apply(CustomArguments<uint16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, int8_t, RandRand)
->Apply(CustomArguments<int8_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint8_t, RandRand)
->Apply(CustomArguments<uint8_t>);
// One value is always zero, and another is bigger or equal than zero.
// The comparison is predictable.
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint32_t, ZeroRand)
->Apply(CustomArguments<uint32_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint64_t, ZeroRand)
->Apply(CustomArguments<uint64_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint16_t, ZeroRand)
->Apply(CustomArguments<uint16_t>);
BENCHMARK_TEMPLATE(BM_StdMidpoint, uint8_t, ZeroRand)
->Apply(CustomArguments<uint8_t>);
```
```
$ ~/src/googlebenchmark/tools/compare.py --no-utest benchmarks ./llvm-cmov-bench-OLD ./llvm-cmov-bench-NEW
RUNNING: ./llvm-cmov-bench-OLD --benchmark_out=/tmp/tmp5a5qjm
2019-03-06 21:53:31
Running ./llvm-cmov-bench-OLD
Run on (8 X 4000 MHz CPU s)
CPU Caches:
L1 Data 16K (x8)
L1 Instruction 64K (x4)
L2 Unified 2048K (x4)
L3 Unified 8192K (x1)
Load Average: 1.78, 1.81, 1.36
----------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters<...>
----------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 300398 ns 300404 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25083G/s midpoints=305.398M midpoints/sec=436.319M/s
BM_StdMidpoint<int32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<int32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 300433 ns 300433 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25052G/s midpoints=305.398M midpoints/sec=436.278M/s
BM_StdMidpoint<uint32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 169857 ns 169858 ns 4121 bytes_read/iteration=1024k bytes_read/sec=5.74929G/s midpoints=270.074M midpoints/sec=385.828M/s
BM_StdMidpoint<int64_t, RandRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<int64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 169770 ns 169771 ns 4125 bytes_read/iteration=1024k bytes_read/sec=5.75223G/s midpoints=270.336M midpoints/sec=386.026M/s
BM_StdMidpoint<uint64_t, RandRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 591169 ns 591179 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.65189G/s midpoints=309.854M midpoints/sec=443.426M/s
BM_StdMidpoint<int16_t, RandRand>_BigO 2.25 N 2.25 N
BM_StdMidpoint<int16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 591264 ns 591274 ns 1184 bytes_read/iteration=1024k bytes_read/sec=1.65162G/s midpoints=310.378M midpoints/sec=443.354M/s
BM_StdMidpoint<uint16_t, RandRand>_BigO 2.25 N 2.25 N
BM_StdMidpoint<uint16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 2983669 ns 2983689 ns 235 bytes_read/iteration=1024k bytes_read/sec=335.156M/s midpoints=123.208M midpoints/sec=175.718M/s
BM_StdMidpoint<int8_t, RandRand>_BigO 5.69 N 5.69 N
BM_StdMidpoint<int8_t, RandRand>_RMS 0 % 0 %
<...>
BM_StdMidpoint<uint8_t, RandRand>/524288 2668398 ns 2668419 ns 262 bytes_read/iteration=1024k bytes_read/sec=374.754M/s midpoints=137.363M midpoints/sec=196.479M/s
BM_StdMidpoint<uint8_t, RandRand>_BigO 5.09 N 5.09 N
BM_StdMidpoint<uint8_t, RandRand>_RMS 0 % 0 %
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 300887 ns 300887 ns 2331 bytes_read/iteration=1024k bytes_read/sec=3.24561G/s midpoints=305.529M midpoints/sec=435.619M/s
BM_StdMidpoint<uint32_t, ZeroRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, ZeroRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 169634 ns 169634 ns 4102 bytes_read/iteration=1024k bytes_read/sec=5.75688G/s midpoints=268.829M midpoints/sec=386.338M/s
BM_StdMidpoint<uint64_t, ZeroRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, ZeroRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 592252 ns 592255 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.64889G/s midpoints=309.854M midpoints/sec=442.62M/s
BM_StdMidpoint<uint16_t, ZeroRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<uint16_t, ZeroRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 987295 ns 987309 ns 711 bytes_read/iteration=1024k bytes_read/sec=1012.85M/s midpoints=372.769M midpoints/sec=531.028M/s
BM_StdMidpoint<uint8_t, ZeroRand>_BigO 1.88 N 1.88 N
BM_StdMidpoint<uint8_t, ZeroRand>_RMS 1 % 1 %
RUNNING: ./llvm-cmov-bench-NEW --benchmark_out=/tmp/tmpPvwpfW
2019-03-06 21:56:58
Running ./llvm-cmov-bench-NEW
Run on (8 X 4000 MHz CPU s)
CPU Caches:
L1 Data 16K (x8)
L1 Instruction 64K (x4)
L2 Unified 2048K (x4)
L3 Unified 8192K (x1)
Load Average: 1.17, 1.46, 1.30
----------------------------------------------------------------------------------------------------
Benchmark Time CPU Iterations UserCounters<...>
----------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 300878 ns 300880 ns 2324 bytes_read/iteration=1024k bytes_read/sec=3.24569G/s midpoints=304.611M midpoints/sec=435.629M/s
BM_StdMidpoint<int32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<int32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 300231 ns 300226 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.25276G/s midpoints=305.398M midpoints/sec=436.578M/s
BM_StdMidpoint<uint32_t, RandRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, RandRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 170819 ns 170777 ns 4115 bytes_read/iteration=1024k bytes_read/sec=5.71835G/s midpoints=269.681M midpoints/sec=383.752M/s
BM_StdMidpoint<int64_t, RandRand>_BigO 2.60 N 2.60 N
BM_StdMidpoint<int64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 171705 ns 171708 ns 4106 bytes_read/iteration=1024k bytes_read/sec=5.68733G/s midpoints=269.091M midpoints/sec=381.671M/s
BM_StdMidpoint<uint64_t, RandRand>_BigO 2.62 N 2.62 N
BM_StdMidpoint<uint64_t, RandRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 592510 ns 592516 ns 1182 bytes_read/iteration=1024k bytes_read/sec=1.64816G/s midpoints=309.854M midpoints/sec=442.425M/s
BM_StdMidpoint<int16_t, RandRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<int16_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 614823 ns 614823 ns 1180 bytes_read/iteration=1024k bytes_read/sec=1.58836G/s midpoints=309.33M midpoints/sec=426.373M/s
BM_StdMidpoint<uint16_t, RandRand>_BigO 2.33 N 2.33 N
BM_StdMidpoint<uint16_t, RandRand>_RMS 4 % 4 %
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 1073181 ns 1073201 ns 650 bytes_read/iteration=1024k bytes_read/sec=931.791M/s midpoints=340.787M midpoints/sec=488.527M/s
BM_StdMidpoint<int8_t, RandRand>_BigO 2.05 N 2.05 N
BM_StdMidpoint<int8_t, RandRand>_RMS 1 % 1 %
BM_StdMidpoint<uint8_t, RandRand>/524288 1071010 ns 1071020 ns 653 bytes_read/iteration=1024k bytes_read/sec=933.689M/s midpoints=342.36M midpoints/sec=489.522M/s
BM_StdMidpoint<uint8_t, RandRand>_BigO 2.05 N 2.05 N
BM_StdMidpoint<uint8_t, RandRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 300413 ns 300416 ns 2330 bytes_read/iteration=1024k bytes_read/sec=3.2507G/s midpoints=305.398M midpoints/sec=436.302M/s
BM_StdMidpoint<uint32_t, ZeroRand>_BigO 2.29 N 2.29 N
BM_StdMidpoint<uint32_t, ZeroRand>_RMS 2 % 2 %
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 169667 ns 169669 ns 4123 bytes_read/iteration=1024k bytes_read/sec=5.75568G/s midpoints=270.205M midpoints/sec=386.257M/s
BM_StdMidpoint<uint64_t, ZeroRand>_BigO 2.59 N 2.59 N
BM_StdMidpoint<uint64_t, ZeroRand>_RMS 3 % 3 %
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 591396 ns 591404 ns 1184 bytes_read/iteration=1024k bytes_read/sec=1.65126G/s midpoints=310.378M midpoints/sec=443.257M/s
BM_StdMidpoint<uint16_t, ZeroRand>_BigO 2.26 N 2.26 N
BM_StdMidpoint<uint16_t, ZeroRand>_RMS 1 % 1 %
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 1069421 ns 1069413 ns 655 bytes_read/iteration=1024k bytes_read/sec=935.092M/s midpoints=343.409M midpoints/sec=490.258M/s
BM_StdMidpoint<uint8_t, ZeroRand>_BigO 2.04 N 2.04 N
BM_StdMidpoint<uint8_t, ZeroRand>_RMS 0 % 0 %
Comparing ./llvm-cmov-bench-OLD to ./llvm-cmov-bench-NEW
Benchmark Time CPU Time Old Time New CPU Old CPU New
----------------------------------------------------------------------------------------------------------------------------------------
<...>
BM_StdMidpoint<int32_t, RandRand>/131072 +0.0016 +0.0016 300398 300878 300404 300880
<...>
BM_StdMidpoint<uint32_t, RandRand>/131072 -0.0007 -0.0007 300433 300231 300433 300226
<...>
BM_StdMidpoint<int64_t, RandRand>/65536 +0.0057 +0.0054 169857 170819 169858 170777
<...>
BM_StdMidpoint<uint64_t, RandRand>/65536 +0.0114 +0.0114 169770 171705 169771 171708
<...>
BM_StdMidpoint<int16_t, RandRand>/262144 +0.0023 +0.0023 591169 592510 591179 592516
<...>
BM_StdMidpoint<uint16_t, RandRand>/262144 +0.0398 +0.0398 591264 614823 591274 614823
<...>
BM_StdMidpoint<int8_t, RandRand>/524288 -0.6403 -0.6403 2983669 1073181 2983689 1073201
<...>
BM_StdMidpoint<uint8_t, RandRand>/524288 -0.5986 -0.5986 2668398 1071010 2668419 1071020
<...>
BM_StdMidpoint<uint32_t, ZeroRand>/131072 -0.0016 -0.0016 300887 300413 300887 300416
<...>
BM_StdMidpoint<uint64_t, ZeroRand>/65536 +0.0002 +0.0002 169634 169667 169634 169669
<...>
BM_StdMidpoint<uint16_t, ZeroRand>/262144 -0.0014 -0.0014 592252 591396 592255 591404
<...>
BM_StdMidpoint<uint8_t, ZeroRand>/524288 +0.0832 +0.0832 987295 1069421 987309 1069413
```
What can we tell from the benchmark?
* `BM_StdMidpoint<[u]int8_t, RandRand>` indeed has the worst performance.
* All `BM_StdMidpoint<uint{8,16,32}_t, ZeroRand>` are all performant, even the 8-bit case.
That is because there we are computing mid point between zero and some random number,
thus if the branch predictor is in use, it is in optimal situation.
* Promoting 8-bit CMOV did improve performance of `BM_StdMidpoint<[u]int8_t, RandRand>`, by -59%..-64%.
# What about branch predictor?
* `BM_StdMidpoint<uint8_t, ZeroRand>` was faster than `BM_StdMidpoint<uint{16,32,64}_t, ZeroRand>`,
which may mean that well-predicted branch is better than `cmov`.
* Promoting 8-bit CMOV degraded performance of `BM_StdMidpoint<uint8_t, ZeroRand>`,
`cmov` is up to +10% worse than well-predicted branch.
* However, i do not believe this is a concern. If the branch is well predicted, then the PGO
will also say that it is well predicted, and LLVM will happily expand cmov back into branch:
https://godbolt.org/z/P5ufig
# What about partial register stalls?
I'm not really able to answer that.
What i can say is that if the branch is unpredictable (if it is predictable, then use PGO and you'll have branch)
in ~50% of cases you will have to pay branch misprediction penalty.
```
$ grep -i MispredictPenalty X86Sched*.td
X86SchedBroadwell.td: let MispredictPenalty = 16;
X86SchedHaswell.td: let MispredictPenalty = 16;
X86SchedSandyBridge.td: let MispredictPenalty = 16;
X86SchedSkylakeClient.td: let MispredictPenalty = 14;
X86SchedSkylakeServer.td: let MispredictPenalty = 14;
X86ScheduleBdVer2.td: let MispredictPenalty = 20; // Minimum branch misdirection penalty.
X86ScheduleBtVer2.td: let MispredictPenalty = 14; // Minimum branch misdirection penalty
X86ScheduleSLM.td: let MispredictPenalty = 10;
X86ScheduleZnver1.td: let MispredictPenalty = 17;
```
.. which it can be as small as 10 cycles and as large as 20 cycles.
Partial register stalls do not seem to be an issue for AMD CPU's.
For intel CPU's, they should be around ~5 cycles?
Is that actually an issue here? I'm not sure.
In short, i'd say this is an improvement, at least on this microbenchmark.
Fixes [[ https://bugs.llvm.org/show_bug.cgi?id=40965 | PR40965 ]].
Reviewers: craig.topper, RKSimon, spatel, andreadb, nikic
Reviewed By: craig.topper, andreadb
Subscribers: jfb, jdoerfert, llvm-commits, mclow.lists
Tags: #llvm, #libc
Differential Revision: https://reviews.llvm.org/D59035
llvm-svn: 356300
2019-03-16 05:17:53 +08:00
|
|
|
; ANY-NEXT: # kill: def $esi killed $esi def $rsi
|
|
|
|
; ANY-NEXT: # kill: def $edi killed $edi def $rdi
|
|
|
|
; ANY-NEXT: leal (%rdi,%rsi), %eax
|
|
|
|
; ANY-NEXT: notb %sil
|
|
|
|
; ANY-NEXT: cmpb %sil, %dil
|
|
|
|
; ANY-NEXT: movzbl %al, %ecx
|
|
|
|
; ANY-NEXT: movl $255, %eax
|
|
|
|
; ANY-NEXT: cmovbel %ecx, %eax
|
|
|
|
; ANY-NEXT: # kill: def $al killed $al killed $eax
|
2018-09-11 01:40:15 +08:00
|
|
|
; ANY-NEXT: retq
|
|
|
|
%noty = xor i8 %y, -1
|
|
|
|
%a = add i8 %x, %y
|
|
|
|
%c = icmp ugt i8 %x, %noty
|
|
|
|
%r = select i1 %c, i8 -1, i8 %a
|
|
|
|
ret i8 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define i16 @unsigned_sat_variable_i16_using_min(i16 %x, i16 %y) {
|
|
|
|
; ANY-LABEL: unsigned_sat_variable_i16_using_min:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: # kill: def $esi killed $esi def $rsi
|
|
|
|
; ANY-NEXT: movl %esi, %eax
|
|
|
|
; ANY-NEXT: notl %eax
|
|
|
|
; ANY-NEXT: cmpw %ax, %di
|
|
|
|
; ANY-NEXT: cmovbl %edi, %eax
|
|
|
|
; ANY-NEXT: leal (%rax,%rsi), %eax
|
|
|
|
; ANY-NEXT: # kill: def $ax killed $ax killed $eax
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%noty = xor i16 %y, -1
|
|
|
|
%c = icmp ult i16 %x, %noty
|
|
|
|
%s = select i1 %c, i16 %x, i16 %noty
|
|
|
|
%r = add i16 %s, %y
|
|
|
|
ret i16 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define i16 @unsigned_sat_variable_i16_using_cmp_sum(i16 %x, i16 %y) {
|
|
|
|
; ANY-LABEL: unsigned_sat_variable_i16_using_cmp_sum:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: addw %si, %di
|
|
|
|
; ANY-NEXT: movl $65535, %eax # imm = 0xFFFF
|
|
|
|
; ANY-NEXT: cmovael %edi, %eax
|
|
|
|
; ANY-NEXT: # kill: def $ax killed $ax killed $eax
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%a = add i16 %x, %y
|
|
|
|
%c = icmp ugt i16 %x, %a
|
|
|
|
%r = select i1 %c, i16 -1, i16 %a
|
|
|
|
ret i16 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define i16 @unsigned_sat_variable_i16_using_cmp_notval(i16 %x, i16 %y) {
|
|
|
|
; ANY-LABEL: unsigned_sat_variable_i16_using_cmp_notval:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: # kill: def $esi killed $esi def $rsi
|
|
|
|
; ANY-NEXT: # kill: def $edi killed $edi def $rdi
|
|
|
|
; ANY-NEXT: leal (%rdi,%rsi), %ecx
|
|
|
|
; ANY-NEXT: notl %esi
|
|
|
|
; ANY-NEXT: cmpw %si, %di
|
|
|
|
; ANY-NEXT: movl $65535, %eax # imm = 0xFFFF
|
|
|
|
; ANY-NEXT: cmovbel %ecx, %eax
|
|
|
|
; ANY-NEXT: # kill: def $ax killed $ax killed $eax
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%noty = xor i16 %y, -1
|
|
|
|
%a = add i16 %x, %y
|
|
|
|
%c = icmp ugt i16 %x, %noty
|
|
|
|
%r = select i1 %c, i16 -1, i16 %a
|
|
|
|
ret i16 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @unsigned_sat_variable_i32_using_min(i32 %x, i32 %y) {
|
|
|
|
; ANY-LABEL: unsigned_sat_variable_i32_using_min:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: # kill: def $esi killed $esi def $rsi
|
|
|
|
; ANY-NEXT: movl %esi, %eax
|
|
|
|
; ANY-NEXT: notl %eax
|
|
|
|
; ANY-NEXT: cmpl %eax, %edi
|
|
|
|
; ANY-NEXT: cmovbl %edi, %eax
|
|
|
|
; ANY-NEXT: leal (%rax,%rsi), %eax
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%noty = xor i32 %y, -1
|
|
|
|
%c = icmp ult i32 %x, %noty
|
|
|
|
%s = select i1 %c, i32 %x, i32 %noty
|
|
|
|
%r = add i32 %s, %y
|
|
|
|
ret i32 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @unsigned_sat_variable_i32_using_cmp_sum(i32 %x, i32 %y) {
|
|
|
|
; ANY-LABEL: unsigned_sat_variable_i32_using_cmp_sum:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: addl %esi, %edi
|
|
|
|
; ANY-NEXT: movl $-1, %eax
|
|
|
|
; ANY-NEXT: cmovael %edi, %eax
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%a = add i32 %x, %y
|
|
|
|
%c = icmp ugt i32 %x, %a
|
|
|
|
%r = select i1 %c, i32 -1, i32 %a
|
|
|
|
ret i32 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define i32 @unsigned_sat_variable_i32_using_cmp_notval(i32 %x, i32 %y) {
|
|
|
|
; ANY-LABEL: unsigned_sat_variable_i32_using_cmp_notval:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: # kill: def $esi killed $esi def $rsi
|
|
|
|
; ANY-NEXT: # kill: def $edi killed $edi def $rdi
|
|
|
|
; ANY-NEXT: leal (%rdi,%rsi), %ecx
|
|
|
|
; ANY-NEXT: notl %esi
|
|
|
|
; ANY-NEXT: cmpl %esi, %edi
|
|
|
|
; ANY-NEXT: movl $-1, %eax
|
|
|
|
; ANY-NEXT: cmovbel %ecx, %eax
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%noty = xor i32 %y, -1
|
|
|
|
%a = add i32 %x, %y
|
|
|
|
%c = icmp ugt i32 %x, %noty
|
|
|
|
%r = select i1 %c, i32 -1, i32 %a
|
|
|
|
ret i32 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define i64 @unsigned_sat_variable_i64_using_min(i64 %x, i64 %y) {
|
|
|
|
; ANY-LABEL: unsigned_sat_variable_i64_using_min:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: movq %rsi, %rax
|
|
|
|
; ANY-NEXT: notq %rax
|
|
|
|
; ANY-NEXT: cmpq %rax, %rdi
|
|
|
|
; ANY-NEXT: cmovbq %rdi, %rax
|
|
|
|
; ANY-NEXT: leaq (%rax,%rsi), %rax
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%noty = xor i64 %y, -1
|
|
|
|
%c = icmp ult i64 %x, %noty
|
|
|
|
%s = select i1 %c, i64 %x, i64 %noty
|
|
|
|
%r = add i64 %s, %y
|
|
|
|
ret i64 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define i64 @unsigned_sat_variable_i64_using_cmp_sum(i64 %x, i64 %y) {
|
|
|
|
; ANY-LABEL: unsigned_sat_variable_i64_using_cmp_sum:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: addq %rsi, %rdi
|
|
|
|
; ANY-NEXT: movq $-1, %rax
|
|
|
|
; ANY-NEXT: cmovaeq %rdi, %rax
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%a = add i64 %x, %y
|
|
|
|
%c = icmp ugt i64 %x, %a
|
|
|
|
%r = select i1 %c, i64 -1, i64 %a
|
|
|
|
ret i64 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define i64 @unsigned_sat_variable_i64_using_cmp_notval(i64 %x, i64 %y) {
|
|
|
|
; ANY-LABEL: unsigned_sat_variable_i64_using_cmp_notval:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: leaq (%rdi,%rsi), %rcx
|
|
|
|
; ANY-NEXT: notq %rsi
|
|
|
|
; ANY-NEXT: cmpq %rsi, %rdi
|
|
|
|
; ANY-NEXT: movq $-1, %rax
|
|
|
|
; ANY-NEXT: cmovbeq %rcx, %rax
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%noty = xor i64 %y, -1
|
|
|
|
%a = add i64 %x, %y
|
|
|
|
%c = icmp ugt i64 %x, %noty
|
|
|
|
%r = select i1 %c, i64 -1, i64 %a
|
|
|
|
ret i64 %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @unsigned_sat_constant_v16i8_using_min(<16 x i8> %x) {
|
|
|
|
; ANY-LABEL: unsigned_sat_constant_v16i8_using_min:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: pminub {{.*}}(%rip), %xmm0
|
|
|
|
; ANY-NEXT: paddb {{.*}}(%rip), %xmm0
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%c = icmp ult <16 x i8> %x, <i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43>
|
|
|
|
%s = select <16 x i1> %c, <16 x i8> %x, <16 x i8> <i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43>
|
|
|
|
%r = add <16 x i8> %s, <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>
|
|
|
|
ret <16 x i8> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @unsigned_sat_constant_v16i8_using_cmp_sum(<16 x i8> %x) {
|
|
|
|
; ANY-LABEL: unsigned_sat_constant_v16i8_using_cmp_sum:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: paddusb {{.*}}(%rip), %xmm0
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%a = add <16 x i8> %x, <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>
|
|
|
|
%c = icmp ugt <16 x i8> %x, %a
|
|
|
|
%r = select <16 x i1> %c, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %a
|
|
|
|
ret <16 x i8> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @unsigned_sat_constant_v16i8_using_cmp_notval(<16 x i8> %x) {
|
|
|
|
; ANY-LABEL: unsigned_sat_constant_v16i8_using_cmp_notval:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: paddusb {{.*}}(%rip), %xmm0
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%a = add <16 x i8> %x, <i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42, i8 42>
|
|
|
|
%c = icmp ugt <16 x i8> %x, <i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43>
|
|
|
|
%r = select <16 x i1> %c, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %a
|
|
|
|
ret <16 x i8> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i16> @unsigned_sat_constant_v8i16_using_min(<8 x i16> %x) {
|
|
|
|
; SSE2-LABEL: unsigned_sat_constant_v8i16_using_min:
|
|
|
|
; SSE2: # %bb.0:
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
|
|
|
|
; SSE2-NEXT: pxor %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: pminsw {{.*}}(%rip), %xmm0
|
|
|
|
; SSE2-NEXT: pxor %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: paddw {{.*}}(%rip), %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: unsigned_sat_constant_v8i16_using_min:
|
|
|
|
; SSE41: # %bb.0:
|
|
|
|
; SSE41-NEXT: pminuw {{.*}}(%rip), %xmm0
|
|
|
|
; SSE41-NEXT: paddw {{.*}}(%rip), %xmm0
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
%c = icmp ult <8 x i16> %x, <i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43>
|
|
|
|
%s = select <8 x i1> %c, <8 x i16> %x, <8 x i16> <i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43>
|
|
|
|
%r = add <8 x i16> %s, <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>
|
|
|
|
ret <8 x i16> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i16> @unsigned_sat_constant_v8i16_using_cmp_sum(<8 x i16> %x) {
|
|
|
|
; ANY-LABEL: unsigned_sat_constant_v8i16_using_cmp_sum:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: paddusw {{.*}}(%rip), %xmm0
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%a = add <8 x i16> %x, <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>
|
|
|
|
%c = icmp ugt <8 x i16> %x, %a
|
|
|
|
%r = select <8 x i1> %c, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %a
|
|
|
|
ret <8 x i16> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i16> @unsigned_sat_constant_v8i16_using_cmp_notval(<8 x i16> %x) {
|
|
|
|
; ANY-LABEL: unsigned_sat_constant_v8i16_using_cmp_notval:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: paddusw {{.*}}(%rip), %xmm0
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%a = add <8 x i16> %x, <i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42, i16 42>
|
|
|
|
%c = icmp ugt <8 x i16> %x, <i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43>
|
|
|
|
%r = select <8 x i1> %c, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %a
|
|
|
|
ret <8 x i16> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @unsigned_sat_constant_v4i32_using_min(<4 x i32> %x) {
|
|
|
|
; SSE2-LABEL: unsigned_sat_constant_v4i32_using_min:
|
|
|
|
; SSE2: # %bb.0:
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
|
|
|
|
; SSE2-NEXT: pxor %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483605,2147483605,2147483605,2147483605]
|
|
|
|
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
|
|
|
|
; SSE2-NEXT: pand %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2
|
|
|
|
; SSE2-NEXT: por %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: unsigned_sat_constant_v4i32_using_min:
|
|
|
|
; SSE41: # %bb.0:
|
|
|
|
; SSE41-NEXT: pminud {{.*}}(%rip), %xmm0
|
|
|
|
; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
%c = icmp ult <4 x i32> %x, <i32 -43, i32 -43, i32 -43, i32 -43>
|
|
|
|
%s = select <4 x i1> %c, <4 x i32> %x, <4 x i32> <i32 -43, i32 -43, i32 -43, i32 -43>
|
|
|
|
%r = add <4 x i32> %s, <i32 42, i32 42, i32 42, i32 42>
|
|
|
|
ret <4 x i32> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @unsigned_sat_constant_v4i32_using_cmp_sum(<4 x i32> %x) {
|
|
|
|
; SSE2-LABEL: unsigned_sat_constant_v4i32_using_cmp_sum:
|
|
|
|
; SSE2: # %bb.0:
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [42,42,42,42]
|
|
|
|
; SSE2-NEXT: paddd %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
|
|
|
|
; SSE2-NEXT: pxor %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: pxor %xmm1, %xmm2
|
|
|
|
; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: por %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: unsigned_sat_constant_v4i32_using_cmp_sum:
|
|
|
|
; SSE41: # %bb.0:
|
|
|
|
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [42,42,42,42]
|
|
|
|
; SSE41-NEXT: paddd %xmm0, %xmm2
|
|
|
|
; SSE41-NEXT: movdqa %xmm0, %xmm1
|
|
|
|
; SSE41-NEXT: pminud %xmm2, %xmm1
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; SSE41-NEXT: pxor %xmm0, %xmm1
|
|
|
|
; SSE41-NEXT: por %xmm2, %xmm1
|
|
|
|
; SSE41-NEXT: movdqa %xmm1, %xmm0
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
%a = add <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42>
|
|
|
|
%c = icmp ugt <4 x i32> %x, %a
|
|
|
|
%r = select <4 x i1> %c, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %a
|
|
|
|
ret <4 x i32> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @unsigned_sat_constant_v4i32_using_cmp_notval(<4 x i32> %x) {
|
|
|
|
; SSE2-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval:
|
|
|
|
; SSE2: # %bb.0:
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [42,42,42,42]
|
|
|
|
; SSE2-NEXT: paddd %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
|
|
|
|
; SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
|
|
|
|
; SSE2-NEXT: por %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval:
|
|
|
|
; SSE41: # %bb.0:
|
|
|
|
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42,42,42,42]
|
|
|
|
; SSE41-NEXT: paddd %xmm0, %xmm1
|
2018-12-16 23:05:48 +08:00
|
|
|
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4294967254,4294967254,4294967254,4294967254]
|
|
|
|
; SSE41-NEXT: pmaxud %xmm0, %xmm2
|
2018-09-11 01:40:15 +08:00
|
|
|
; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
|
|
|
|
; SSE41-NEXT: por %xmm1, %xmm0
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
%a = add <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42>
|
|
|
|
%c = icmp ugt <4 x i32> %x, <i32 -43, i32 -43, i32 -43, i32 -43>
|
|
|
|
%r = select <4 x i1> %c, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %a
|
|
|
|
ret <4 x i32> %r
|
|
|
|
}
|
|
|
|
|
2019-01-27 00:27:48 +08:00
|
|
|
define <4 x i32> @unsigned_sat_constant_v4i32_using_cmp_notval_nonsplat(<4 x i32> %x) {
|
|
|
|
; SSE2-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval_nonsplat:
|
|
|
|
; SSE2: # %bb.0:
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [43,44,45,46]
|
|
|
|
; SSE2-NEXT: paddd %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
|
|
|
|
; SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0
|
|
|
|
; SSE2-NEXT: por %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval_nonsplat:
|
|
|
|
; SSE41: # %bb.0:
|
|
|
|
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [43,44,45,46]
|
|
|
|
; SSE41-NEXT: paddd %xmm0, %xmm1
|
2019-01-27 00:40:03 +08:00
|
|
|
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4294967253,4294967252,4294967251,4294967250]
|
|
|
|
; SSE41-NEXT: pmaxud %xmm0, %xmm2
|
2019-01-27 00:27:48 +08:00
|
|
|
; SSE41-NEXT: pcmpeqd %xmm2, %xmm0
|
|
|
|
; SSE41-NEXT: por %xmm1, %xmm0
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
%a = add <4 x i32> %x, <i32 43, i32 44, i32 45, i32 46>
|
|
|
|
%c = icmp ugt <4 x i32> %x, <i32 -44, i32 -45, i32 -46, i32 -47>
|
|
|
|
%r = select <4 x i1> %c, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %a
|
|
|
|
ret <4 x i32> %r
|
|
|
|
}
|
|
|
|
|
2018-09-11 01:40:15 +08:00
|
|
|
define <2 x i64> @unsigned_sat_constant_v2i64_using_min(<2 x i64> %x) {
|
|
|
|
; SSE2-LABEL: unsigned_sat_constant_v2i64_using_min:
|
|
|
|
; SSE2: # %bb.0:
|
2018-10-10 03:05:50 +08:00
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
|
2018-09-11 01:40:15 +08:00
|
|
|
; SSE2-NEXT: pxor %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372034707292117,9223372034707292117]
|
|
|
|
; SSE2-NEXT: movdqa %xmm2, %xmm3
|
|
|
|
; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
|
|
|
|
; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
|
|
|
|
; SSE2-NEXT: pand %xmm4, %xmm1
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
|
|
|
|
; SSE2-NEXT: por %xmm1, %xmm2
|
|
|
|
; SSE2-NEXT: pand %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: pandn {{.*}}(%rip), %xmm2
|
|
|
|
; SSE2-NEXT: por %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: paddq {{.*}}(%rip), %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: unsigned_sat_constant_v2i64_using_min:
|
|
|
|
; SSE41: # %bb.0:
|
|
|
|
; SSE41-NEXT: movdqa %xmm0, %xmm1
|
|
|
|
; SSE41-NEXT: movapd {{.*#+}} xmm2 = [18446744073709551573,18446744073709551573]
|
2018-10-10 03:05:50 +08:00
|
|
|
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
|
2018-09-11 01:40:15 +08:00
|
|
|
; SSE41-NEXT: pxor %xmm1, %xmm0
|
|
|
|
; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372034707292117,9223372034707292117]
|
|
|
|
; SSE41-NEXT: movdqa %xmm3, %xmm4
|
2019-03-26 23:08:14 +08:00
|
|
|
; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
|
|
|
|
; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
|
|
|
|
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
|
|
|
|
; SSE41-NEXT: pand %xmm4, %xmm0
|
|
|
|
; SSE41-NEXT: por %xmm3, %xmm0
|
2018-09-11 01:40:15 +08:00
|
|
|
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
|
|
|
|
; SSE41-NEXT: paddq {{.*}}(%rip), %xmm2
|
|
|
|
; SSE41-NEXT: movdqa %xmm2, %xmm0
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
%c = icmp ult <2 x i64> %x, <i64 -43, i64 -43>
|
|
|
|
%s = select <2 x i1> %c, <2 x i64> %x, <2 x i64> <i64 -43, i64 -43>
|
|
|
|
%r = add <2 x i64> %s, <i64 42, i64 42>
|
|
|
|
ret <2 x i64> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_sum(<2 x i64> %x) {
|
|
|
|
; ANY-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: movdqa {{.*#+}} xmm1 = [42,42]
|
|
|
|
; ANY-NEXT: paddq %xmm0, %xmm1
|
2018-10-10 03:05:50 +08:00
|
|
|
; ANY-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
|
2018-09-11 01:40:15 +08:00
|
|
|
; ANY-NEXT: pxor %xmm2, %xmm0
|
|
|
|
; ANY-NEXT: pxor %xmm1, %xmm2
|
|
|
|
; ANY-NEXT: movdqa %xmm0, %xmm3
|
|
|
|
; ANY-NEXT: pcmpgtd %xmm2, %xmm3
|
|
|
|
; ANY-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
|
|
|
|
; ANY-NEXT: pcmpeqd %xmm0, %xmm2
|
|
|
|
; ANY-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
|
|
|
|
; ANY-NEXT: pand %xmm4, %xmm2
|
|
|
|
; ANY-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
|
|
|
|
; ANY-NEXT: por %xmm1, %xmm0
|
|
|
|
; ANY-NEXT: por %xmm2, %xmm0
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%a = add <2 x i64> %x, <i64 42, i64 42>
|
|
|
|
%c = icmp ugt <2 x i64> %x, %a
|
|
|
|
%r = select <2 x i1> %c, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %a
|
|
|
|
ret <2 x i64> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_notval(<2 x i64> %x) {
|
|
|
|
; ANY-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: movdqa {{.*#+}} xmm1 = [42,42]
|
|
|
|
; ANY-NEXT: paddq %xmm0, %xmm1
|
|
|
|
; ANY-NEXT: pxor {{.*}}(%rip), %xmm0
|
|
|
|
; ANY-NEXT: movdqa {{.*#+}} xmm2 = [9223372034707292117,9223372034707292117]
|
|
|
|
; ANY-NEXT: movdqa %xmm0, %xmm3
|
|
|
|
; ANY-NEXT: pcmpgtd %xmm2, %xmm3
|
|
|
|
; ANY-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
|
|
|
|
; ANY-NEXT: pcmpeqd %xmm2, %xmm0
|
|
|
|
; ANY-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
|
|
|
|
; ANY-NEXT: pand %xmm4, %xmm2
|
|
|
|
; ANY-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
|
|
|
|
; ANY-NEXT: por %xmm1, %xmm0
|
|
|
|
; ANY-NEXT: por %xmm2, %xmm0
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%a = add <2 x i64> %x, <i64 42, i64 42>
|
|
|
|
%c = icmp ugt <2 x i64> %x, <i64 -43, i64 -43>
|
|
|
|
%r = select <2 x i1> %c, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %a
|
|
|
|
ret <2 x i64> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @unsigned_sat_variable_v16i8_using_min(<16 x i8> %x, <16 x i8> %y) {
|
|
|
|
; ANY-LABEL: unsigned_sat_variable_v16i8_using_min:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: pcmpeqd %xmm2, %xmm2
|
|
|
|
; ANY-NEXT: pxor %xmm1, %xmm2
|
|
|
|
; ANY-NEXT: pminub %xmm2, %xmm0
|
|
|
|
; ANY-NEXT: paddb %xmm1, %xmm0
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%noty = xor <16 x i8> %y, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
|
|
|
|
%c = icmp ult <16 x i8> %x, %noty
|
|
|
|
%s = select <16 x i1> %c, <16 x i8> %x, <16 x i8> %noty
|
|
|
|
%r = add <16 x i8> %s, %y
|
|
|
|
ret <16 x i8> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @unsigned_sat_variable_v16i8_using_cmp_sum(<16 x i8> %x, <16 x i8> %y) {
|
|
|
|
; ANY-LABEL: unsigned_sat_variable_v16i8_using_cmp_sum:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: paddusb %xmm1, %xmm0
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%a = add <16 x i8> %x, %y
|
|
|
|
%c = icmp ugt <16 x i8> %x, %a
|
|
|
|
%r = select <16 x i1> %c, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %a
|
|
|
|
ret <16 x i8> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i8> @unsigned_sat_variable_v16i8_using_cmp_notval(<16 x i8> %x, <16 x i8> %y) {
|
|
|
|
; ANY-LABEL: unsigned_sat_variable_v16i8_using_cmp_notval:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: pcmpeqd %xmm2, %xmm2
|
|
|
|
; ANY-NEXT: movdqa %xmm0, %xmm3
|
|
|
|
; ANY-NEXT: paddb %xmm1, %xmm3
|
|
|
|
; ANY-NEXT: pxor %xmm2, %xmm1
|
|
|
|
; ANY-NEXT: pminub %xmm0, %xmm1
|
|
|
|
; ANY-NEXT: pcmpeqb %xmm1, %xmm0
|
|
|
|
; ANY-NEXT: pxor %xmm2, %xmm0
|
|
|
|
; ANY-NEXT: por %xmm3, %xmm0
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%noty = xor <16 x i8> %y, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
|
|
|
|
%a = add <16 x i8> %x, %y
|
|
|
|
%c = icmp ugt <16 x i8> %x, %noty
|
|
|
|
%r = select <16 x i1> %c, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %a
|
|
|
|
ret <16 x i8> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i16> @unsigned_sat_variable_v8i16_using_min(<8 x i16> %x, <8 x i16> %y) {
|
|
|
|
; SSE2-LABEL: unsigned_sat_variable_v8i16_using_min:
|
|
|
|
; SSE2: # %bb.0:
|
[X86] Move promotion of vector and/or/xor from legalization to DAG combine
Summary:
I've noticed that the bitcasts we introduce for these make computeKnownBits and computeNumSignBits not work well in LegalizeVectorOps. LegalizeVectorOps legalizes bottom up while LegalizeDAG legalizes top down. The bottom up strategy for LegalizeVectorOps means operands are legalized before their uses. So we promote and/or/xor before we legalize the operands that use them making computeKnownBits/computeNumSignBits in places like LowerTruncate suboptimal. I looked at changing LegalizeVectorOps to be top down as well, but that was more disruptive and caused some regressions. I also looked at just moving promotion of binops to LegalizeDAG, but that had a few issues one around matching AND,ANDN,OR into VSELECT because I had to create ANDN as vXi64, but the other nodes hadn't legalized yet, I didn't look too hard at fixing that.
This patch seems to produce better results overall than my other attempts. We now form broadcasts of constants better in some cases. For at least some of them the AND was being introduced in LegalizeDAG, promoted to vXi64, and the BUILD_VECTOR was also legalized there. I think we got bad ordering of that. Now the promotion is out of the legalizer so we handle this better.
In the longer term I think we really should evaluate whether we should be doing this promotion at all. It's really there to reduce isel pattern count, but I'm wondering if we'd be better served just eating the pattern cost or doing C++ based isel for vector and/or/xor in X86ISelDAGToDAG. The masked and/or/xor will definitely be difficult in patterns if a bitcast gets between the vselect and the and/or/xor node. That becomes a lot of permutations to cover.
Reviewers: RKSimon, spatel
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D53107
llvm-svn: 344487
2018-10-15 09:51:58 +08:00
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
|
|
|
|
; SSE2-NEXT: pxor %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32767,32767,32767,32767,32767,32767,32767,32767]
|
|
|
|
; SSE2-NEXT: pxor %xmm1, %xmm3
|
|
|
|
; SSE2-NEXT: pminsw %xmm3, %xmm0
|
|
|
|
; SSE2-NEXT: pxor %xmm2, %xmm0
|
2018-09-11 01:40:15 +08:00
|
|
|
; SSE2-NEXT: paddw %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: unsigned_sat_variable_v8i16_using_min:
|
|
|
|
; SSE41: # %bb.0:
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
|
|
|
|
; SSE41-NEXT: pxor %xmm1, %xmm2
|
|
|
|
; SSE41-NEXT: pminuw %xmm2, %xmm0
|
|
|
|
; SSE41-NEXT: paddw %xmm1, %xmm0
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
%noty = xor <8 x i16> %y, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
|
|
|
|
%c = icmp ult <8 x i16> %x, %noty
|
|
|
|
%s = select <8 x i1> %c, <8 x i16> %x, <8 x i16> %noty
|
|
|
|
%r = add <8 x i16> %s, %y
|
|
|
|
ret <8 x i16> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i16> @unsigned_sat_variable_v8i16_using_cmp_sum(<8 x i16> %x, <8 x i16> %y) {
|
|
|
|
; ANY-LABEL: unsigned_sat_variable_v8i16_using_cmp_sum:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: paddusw %xmm1, %xmm0
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%a = add <8 x i16> %x, %y
|
|
|
|
%c = icmp ugt <8 x i16> %x, %a
|
|
|
|
%r = select <8 x i1> %c, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %a
|
|
|
|
ret <8 x i16> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i16> @unsigned_sat_variable_v8i16_using_cmp_notval(<8 x i16> %x, <8 x i16> %y) {
|
|
|
|
; SSE2-LABEL: unsigned_sat_variable_v8i16_using_cmp_notval:
|
|
|
|
; SSE2: # %bb.0:
|
[X86] Move promotion of vector and/or/xor from legalization to DAG combine
Summary:
I've noticed that the bitcasts we introduce for these make computeKnownBits and computeNumSignBits not work well in LegalizeVectorOps. LegalizeVectorOps legalizes bottom up while LegalizeDAG legalizes top down. The bottom up strategy for LegalizeVectorOps means operands are legalized before their uses. So we promote and/or/xor before we legalize the operands that use them making computeKnownBits/computeNumSignBits in places like LowerTruncate suboptimal. I looked at changing LegalizeVectorOps to be top down as well, but that was more disruptive and caused some regressions. I also looked at just moving promotion of binops to LegalizeDAG, but that had a few issues one around matching AND,ANDN,OR into VSELECT because I had to create ANDN as vXi64, but the other nodes hadn't legalized yet, I didn't look too hard at fixing that.
This patch seems to produce better results overall than my other attempts. We now form broadcasts of constants better in some cases. For at least some of them the AND was being introduced in LegalizeDAG, promoted to vXi64, and the BUILD_VECTOR was also legalized there. I think we got bad ordering of that. Now the promotion is out of the legalizer so we handle this better.
In the longer term I think we really should evaluate whether we should be doing this promotion at all. It's really there to reduce isel pattern count, but I'm wondering if we'd be better served just eating the pattern cost or doing C++ based isel for vector and/or/xor in X86ISelDAGToDAG. The masked and/or/xor will definitely be difficult in patterns if a bitcast gets between the vselect and the and/or/xor node. That becomes a lot of permutations to cover.
Reviewers: RKSimon, spatel
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D53107
llvm-svn: 344487
2018-10-15 09:51:58 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm0, %xmm2
|
|
|
|
; SSE2-NEXT: paddw %xmm1, %xmm2
|
|
|
|
; SSE2-NEXT: pxor {{.*}}(%rip), %xmm1
|
|
|
|
; SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
|
|
|
|
; SSE2-NEXT: pcmpgtw %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: por %xmm2, %xmm0
|
2018-09-11 01:40:15 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: unsigned_sat_variable_v8i16_using_cmp_notval:
|
|
|
|
; SSE41: # %bb.0:
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
|
|
|
|
; SSE41-NEXT: movdqa %xmm0, %xmm3
|
|
|
|
; SSE41-NEXT: paddw %xmm1, %xmm3
|
|
|
|
; SSE41-NEXT: pxor %xmm2, %xmm1
|
|
|
|
; SSE41-NEXT: pminuw %xmm0, %xmm1
|
|
|
|
; SSE41-NEXT: pcmpeqw %xmm1, %xmm0
|
|
|
|
; SSE41-NEXT: pxor %xmm2, %xmm0
|
|
|
|
; SSE41-NEXT: por %xmm3, %xmm0
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
%noty = xor <8 x i16> %y, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
|
|
|
|
%a = add <8 x i16> %x, %y
|
|
|
|
%c = icmp ugt <8 x i16> %x, %noty
|
|
|
|
%r = select <8 x i1> %c, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %a
|
|
|
|
ret <8 x i16> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @unsigned_sat_variable_v4i32_using_min(<4 x i32> %x, <4 x i32> %y) {
|
|
|
|
; SSE2-LABEL: unsigned_sat_variable_v4i32_using_min:
|
|
|
|
; SSE2: # %bb.0:
|
|
|
|
; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
|
[X86] Move promotion of vector and/or/xor from legalization to DAG combine
Summary:
I've noticed that the bitcasts we introduce for these make computeKnownBits and computeNumSignBits not work well in LegalizeVectorOps. LegalizeVectorOps legalizes bottom up while LegalizeDAG legalizes top down. The bottom up strategy for LegalizeVectorOps means operands are legalized before their uses. So we promote and/or/xor before we legalize the operands that use them making computeKnownBits/computeNumSignBits in places like LowerTruncate suboptimal. I looked at changing LegalizeVectorOps to be top down as well, but that was more disruptive and caused some regressions. I also looked at just moving promotion of binops to LegalizeDAG, but that had a few issues one around matching AND,ANDN,OR into VSELECT because I had to create ANDN as vXi64, but the other nodes hadn't legalized yet, I didn't look too hard at fixing that.
This patch seems to produce better results overall than my other attempts. We now form broadcasts of constants better in some cases. For at least some of them the AND was being introduced in LegalizeDAG, promoted to vXi64, and the BUILD_VECTOR was also legalized there. I think we got bad ordering of that. Now the promotion is out of the legalizer so we handle this better.
In the longer term I think we really should evaluate whether we should be doing this promotion at all. It's really there to reduce isel pattern count, but I'm wondering if we'd be better served just eating the pattern cost or doing C++ based isel for vector and/or/xor in X86ISelDAGToDAG. The masked and/or/xor will definitely be difficult in patterns if a bitcast gets between the vselect and the and/or/xor node. That becomes a lot of permutations to cover.
Reviewers: RKSimon, spatel
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D53107
llvm-svn: 344487
2018-10-15 09:51:58 +08:00
|
|
|
; SSE2-NEXT: pxor %xmm0, %xmm3
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647]
|
|
|
|
; SSE2-NEXT: pxor %xmm1, %xmm4
|
|
|
|
; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
|
|
|
|
; SSE2-NEXT: pand %xmm4, %xmm0
|
2018-10-27 01:21:26 +08:00
|
|
|
; SSE2-NEXT: pxor %xmm2, %xmm4
|
|
|
|
; SSE2-NEXT: movdqa %xmm1, %xmm2
|
|
|
|
; SSE2-NEXT: pandn %xmm4, %xmm2
|
|
|
|
; SSE2-NEXT: por %xmm2, %xmm0
|
2018-09-11 01:40:15 +08:00
|
|
|
; SSE2-NEXT: paddd %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: unsigned_sat_variable_v4i32_using_min:
|
|
|
|
; SSE41: # %bb.0:
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
|
|
|
|
; SSE41-NEXT: pxor %xmm1, %xmm2
|
|
|
|
; SSE41-NEXT: pminud %xmm2, %xmm0
|
|
|
|
; SSE41-NEXT: paddd %xmm1, %xmm0
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
%noty = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
|
|
|
|
%c = icmp ult <4 x i32> %x, %noty
|
|
|
|
%s = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %noty
|
|
|
|
%r = add <4 x i32> %s, %y
|
|
|
|
ret <4 x i32> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_sum(<4 x i32> %x, <4 x i32> %y) {
|
|
|
|
; SSE2-LABEL: unsigned_sat_variable_v4i32_using_cmp_sum:
|
|
|
|
; SSE2: # %bb.0:
|
|
|
|
; SSE2-NEXT: paddd %xmm0, %xmm1
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
|
|
|
|
; SSE2-NEXT: pxor %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: pxor %xmm1, %xmm2
|
|
|
|
; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: por %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: unsigned_sat_variable_v4i32_using_cmp_sum:
|
|
|
|
; SSE41: # %bb.0:
|
|
|
|
; SSE41-NEXT: paddd %xmm0, %xmm1
|
|
|
|
; SSE41-NEXT: movdqa %xmm0, %xmm2
|
|
|
|
; SSE41-NEXT: pminud %xmm1, %xmm2
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
|
|
|
|
; SSE41-NEXT: pxor %xmm0, %xmm2
|
|
|
|
; SSE41-NEXT: por %xmm1, %xmm2
|
|
|
|
; SSE41-NEXT: movdqa %xmm2, %xmm0
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
%a = add <4 x i32> %x, %y
|
|
|
|
%c = icmp ugt <4 x i32> %x, %a
|
|
|
|
%r = select <4 x i1> %c, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %a
|
|
|
|
ret <4 x i32> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_notval(<4 x i32> %x, <4 x i32> %y) {
|
|
|
|
; SSE2-LABEL: unsigned_sat_variable_v4i32_using_cmp_notval:
|
|
|
|
; SSE2: # %bb.0:
|
[X86] Move promotion of vector and/or/xor from legalization to DAG combine
Summary:
I've noticed that the bitcasts we introduce for these make computeKnownBits and computeNumSignBits not work well in LegalizeVectorOps. LegalizeVectorOps legalizes bottom up while LegalizeDAG legalizes top down. The bottom up strategy for LegalizeVectorOps means operands are legalized before their uses. So we promote and/or/xor before we legalize the operands that use them making computeKnownBits/computeNumSignBits in places like LowerTruncate suboptimal. I looked at changing LegalizeVectorOps to be top down as well, but that was more disruptive and caused some regressions. I also looked at just moving promotion of binops to LegalizeDAG, but that had a few issues one around matching AND,ANDN,OR into VSELECT because I had to create ANDN as vXi64, but the other nodes hadn't legalized yet, I didn't look too hard at fixing that.
This patch seems to produce better results overall than my other attempts. We now form broadcasts of constants better in some cases. For at least some of them the AND was being introduced in LegalizeDAG, promoted to vXi64, and the BUILD_VECTOR was also legalized there. I think we got bad ordering of that. Now the promotion is out of the legalizer so we handle this better.
In the longer term I think we really should evaluate whether we should be doing this promotion at all. It's really there to reduce isel pattern count, but I'm wondering if we'd be better served just eating the pattern cost or doing C++ based isel for vector and/or/xor in X86ISelDAGToDAG. The masked and/or/xor will definitely be difficult in patterns if a bitcast gets between the vselect and the and/or/xor node. That becomes a lot of permutations to cover.
Reviewers: RKSimon, spatel
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D53107
llvm-svn: 344487
2018-10-15 09:51:58 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm0, %xmm2
|
|
|
|
; SSE2-NEXT: paddd %xmm1, %xmm2
|
|
|
|
; SSE2-NEXT: pxor {{.*}}(%rip), %xmm1
|
|
|
|
; SSE2-NEXT: pxor {{.*}}(%rip), %xmm0
|
|
|
|
; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: por %xmm2, %xmm0
|
2018-09-11 01:40:15 +08:00
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: unsigned_sat_variable_v4i32_using_cmp_notval:
|
|
|
|
; SSE41: # %bb.0:
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
|
|
|
|
; SSE41-NEXT: movdqa %xmm0, %xmm3
|
|
|
|
; SSE41-NEXT: paddd %xmm1, %xmm3
|
|
|
|
; SSE41-NEXT: pxor %xmm2, %xmm1
|
|
|
|
; SSE41-NEXT: pminud %xmm0, %xmm1
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
|
|
|
|
; SSE41-NEXT: pxor %xmm2, %xmm0
|
|
|
|
; SSE41-NEXT: por %xmm3, %xmm0
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
%noty = xor <4 x i32> %y, <i32 -1, i32 -1, i32 -1, i32 -1>
|
|
|
|
%a = add <4 x i32> %x, %y
|
|
|
|
%c = icmp ugt <4 x i32> %x, %noty
|
|
|
|
%r = select <4 x i1> %c, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %a
|
|
|
|
ret <4 x i32> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @unsigned_sat_variable_v2i64_using_min(<2 x i64> %x, <2 x i64> %y) {
|
|
|
|
; SSE2-LABEL: unsigned_sat_variable_v2i64_using_min:
|
|
|
|
; SSE2: # %bb.0:
|
2018-10-10 03:05:50 +08:00
|
|
|
; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
|
|
|
|
; SSE2-NEXT: pxor %xmm0, %xmm3
|
|
|
|
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372034707292159,9223372034707292159]
|
|
|
|
; SSE2-NEXT: pxor %xmm1, %xmm4
|
|
|
|
; SSE2-NEXT: movdqa %xmm4, %xmm5
|
|
|
|
; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
|
2018-09-11 01:40:15 +08:00
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
|
2018-10-10 03:05:50 +08:00
|
|
|
; SSE2-NEXT: pcmpeqd %xmm3, %xmm4
|
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
|
|
|
|
; SSE2-NEXT: pand %xmm6, %xmm3
|
2018-09-11 01:40:15 +08:00
|
|
|
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
|
2018-10-10 03:05:50 +08:00
|
|
|
; SSE2-NEXT: por %xmm3, %xmm4
|
2018-09-11 01:40:15 +08:00
|
|
|
; SSE2-NEXT: pand %xmm4, %xmm0
|
2018-10-10 03:05:50 +08:00
|
|
|
; SSE2-NEXT: pxor %xmm2, %xmm4
|
2018-09-11 01:40:15 +08:00
|
|
|
; SSE2-NEXT: movdqa %xmm1, %xmm2
|
|
|
|
; SSE2-NEXT: pandn %xmm4, %xmm2
|
|
|
|
; SSE2-NEXT: por %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: paddq %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSE41-LABEL: unsigned_sat_variable_v2i64_using_min:
|
|
|
|
; SSE41: # %bb.0:
|
|
|
|
; SSE41-NEXT: movdqa %xmm0, %xmm2
|
|
|
|
; SSE41-NEXT: pcmpeqd %xmm3, %xmm3
|
|
|
|
; SSE41-NEXT: pxor %xmm1, %xmm3
|
2018-10-10 03:05:50 +08:00
|
|
|
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
|
|
|
|
; SSE41-NEXT: pxor %xmm2, %xmm0
|
|
|
|
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372034707292159,9223372034707292159]
|
|
|
|
; SSE41-NEXT: pxor %xmm1, %xmm4
|
|
|
|
; SSE41-NEXT: movdqa %xmm4, %xmm5
|
2019-03-26 23:08:14 +08:00
|
|
|
; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
|
|
|
|
; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
|
|
|
|
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
|
|
|
|
; SSE41-NEXT: pand %xmm5, %xmm0
|
|
|
|
; SSE41-NEXT: por %xmm4, %xmm0
|
2018-09-11 01:40:15 +08:00
|
|
|
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
|
|
|
|
; SSE41-NEXT: paddq %xmm1, %xmm3
|
|
|
|
; SSE41-NEXT: movdqa %xmm3, %xmm0
|
|
|
|
; SSE41-NEXT: retq
|
|
|
|
%noty = xor <2 x i64> %y, <i64 -1, i64 -1>
|
|
|
|
%c = icmp ult <2 x i64> %x, %noty
|
|
|
|
%s = select <2 x i1> %c, <2 x i64> %x, <2 x i64> %noty
|
|
|
|
%r = add <2 x i64> %s, %y
|
|
|
|
ret <2 x i64> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_sum(<2 x i64> %x, <2 x i64> %y) {
|
|
|
|
; ANY-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum:
|
|
|
|
; ANY: # %bb.0:
|
|
|
|
; ANY-NEXT: paddq %xmm0, %xmm1
|
2018-10-10 03:05:50 +08:00
|
|
|
; ANY-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
|
2018-09-11 01:40:15 +08:00
|
|
|
; ANY-NEXT: pxor %xmm2, %xmm0
|
|
|
|
; ANY-NEXT: pxor %xmm1, %xmm2
|
|
|
|
; ANY-NEXT: movdqa %xmm0, %xmm3
|
|
|
|
; ANY-NEXT: pcmpgtd %xmm2, %xmm3
|
|
|
|
; ANY-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
|
|
|
|
; ANY-NEXT: pcmpeqd %xmm0, %xmm2
|
|
|
|
; ANY-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
|
|
|
|
; ANY-NEXT: pand %xmm4, %xmm2
|
|
|
|
; ANY-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
|
|
|
|
; ANY-NEXT: por %xmm1, %xmm0
|
|
|
|
; ANY-NEXT: por %xmm2, %xmm0
|
|
|
|
; ANY-NEXT: retq
|
|
|
|
%a = add <2 x i64> %x, %y
|
|
|
|
%c = icmp ugt <2 x i64> %x, %a
|
|
|
|
%r = select <2 x i1> %c, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %a
|
|
|
|
ret <2 x i64> %r
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_notval(<2 x i64> %x, <2 x i64> %y) {
|
|
|
|
; ANY-LABEL: unsigned_sat_variable_v2i64_using_cmp_notval:
|
|
|
|
; ANY: # %bb.0:
|
2018-10-10 03:05:50 +08:00
|
|
|
; ANY-NEXT: movdqa %xmm0, %xmm2
|
|
|
|
; ANY-NEXT: paddq %xmm1, %xmm2
|
|
|
|
; ANY-NEXT: pxor {{.*}}(%rip), %xmm1
|
|
|
|
; ANY-NEXT: pxor {{.*}}(%rip), %xmm0
|
|
|
|
; ANY-NEXT: movdqa %xmm0, %xmm3
|
|
|
|
; ANY-NEXT: pcmpgtd %xmm1, %xmm3
|
|
|
|
; ANY-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
|
|
|
|
; ANY-NEXT: pcmpeqd %xmm1, %xmm0
|
|
|
|
; ANY-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
|
|
|
|
; ANY-NEXT: pand %xmm4, %xmm1
|
|
|
|
; ANY-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
|
|
|
|
; ANY-NEXT: por %xmm2, %xmm0
|
|
|
|
; ANY-NEXT: por %xmm1, %xmm0
|
2018-09-11 01:40:15 +08:00
|
|
|
; ANY-NEXT: retq
|
|
|
|
%noty = xor <2 x i64> %y, <i64 -1, i64 -1>
|
|
|
|
%a = add <2 x i64> %x, %y
|
|
|
|
%c = icmp ugt <2 x i64> %x, %noty
|
|
|
|
%r = select <2 x i1> %c, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %a
|
|
|
|
ret <2 x i64> %r
|
|
|
|
}
|
|
|
|
|