forked from OSchip/llvm-project
[x86] replace div/rem with shift/mask for better shuffle combining perf
We know that shuffle masks are power-of-2 sizes, but there's no way (?) for LLVM to know that, so hack combineX86ShufflesRecursively() to be much faster by replacing div/rem with shift/mask. This makes the motivating compile-time bug in PR32037 ( https://bugs.llvm.org/show_bug.cgi?id=32037 ) about 9% faster overall. Differential Revision: https://reviews.llvm.org/D34174 llvm-svn: 305398
This commit is contained in:
parent
4a911c867f
commit
ce0b99563a
|
@ -27970,28 +27970,45 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
|
||||||
OpMask.size() % RootMask.size() == 0) ||
|
OpMask.size() % RootMask.size() == 0) ||
|
||||||
OpMask.size() == RootMask.size()) &&
|
OpMask.size() == RootMask.size()) &&
|
||||||
"The smaller number of elements must divide the larger.");
|
"The smaller number of elements must divide the larger.");
|
||||||
int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
|
|
||||||
int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
|
// This function can be performance-critical, so we rely on the power-of-2
|
||||||
int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
|
// knowledge that we have about the mask sizes to replace div/rem ops with
|
||||||
assert(((RootRatio == 1 && OpRatio == 1) ||
|
// bit-masks and shifts.
|
||||||
(RootRatio == 1) != (OpRatio == 1)) &&
|
assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
|
||||||
|
assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
|
||||||
|
unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
|
||||||
|
unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
|
||||||
|
|
||||||
|
unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
|
||||||
|
unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
|
||||||
|
unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
|
||||||
|
assert((RootRatio == 1 || OpRatio == 1) &&
|
||||||
"Must not have a ratio for both incoming and op masks!");
|
"Must not have a ratio for both incoming and op masks!");
|
||||||
|
|
||||||
SmallVector<int, 64> Mask((unsigned)MaskWidth, SM_SentinelUndef);
|
assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
|
||||||
|
assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
|
||||||
|
assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
|
||||||
|
unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
|
||||||
|
unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
|
||||||
|
|
||||||
|
SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
|
||||||
|
|
||||||
// Merge this shuffle operation's mask into our accumulated mask. Note that
|
// Merge this shuffle operation's mask into our accumulated mask. Note that
|
||||||
// this shuffle's mask will be the first applied to the input, followed by the
|
// this shuffle's mask will be the first applied to the input, followed by the
|
||||||
// root mask to get us all the way to the root value arrangement. The reason
|
// root mask to get us all the way to the root value arrangement. The reason
|
||||||
// for this order is that we are recursing up the operation chain.
|
// for this order is that we are recursing up the operation chain.
|
||||||
for (int i = 0; i < MaskWidth; ++i) {
|
for (unsigned i = 0; i < MaskWidth; ++i) {
|
||||||
int RootIdx = i / RootRatio;
|
unsigned RootIdx = i >> RootRatioLog2;
|
||||||
if (RootMask[RootIdx] < 0) {
|
if (RootMask[RootIdx] < 0) {
|
||||||
// This is a zero or undef lane, we're done.
|
// This is a zero or undef lane, we're done.
|
||||||
Mask[i] = RootMask[RootIdx];
|
Mask[i] = RootMask[RootIdx];
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
|
// TODO: Here and below, we could convert multiply to shift-left for
|
||||||
|
// performance because we know that our mask sizes are power-of-2.
|
||||||
|
unsigned RootMaskedIdx =
|
||||||
|
RootMask[RootIdx] * RootRatio + (i & (RootRatio - 1));
|
||||||
|
|
||||||
// Just insert the scaled root mask value if it references an input other
|
// Just insert the scaled root mask value if it references an input other
|
||||||
// than the SrcOp we're currently inserting.
|
// than the SrcOp we're currently inserting.
|
||||||
|
@ -28001,9 +28018,9 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
RootMaskedIdx %= MaskWidth;
|
RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
|
||||||
|
|
||||||
int OpIdx = RootMaskedIdx / OpRatio;
|
unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
|
||||||
if (OpMask[OpIdx] < 0) {
|
if (OpMask[OpIdx] < 0) {
|
||||||
// The incoming lanes are zero or undef, it doesn't matter which ones we
|
// The incoming lanes are zero or undef, it doesn't matter which ones we
|
||||||
// are using.
|
// are using.
|
||||||
|
@ -28012,8 +28029,9 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ok, we have non-zero lanes, map them through to one of the Op's inputs.
|
// Ok, we have non-zero lanes, map them through to one of the Op's inputs.
|
||||||
int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio;
|
unsigned OpMaskedIdx =
|
||||||
OpMaskedIdx %= MaskWidth;
|
OpMask[OpIdx] * OpRatio + (RootMaskedIdx & (OpRatio - 1));
|
||||||
|
OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
|
||||||
|
|
||||||
if (OpMask[OpIdx] < (int)OpMask.size()) {
|
if (OpMask[OpIdx] < (int)OpMask.size()) {
|
||||||
assert(0 <= InputIdx0 && "Unknown target shuffle input");
|
assert(0 <= InputIdx0 && "Unknown target shuffle input");
|
||||||
|
|
Loading…
Reference in New Issue