forked from OSchip/llvm-project
[x86] limit vector increment fold to allow load folding
The tests are based on the example from: https://llvm.org/PR52032 I suspect that it looks worse than it actually is. :) That is, llvm-mca says there's no uop/timing difference with the load folding and pcmpeq vs. broadcast on Haswell (and probably other targets). The load-folding definitely makes the code smaller, so it's good for that at least. So this requires carving a narrow hole in the transform to get just this case without changing others that look good as-is (in other words, the transform still seems good for most examples). Differential Revision: https://reviews.llvm.org/D112464
This commit is contained in:
parent
837518d6a0
commit
285b8abce4
|
@ -880,16 +880,31 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
|
|||
continue;
|
||||
}
|
||||
|
||||
/// Convert vector increment or decrement to sub/add with an all-ones
|
||||
/// constant:
|
||||
/// add X, <1, 1...> --> sub X, <-1, -1...>
|
||||
/// sub X, <1, 1...> --> add X, <-1, -1...>
|
||||
/// The all-ones vector constant can be materialized using a pcmpeq
|
||||
/// instruction that is commonly recognized as an idiom (has no register
|
||||
/// dependency), so that's better/smaller than loading a splat 1 constant.
|
||||
// Convert vector increment or decrement to sub/add with an all-ones
|
||||
// constant:
|
||||
// add X, <1, 1...> --> sub X, <-1, -1...>
|
||||
// sub X, <1, 1...> --> add X, <-1, -1...>
|
||||
// The all-ones vector constant can be materialized using a pcmpeq
|
||||
// instruction that is commonly recognized as an idiom (has no register
|
||||
// dependency), so that's better/smaller than loading a splat 1 constant.
|
||||
//
|
||||
// But don't do this if it would inhibit a potentially profitable load
|
||||
// folding opportunity for the other operand. That only occurs with the
|
||||
// intersection of:
|
||||
// (1) The other operand (op0) is load foldable.
|
||||
// (2) The op is an add (otherwise, we are *creating* an add and can still
|
||||
// load fold the other op).
|
||||
// (3) The target has AVX (otherwise, we have a destructive add and can't
|
||||
// load fold the other op without killing the constant op).
|
||||
// (4) The constant 1 vector has multiple uses (so it is profitable to load
|
||||
// into a register anyway).
|
||||
auto mayPreventLoadFold = [&]() {
|
||||
return X86::mayFoldLoad(N->getOperand(0), *Subtarget) &&
|
||||
N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
|
||||
!N->getOperand(1).hasOneUse();
|
||||
};
|
||||
if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
|
||||
N->getSimpleValueType(0).isVector()) {
|
||||
|
||||
N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
|
||||
APInt SplatVal;
|
||||
if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
|
||||
SplatVal.isOne()) {
|
||||
|
|
|
@ -276,6 +276,10 @@ define <4 x i32> @combine_vec_neg_xor_consts(<4 x i32> %x) {
|
|||
ret <4 x i32> %sub
|
||||
}
|
||||
|
||||
; With AVX, this could use broadcast (an extra load) and
|
||||
; load-folded 'add', but currently we favor the virtually
|
||||
; free pcmpeq instruction.
|
||||
|
||||
define void @PR52032_oneuse_constant(<8 x i32>* %p) {
|
||||
; SSE-LABEL: PR52032_oneuse_constant:
|
||||
; SSE: # %bb.0:
|
||||
|
@ -302,6 +306,9 @@ define void @PR52032_oneuse_constant(<8 x i32>* %p) {
|
|||
ret void
|
||||
}
|
||||
|
||||
; With AVX, we don't transform 'add' to 'sub' because that prevents load folding.
|
||||
; With SSE, we do it because we can't load fold the other op without overwriting the constant op.
|
||||
|
||||
define void @PR52032(<8 x i32>* %p) {
|
||||
; SSE-LABEL: PR52032:
|
||||
; SSE: # %bb.0:
|
||||
|
@ -322,12 +329,10 @@ define void @PR52032(<8 x i32>* %p) {
|
|||
;
|
||||
; AVX-LABEL: PR52032:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
|
||||
; AVX-NEXT: vmovdqu (%rdi), %ymm1
|
||||
; AVX-NEXT: vmovdqu 32(%rdi), %ymm2
|
||||
; AVX-NEXT: vpsubd %ymm0, %ymm1, %ymm1
|
||||
; AVX-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
|
||||
; AVX-NEXT: vpaddd (%rdi), %ymm0, %ymm1
|
||||
; AVX-NEXT: vmovdqu %ymm1, (%rdi)
|
||||
; AVX-NEXT: vpsubd %ymm0, %ymm2, %ymm0
|
||||
; AVX-NEXT: vpaddd 32(%rdi), %ymm0, %ymm0
|
||||
; AVX-NEXT: vmovdqu %ymm0, 32(%rdi)
|
||||
; AVX-NEXT: vzeroupper
|
||||
; AVX-NEXT: retq
|
||||
|
@ -341,6 +346,10 @@ define void @PR52032(<8 x i32>* %p) {
|
|||
ret void
|
||||
}
|
||||
|
||||
; Same as above, but 128-bit ops:
|
||||
; With AVX, we don't transform 'add' to 'sub' because that prevents load folding.
|
||||
; With SSE, we do it because we can't load fold the other op without overwriting the constant op.
|
||||
|
||||
define void @PR52032_2(<4 x i32>* %p) {
|
||||
; SSE-LABEL: PR52032_2:
|
||||
; SSE: # %bb.0:
|
||||
|
@ -355,12 +364,10 @@ define void @PR52032_2(<4 x i32>* %p) {
|
|||
;
|
||||
; AVX-LABEL: PR52032_2:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
|
||||
; AVX-NEXT: vmovdqu (%rdi), %xmm1
|
||||
; AVX-NEXT: vmovdqu 16(%rdi), %xmm2
|
||||
; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1]
|
||||
; AVX-NEXT: vpaddd (%rdi), %xmm0, %xmm1
|
||||
; AVX-NEXT: vmovdqu %xmm1, (%rdi)
|
||||
; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0
|
||||
; AVX-NEXT: vpaddd 16(%rdi), %xmm0, %xmm0
|
||||
; AVX-NEXT: vmovdqu %xmm0, 16(%rdi)
|
||||
; AVX-NEXT: retq
|
||||
%i3 = load <4 x i32>, <4 x i32>* %p, align 4
|
||||
|
@ -373,6 +380,8 @@ define void @PR52032_2(<4 x i32>* %p) {
|
|||
ret void
|
||||
}
|
||||
|
||||
; If we are starting with a 'sub', it is always better to do the transform.
|
||||
|
||||
define void @PR52032_3(<4 x i32>* %p) {
|
||||
; SSE-LABEL: PR52032_3:
|
||||
; SSE: # %bb.0:
|
||||
|
@ -403,6 +412,8 @@ define void @PR52032_3(<4 x i32>* %p) {
|
|||
ret void
|
||||
}
|
||||
|
||||
; If there's no chance of profitable load folding (because of extra uses), we convert 'add' to 'sub'.
|
||||
|
||||
define void @PR52032_4(<4 x i32>* %p, <4 x i32>* %q) {
|
||||
; SSE-LABEL: PR52032_4:
|
||||
; SSE: # %bb.0:
|
||||
|
|
Loading…
Reference in New Issue