2019-09-20 22:17:00 +08:00
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s
%v8i8 = type { i8 , i8 , i8 , i8 , i8 , i8 , i8 , i8 }
; https://bugs.llvm.org/show_bug.cgi?id=43146
define i64 @load_bswap ( %v8i8 * %p ) {
; CHECK-LABEL: @load_bswap(
; CHECK-NEXT: [[G0:%.*]] = getelementptr inbounds [[V8I8:%.*]], %v8i8* [[P:%.*]], i64 0, i32 0
; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 1
; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 2
; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 3
; CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 4
; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 5
; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 6
; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 7
[SLP] avoid reduction transform on patterns that the backend can load-combine (2nd try)
The 1st attempt at this modified the cost model in a bad way to avoid the vectorization,
but that caused problems for other users (the loop vectorizer) of the cost model.
I don't see an ideal solution to these 2 related, potentially large, perf regressions:
https://bugs.llvm.org/show_bug.cgi?id=42708
https://bugs.llvm.org/show_bug.cgi?id=43146
We decided that load combining was unsuitable for IR because it could obscure other
optimizations in IR. So we removed the LoadCombiner pass and deferred to the backend.
Therefore, preventing SLP from destroying load combine opportunities requires that it
recognizes patterns that could be combined later, but not do the optimization itself (
it's not a vector combine anyway, so it's probably out-of-scope for SLP).
Here, we add a cost-independent bailout with a conservative pattern match for a
multi-instruction sequence that can probably be reduced later.
In the x86 tests shown (and discussed in more detail in the bug reports), SDAG combining
will produce a single instruction on these tests like:
movbe rax, qword ptr [rdi]
or:
mov rax, qword ptr [rdi]
Not some (half) vector monstrosity as we currently do using SLP:
vpmovzxbq ymm0, dword ptr [rdi + 1] # ymm0 = mem[0],zero,zero,..
vpsllvq ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
movzx eax, byte ptr [rdi]
movzx ecx, byte ptr [rdi + 5]
shl rcx, 40
movzx edx, byte ptr [rdi + 6]
shl rdx, 48
or rdx, rcx
movzx ecx, byte ptr [rdi + 7]
shl rcx, 56
or rcx, rdx
or rcx, rax
vextracti128 xmm1, ymm0, 1
vpor xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1]
vpor xmm0, xmm0, xmm1
vmovq rax, xmm0
or rax, rcx
vzeroupper
ret
Differential Revision: https://reviews.llvm.org/D67841
llvm-svn: 375025
2019-10-17 02:06:24 +08:00
; CHECK-NEXT: [[T0:%.*]] = load i8, i8* [[G0]]
; CHECK-NEXT: [[T1:%.*]] = load i8, i8* [[G1]]
; CHECK-NEXT: [[T2:%.*]] = load i8, i8* [[G2]]
; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[G3]]
2019-09-20 22:17:00 +08:00
; CHECK-NEXT: [[T4:%.*]] = load i8, i8* [[G4]]
; CHECK-NEXT: [[T5:%.*]] = load i8, i8* [[G5]]
; CHECK-NEXT: [[T6:%.*]] = load i8, i8* [[G6]]
; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[G7]]
[SLP] avoid reduction transform on patterns that the backend can load-combine (2nd try)
The 1st attempt at this modified the cost model in a bad way to avoid the vectorization,
but that caused problems for other users (the loop vectorizer) of the cost model.
I don't see an ideal solution to these 2 related, potentially large, perf regressions:
https://bugs.llvm.org/show_bug.cgi?id=42708
https://bugs.llvm.org/show_bug.cgi?id=43146
We decided that load combining was unsuitable for IR because it could obscure other
optimizations in IR. So we removed the LoadCombiner pass and deferred to the backend.
Therefore, preventing SLP from destroying load combine opportunities requires that it
recognizes patterns that could be combined later, but not do the optimization itself (
it's not a vector combine anyway, so it's probably out-of-scope for SLP).
Here, we add a cost-independent bailout with a conservative pattern match for a
multi-instruction sequence that can probably be reduced later.
In the x86 tests shown (and discussed in more detail in the bug reports), SDAG combining
will produce a single instruction on these tests like:
movbe rax, qword ptr [rdi]
or:
mov rax, qword ptr [rdi]
Not some (half) vector monstrosity as we currently do using SLP:
vpmovzxbq ymm0, dword ptr [rdi + 1] # ymm0 = mem[0],zero,zero,..
vpsllvq ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
movzx eax, byte ptr [rdi]
movzx ecx, byte ptr [rdi + 5]
shl rcx, 40
movzx edx, byte ptr [rdi + 6]
shl rdx, 48
or rdx, rcx
movzx ecx, byte ptr [rdi + 7]
shl rcx, 56
or rcx, rdx
or rcx, rax
vextracti128 xmm1, ymm0, 1
vpor xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1]
vpor xmm0, xmm0, xmm1
vmovq rax, xmm0
or rax, rcx
vzeroupper
ret
Differential Revision: https://reviews.llvm.org/D67841
llvm-svn: 375025
2019-10-17 02:06:24 +08:00
; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[T0]] to i64
; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[T1]] to i64
; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[T2]] to i64
; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[T3]] to i64
2019-09-20 22:17:00 +08:00
; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[T4]] to i64
; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[T5]] to i64
; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[T6]] to i64
; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[T7]] to i64
[SLP] avoid reduction transform on patterns that the backend can load-combine (2nd try)
The 1st attempt at this modified the cost model in a bad way to avoid the vectorization,
but that caused problems for other users (the loop vectorizer) of the cost model.
I don't see an ideal solution to these 2 related, potentially large, perf regressions:
https://bugs.llvm.org/show_bug.cgi?id=42708
https://bugs.llvm.org/show_bug.cgi?id=43146
We decided that load combining was unsuitable for IR because it could obscure other
optimizations in IR. So we removed the LoadCombiner pass and deferred to the backend.
Therefore, preventing SLP from destroying load combine opportunities requires that it
recognizes patterns that could be combined later, but not do the optimization itself (
it's not a vector combine anyway, so it's probably out-of-scope for SLP).
Here, we add a cost-independent bailout with a conservative pattern match for a
multi-instruction sequence that can probably be reduced later.
In the x86 tests shown (and discussed in more detail in the bug reports), SDAG combining
will produce a single instruction on these tests like:
movbe rax, qword ptr [rdi]
or:
mov rax, qword ptr [rdi]
Not some (half) vector monstrosity as we currently do using SLP:
vpmovzxbq ymm0, dword ptr [rdi + 1] # ymm0 = mem[0],zero,zero,..
vpsllvq ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
movzx eax, byte ptr [rdi]
movzx ecx, byte ptr [rdi + 5]
shl rcx, 40
movzx edx, byte ptr [rdi + 6]
shl rdx, 48
or rdx, rcx
movzx ecx, byte ptr [rdi + 7]
shl rcx, 56
or rcx, rdx
or rcx, rax
vextracti128 xmm1, ymm0, 1
vpor xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1]
vpor xmm0, xmm0, xmm1
vmovq rax, xmm0
or rax, rcx
vzeroupper
ret
Differential Revision: https://reviews.llvm.org/D67841
llvm-svn: 375025
2019-10-17 02:06:24 +08:00
; CHECK-NEXT: [[SH0:%.*]] = shl nuw i64 [[Z0]], 56
; CHECK-NEXT: [[SH1:%.*]] = shl nuw nsw i64 [[Z1]], 48
; CHECK-NEXT: [[SH2:%.*]] = shl nuw nsw i64 [[Z2]], 40
; CHECK-NEXT: [[SH3:%.*]] = shl nuw nsw i64 [[Z3]], 32
2019-09-20 22:17:00 +08:00
; CHECK-NEXT: [[SH4:%.*]] = shl nuw nsw i64 [[Z4]], 24
; CHECK-NEXT: [[SH5:%.*]] = shl nuw nsw i64 [[Z5]], 16
; CHECK-NEXT: [[SH6:%.*]] = shl nuw nsw i64 [[Z6]], 8
[SLP] avoid reduction transform on patterns that the backend can load-combine (2nd try)
The 1st attempt at this modified the cost model in a bad way to avoid the vectorization,
but that caused problems for other users (the loop vectorizer) of the cost model.
I don't see an ideal solution to these 2 related, potentially large, perf regressions:
https://bugs.llvm.org/show_bug.cgi?id=42708
https://bugs.llvm.org/show_bug.cgi?id=43146
We decided that load combining was unsuitable for IR because it could obscure other
optimizations in IR. So we removed the LoadCombiner pass and deferred to the backend.
Therefore, preventing SLP from destroying load combine opportunities requires that it
recognizes patterns that could be combined later, but not do the optimization itself (
it's not a vector combine anyway, so it's probably out-of-scope for SLP).
Here, we add a cost-independent bailout with a conservative pattern match for a
multi-instruction sequence that can probably be reduced later.
In the x86 tests shown (and discussed in more detail in the bug reports), SDAG combining
will produce a single instruction on these tests like:
movbe rax, qword ptr [rdi]
or:
mov rax, qword ptr [rdi]
Not some (half) vector monstrosity as we currently do using SLP:
vpmovzxbq ymm0, dword ptr [rdi + 1] # ymm0 = mem[0],zero,zero,..
vpsllvq ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
movzx eax, byte ptr [rdi]
movzx ecx, byte ptr [rdi + 5]
shl rcx, 40
movzx edx, byte ptr [rdi + 6]
shl rdx, 48
or rdx, rcx
movzx ecx, byte ptr [rdi + 7]
shl rcx, 56
or rcx, rdx
or rcx, rax
vextracti128 xmm1, ymm0, 1
vpor xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1]
vpor xmm0, xmm0, xmm1
vmovq rax, xmm0
or rax, rcx
vzeroupper
ret
Differential Revision: https://reviews.llvm.org/D67841
llvm-svn: 375025
2019-10-17 02:06:24 +08:00
; CHECK-NEXT: [[OR01:%.*]] = or i64 [[SH0]], [[SH1]]
; CHECK-NEXT: [[OR012:%.*]] = or i64 [[OR01]], [[SH2]]
; CHECK-NEXT: [[OR0123:%.*]] = or i64 [[OR012]], [[SH3]]
; CHECK-NEXT: [[OR01234:%.*]] = or i64 [[OR0123]], [[SH4]]
; CHECK-NEXT: [[OR012345:%.*]] = or i64 [[OR01234]], [[SH5]]
; CHECK-NEXT: [[OR0123456:%.*]] = or i64 [[OR012345]], [[SH6]]
; CHECK-NEXT: [[OR01234567:%.*]] = or i64 [[OR0123456]], [[Z7]]
; CHECK-NEXT: ret i64 [[OR01234567]]
2019-09-20 22:17:00 +08:00
;
%g0 = getelementptr inbounds %v8i8 , %v8i8 * %p , i64 0 , i32 0
%g1 = getelementptr inbounds %v8i8 , %v8i8 * %p , i64 0 , i32 1
%g2 = getelementptr inbounds %v8i8 , %v8i8 * %p , i64 0 , i32 2
%g3 = getelementptr inbounds %v8i8 , %v8i8 * %p , i64 0 , i32 3
%g4 = getelementptr inbounds %v8i8 , %v8i8 * %p , i64 0 , i32 4
%g5 = getelementptr inbounds %v8i8 , %v8i8 * %p , i64 0 , i32 5
%g6 = getelementptr inbounds %v8i8 , %v8i8 * %p , i64 0 , i32 6
%g7 = getelementptr inbounds %v8i8 , %v8i8 * %p , i64 0 , i32 7
%t0 = load i8 , i8 * %g0
%t1 = load i8 , i8 * %g1
%t2 = load i8 , i8 * %g2
%t3 = load i8 , i8 * %g3
%t4 = load i8 , i8 * %g4
%t5 = load i8 , i8 * %g5
%t6 = load i8 , i8 * %g6
%t7 = load i8 , i8 * %g7
%z0 = zext i8 %t0 to i64
%z1 = zext i8 %t1 to i64
%z2 = zext i8 %t2 to i64
%z3 = zext i8 %t3 to i64
%z4 = zext i8 %t4 to i64
%z5 = zext i8 %t5 to i64
%z6 = zext i8 %t6 to i64
%z7 = zext i8 %t7 to i64
%sh0 = shl nuw i64 %z0 , 56
%sh1 = shl nuw nsw i64 %z1 , 48
%sh2 = shl nuw nsw i64 %z2 , 40
%sh3 = shl nuw nsw i64 %z3 , 32
%sh4 = shl nuw nsw i64 %z4 , 24
%sh5 = shl nuw nsw i64 %z5 , 16
%sh6 = shl nuw nsw i64 %z6 , 8
; %sh7 = shl nuw nsw i64 %z7, 0 <-- missing phantom shift
%or01 = or i64 %sh0 , %sh1
%or012 = or i64 %or01 , %sh2
%or0123 = or i64 %or012 , %sh3
%or01234 = or i64 %or0123 , %sh4
%or012345 = or i64 %or01234 , %sh5
%or0123456 = or i64 %or012345 , %sh6
%or01234567 = or i64 %or0123456 , %z7
ret i64 %or01234567
}
define i64 @load_bswap_nop_shift ( %v8i8 * %p ) {
; CHECK-LABEL: @load_bswap_nop_shift(
; CHECK-NEXT: [[G0:%.*]] = getelementptr inbounds [[V8I8:%.*]], %v8i8* [[P:%.*]], i64 0, i32 0
; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 1
; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 2
; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 3
; CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 4
; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 5
; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 6
; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 7
[SLP] avoid reduction transform on patterns that the backend can load-combine (2nd try)
The 1st attempt at this modified the cost model in a bad way to avoid the vectorization,
but that caused problems for other users (the loop vectorizer) of the cost model.
I don't see an ideal solution to these 2 related, potentially large, perf regressions:
https://bugs.llvm.org/show_bug.cgi?id=42708
https://bugs.llvm.org/show_bug.cgi?id=43146
We decided that load combining was unsuitable for IR because it could obscure other
optimizations in IR. So we removed the LoadCombiner pass and deferred to the backend.
Therefore, preventing SLP from destroying load combine opportunities requires that it
recognizes patterns that could be combined later, but not do the optimization itself (
it's not a vector combine anyway, so it's probably out-of-scope for SLP).
Here, we add a cost-independent bailout with a conservative pattern match for a
multi-instruction sequence that can probably be reduced later.
In the x86 tests shown (and discussed in more detail in the bug reports), SDAG combining
will produce a single instruction on these tests like:
movbe rax, qword ptr [rdi]
or:
mov rax, qword ptr [rdi]
Not some (half) vector monstrosity as we currently do using SLP:
vpmovzxbq ymm0, dword ptr [rdi + 1] # ymm0 = mem[0],zero,zero,..
vpsllvq ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
movzx eax, byte ptr [rdi]
movzx ecx, byte ptr [rdi + 5]
shl rcx, 40
movzx edx, byte ptr [rdi + 6]
shl rdx, 48
or rdx, rcx
movzx ecx, byte ptr [rdi + 7]
shl rcx, 56
or rcx, rdx
or rcx, rax
vextracti128 xmm1, ymm0, 1
vpor xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1]
vpor xmm0, xmm0, xmm1
vmovq rax, xmm0
or rax, rcx
vzeroupper
ret
Differential Revision: https://reviews.llvm.org/D67841
llvm-svn: 375025
2019-10-17 02:06:24 +08:00
; CHECK-NEXT: [[T0:%.*]] = load i8, i8* [[G0]]
; CHECK-NEXT: [[T1:%.*]] = load i8, i8* [[G1]]
; CHECK-NEXT: [[T2:%.*]] = load i8, i8* [[G2]]
; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[G3]]
; CHECK-NEXT: [[T4:%.*]] = load i8, i8* [[G4]]
; CHECK-NEXT: [[T5:%.*]] = load i8, i8* [[G5]]
; CHECK-NEXT: [[T6:%.*]] = load i8, i8* [[G6]]
; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[G7]]
; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[T0]] to i64
; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[T1]] to i64
; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[T2]] to i64
; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[T3]] to i64
; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[T4]] to i64
; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[T5]] to i64
; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[T6]] to i64
; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[T7]] to i64
; CHECK-NEXT: [[SH0:%.*]] = shl nuw i64 [[Z0]], 56
; CHECK-NEXT: [[SH1:%.*]] = shl nuw nsw i64 [[Z1]], 48
; CHECK-NEXT: [[SH2:%.*]] = shl nuw nsw i64 [[Z2]], 40
; CHECK-NEXT: [[SH3:%.*]] = shl nuw nsw i64 [[Z3]], 32
; CHECK-NEXT: [[SH4:%.*]] = shl nuw nsw i64 [[Z4]], 24
; CHECK-NEXT: [[SH5:%.*]] = shl nuw nsw i64 [[Z5]], 16
; CHECK-NEXT: [[SH6:%.*]] = shl nuw nsw i64 [[Z6]], 8
; CHECK-NEXT: [[SH7:%.*]] = shl nuw nsw i64 [[Z7]], 0
; CHECK-NEXT: [[OR01:%.*]] = or i64 [[SH0]], [[SH1]]
; CHECK-NEXT: [[OR012:%.*]] = or i64 [[OR01]], [[SH2]]
; CHECK-NEXT: [[OR0123:%.*]] = or i64 [[OR012]], [[SH3]]
; CHECK-NEXT: [[OR01234:%.*]] = or i64 [[OR0123]], [[SH4]]
; CHECK-NEXT: [[OR012345:%.*]] = or i64 [[OR01234]], [[SH5]]
; CHECK-NEXT: [[OR0123456:%.*]] = or i64 [[OR012345]], [[SH6]]
; CHECK-NEXT: [[OR01234567:%.*]] = or i64 [[OR0123456]], [[SH7]]
; CHECK-NEXT: ret i64 [[OR01234567]]
2019-09-20 22:17:00 +08:00
;
%g0 = getelementptr inbounds %v8i8 , %v8i8 * %p , i64 0 , i32 0
%g1 = getelementptr inbounds %v8i8 , %v8i8 * %p , i64 0 , i32 1
%g2 = getelementptr inbounds %v8i8 , %v8i8 * %p , i64 0 , i32 2
%g3 = getelementptr inbounds %v8i8 , %v8i8 * %p , i64 0 , i32 3
%g4 = getelementptr inbounds %v8i8 , %v8i8 * %p , i64 0 , i32 4
%g5 = getelementptr inbounds %v8i8 , %v8i8 * %p , i64 0 , i32 5
%g6 = getelementptr inbounds %v8i8 , %v8i8 * %p , i64 0 , i32 6
%g7 = getelementptr inbounds %v8i8 , %v8i8 * %p , i64 0 , i32 7
%t0 = load i8 , i8 * %g0
%t1 = load i8 , i8 * %g1
%t2 = load i8 , i8 * %g2
%t3 = load i8 , i8 * %g3
%t4 = load i8 , i8 * %g4
%t5 = load i8 , i8 * %g5
%t6 = load i8 , i8 * %g6
%t7 = load i8 , i8 * %g7
%z0 = zext i8 %t0 to i64
%z1 = zext i8 %t1 to i64
%z2 = zext i8 %t2 to i64
%z3 = zext i8 %t3 to i64
%z4 = zext i8 %t4 to i64
%z5 = zext i8 %t5 to i64
%z6 = zext i8 %t6 to i64
%z7 = zext i8 %t7 to i64
%sh0 = shl nuw i64 %z0 , 56
%sh1 = shl nuw nsw i64 %z1 , 48
%sh2 = shl nuw nsw i64 %z2 , 40
%sh3 = shl nuw nsw i64 %z3 , 32
%sh4 = shl nuw nsw i64 %z4 , 24
%sh5 = shl nuw nsw i64 %z5 , 16
%sh6 = shl nuw nsw i64 %z6 , 8
%sh7 = shl nuw nsw i64 %z7 , 0
%or01 = or i64 %sh0 , %sh1
%or012 = or i64 %or01 , %sh2
%or0123 = or i64 %or012 , %sh3
%or01234 = or i64 %or0123 , %sh4
%or012345 = or i64 %or01234 , %sh5
%or0123456 = or i64 %or012345 , %sh6
%or01234567 = or i64 %or0123456 , %sh7
ret i64 %or01234567
}
; https://bugs.llvm.org/show_bug.cgi?id=42708
define i64 @load64le ( i8 * %arg ) {
; CHECK-LABEL: @load64le(
; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds i8, i8* [[ARG:%.*]], i64 1
; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 2
; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 3
; CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 4
; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 5
; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 6
; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 7
; CHECK-NEXT: [[LD0:%.*]] = load i8, i8* [[ARG]], align 1
[SLP] avoid reduction transform on patterns that the backend can load-combine (2nd try)
The 1st attempt at this modified the cost model in a bad way to avoid the vectorization,
but that caused problems for other users (the loop vectorizer) of the cost model.
I don't see an ideal solution to these 2 related, potentially large, perf regressions:
https://bugs.llvm.org/show_bug.cgi?id=42708
https://bugs.llvm.org/show_bug.cgi?id=43146
We decided that load combining was unsuitable for IR because it could obscure other
optimizations in IR. So we removed the LoadCombiner pass and deferred to the backend.
Therefore, preventing SLP from destroying load combine opportunities requires that it
recognizes patterns that could be combined later, but not do the optimization itself (
it's not a vector combine anyway, so it's probably out-of-scope for SLP).
Here, we add a cost-independent bailout with a conservative pattern match for a
multi-instruction sequence that can probably be reduced later.
In the x86 tests shown (and discussed in more detail in the bug reports), SDAG combining
will produce a single instruction on these tests like:
movbe rax, qword ptr [rdi]
or:
mov rax, qword ptr [rdi]
Not some (half) vector monstrosity as we currently do using SLP:
vpmovzxbq ymm0, dword ptr [rdi + 1] # ymm0 = mem[0],zero,zero,..
vpsllvq ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
movzx eax, byte ptr [rdi]
movzx ecx, byte ptr [rdi + 5]
shl rcx, 40
movzx edx, byte ptr [rdi + 6]
shl rdx, 48
or rdx, rcx
movzx ecx, byte ptr [rdi + 7]
shl rcx, 56
or rcx, rdx
or rcx, rax
vextracti128 xmm1, ymm0, 1
vpor xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1]
vpor xmm0, xmm0, xmm1
vmovq rax, xmm0
or rax, rcx
vzeroupper
ret
Differential Revision: https://reviews.llvm.org/D67841
llvm-svn: 375025
2019-10-17 02:06:24 +08:00
; CHECK-NEXT: [[LD1:%.*]] = load i8, i8* [[G1]], align 1
; CHECK-NEXT: [[LD2:%.*]] = load i8, i8* [[G2]], align 1
; CHECK-NEXT: [[LD3:%.*]] = load i8, i8* [[G3]], align 1
; CHECK-NEXT: [[LD4:%.*]] = load i8, i8* [[G4]], align 1
2019-09-20 22:17:00 +08:00
; CHECK-NEXT: [[LD5:%.*]] = load i8, i8* [[G5]], align 1
; CHECK-NEXT: [[LD6:%.*]] = load i8, i8* [[G6]], align 1
; CHECK-NEXT: [[LD7:%.*]] = load i8, i8* [[G7]], align 1
; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[LD0]] to i64
[SLP] avoid reduction transform on patterns that the backend can load-combine (2nd try)
The 1st attempt at this modified the cost model in a bad way to avoid the vectorization,
but that caused problems for other users (the loop vectorizer) of the cost model.
I don't see an ideal solution to these 2 related, potentially large, perf regressions:
https://bugs.llvm.org/show_bug.cgi?id=42708
https://bugs.llvm.org/show_bug.cgi?id=43146
We decided that load combining was unsuitable for IR because it could obscure other
optimizations in IR. So we removed the LoadCombiner pass and deferred to the backend.
Therefore, preventing SLP from destroying load combine opportunities requires that it
recognizes patterns that could be combined later, but not do the optimization itself (
it's not a vector combine anyway, so it's probably out-of-scope for SLP).
Here, we add a cost-independent bailout with a conservative pattern match for a
multi-instruction sequence that can probably be reduced later.
In the x86 tests shown (and discussed in more detail in the bug reports), SDAG combining
will produce a single instruction on these tests like:
movbe rax, qword ptr [rdi]
or:
mov rax, qword ptr [rdi]
Not some (half) vector monstrosity as we currently do using SLP:
vpmovzxbq ymm0, dword ptr [rdi + 1] # ymm0 = mem[0],zero,zero,..
vpsllvq ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
movzx eax, byte ptr [rdi]
movzx ecx, byte ptr [rdi + 5]
shl rcx, 40
movzx edx, byte ptr [rdi + 6]
shl rdx, 48
or rdx, rcx
movzx ecx, byte ptr [rdi + 7]
shl rcx, 56
or rcx, rdx
or rcx, rax
vextracti128 xmm1, ymm0, 1
vpor xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1]
vpor xmm0, xmm0, xmm1
vmovq rax, xmm0
or rax, rcx
vzeroupper
ret
Differential Revision: https://reviews.llvm.org/D67841
llvm-svn: 375025
2019-10-17 02:06:24 +08:00
; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[LD1]] to i64
; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[LD2]] to i64
; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[LD3]] to i64
; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[LD4]] to i64
2019-09-20 22:17:00 +08:00
; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[LD5]] to i64
; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[LD6]] to i64
; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[LD7]] to i64
[SLP] avoid reduction transform on patterns that the backend can load-combine (2nd try)
The 1st attempt at this modified the cost model in a bad way to avoid the vectorization,
but that caused problems for other users (the loop vectorizer) of the cost model.
I don't see an ideal solution to these 2 related, potentially large, perf regressions:
https://bugs.llvm.org/show_bug.cgi?id=42708
https://bugs.llvm.org/show_bug.cgi?id=43146
We decided that load combining was unsuitable for IR because it could obscure other
optimizations in IR. So we removed the LoadCombiner pass and deferred to the backend.
Therefore, preventing SLP from destroying load combine opportunities requires that it
recognizes patterns that could be combined later, but not do the optimization itself (
it's not a vector combine anyway, so it's probably out-of-scope for SLP).
Here, we add a cost-independent bailout with a conservative pattern match for a
multi-instruction sequence that can probably be reduced later.
In the x86 tests shown (and discussed in more detail in the bug reports), SDAG combining
will produce a single instruction on these tests like:
movbe rax, qword ptr [rdi]
or:
mov rax, qword ptr [rdi]
Not some (half) vector monstrosity as we currently do using SLP:
vpmovzxbq ymm0, dword ptr [rdi + 1] # ymm0 = mem[0],zero,zero,..
vpsllvq ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
movzx eax, byte ptr [rdi]
movzx ecx, byte ptr [rdi + 5]
shl rcx, 40
movzx edx, byte ptr [rdi + 6]
shl rdx, 48
or rdx, rcx
movzx ecx, byte ptr [rdi + 7]
shl rcx, 56
or rcx, rdx
or rcx, rax
vextracti128 xmm1, ymm0, 1
vpor xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1]
vpor xmm0, xmm0, xmm1
vmovq rax, xmm0
or rax, rcx
vzeroupper
ret
Differential Revision: https://reviews.llvm.org/D67841
llvm-svn: 375025
2019-10-17 02:06:24 +08:00
; CHECK-NEXT: [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8
; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16
; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i64 [[Z3]], 24
; CHECK-NEXT: [[S4:%.*]] = shl nuw nsw i64 [[Z4]], 32
2019-09-20 22:17:00 +08:00
; CHECK-NEXT: [[S5:%.*]] = shl nuw nsw i64 [[Z5]], 40
; CHECK-NEXT: [[S6:%.*]] = shl nuw nsw i64 [[Z6]], 48
; CHECK-NEXT: [[S7:%.*]] = shl nuw i64 [[Z7]], 56
[SLP] avoid reduction transform on patterns that the backend can load-combine (2nd try)
The 1st attempt at this modified the cost model in a bad way to avoid the vectorization,
but that caused problems for other users (the loop vectorizer) of the cost model.
I don't see an ideal solution to these 2 related, potentially large, perf regressions:
https://bugs.llvm.org/show_bug.cgi?id=42708
https://bugs.llvm.org/show_bug.cgi?id=43146
We decided that load combining was unsuitable for IR because it could obscure other
optimizations in IR. So we removed the LoadCombiner pass and deferred to the backend.
Therefore, preventing SLP from destroying load combine opportunities requires that it
recognizes patterns that could be combined later, but not do the optimization itself (
it's not a vector combine anyway, so it's probably out-of-scope for SLP).
Here, we add a cost-independent bailout with a conservative pattern match for a
multi-instruction sequence that can probably be reduced later.
In the x86 tests shown (and discussed in more detail in the bug reports), SDAG combining
will produce a single instruction on these tests like:
movbe rax, qword ptr [rdi]
or:
mov rax, qword ptr [rdi]
Not some (half) vector monstrosity as we currently do using SLP:
vpmovzxbq ymm0, dword ptr [rdi + 1] # ymm0 = mem[0],zero,zero,..
vpsllvq ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
movzx eax, byte ptr [rdi]
movzx ecx, byte ptr [rdi + 5]
shl rcx, 40
movzx edx, byte ptr [rdi + 6]
shl rdx, 48
or rdx, rcx
movzx ecx, byte ptr [rdi + 7]
shl rcx, 56
or rcx, rdx
or rcx, rax
vextracti128 xmm1, ymm0, 1
vpor xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1]
vpor xmm0, xmm0, xmm1
vmovq rax, xmm0
or rax, rcx
vzeroupper
ret
Differential Revision: https://reviews.llvm.org/D67841
llvm-svn: 375025
2019-10-17 02:06:24 +08:00
; CHECK-NEXT: [[O1:%.*]] = or i64 [[S1]], [[Z0]]
; CHECK-NEXT: [[O2:%.*]] = or i64 [[O1]], [[S2]]
; CHECK-NEXT: [[O3:%.*]] = or i64 [[O2]], [[S3]]
; CHECK-NEXT: [[O4:%.*]] = or i64 [[O3]], [[S4]]
; CHECK-NEXT: [[O5:%.*]] = or i64 [[O4]], [[S5]]
; CHECK-NEXT: [[O6:%.*]] = or i64 [[O5]], [[S6]]
; CHECK-NEXT: [[O7:%.*]] = or i64 [[O6]], [[S7]]
; CHECK-NEXT: ret i64 [[O7]]
2019-09-20 22:17:00 +08:00
;
%g1 = getelementptr inbounds i8 , i8 * %arg , i64 1
%g2 = getelementptr inbounds i8 , i8 * %arg , i64 2
%g3 = getelementptr inbounds i8 , i8 * %arg , i64 3
%g4 = getelementptr inbounds i8 , i8 * %arg , i64 4
%g5 = getelementptr inbounds i8 , i8 * %arg , i64 5
%g6 = getelementptr inbounds i8 , i8 * %arg , i64 6
%g7 = getelementptr inbounds i8 , i8 * %arg , i64 7
%ld0 = load i8 , i8 * %arg , align 1
%ld1 = load i8 , i8 * %g1 , align 1
%ld2 = load i8 , i8 * %g2 , align 1
%ld3 = load i8 , i8 * %g3 , align 1
%ld4 = load i8 , i8 * %g4 , align 1
%ld5 = load i8 , i8 * %g5 , align 1
%ld6 = load i8 , i8 * %g6 , align 1
%ld7 = load i8 , i8 * %g7 , align 1
%z0 = zext i8 %ld0 to i64
%z1 = zext i8 %ld1 to i64
%z2 = zext i8 %ld2 to i64
%z3 = zext i8 %ld3 to i64
%z4 = zext i8 %ld4 to i64
%z5 = zext i8 %ld5 to i64
%z6 = zext i8 %ld6 to i64
%z7 = zext i8 %ld7 to i64
; %s0 = shl nuw nsw i64 %z0, 0 <-- missing phantom shift
%s1 = shl nuw nsw i64 %z1 , 8
%s2 = shl nuw nsw i64 %z2 , 16
%s3 = shl nuw nsw i64 %z3 , 24
%s4 = shl nuw nsw i64 %z4 , 32
%s5 = shl nuw nsw i64 %z5 , 40
%s6 = shl nuw nsw i64 %z6 , 48
%s7 = shl nuw i64 %z7 , 56
%o1 = or i64 %s1 , %z0
%o2 = or i64 %o1 , %s2
%o3 = or i64 %o2 , %s3
%o4 = or i64 %o3 , %s4
%o5 = or i64 %o4 , %s5
%o6 = or i64 %o5 , %s6
%o7 = or i64 %o6 , %s7
ret i64 %o7
}
define i64 @load64le_nop_shift ( i8 * %arg ) {
; CHECK-LABEL: @load64le_nop_shift(
; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds i8, i8* [[ARG:%.*]], i64 1
; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 2
; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 3
; CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 4
; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 5
; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 6
; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 7
[SLP] avoid reduction transform on patterns that the backend can load-combine (2nd try)
The 1st attempt at this modified the cost model in a bad way to avoid the vectorization,
but that caused problems for other users (the loop vectorizer) of the cost model.
I don't see an ideal solution to these 2 related, potentially large, perf regressions:
https://bugs.llvm.org/show_bug.cgi?id=42708
https://bugs.llvm.org/show_bug.cgi?id=43146
We decided that load combining was unsuitable for IR because it could obscure other
optimizations in IR. So we removed the LoadCombiner pass and deferred to the backend.
Therefore, preventing SLP from destroying load combine opportunities requires that it
recognizes patterns that could be combined later, but not do the optimization itself (
it's not a vector combine anyway, so it's probably out-of-scope for SLP).
Here, we add a cost-independent bailout with a conservative pattern match for a
multi-instruction sequence that can probably be reduced later.
In the x86 tests shown (and discussed in more detail in the bug reports), SDAG combining
will produce a single instruction on these tests like:
movbe rax, qword ptr [rdi]
or:
mov rax, qword ptr [rdi]
Not some (half) vector monstrosity as we currently do using SLP:
vpmovzxbq ymm0, dword ptr [rdi + 1] # ymm0 = mem[0],zero,zero,..
vpsllvq ymm0, ymm0, ymmword ptr [rip + .LCPI0_0]
movzx eax, byte ptr [rdi]
movzx ecx, byte ptr [rdi + 5]
shl rcx, 40
movzx edx, byte ptr [rdi + 6]
shl rdx, 48
or rdx, rcx
movzx ecx, byte ptr [rdi + 7]
shl rcx, 56
or rcx, rdx
or rcx, rax
vextracti128 xmm1, ymm0, 1
vpor xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1]
vpor xmm0, xmm0, xmm1
vmovq rax, xmm0
or rax, rcx
vzeroupper
ret
Differential Revision: https://reviews.llvm.org/D67841
llvm-svn: 375025
2019-10-17 02:06:24 +08:00
; CHECK-NEXT: [[LD0:%.*]] = load i8, i8* [[ARG]], align 1
; CHECK-NEXT: [[LD1:%.*]] = load i8, i8* [[G1]], align 1
; CHECK-NEXT: [[LD2:%.*]] = load i8, i8* [[G2]], align 1
; CHECK-NEXT: [[LD3:%.*]] = load i8, i8* [[G3]], align 1
; CHECK-NEXT: [[LD4:%.*]] = load i8, i8* [[G4]], align 1
; CHECK-NEXT: [[LD5:%.*]] = load i8, i8* [[G5]], align 1
; CHECK-NEXT: [[LD6:%.*]] = load i8, i8* [[G6]], align 1
; CHECK-NEXT: [[LD7:%.*]] = load i8, i8* [[G7]], align 1
; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[LD0]] to i64
; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[LD1]] to i64
; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[LD2]] to i64
; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[LD3]] to i64
; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[LD4]] to i64
; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[LD5]] to i64
; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[LD6]] to i64
; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[LD7]] to i64
; CHECK-NEXT: [[S0:%.*]] = shl nuw nsw i64 [[Z0]], 0
; CHECK-NEXT: [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8
; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16
; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i64 [[Z3]], 24
; CHECK-NEXT: [[S4:%.*]] = shl nuw nsw i64 [[Z4]], 32
; CHECK-NEXT: [[S5:%.*]] = shl nuw nsw i64 [[Z5]], 40
; CHECK-NEXT: [[S6:%.*]] = shl nuw nsw i64 [[Z6]], 48
; CHECK-NEXT: [[S7:%.*]] = shl nuw i64 [[Z7]], 56
; CHECK-NEXT: [[O1:%.*]] = or i64 [[S1]], [[S0]]
; CHECK-NEXT: [[O2:%.*]] = or i64 [[O1]], [[S2]]
; CHECK-NEXT: [[O3:%.*]] = or i64 [[O2]], [[S3]]
; CHECK-NEXT: [[O4:%.*]] = or i64 [[O3]], [[S4]]
; CHECK-NEXT: [[O5:%.*]] = or i64 [[O4]], [[S5]]
; CHECK-NEXT: [[O6:%.*]] = or i64 [[O5]], [[S6]]
; CHECK-NEXT: [[O7:%.*]] = or i64 [[O6]], [[S7]]
; CHECK-NEXT: ret i64 [[O7]]
2019-09-20 22:17:00 +08:00
;
%g1 = getelementptr inbounds i8 , i8 * %arg , i64 1
%g2 = getelementptr inbounds i8 , i8 * %arg , i64 2
%g3 = getelementptr inbounds i8 , i8 * %arg , i64 3
%g4 = getelementptr inbounds i8 , i8 * %arg , i64 4
%g5 = getelementptr inbounds i8 , i8 * %arg , i64 5
%g6 = getelementptr inbounds i8 , i8 * %arg , i64 6
%g7 = getelementptr inbounds i8 , i8 * %arg , i64 7
%ld0 = load i8 , i8 * %arg , align 1
%ld1 = load i8 , i8 * %g1 , align 1
%ld2 = load i8 , i8 * %g2 , align 1
%ld3 = load i8 , i8 * %g3 , align 1
%ld4 = load i8 , i8 * %g4 , align 1
%ld5 = load i8 , i8 * %g5 , align 1
%ld6 = load i8 , i8 * %g6 , align 1
%ld7 = load i8 , i8 * %g7 , align 1
%z0 = zext i8 %ld0 to i64
%z1 = zext i8 %ld1 to i64
%z2 = zext i8 %ld2 to i64
%z3 = zext i8 %ld3 to i64
%z4 = zext i8 %ld4 to i64
%z5 = zext i8 %ld5 to i64
%z6 = zext i8 %ld6 to i64
%z7 = zext i8 %ld7 to i64
%s0 = shl nuw nsw i64 %z0 , 0
%s1 = shl nuw nsw i64 %z1 , 8
%s2 = shl nuw nsw i64 %z2 , 16
%s3 = shl nuw nsw i64 %z3 , 24
%s4 = shl nuw nsw i64 %z4 , 32
%s5 = shl nuw nsw i64 %z5 , 40
%s6 = shl nuw nsw i64 %z6 , 48
%s7 = shl nuw i64 %z7 , 56
%o1 = or i64 %s1 , %s0
%o2 = or i64 %o1 , %s2
%o3 = or i64 %o2 , %s3
%o4 = or i64 %o3 , %s4
%o5 = or i64 %o4 , %s5
%o6 = or i64 %o5 , %s6
%o7 = or i64 %o6 , %s7
ret i64 %o7
}
2020-04-28 05:28:41 +08:00
define void @PR39538 ( i8 * %t0 , i32 * %t1 ) {
; CHECK-LABEL: @PR39538(
; CHECK-NEXT: [[T6:%.*]] = getelementptr inbounds i8, i8* [[T0:%.*]], i64 1
; CHECK-NEXT: [[T11:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 2
; CHECK-NEXT: [[T16:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 3
; CHECK-NEXT: [[T20:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 4
; CHECK-NEXT: [[T24:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 5
; CHECK-NEXT: [[T29:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 6
; CHECK-NEXT: [[T34:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 7
; CHECK-NEXT: [[T39:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 8
; CHECK-NEXT: [[T43:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 9
; CHECK-NEXT: [[T48:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 10
; CHECK-NEXT: [[T53:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 11
; CHECK-NEXT: [[T58:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 12
; CHECK-NEXT: [[T62:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 13
; CHECK-NEXT: [[T67:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 14
; CHECK-NEXT: [[T72:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 15
; CHECK-NEXT: [[T38:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 1
; CHECK-NEXT: [[T57:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 2
; CHECK-NEXT: [[T76:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 3
; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[T0]], align 1
; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[T6]], align 1
; CHECK-NEXT: [[T12:%.*]] = load i8, i8* [[T11]], align 1
; CHECK-NEXT: [[T17:%.*]] = load i8, i8* [[T16]], align 1
; CHECK-NEXT: [[T21:%.*]] = load i8, i8* [[T20]], align 1
; CHECK-NEXT: [[T25:%.*]] = load i8, i8* [[T24]], align 1
; CHECK-NEXT: [[T30:%.*]] = load i8, i8* [[T29]], align 1
; CHECK-NEXT: [[T35:%.*]] = load i8, i8* [[T34]], align 1
; CHECK-NEXT: [[T40:%.*]] = load i8, i8* [[T39]], align 1
; CHECK-NEXT: [[T44:%.*]] = load i8, i8* [[T43]], align 1
; CHECK-NEXT: [[T49:%.*]] = load i8, i8* [[T48]], align 1
; CHECK-NEXT: [[T54:%.*]] = load i8, i8* [[T53]], align 1
; CHECK-NEXT: [[T59:%.*]] = load i8, i8* [[T58]], align 1
; CHECK-NEXT: [[T63:%.*]] = load i8, i8* [[T62]], align 1
; CHECK-NEXT: [[T68:%.*]] = load i8, i8* [[T67]], align 1
; CHECK-NEXT: [[T73:%.*]] = load i8, i8* [[T72]], align 1
[SLP] add another bailout for load-combine patterns (2nd try)
The original patch (rG86dfbc676ebe) exposed an existing bug:
we could wrongly cast a constant expression to BinaryOperator
because the pattern matching allows that. This adds a check
for that case, and there's a reduced test case to verify no
crashing.
Original commit message:
This builds on the or-reduction bailout that was added with D67841.
We still do not have IR-level load combining, although that could
be a target-specific enhancement for -vector-combiner.
The heuristic is narrowly defined to catch the motivating case from
PR39538:
https://bugs.llvm.org/show_bug.cgi?id=39538
...while preserving existing functionality.
That is, there's an unmodified test of pure load/zext/store that is
not seen in this patch at llvm/test/Transforms/SLPVectorizer/X86/cast.ll.
That's the reason for the logic difference to require the 'or'
instructions. The chances that vectorization would actually help a
memory-bound sequence like that seem small, but it looks nicer with:
vpmovzxwd (%rsi), %xmm0
vmovdqu %xmm0, (%rdi)
rather than:
movzwl (%rsi), %eax
movl %eax, (%rdi)
...
In the motivating test, we avoid creating a vector mess that is
unrecoverable in the backend, and SDAG forms the expected bswap
instructions after load combining:
movzbl (%rdi), %eax
vmovd %eax, %xmm0
movzbl 1(%rdi), %eax
vmovd %eax, %xmm1
movzbl 2(%rdi), %eax
vpinsrb $4, 4(%rdi), %xmm0, %xmm0
vpinsrb $8, 8(%rdi), %xmm0, %xmm0
vpinsrb $12, 12(%rdi), %xmm0, %xmm0
vmovd %eax, %xmm2
movzbl 3(%rdi), %eax
vpinsrb $1, 5(%rdi), %xmm1, %xmm1
vpinsrb $2, 9(%rdi), %xmm1, %xmm1
vpinsrb $3, 13(%rdi), %xmm1, %xmm1
vpslld $24, %xmm0, %xmm0
vpmovzxbd %xmm1, %xmm1 # xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
vpslld $16, %xmm1, %xmm1
vpor %xmm0, %xmm1, %xmm0
vpinsrb $1, 6(%rdi), %xmm2, %xmm1
vmovd %eax, %xmm2
vpinsrb $2, 10(%rdi), %xmm1, %xmm1
vpinsrb $3, 14(%rdi), %xmm1, %xmm1
vpinsrb $1, 7(%rdi), %xmm2, %xmm2
vpinsrb $2, 11(%rdi), %xmm2, %xmm2
vpmovzxbd %xmm1, %xmm1 # xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
vpinsrb $3, 15(%rdi), %xmm2, %xmm2
vpslld $8, %xmm1, %xmm1
vpmovzxbd %xmm2, %xmm2 # xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
vpor %xmm2, %xmm1, %xmm1
vpor %xmm1, %xmm0, %xmm0
vmovdqu %xmm0, (%rsi)
movl (%rdi), %eax
movl 4(%rdi), %ecx
movl 8(%rdi), %edx
movbel %eax, (%rsi)
movbel %ecx, 4(%rsi)
movl 12(%rdi), %ecx
movbel %edx, 8(%rsi)
movbel %ecx, 12(%rsi)
Differential Revision: https://reviews.llvm.org/D78997
2020-05-08 03:00:23 +08:00
; CHECK-NEXT: [[T4:%.*]] = zext i8 [[T3]] to i32
; CHECK-NEXT: [[T8:%.*]] = zext i8 [[T7]] to i32
; CHECK-NEXT: [[T13:%.*]] = zext i8 [[T12]] to i32
; CHECK-NEXT: [[T18:%.*]] = zext i8 [[T17]] to i32
; CHECK-NEXT: [[T22:%.*]] = zext i8 [[T21]] to i32
; CHECK-NEXT: [[T26:%.*]] = zext i8 [[T25]] to i32
; CHECK-NEXT: [[T31:%.*]] = zext i8 [[T30]] to i32
; CHECK-NEXT: [[T36:%.*]] = zext i8 [[T35]] to i32
; CHECK-NEXT: [[T41:%.*]] = zext i8 [[T40]] to i32
; CHECK-NEXT: [[T45:%.*]] = zext i8 [[T44]] to i32
; CHECK-NEXT: [[T50:%.*]] = zext i8 [[T49]] to i32
; CHECK-NEXT: [[T55:%.*]] = zext i8 [[T54]] to i32
; CHECK-NEXT: [[T60:%.*]] = zext i8 [[T59]] to i32
; CHECK-NEXT: [[T64:%.*]] = zext i8 [[T63]] to i32
; CHECK-NEXT: [[T69:%.*]] = zext i8 [[T68]] to i32
; CHECK-NEXT: [[T74:%.*]] = zext i8 [[T73]] to i32
; CHECK-NEXT: [[T5:%.*]] = shl nuw i32 [[T4]], 24
; CHECK-NEXT: [[T23:%.*]] = shl nuw i32 [[T22]], 24
; CHECK-NEXT: [[T42:%.*]] = shl nuw i32 [[T41]], 24
; CHECK-NEXT: [[T61:%.*]] = shl nuw i32 [[T60]], 24
; CHECK-NEXT: [[T9:%.*]] = shl nuw nsw i32 [[T8]], 16
; CHECK-NEXT: [[T27:%.*]] = shl nuw nsw i32 [[T26]], 16
; CHECK-NEXT: [[T46:%.*]] = shl nuw nsw i32 [[T45]], 16
; CHECK-NEXT: [[T65:%.*]] = shl nuw nsw i32 [[T64]], 16
; CHECK-NEXT: [[T14:%.*]] = shl nuw nsw i32 [[T13]], 8
; CHECK-NEXT: [[T32:%.*]] = shl nuw nsw i32 [[T31]], 8
; CHECK-NEXT: [[T51:%.*]] = shl nuw nsw i32 [[T50]], 8
; CHECK-NEXT: [[T70:%.*]] = shl nuw nsw i32 [[T69]], 8
; CHECK-NEXT: [[T10:%.*]] = or i32 [[T9]], [[T5]]
; CHECK-NEXT: [[T15:%.*]] = or i32 [[T10]], [[T14]]
; CHECK-NEXT: [[T19:%.*]] = or i32 [[T15]], [[T18]]
; CHECK-NEXT: [[T28:%.*]] = or i32 [[T27]], [[T23]]
; CHECK-NEXT: [[T33:%.*]] = or i32 [[T28]], [[T32]]
; CHECK-NEXT: [[T37:%.*]] = or i32 [[T33]], [[T36]]
; CHECK-NEXT: [[T47:%.*]] = or i32 [[T46]], [[T42]]
; CHECK-NEXT: [[T52:%.*]] = or i32 [[T47]], [[T51]]
; CHECK-NEXT: [[T56:%.*]] = or i32 [[T52]], [[T55]]
; CHECK-NEXT: [[T66:%.*]] = or i32 [[T65]], [[T61]]
; CHECK-NEXT: [[T71:%.*]] = or i32 [[T66]], [[T70]]
; CHECK-NEXT: [[T75:%.*]] = or i32 [[T71]], [[T74]]
; CHECK-NEXT: store i32 [[T19]], i32* [[T1]], align 4
; CHECK-NEXT: store i32 [[T37]], i32* [[T38]], align 4
; CHECK-NEXT: store i32 [[T56]], i32* [[T57]], align 4
; CHECK-NEXT: store i32 [[T75]], i32* [[T76]], align 4
2020-04-28 05:28:41 +08:00
; CHECK-NEXT: ret void
;
%t6 = getelementptr inbounds i8 , i8 * %t0 , i64 1
%t11 = getelementptr inbounds i8 , i8 * %t0 , i64 2
%t16 = getelementptr inbounds i8 , i8 * %t0 , i64 3
%t20 = getelementptr inbounds i8 , i8 * %t0 , i64 4
%t24 = getelementptr inbounds i8 , i8 * %t0 , i64 5
%t29 = getelementptr inbounds i8 , i8 * %t0 , i64 6
%t34 = getelementptr inbounds i8 , i8 * %t0 , i64 7
%t39 = getelementptr inbounds i8 , i8 * %t0 , i64 8
%t43 = getelementptr inbounds i8 , i8 * %t0 , i64 9
%t48 = getelementptr inbounds i8 , i8 * %t0 , i64 10
%t53 = getelementptr inbounds i8 , i8 * %t0 , i64 11
%t58 = getelementptr inbounds i8 , i8 * %t0 , i64 12
%t62 = getelementptr inbounds i8 , i8 * %t0 , i64 13
%t67 = getelementptr inbounds i8 , i8 * %t0 , i64 14
%t72 = getelementptr inbounds i8 , i8 * %t0 , i64 15
%t38 = getelementptr inbounds i32 , i32 * %t1 , i64 1
%t57 = getelementptr inbounds i32 , i32 * %t1 , i64 2
%t76 = getelementptr inbounds i32 , i32 * %t1 , i64 3
%t3 = load i8 , i8 * %t0 , align 1
%t7 = load i8 , i8 * %t6 , align 1
%t12 = load i8 , i8 * %t11 , align 1
%t17 = load i8 , i8 * %t16 , align 1
%t21 = load i8 , i8 * %t20 , align 1
%t25 = load i8 , i8 * %t24 , align 1
%t30 = load i8 , i8 * %t29 , align 1
%t35 = load i8 , i8 * %t34 , align 1
%t40 = load i8 , i8 * %t39 , align 1
%t44 = load i8 , i8 * %t43 , align 1
%t49 = load i8 , i8 * %t48 , align 1
%t54 = load i8 , i8 * %t53 , align 1
%t59 = load i8 , i8 * %t58 , align 1
%t63 = load i8 , i8 * %t62 , align 1
%t68 = load i8 , i8 * %t67 , align 1
%t73 = load i8 , i8 * %t72 , align 1
%t4 = zext i8 %t3 to i32
%t8 = zext i8 %t7 to i32
%t13 = zext i8 %t12 to i32
%t18 = zext i8 %t17 to i32
%t22 = zext i8 %t21 to i32
%t26 = zext i8 %t25 to i32
%t31 = zext i8 %t30 to i32
%t36 = zext i8 %t35 to i32
%t41 = zext i8 %t40 to i32
%t45 = zext i8 %t44 to i32
%t50 = zext i8 %t49 to i32
%t55 = zext i8 %t54 to i32
%t60 = zext i8 %t59 to i32
%t64 = zext i8 %t63 to i32
%t69 = zext i8 %t68 to i32
%t74 = zext i8 %t73 to i32
%t5 = shl nuw i32 %t4 , 24
%t23 = shl nuw i32 %t22 , 24
%t42 = shl nuw i32 %t41 , 24
%t61 = shl nuw i32 %t60 , 24
%t9 = shl nuw nsw i32 %t8 , 16
%t27 = shl nuw nsw i32 %t26 , 16
%t46 = shl nuw nsw i32 %t45 , 16
%t65 = shl nuw nsw i32 %t64 , 16
%t14 = shl nuw nsw i32 %t13 , 8
%t32 = shl nuw nsw i32 %t31 , 8
%t51 = shl nuw nsw i32 %t50 , 8
%t70 = shl nuw nsw i32 %t69 , 8
%t10 = or i32 %t9 , %t5
%t15 = or i32 %t10 , %t14
%t19 = or i32 %t15 , %t18
%t28 = or i32 %t27 , %t23
%t33 = or i32 %t28 , %t32
%t37 = or i32 %t33 , %t36
%t47 = or i32 %t46 , %t42
%t52 = or i32 %t47 , %t51
%t56 = or i32 %t52 , %t55
%t66 = or i32 %t65 , %t61
%t71 = or i32 %t66 , %t70
%t75 = or i32 %t71 , %t74
store i32 %t19 , i32 * %t1 , align 4
store i32 %t37 , i32 * %t38 , align 4
store i32 %t56 , i32 * %t57 , align 4
store i32 %t75 , i32 * %t76 , align 4
ret void
}
2020-05-08 02:53:57 +08:00
; Do not crash on constant expressions.
@g1 = external d s o _ l o c a l unnamed_addr constant [ 8 x i8 ] , align 1
@g2 = external d s o _ l o c a l unnamed_addr constant [ 5 x i8 ] , align 1
define void @load_combine_constant_expression ( i64 * %t1 ) {
; CHECK-LABEL: @load_combine_constant_expression(
; CHECK-NEXT: store i64 or (i64 shl (i64 zext (i32 ptrtoint ([8 x i8]* @g1 to i32) to i64), i64 32), i64 zext (i32 ptrtoint ([5 x i8]* @g2 to i32) to i64)), i64* [[T1:%.*]], align 4
; CHECK-NEXT: [[T3:%.*]] = getelementptr i64, i64* [[T1]], i64 1
; CHECK-NEXT: store i64 or (i64 shl (i64 zext (i32 ptrtoint ([8 x i8]* @g1 to i32) to i64), i64 32), i64 zext (i32 ptrtoint ([5 x i8]* @g2 to i32) to i64)), i64* [[T3]], align 4
; CHECK-NEXT: ret void
;
store i64 or ( i64 shl ( i64 zext ( i32 ptrtoint ( [ 8 x i8 ] * @g1 to i32 ) to i64 ) , i64 32 ) , i64 zext ( i32 ptrtoint ( [ 5 x i8 ] * @g2 to i32 ) to i64 ) ) , i64 * %t1 , align 4
%t3 = getelementptr i64 , i64 * %t1 , i64 1
store i64 or ( i64 shl ( i64 zext ( i32 ptrtoint ( [ 8 x i8 ] * @g1 to i32 ) to i64 ) , i64 32 ) , i64 zext ( i32 ptrtoint ( [ 5 x i8 ] * @g2 to i32 ) to i64 ) ) , i64 * %t3 , align 4
ret void
}