llvm-project/llvm/test/Transforms/SLPVectorizer/X86/bad-reduction.ll

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

540 lines
24 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s
%v8i8 = type { i8, i8, i8, i8, i8, i8, i8, i8 }
; https://bugs.llvm.org/show_bug.cgi?id=43146
define i64 @load_bswap(%v8i8* %p) {
; CHECK-LABEL: @load_bswap(
; CHECK-NEXT: [[G0:%.*]] = getelementptr inbounds [[V8I8:%.*]], %v8i8* [[P:%.*]], i64 0, i32 0
; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 1
; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 2
; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 3
; CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 4
; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 5
; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 6
; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 7
[SLP] avoid reduction transform on patterns that the backend can load-combine (2nd try) The 1st attempt at this modified the cost model in a bad way to avoid the vectorization, but that caused problems for other users (the loop vectorizer) of the cost model. I don't see an ideal solution to these 2 related, potentially large, perf regressions: https://bugs.llvm.org/show_bug.cgi?id=42708 https://bugs.llvm.org/show_bug.cgi?id=43146 We decided that load combining was unsuitable for IR because it could obscure other optimizations in IR. So we removed the LoadCombiner pass and deferred to the backend. Therefore, preventing SLP from destroying load combine opportunities requires that it recognizes patterns that could be combined later, but not do the optimization itself ( it's not a vector combine anyway, so it's probably out-of-scope for SLP). Here, we add a cost-independent bailout with a conservative pattern match for a multi-instruction sequence that can probably be reduced later. In the x86 tests shown (and discussed in more detail in the bug reports), SDAG combining will produce a single instruction on these tests like: movbe rax, qword ptr [rdi] or: mov rax, qword ptr [rdi] Not some (half) vector monstrosity as we currently do using SLP: vpmovzxbq ymm0, dword ptr [rdi + 1] # ymm0 = mem[0],zero,zero,.. vpsllvq ymm0, ymm0, ymmword ptr [rip + .LCPI0_0] movzx eax, byte ptr [rdi] movzx ecx, byte ptr [rdi + 5] shl rcx, 40 movzx edx, byte ptr [rdi + 6] shl rdx, 48 or rdx, rcx movzx ecx, byte ptr [rdi + 7] shl rcx, 56 or rcx, rdx or rcx, rax vextracti128 xmm1, ymm0, 1 vpor xmm0, xmm0, xmm1 vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1] vpor xmm0, xmm0, xmm1 vmovq rax, xmm0 or rax, rcx vzeroupper ret Differential Revision: https://reviews.llvm.org/D67841 llvm-svn: 375025
2019-10-17 02:06:24 +08:00
; CHECK-NEXT: [[T0:%.*]] = load i8, i8* [[G0]]
; CHECK-NEXT: [[T1:%.*]] = load i8, i8* [[G1]]
; CHECK-NEXT: [[T2:%.*]] = load i8, i8* [[G2]]
; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[G3]]
; CHECK-NEXT: [[T4:%.*]] = load i8, i8* [[G4]]
; CHECK-NEXT: [[T5:%.*]] = load i8, i8* [[G5]]
; CHECK-NEXT: [[T6:%.*]] = load i8, i8* [[G6]]
; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[G7]]
[SLP] avoid reduction transform on patterns that the backend can load-combine (2nd try) The 1st attempt at this modified the cost model in a bad way to avoid the vectorization, but that caused problems for other users (the loop vectorizer) of the cost model. I don't see an ideal solution to these 2 related, potentially large, perf regressions: https://bugs.llvm.org/show_bug.cgi?id=42708 https://bugs.llvm.org/show_bug.cgi?id=43146 We decided that load combining was unsuitable for IR because it could obscure other optimizations in IR. So we removed the LoadCombiner pass and deferred to the backend. Therefore, preventing SLP from destroying load combine opportunities requires that it recognizes patterns that could be combined later, but not do the optimization itself ( it's not a vector combine anyway, so it's probably out-of-scope for SLP). Here, we add a cost-independent bailout with a conservative pattern match for a multi-instruction sequence that can probably be reduced later. In the x86 tests shown (and discussed in more detail in the bug reports), SDAG combining will produce a single instruction on these tests like: movbe rax, qword ptr [rdi] or: mov rax, qword ptr [rdi] Not some (half) vector monstrosity as we currently do using SLP: vpmovzxbq ymm0, dword ptr [rdi + 1] # ymm0 = mem[0],zero,zero,.. vpsllvq ymm0, ymm0, ymmword ptr [rip + .LCPI0_0] movzx eax, byte ptr [rdi] movzx ecx, byte ptr [rdi + 5] shl rcx, 40 movzx edx, byte ptr [rdi + 6] shl rdx, 48 or rdx, rcx movzx ecx, byte ptr [rdi + 7] shl rcx, 56 or rcx, rdx or rcx, rax vextracti128 xmm1, ymm0, 1 vpor xmm0, xmm0, xmm1 vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1] vpor xmm0, xmm0, xmm1 vmovq rax, xmm0 or rax, rcx vzeroupper ret Differential Revision: https://reviews.llvm.org/D67841 llvm-svn: 375025
2019-10-17 02:06:24 +08:00
; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[T0]] to i64
; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[T1]] to i64
; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[T2]] to i64
; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[T3]] to i64
; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[T4]] to i64
; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[T5]] to i64
; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[T6]] to i64
; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[T7]] to i64
[SLP] avoid reduction transform on patterns that the backend can load-combine (2nd try) The 1st attempt at this modified the cost model in a bad way to avoid the vectorization, but that caused problems for other users (the loop vectorizer) of the cost model. I don't see an ideal solution to these 2 related, potentially large, perf regressions: https://bugs.llvm.org/show_bug.cgi?id=42708 https://bugs.llvm.org/show_bug.cgi?id=43146 We decided that load combining was unsuitable for IR because it could obscure other optimizations in IR. So we removed the LoadCombiner pass and deferred to the backend. Therefore, preventing SLP from destroying load combine opportunities requires that it recognizes patterns that could be combined later, but not do the optimization itself ( it's not a vector combine anyway, so it's probably out-of-scope for SLP). Here, we add a cost-independent bailout with a conservative pattern match for a multi-instruction sequence that can probably be reduced later. In the x86 tests shown (and discussed in more detail in the bug reports), SDAG combining will produce a single instruction on these tests like: movbe rax, qword ptr [rdi] or: mov rax, qword ptr [rdi] Not some (half) vector monstrosity as we currently do using SLP: vpmovzxbq ymm0, dword ptr [rdi + 1] # ymm0 = mem[0],zero,zero,.. vpsllvq ymm0, ymm0, ymmword ptr [rip + .LCPI0_0] movzx eax, byte ptr [rdi] movzx ecx, byte ptr [rdi + 5] shl rcx, 40 movzx edx, byte ptr [rdi + 6] shl rdx, 48 or rdx, rcx movzx ecx, byte ptr [rdi + 7] shl rcx, 56 or rcx, rdx or rcx, rax vextracti128 xmm1, ymm0, 1 vpor xmm0, xmm0, xmm1 vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1] vpor xmm0, xmm0, xmm1 vmovq rax, xmm0 or rax, rcx vzeroupper ret Differential Revision: https://reviews.llvm.org/D67841 llvm-svn: 375025
2019-10-17 02:06:24 +08:00
; CHECK-NEXT: [[SH0:%.*]] = shl nuw i64 [[Z0]], 56
; CHECK-NEXT: [[SH1:%.*]] = shl nuw nsw i64 [[Z1]], 48
; CHECK-NEXT: [[SH2:%.*]] = shl nuw nsw i64 [[Z2]], 40
; CHECK-NEXT: [[SH3:%.*]] = shl nuw nsw i64 [[Z3]], 32
; CHECK-NEXT: [[SH4:%.*]] = shl nuw nsw i64 [[Z4]], 24
; CHECK-NEXT: [[SH5:%.*]] = shl nuw nsw i64 [[Z5]], 16
; CHECK-NEXT: [[SH6:%.*]] = shl nuw nsw i64 [[Z6]], 8
[SLP] avoid reduction transform on patterns that the backend can load-combine (2nd try) The 1st attempt at this modified the cost model in a bad way to avoid the vectorization, but that caused problems for other users (the loop vectorizer) of the cost model. I don't see an ideal solution to these 2 related, potentially large, perf regressions: https://bugs.llvm.org/show_bug.cgi?id=42708 https://bugs.llvm.org/show_bug.cgi?id=43146 We decided that load combining was unsuitable for IR because it could obscure other optimizations in IR. So we removed the LoadCombiner pass and deferred to the backend. Therefore, preventing SLP from destroying load combine opportunities requires that it recognizes patterns that could be combined later, but not do the optimization itself ( it's not a vector combine anyway, so it's probably out-of-scope for SLP). Here, we add a cost-independent bailout with a conservative pattern match for a multi-instruction sequence that can probably be reduced later. In the x86 tests shown (and discussed in more detail in the bug reports), SDAG combining will produce a single instruction on these tests like: movbe rax, qword ptr [rdi] or: mov rax, qword ptr [rdi] Not some (half) vector monstrosity as we currently do using SLP: vpmovzxbq ymm0, dword ptr [rdi + 1] # ymm0 = mem[0],zero,zero,.. vpsllvq ymm0, ymm0, ymmword ptr [rip + .LCPI0_0] movzx eax, byte ptr [rdi] movzx ecx, byte ptr [rdi + 5] shl rcx, 40 movzx edx, byte ptr [rdi + 6] shl rdx, 48 or rdx, rcx movzx ecx, byte ptr [rdi + 7] shl rcx, 56 or rcx, rdx or rcx, rax vextracti128 xmm1, ymm0, 1 vpor xmm0, xmm0, xmm1 vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1] vpor xmm0, xmm0, xmm1 vmovq rax, xmm0 or rax, rcx vzeroupper ret Differential Revision: https://reviews.llvm.org/D67841 llvm-svn: 375025
2019-10-17 02:06:24 +08:00
; CHECK-NEXT: [[OR01:%.*]] = or i64 [[SH0]], [[SH1]]
; CHECK-NEXT: [[OR012:%.*]] = or i64 [[OR01]], [[SH2]]
; CHECK-NEXT: [[OR0123:%.*]] = or i64 [[OR012]], [[SH3]]
; CHECK-NEXT: [[OR01234:%.*]] = or i64 [[OR0123]], [[SH4]]
; CHECK-NEXT: [[OR012345:%.*]] = or i64 [[OR01234]], [[SH5]]
; CHECK-NEXT: [[OR0123456:%.*]] = or i64 [[OR012345]], [[SH6]]
; CHECK-NEXT: [[OR01234567:%.*]] = or i64 [[OR0123456]], [[Z7]]
; CHECK-NEXT: ret i64 [[OR01234567]]
;
%g0 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 0
%g1 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 1
%g2 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 2
%g3 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 3
%g4 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 4
%g5 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 5
%g6 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 6
%g7 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 7
%t0 = load i8, i8* %g0
%t1 = load i8, i8* %g1
%t2 = load i8, i8* %g2
%t3 = load i8, i8* %g3
%t4 = load i8, i8* %g4
%t5 = load i8, i8* %g5
%t6 = load i8, i8* %g6
%t7 = load i8, i8* %g7
%z0 = zext i8 %t0 to i64
%z1 = zext i8 %t1 to i64
%z2 = zext i8 %t2 to i64
%z3 = zext i8 %t3 to i64
%z4 = zext i8 %t4 to i64
%z5 = zext i8 %t5 to i64
%z6 = zext i8 %t6 to i64
%z7 = zext i8 %t7 to i64
%sh0 = shl nuw i64 %z0, 56
%sh1 = shl nuw nsw i64 %z1, 48
%sh2 = shl nuw nsw i64 %z2, 40
%sh3 = shl nuw nsw i64 %z3, 32
%sh4 = shl nuw nsw i64 %z4, 24
%sh5 = shl nuw nsw i64 %z5, 16
%sh6 = shl nuw nsw i64 %z6, 8
; %sh7 = shl nuw nsw i64 %z7, 0 <-- missing phantom shift
%or01 = or i64 %sh0, %sh1
%or012 = or i64 %or01, %sh2
%or0123 = or i64 %or012, %sh3
%or01234 = or i64 %or0123, %sh4
%or012345 = or i64 %or01234, %sh5
%or0123456 = or i64 %or012345, %sh6
%or01234567 = or i64 %or0123456, %z7
ret i64 %or01234567
}
define i64 @load_bswap_nop_shift(%v8i8* %p) {
; CHECK-LABEL: @load_bswap_nop_shift(
; CHECK-NEXT: [[G0:%.*]] = getelementptr inbounds [[V8I8:%.*]], %v8i8* [[P:%.*]], i64 0, i32 0
; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 1
; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 2
; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 3
; CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 4
; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 5
; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 6
; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds [[V8I8]], %v8i8* [[P]], i64 0, i32 7
[SLP] avoid reduction transform on patterns that the backend can load-combine (2nd try) The 1st attempt at this modified the cost model in a bad way to avoid the vectorization, but that caused problems for other users (the loop vectorizer) of the cost model. I don't see an ideal solution to these 2 related, potentially large, perf regressions: https://bugs.llvm.org/show_bug.cgi?id=42708 https://bugs.llvm.org/show_bug.cgi?id=43146 We decided that load combining was unsuitable for IR because it could obscure other optimizations in IR. So we removed the LoadCombiner pass and deferred to the backend. Therefore, preventing SLP from destroying load combine opportunities requires that it recognizes patterns that could be combined later, but not do the optimization itself ( it's not a vector combine anyway, so it's probably out-of-scope for SLP). Here, we add a cost-independent bailout with a conservative pattern match for a multi-instruction sequence that can probably be reduced later. In the x86 tests shown (and discussed in more detail in the bug reports), SDAG combining will produce a single instruction on these tests like: movbe rax, qword ptr [rdi] or: mov rax, qword ptr [rdi] Not some (half) vector monstrosity as we currently do using SLP: vpmovzxbq ymm0, dword ptr [rdi + 1] # ymm0 = mem[0],zero,zero,.. vpsllvq ymm0, ymm0, ymmword ptr [rip + .LCPI0_0] movzx eax, byte ptr [rdi] movzx ecx, byte ptr [rdi + 5] shl rcx, 40 movzx edx, byte ptr [rdi + 6] shl rdx, 48 or rdx, rcx movzx ecx, byte ptr [rdi + 7] shl rcx, 56 or rcx, rdx or rcx, rax vextracti128 xmm1, ymm0, 1 vpor xmm0, xmm0, xmm1 vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1] vpor xmm0, xmm0, xmm1 vmovq rax, xmm0 or rax, rcx vzeroupper ret Differential Revision: https://reviews.llvm.org/D67841 llvm-svn: 375025
2019-10-17 02:06:24 +08:00
; CHECK-NEXT: [[T0:%.*]] = load i8, i8* [[G0]]
; CHECK-NEXT: [[T1:%.*]] = load i8, i8* [[G1]]
; CHECK-NEXT: [[T2:%.*]] = load i8, i8* [[G2]]
; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[G3]]
; CHECK-NEXT: [[T4:%.*]] = load i8, i8* [[G4]]
; CHECK-NEXT: [[T5:%.*]] = load i8, i8* [[G5]]
; CHECK-NEXT: [[T6:%.*]] = load i8, i8* [[G6]]
; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[G7]]
; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[T0]] to i64
; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[T1]] to i64
; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[T2]] to i64
; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[T3]] to i64
; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[T4]] to i64
; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[T5]] to i64
; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[T6]] to i64
; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[T7]] to i64
; CHECK-NEXT: [[SH0:%.*]] = shl nuw i64 [[Z0]], 56
; CHECK-NEXT: [[SH1:%.*]] = shl nuw nsw i64 [[Z1]], 48
; CHECK-NEXT: [[SH2:%.*]] = shl nuw nsw i64 [[Z2]], 40
; CHECK-NEXT: [[SH3:%.*]] = shl nuw nsw i64 [[Z3]], 32
; CHECK-NEXT: [[SH4:%.*]] = shl nuw nsw i64 [[Z4]], 24
; CHECK-NEXT: [[SH5:%.*]] = shl nuw nsw i64 [[Z5]], 16
; CHECK-NEXT: [[SH6:%.*]] = shl nuw nsw i64 [[Z6]], 8
; CHECK-NEXT: [[SH7:%.*]] = shl nuw nsw i64 [[Z7]], 0
; CHECK-NEXT: [[OR01:%.*]] = or i64 [[SH0]], [[SH1]]
; CHECK-NEXT: [[OR012:%.*]] = or i64 [[OR01]], [[SH2]]
; CHECK-NEXT: [[OR0123:%.*]] = or i64 [[OR012]], [[SH3]]
; CHECK-NEXT: [[OR01234:%.*]] = or i64 [[OR0123]], [[SH4]]
; CHECK-NEXT: [[OR012345:%.*]] = or i64 [[OR01234]], [[SH5]]
; CHECK-NEXT: [[OR0123456:%.*]] = or i64 [[OR012345]], [[SH6]]
; CHECK-NEXT: [[OR01234567:%.*]] = or i64 [[OR0123456]], [[SH7]]
; CHECK-NEXT: ret i64 [[OR01234567]]
;
%g0 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 0
%g1 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 1
%g2 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 2
%g3 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 3
%g4 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 4
%g5 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 5
%g6 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 6
%g7 = getelementptr inbounds %v8i8, %v8i8* %p, i64 0, i32 7
%t0 = load i8, i8* %g0
%t1 = load i8, i8* %g1
%t2 = load i8, i8* %g2
%t3 = load i8, i8* %g3
%t4 = load i8, i8* %g4
%t5 = load i8, i8* %g5
%t6 = load i8, i8* %g6
%t7 = load i8, i8* %g7
%z0 = zext i8 %t0 to i64
%z1 = zext i8 %t1 to i64
%z2 = zext i8 %t2 to i64
%z3 = zext i8 %t3 to i64
%z4 = zext i8 %t4 to i64
%z5 = zext i8 %t5 to i64
%z6 = zext i8 %t6 to i64
%z7 = zext i8 %t7 to i64
%sh0 = shl nuw i64 %z0, 56
%sh1 = shl nuw nsw i64 %z1, 48
%sh2 = shl nuw nsw i64 %z2, 40
%sh3 = shl nuw nsw i64 %z3, 32
%sh4 = shl nuw nsw i64 %z4, 24
%sh5 = shl nuw nsw i64 %z5, 16
%sh6 = shl nuw nsw i64 %z6, 8
%sh7 = shl nuw nsw i64 %z7, 0
%or01 = or i64 %sh0, %sh1
%or012 = or i64 %or01, %sh2
%or0123 = or i64 %or012, %sh3
%or01234 = or i64 %or0123, %sh4
%or012345 = or i64 %or01234, %sh5
%or0123456 = or i64 %or012345, %sh6
%or01234567 = or i64 %or0123456, %sh7
ret i64 %or01234567
}
; https://bugs.llvm.org/show_bug.cgi?id=42708
define i64 @load64le(i8* %arg) {
; CHECK-LABEL: @load64le(
; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds i8, i8* [[ARG:%.*]], i64 1
; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 2
; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 3
; CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 4
; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 5
; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 6
; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 7
; CHECK-NEXT: [[LD0:%.*]] = load i8, i8* [[ARG]], align 1
[SLP] avoid reduction transform on patterns that the backend can load-combine (2nd try) The 1st attempt at this modified the cost model in a bad way to avoid the vectorization, but that caused problems for other users (the loop vectorizer) of the cost model. I don't see an ideal solution to these 2 related, potentially large, perf regressions: https://bugs.llvm.org/show_bug.cgi?id=42708 https://bugs.llvm.org/show_bug.cgi?id=43146 We decided that load combining was unsuitable for IR because it could obscure other optimizations in IR. So we removed the LoadCombiner pass and deferred to the backend. Therefore, preventing SLP from destroying load combine opportunities requires that it recognizes patterns that could be combined later, but not do the optimization itself ( it's not a vector combine anyway, so it's probably out-of-scope for SLP). Here, we add a cost-independent bailout with a conservative pattern match for a multi-instruction sequence that can probably be reduced later. In the x86 tests shown (and discussed in more detail in the bug reports), SDAG combining will produce a single instruction on these tests like: movbe rax, qword ptr [rdi] or: mov rax, qword ptr [rdi] Not some (half) vector monstrosity as we currently do using SLP: vpmovzxbq ymm0, dword ptr [rdi + 1] # ymm0 = mem[0],zero,zero,.. vpsllvq ymm0, ymm0, ymmword ptr [rip + .LCPI0_0] movzx eax, byte ptr [rdi] movzx ecx, byte ptr [rdi + 5] shl rcx, 40 movzx edx, byte ptr [rdi + 6] shl rdx, 48 or rdx, rcx movzx ecx, byte ptr [rdi + 7] shl rcx, 56 or rcx, rdx or rcx, rax vextracti128 xmm1, ymm0, 1 vpor xmm0, xmm0, xmm1 vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1] vpor xmm0, xmm0, xmm1 vmovq rax, xmm0 or rax, rcx vzeroupper ret Differential Revision: https://reviews.llvm.org/D67841 llvm-svn: 375025
2019-10-17 02:06:24 +08:00
; CHECK-NEXT: [[LD1:%.*]] = load i8, i8* [[G1]], align 1
; CHECK-NEXT: [[LD2:%.*]] = load i8, i8* [[G2]], align 1
; CHECK-NEXT: [[LD3:%.*]] = load i8, i8* [[G3]], align 1
; CHECK-NEXT: [[LD4:%.*]] = load i8, i8* [[G4]], align 1
; CHECK-NEXT: [[LD5:%.*]] = load i8, i8* [[G5]], align 1
; CHECK-NEXT: [[LD6:%.*]] = load i8, i8* [[G6]], align 1
; CHECK-NEXT: [[LD7:%.*]] = load i8, i8* [[G7]], align 1
; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[LD0]] to i64
[SLP] avoid reduction transform on patterns that the backend can load-combine (2nd try) The 1st attempt at this modified the cost model in a bad way to avoid the vectorization, but that caused problems for other users (the loop vectorizer) of the cost model. I don't see an ideal solution to these 2 related, potentially large, perf regressions: https://bugs.llvm.org/show_bug.cgi?id=42708 https://bugs.llvm.org/show_bug.cgi?id=43146 We decided that load combining was unsuitable for IR because it could obscure other optimizations in IR. So we removed the LoadCombiner pass and deferred to the backend. Therefore, preventing SLP from destroying load combine opportunities requires that it recognizes patterns that could be combined later, but not do the optimization itself ( it's not a vector combine anyway, so it's probably out-of-scope for SLP). Here, we add a cost-independent bailout with a conservative pattern match for a multi-instruction sequence that can probably be reduced later. In the x86 tests shown (and discussed in more detail in the bug reports), SDAG combining will produce a single instruction on these tests like: movbe rax, qword ptr [rdi] or: mov rax, qword ptr [rdi] Not some (half) vector monstrosity as we currently do using SLP: vpmovzxbq ymm0, dword ptr [rdi + 1] # ymm0 = mem[0],zero,zero,.. vpsllvq ymm0, ymm0, ymmword ptr [rip + .LCPI0_0] movzx eax, byte ptr [rdi] movzx ecx, byte ptr [rdi + 5] shl rcx, 40 movzx edx, byte ptr [rdi + 6] shl rdx, 48 or rdx, rcx movzx ecx, byte ptr [rdi + 7] shl rcx, 56 or rcx, rdx or rcx, rax vextracti128 xmm1, ymm0, 1 vpor xmm0, xmm0, xmm1 vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1] vpor xmm0, xmm0, xmm1 vmovq rax, xmm0 or rax, rcx vzeroupper ret Differential Revision: https://reviews.llvm.org/D67841 llvm-svn: 375025
2019-10-17 02:06:24 +08:00
; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[LD1]] to i64
; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[LD2]] to i64
; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[LD3]] to i64
; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[LD4]] to i64
; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[LD5]] to i64
; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[LD6]] to i64
; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[LD7]] to i64
[SLP] avoid reduction transform on patterns that the backend can load-combine (2nd try) The 1st attempt at this modified the cost model in a bad way to avoid the vectorization, but that caused problems for other users (the loop vectorizer) of the cost model. I don't see an ideal solution to these 2 related, potentially large, perf regressions: https://bugs.llvm.org/show_bug.cgi?id=42708 https://bugs.llvm.org/show_bug.cgi?id=43146 We decided that load combining was unsuitable for IR because it could obscure other optimizations in IR. So we removed the LoadCombiner pass and deferred to the backend. Therefore, preventing SLP from destroying load combine opportunities requires that it recognizes patterns that could be combined later, but not do the optimization itself ( it's not a vector combine anyway, so it's probably out-of-scope for SLP). Here, we add a cost-independent bailout with a conservative pattern match for a multi-instruction sequence that can probably be reduced later. In the x86 tests shown (and discussed in more detail in the bug reports), SDAG combining will produce a single instruction on these tests like: movbe rax, qword ptr [rdi] or: mov rax, qword ptr [rdi] Not some (half) vector monstrosity as we currently do using SLP: vpmovzxbq ymm0, dword ptr [rdi + 1] # ymm0 = mem[0],zero,zero,.. vpsllvq ymm0, ymm0, ymmword ptr [rip + .LCPI0_0] movzx eax, byte ptr [rdi] movzx ecx, byte ptr [rdi + 5] shl rcx, 40 movzx edx, byte ptr [rdi + 6] shl rdx, 48 or rdx, rcx movzx ecx, byte ptr [rdi + 7] shl rcx, 56 or rcx, rdx or rcx, rax vextracti128 xmm1, ymm0, 1 vpor xmm0, xmm0, xmm1 vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1] vpor xmm0, xmm0, xmm1 vmovq rax, xmm0 or rax, rcx vzeroupper ret Differential Revision: https://reviews.llvm.org/D67841 llvm-svn: 375025
2019-10-17 02:06:24 +08:00
; CHECK-NEXT: [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8
; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16
; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i64 [[Z3]], 24
; CHECK-NEXT: [[S4:%.*]] = shl nuw nsw i64 [[Z4]], 32
; CHECK-NEXT: [[S5:%.*]] = shl nuw nsw i64 [[Z5]], 40
; CHECK-NEXT: [[S6:%.*]] = shl nuw nsw i64 [[Z6]], 48
; CHECK-NEXT: [[S7:%.*]] = shl nuw i64 [[Z7]], 56
[SLP] avoid reduction transform on patterns that the backend can load-combine (2nd try) The 1st attempt at this modified the cost model in a bad way to avoid the vectorization, but that caused problems for other users (the loop vectorizer) of the cost model. I don't see an ideal solution to these 2 related, potentially large, perf regressions: https://bugs.llvm.org/show_bug.cgi?id=42708 https://bugs.llvm.org/show_bug.cgi?id=43146 We decided that load combining was unsuitable for IR because it could obscure other optimizations in IR. So we removed the LoadCombiner pass and deferred to the backend. Therefore, preventing SLP from destroying load combine opportunities requires that it recognizes patterns that could be combined later, but not do the optimization itself ( it's not a vector combine anyway, so it's probably out-of-scope for SLP). Here, we add a cost-independent bailout with a conservative pattern match for a multi-instruction sequence that can probably be reduced later. In the x86 tests shown (and discussed in more detail in the bug reports), SDAG combining will produce a single instruction on these tests like: movbe rax, qword ptr [rdi] or: mov rax, qword ptr [rdi] Not some (half) vector monstrosity as we currently do using SLP: vpmovzxbq ymm0, dword ptr [rdi + 1] # ymm0 = mem[0],zero,zero,.. vpsllvq ymm0, ymm0, ymmword ptr [rip + .LCPI0_0] movzx eax, byte ptr [rdi] movzx ecx, byte ptr [rdi + 5] shl rcx, 40 movzx edx, byte ptr [rdi + 6] shl rdx, 48 or rdx, rcx movzx ecx, byte ptr [rdi + 7] shl rcx, 56 or rcx, rdx or rcx, rax vextracti128 xmm1, ymm0, 1 vpor xmm0, xmm0, xmm1 vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1] vpor xmm0, xmm0, xmm1 vmovq rax, xmm0 or rax, rcx vzeroupper ret Differential Revision: https://reviews.llvm.org/D67841 llvm-svn: 375025
2019-10-17 02:06:24 +08:00
; CHECK-NEXT: [[O1:%.*]] = or i64 [[S1]], [[Z0]]
; CHECK-NEXT: [[O2:%.*]] = or i64 [[O1]], [[S2]]
; CHECK-NEXT: [[O3:%.*]] = or i64 [[O2]], [[S3]]
; CHECK-NEXT: [[O4:%.*]] = or i64 [[O3]], [[S4]]
; CHECK-NEXT: [[O5:%.*]] = or i64 [[O4]], [[S5]]
; CHECK-NEXT: [[O6:%.*]] = or i64 [[O5]], [[S6]]
; CHECK-NEXT: [[O7:%.*]] = or i64 [[O6]], [[S7]]
; CHECK-NEXT: ret i64 [[O7]]
;
%g1 = getelementptr inbounds i8, i8* %arg, i64 1
%g2 = getelementptr inbounds i8, i8* %arg, i64 2
%g3 = getelementptr inbounds i8, i8* %arg, i64 3
%g4 = getelementptr inbounds i8, i8* %arg, i64 4
%g5 = getelementptr inbounds i8, i8* %arg, i64 5
%g6 = getelementptr inbounds i8, i8* %arg, i64 6
%g7 = getelementptr inbounds i8, i8* %arg, i64 7
%ld0 = load i8, i8* %arg, align 1
%ld1 = load i8, i8* %g1, align 1
%ld2 = load i8, i8* %g2, align 1
%ld3 = load i8, i8* %g3, align 1
%ld4 = load i8, i8* %g4, align 1
%ld5 = load i8, i8* %g5, align 1
%ld6 = load i8, i8* %g6, align 1
%ld7 = load i8, i8* %g7, align 1
%z0 = zext i8 %ld0 to i64
%z1 = zext i8 %ld1 to i64
%z2 = zext i8 %ld2 to i64
%z3 = zext i8 %ld3 to i64
%z4 = zext i8 %ld4 to i64
%z5 = zext i8 %ld5 to i64
%z6 = zext i8 %ld6 to i64
%z7 = zext i8 %ld7 to i64
; %s0 = shl nuw nsw i64 %z0, 0 <-- missing phantom shift
%s1 = shl nuw nsw i64 %z1, 8
%s2 = shl nuw nsw i64 %z2, 16
%s3 = shl nuw nsw i64 %z3, 24
%s4 = shl nuw nsw i64 %z4, 32
%s5 = shl nuw nsw i64 %z5, 40
%s6 = shl nuw nsw i64 %z6, 48
%s7 = shl nuw i64 %z7, 56
%o1 = or i64 %s1, %z0
%o2 = or i64 %o1, %s2
%o3 = or i64 %o2, %s3
%o4 = or i64 %o3, %s4
%o5 = or i64 %o4, %s5
%o6 = or i64 %o5, %s6
%o7 = or i64 %o6, %s7
ret i64 %o7
}
define i64 @load64le_nop_shift(i8* %arg) {
; CHECK-LABEL: @load64le_nop_shift(
; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds i8, i8* [[ARG:%.*]], i64 1
; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 2
; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 3
; CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 4
; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 5
; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 6
; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds i8, i8* [[ARG]], i64 7
[SLP] avoid reduction transform on patterns that the backend can load-combine (2nd try) The 1st attempt at this modified the cost model in a bad way to avoid the vectorization, but that caused problems for other users (the loop vectorizer) of the cost model. I don't see an ideal solution to these 2 related, potentially large, perf regressions: https://bugs.llvm.org/show_bug.cgi?id=42708 https://bugs.llvm.org/show_bug.cgi?id=43146 We decided that load combining was unsuitable for IR because it could obscure other optimizations in IR. So we removed the LoadCombiner pass and deferred to the backend. Therefore, preventing SLP from destroying load combine opportunities requires that it recognizes patterns that could be combined later, but not do the optimization itself ( it's not a vector combine anyway, so it's probably out-of-scope for SLP). Here, we add a cost-independent bailout with a conservative pattern match for a multi-instruction sequence that can probably be reduced later. In the x86 tests shown (and discussed in more detail in the bug reports), SDAG combining will produce a single instruction on these tests like: movbe rax, qword ptr [rdi] or: mov rax, qword ptr [rdi] Not some (half) vector monstrosity as we currently do using SLP: vpmovzxbq ymm0, dword ptr [rdi + 1] # ymm0 = mem[0],zero,zero,.. vpsllvq ymm0, ymm0, ymmword ptr [rip + .LCPI0_0] movzx eax, byte ptr [rdi] movzx ecx, byte ptr [rdi + 5] shl rcx, 40 movzx edx, byte ptr [rdi + 6] shl rdx, 48 or rdx, rcx movzx ecx, byte ptr [rdi + 7] shl rcx, 56 or rcx, rdx or rcx, rax vextracti128 xmm1, ymm0, 1 vpor xmm0, xmm0, xmm1 vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1] vpor xmm0, xmm0, xmm1 vmovq rax, xmm0 or rax, rcx vzeroupper ret Differential Revision: https://reviews.llvm.org/D67841 llvm-svn: 375025
2019-10-17 02:06:24 +08:00
; CHECK-NEXT: [[LD0:%.*]] = load i8, i8* [[ARG]], align 1
; CHECK-NEXT: [[LD1:%.*]] = load i8, i8* [[G1]], align 1
; CHECK-NEXT: [[LD2:%.*]] = load i8, i8* [[G2]], align 1
; CHECK-NEXT: [[LD3:%.*]] = load i8, i8* [[G3]], align 1
; CHECK-NEXT: [[LD4:%.*]] = load i8, i8* [[G4]], align 1
; CHECK-NEXT: [[LD5:%.*]] = load i8, i8* [[G5]], align 1
; CHECK-NEXT: [[LD6:%.*]] = load i8, i8* [[G6]], align 1
; CHECK-NEXT: [[LD7:%.*]] = load i8, i8* [[G7]], align 1
; CHECK-NEXT: [[Z0:%.*]] = zext i8 [[LD0]] to i64
; CHECK-NEXT: [[Z1:%.*]] = zext i8 [[LD1]] to i64
; CHECK-NEXT: [[Z2:%.*]] = zext i8 [[LD2]] to i64
; CHECK-NEXT: [[Z3:%.*]] = zext i8 [[LD3]] to i64
; CHECK-NEXT: [[Z4:%.*]] = zext i8 [[LD4]] to i64
; CHECK-NEXT: [[Z5:%.*]] = zext i8 [[LD5]] to i64
; CHECK-NEXT: [[Z6:%.*]] = zext i8 [[LD6]] to i64
; CHECK-NEXT: [[Z7:%.*]] = zext i8 [[LD7]] to i64
; CHECK-NEXT: [[S0:%.*]] = shl nuw nsw i64 [[Z0]], 0
; CHECK-NEXT: [[S1:%.*]] = shl nuw nsw i64 [[Z1]], 8
; CHECK-NEXT: [[S2:%.*]] = shl nuw nsw i64 [[Z2]], 16
; CHECK-NEXT: [[S3:%.*]] = shl nuw nsw i64 [[Z3]], 24
; CHECK-NEXT: [[S4:%.*]] = shl nuw nsw i64 [[Z4]], 32
; CHECK-NEXT: [[S5:%.*]] = shl nuw nsw i64 [[Z5]], 40
; CHECK-NEXT: [[S6:%.*]] = shl nuw nsw i64 [[Z6]], 48
; CHECK-NEXT: [[S7:%.*]] = shl nuw i64 [[Z7]], 56
; CHECK-NEXT: [[O1:%.*]] = or i64 [[S1]], [[S0]]
; CHECK-NEXT: [[O2:%.*]] = or i64 [[O1]], [[S2]]
; CHECK-NEXT: [[O3:%.*]] = or i64 [[O2]], [[S3]]
; CHECK-NEXT: [[O4:%.*]] = or i64 [[O3]], [[S4]]
; CHECK-NEXT: [[O5:%.*]] = or i64 [[O4]], [[S5]]
; CHECK-NEXT: [[O6:%.*]] = or i64 [[O5]], [[S6]]
; CHECK-NEXT: [[O7:%.*]] = or i64 [[O6]], [[S7]]
; CHECK-NEXT: ret i64 [[O7]]
;
%g1 = getelementptr inbounds i8, i8* %arg, i64 1
%g2 = getelementptr inbounds i8, i8* %arg, i64 2
%g3 = getelementptr inbounds i8, i8* %arg, i64 3
%g4 = getelementptr inbounds i8, i8* %arg, i64 4
%g5 = getelementptr inbounds i8, i8* %arg, i64 5
%g6 = getelementptr inbounds i8, i8* %arg, i64 6
%g7 = getelementptr inbounds i8, i8* %arg, i64 7
%ld0 = load i8, i8* %arg, align 1
%ld1 = load i8, i8* %g1, align 1
%ld2 = load i8, i8* %g2, align 1
%ld3 = load i8, i8* %g3, align 1
%ld4 = load i8, i8* %g4, align 1
%ld5 = load i8, i8* %g5, align 1
%ld6 = load i8, i8* %g6, align 1
%ld7 = load i8, i8* %g7, align 1
%z0 = zext i8 %ld0 to i64
%z1 = zext i8 %ld1 to i64
%z2 = zext i8 %ld2 to i64
%z3 = zext i8 %ld3 to i64
%z4 = zext i8 %ld4 to i64
%z5 = zext i8 %ld5 to i64
%z6 = zext i8 %ld6 to i64
%z7 = zext i8 %ld7 to i64
%s0 = shl nuw nsw i64 %z0, 0
%s1 = shl nuw nsw i64 %z1, 8
%s2 = shl nuw nsw i64 %z2, 16
%s3 = shl nuw nsw i64 %z3, 24
%s4 = shl nuw nsw i64 %z4, 32
%s5 = shl nuw nsw i64 %z5, 40
%s6 = shl nuw nsw i64 %z6, 48
%s7 = shl nuw i64 %z7, 56
%o1 = or i64 %s1, %s0
%o2 = or i64 %o1, %s2
%o3 = or i64 %o2, %s3
%o4 = or i64 %o3, %s4
%o5 = or i64 %o4, %s5
%o6 = or i64 %o5, %s6
%o7 = or i64 %o6, %s7
ret i64 %o7
}
define void @PR39538(i8* %t0, i32* %t1) {
; CHECK-LABEL: @PR39538(
; CHECK-NEXT: [[T6:%.*]] = getelementptr inbounds i8, i8* [[T0:%.*]], i64 1
; CHECK-NEXT: [[T11:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 2
; CHECK-NEXT: [[T16:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 3
; CHECK-NEXT: [[T20:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 4
; CHECK-NEXT: [[T24:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 5
; CHECK-NEXT: [[T29:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 6
; CHECK-NEXT: [[T34:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 7
; CHECK-NEXT: [[T39:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 8
; CHECK-NEXT: [[T43:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 9
; CHECK-NEXT: [[T48:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 10
; CHECK-NEXT: [[T53:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 11
; CHECK-NEXT: [[T58:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 12
; CHECK-NEXT: [[T62:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 13
; CHECK-NEXT: [[T67:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 14
; CHECK-NEXT: [[T72:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 15
; CHECK-NEXT: [[T38:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 1
; CHECK-NEXT: [[T57:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 2
; CHECK-NEXT: [[T76:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 3
; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[T0]], align 1
; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[T6]], align 1
; CHECK-NEXT: [[T12:%.*]] = load i8, i8* [[T11]], align 1
; CHECK-NEXT: [[T17:%.*]] = load i8, i8* [[T16]], align 1
; CHECK-NEXT: [[T21:%.*]] = load i8, i8* [[T20]], align 1
; CHECK-NEXT: [[T25:%.*]] = load i8, i8* [[T24]], align 1
; CHECK-NEXT: [[T30:%.*]] = load i8, i8* [[T29]], align 1
; CHECK-NEXT: [[T35:%.*]] = load i8, i8* [[T34]], align 1
; CHECK-NEXT: [[T40:%.*]] = load i8, i8* [[T39]], align 1
; CHECK-NEXT: [[T44:%.*]] = load i8, i8* [[T43]], align 1
; CHECK-NEXT: [[T49:%.*]] = load i8, i8* [[T48]], align 1
; CHECK-NEXT: [[T54:%.*]] = load i8, i8* [[T53]], align 1
; CHECK-NEXT: [[T59:%.*]] = load i8, i8* [[T58]], align 1
; CHECK-NEXT: [[T63:%.*]] = load i8, i8* [[T62]], align 1
; CHECK-NEXT: [[T68:%.*]] = load i8, i8* [[T67]], align 1
; CHECK-NEXT: [[T73:%.*]] = load i8, i8* [[T72]], align 1
[SLP] add another bailout for load-combine patterns (2nd try) The original patch (rG86dfbc676ebe) exposed an existing bug: we could wrongly cast a constant expression to BinaryOperator because the pattern matching allows that. This adds a check for that case, and there's a reduced test case to verify no crashing. Original commit message: This builds on the or-reduction bailout that was added with D67841. We still do not have IR-level load combining, although that could be a target-specific enhancement for -vector-combiner. The heuristic is narrowly defined to catch the motivating case from PR39538: https://bugs.llvm.org/show_bug.cgi?id=39538 ...while preserving existing functionality. That is, there's an unmodified test of pure load/zext/store that is not seen in this patch at llvm/test/Transforms/SLPVectorizer/X86/cast.ll. That's the reason for the logic difference to require the 'or' instructions. The chances that vectorization would actually help a memory-bound sequence like that seem small, but it looks nicer with: vpmovzxwd (%rsi), %xmm0 vmovdqu %xmm0, (%rdi) rather than: movzwl (%rsi), %eax movl %eax, (%rdi) ... In the motivating test, we avoid creating a vector mess that is unrecoverable in the backend, and SDAG forms the expected bswap instructions after load combining: movzbl (%rdi), %eax vmovd %eax, %xmm0 movzbl 1(%rdi), %eax vmovd %eax, %xmm1 movzbl 2(%rdi), %eax vpinsrb $4, 4(%rdi), %xmm0, %xmm0 vpinsrb $8, 8(%rdi), %xmm0, %xmm0 vpinsrb $12, 12(%rdi), %xmm0, %xmm0 vmovd %eax, %xmm2 movzbl 3(%rdi), %eax vpinsrb $1, 5(%rdi), %xmm1, %xmm1 vpinsrb $2, 9(%rdi), %xmm1, %xmm1 vpinsrb $3, 13(%rdi), %xmm1, %xmm1 vpslld $24, %xmm0, %xmm0 vpmovzxbd %xmm1, %xmm1 # xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero vpslld $16, %xmm1, %xmm1 vpor %xmm0, %xmm1, %xmm0 vpinsrb $1, 6(%rdi), %xmm2, %xmm1 vmovd %eax, %xmm2 vpinsrb $2, 10(%rdi), %xmm1, %xmm1 vpinsrb $3, 14(%rdi), %xmm1, %xmm1 vpinsrb $1, 7(%rdi), %xmm2, %xmm2 vpinsrb $2, 11(%rdi), %xmm2, %xmm2 vpmovzxbd %xmm1, %xmm1 # xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero vpinsrb $3, 15(%rdi), %xmm2, %xmm2 vpslld $8, %xmm1, %xmm1 vpmovzxbd %xmm2, %xmm2 # xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero vpor %xmm2, %xmm1, %xmm1 vpor %xmm1, %xmm0, %xmm0 vmovdqu %xmm0, (%rsi) movl (%rdi), %eax movl 4(%rdi), %ecx movl 8(%rdi), %edx movbel %eax, (%rsi) movbel %ecx, 4(%rsi) movl 12(%rdi), %ecx movbel %edx, 8(%rsi) movbel %ecx, 12(%rsi) Differential Revision: https://reviews.llvm.org/D78997
2020-05-08 03:00:23 +08:00
; CHECK-NEXT: [[T4:%.*]] = zext i8 [[T3]] to i32
; CHECK-NEXT: [[T8:%.*]] = zext i8 [[T7]] to i32
; CHECK-NEXT: [[T13:%.*]] = zext i8 [[T12]] to i32
; CHECK-NEXT: [[T18:%.*]] = zext i8 [[T17]] to i32
; CHECK-NEXT: [[T22:%.*]] = zext i8 [[T21]] to i32
; CHECK-NEXT: [[T26:%.*]] = zext i8 [[T25]] to i32
; CHECK-NEXT: [[T31:%.*]] = zext i8 [[T30]] to i32
; CHECK-NEXT: [[T36:%.*]] = zext i8 [[T35]] to i32
; CHECK-NEXT: [[T41:%.*]] = zext i8 [[T40]] to i32
; CHECK-NEXT: [[T45:%.*]] = zext i8 [[T44]] to i32
; CHECK-NEXT: [[T50:%.*]] = zext i8 [[T49]] to i32
; CHECK-NEXT: [[T55:%.*]] = zext i8 [[T54]] to i32
; CHECK-NEXT: [[T60:%.*]] = zext i8 [[T59]] to i32
; CHECK-NEXT: [[T64:%.*]] = zext i8 [[T63]] to i32
; CHECK-NEXT: [[T69:%.*]] = zext i8 [[T68]] to i32
; CHECK-NEXT: [[T74:%.*]] = zext i8 [[T73]] to i32
; CHECK-NEXT: [[T5:%.*]] = shl nuw i32 [[T4]], 24
; CHECK-NEXT: [[T23:%.*]] = shl nuw i32 [[T22]], 24
; CHECK-NEXT: [[T42:%.*]] = shl nuw i32 [[T41]], 24
; CHECK-NEXT: [[T61:%.*]] = shl nuw i32 [[T60]], 24
; CHECK-NEXT: [[T9:%.*]] = shl nuw nsw i32 [[T8]], 16
; CHECK-NEXT: [[T27:%.*]] = shl nuw nsw i32 [[T26]], 16
; CHECK-NEXT: [[T46:%.*]] = shl nuw nsw i32 [[T45]], 16
; CHECK-NEXT: [[T65:%.*]] = shl nuw nsw i32 [[T64]], 16
; CHECK-NEXT: [[T14:%.*]] = shl nuw nsw i32 [[T13]], 8
; CHECK-NEXT: [[T32:%.*]] = shl nuw nsw i32 [[T31]], 8
; CHECK-NEXT: [[T51:%.*]] = shl nuw nsw i32 [[T50]], 8
; CHECK-NEXT: [[T70:%.*]] = shl nuw nsw i32 [[T69]], 8
; CHECK-NEXT: [[T10:%.*]] = or i32 [[T9]], [[T5]]
; CHECK-NEXT: [[T15:%.*]] = or i32 [[T10]], [[T14]]
; CHECK-NEXT: [[T19:%.*]] = or i32 [[T15]], [[T18]]
; CHECK-NEXT: [[T28:%.*]] = or i32 [[T27]], [[T23]]
; CHECK-NEXT: [[T33:%.*]] = or i32 [[T28]], [[T32]]
; CHECK-NEXT: [[T37:%.*]] = or i32 [[T33]], [[T36]]
; CHECK-NEXT: [[T47:%.*]] = or i32 [[T46]], [[T42]]
; CHECK-NEXT: [[T52:%.*]] = or i32 [[T47]], [[T51]]
; CHECK-NEXT: [[T56:%.*]] = or i32 [[T52]], [[T55]]
; CHECK-NEXT: [[T66:%.*]] = or i32 [[T65]], [[T61]]
; CHECK-NEXT: [[T71:%.*]] = or i32 [[T66]], [[T70]]
; CHECK-NEXT: [[T75:%.*]] = or i32 [[T71]], [[T74]]
; CHECK-NEXT: store i32 [[T19]], i32* [[T1]], align 4
; CHECK-NEXT: store i32 [[T37]], i32* [[T38]], align 4
; CHECK-NEXT: store i32 [[T56]], i32* [[T57]], align 4
; CHECK-NEXT: store i32 [[T75]], i32* [[T76]], align 4
; CHECK-NEXT: ret void
;
%t6 = getelementptr inbounds i8, i8* %t0, i64 1
%t11 = getelementptr inbounds i8, i8* %t0, i64 2
%t16 = getelementptr inbounds i8, i8* %t0, i64 3
%t20 = getelementptr inbounds i8, i8* %t0, i64 4
%t24 = getelementptr inbounds i8, i8* %t0, i64 5
%t29 = getelementptr inbounds i8, i8* %t0, i64 6
%t34 = getelementptr inbounds i8, i8* %t0, i64 7
%t39 = getelementptr inbounds i8, i8* %t0, i64 8
%t43 = getelementptr inbounds i8, i8* %t0, i64 9
%t48 = getelementptr inbounds i8, i8* %t0, i64 10
%t53 = getelementptr inbounds i8, i8* %t0, i64 11
%t58 = getelementptr inbounds i8, i8* %t0, i64 12
%t62 = getelementptr inbounds i8, i8* %t0, i64 13
%t67 = getelementptr inbounds i8, i8* %t0, i64 14
%t72 = getelementptr inbounds i8, i8* %t0, i64 15
%t38 = getelementptr inbounds i32, i32* %t1, i64 1
%t57 = getelementptr inbounds i32, i32* %t1, i64 2
%t76 = getelementptr inbounds i32, i32* %t1, i64 3
%t3 = load i8, i8* %t0, align 1
%t7 = load i8, i8* %t6, align 1
%t12 = load i8, i8* %t11, align 1
%t17 = load i8, i8* %t16, align 1
%t21 = load i8, i8* %t20, align 1
%t25 = load i8, i8* %t24, align 1
%t30 = load i8, i8* %t29, align 1
%t35 = load i8, i8* %t34, align 1
%t40 = load i8, i8* %t39, align 1
%t44 = load i8, i8* %t43, align 1
%t49 = load i8, i8* %t48, align 1
%t54 = load i8, i8* %t53, align 1
%t59 = load i8, i8* %t58, align 1
%t63 = load i8, i8* %t62, align 1
%t68 = load i8, i8* %t67, align 1
%t73 = load i8, i8* %t72, align 1
%t4 = zext i8 %t3 to i32
%t8 = zext i8 %t7 to i32
%t13 = zext i8 %t12 to i32
%t18 = zext i8 %t17 to i32
%t22 = zext i8 %t21 to i32
%t26 = zext i8 %t25 to i32
%t31 = zext i8 %t30 to i32
%t36 = zext i8 %t35 to i32
%t41 = zext i8 %t40 to i32
%t45 = zext i8 %t44 to i32
%t50 = zext i8 %t49 to i32
%t55 = zext i8 %t54 to i32
%t60 = zext i8 %t59 to i32
%t64 = zext i8 %t63 to i32
%t69 = zext i8 %t68 to i32
%t74 = zext i8 %t73 to i32
%t5 = shl nuw i32 %t4, 24
%t23 = shl nuw i32 %t22, 24
%t42 = shl nuw i32 %t41, 24
%t61 = shl nuw i32 %t60, 24
%t9 = shl nuw nsw i32 %t8, 16
%t27 = shl nuw nsw i32 %t26, 16
%t46 = shl nuw nsw i32 %t45, 16
%t65 = shl nuw nsw i32 %t64, 16
%t14 = shl nuw nsw i32 %t13, 8
%t32 = shl nuw nsw i32 %t31, 8
%t51 = shl nuw nsw i32 %t50, 8
%t70 = shl nuw nsw i32 %t69, 8
%t10 = or i32 %t9, %t5
%t15 = or i32 %t10, %t14
%t19 = or i32 %t15, %t18
%t28 = or i32 %t27, %t23
%t33 = or i32 %t28, %t32
%t37 = or i32 %t33, %t36
%t47 = or i32 %t46, %t42
%t52 = or i32 %t47, %t51
%t56 = or i32 %t52, %t55
%t66 = or i32 %t65, %t61
%t71 = or i32 %t66, %t70
%t75 = or i32 %t71, %t74
store i32 %t19, i32* %t1, align 4
store i32 %t37, i32* %t38, align 4
store i32 %t56, i32* %t57, align 4
store i32 %t75, i32* %t76, align 4
ret void
}
; Do not crash on constant expressions.
@g1 = external dso_local unnamed_addr constant [8 x i8], align 1
@g2 = external dso_local unnamed_addr constant [5 x i8], align 1
define void @load_combine_constant_expression(i64* %t1) {
; CHECK-LABEL: @load_combine_constant_expression(
; CHECK-NEXT: store i64 or (i64 shl (i64 zext (i32 ptrtoint ([8 x i8]* @g1 to i32) to i64), i64 32), i64 zext (i32 ptrtoint ([5 x i8]* @g2 to i32) to i64)), i64* [[T1:%.*]], align 4
; CHECK-NEXT: [[T3:%.*]] = getelementptr i64, i64* [[T1]], i64 1
; CHECK-NEXT: store i64 or (i64 shl (i64 zext (i32 ptrtoint ([8 x i8]* @g1 to i32) to i64), i64 32), i64 zext (i32 ptrtoint ([5 x i8]* @g2 to i32) to i64)), i64* [[T3]], align 4
; CHECK-NEXT: ret void
;
store i64 or (i64 shl (i64 zext (i32 ptrtoint ([8 x i8]* @g1 to i32) to i64), i64 32), i64 zext (i32 ptrtoint ([5 x i8]* @g2 to i32) to i64)), i64* %t1, align 4
%t3 = getelementptr i64, i64* %t1, i64 1
store i64 or (i64 shl (i64 zext (i32 ptrtoint ([8 x i8]* @g1 to i32) to i64), i64 32), i64 zext (i32 ptrtoint ([5 x i8]* @g2 to i32) to i64)), i64* %t3, align 4
ret void
}