From d5387ec2679b5449b81449c5ab3cb0e2fd754010 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 11 Feb 2021 16:42:35 +0000 Subject: [PATCH] [LV] Add tests showing suboptimal vectorization for narrow types. This patch adds additional test cases showing missing/sub-optimal vectorization for loops which contain small and wider memory ops on AArch64. --- ...rization-factor-for-unprofitable-memops.ll | 124 ++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll new file mode 100644 index 000000000000..3aef7a456bcf --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll @@ -0,0 +1,124 @@ +; RUN: opt -loop-vectorize -mtriple=arm64-apple-darwin -S %s | FileCheck %s + +; Test cases for extending the vectorization factor, if small memory operations +; are not profitable. + +; Test with a loop that contains memory accesses of i8 and i32 types. The +; default maximum VF for NEON is 4, but vectorizing 4 x i8 is not +; profitable. But we can extend to VF to 8 or 16, at which point the +; i8 memory accesses become profitable. +define void @test_load_i8_store_i32(i8* noalias %src, i32* noalias %dst, i32 %off, i64 %N) { +; CHECK-LABEL: @test_load_i8_store_i32( +; CHECK-NOT: x i8> +; +entry: + br label %loop + +loop: + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + %gep.src = getelementptr inbounds i8, i8* %src, i64 %iv + %lv = load i8, i8* %gep.src, align 1 + %lv.ext = zext i8 %lv to i32 + %add = add i32 %lv.ext, %off + %gep.dst = getelementptr inbounds i32, i32* %dst, i64 %iv + store i32 %add, i32* %gep.dst + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + +; Same as test_load_i8_store_i32, but with types flipped for load and store. +define void @test_load_i32_store_i8(i32* noalias %src, i8* noalias %dst, i32 %off, i64 %N) { +; CHECK-LABEL: @test_load_i32_store_i8( +; CHECK: <4 x i8> +; +entry: + br label %loop + +loop: + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv + %lv = load i32, i32* %gep.src, align 1 + %add = add i32 %lv, %off + %add.trunc = trunc i32 %add to i8 + %gep.dst = getelementptr inbounds i8, i8* %dst, i64 %iv + store i8 %add.trunc, i8* %gep.dst + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + +; All memory operations use i32, all memory operations are profitable with VF 4. +define void @test_load_i32_store_i32(i32* noalias %src, i32* noalias %dst, i8 %off, i64 %N) { +; CHECK-LABEL: @test_load_i32_store_i32( +; CHECK: vector.body: +; CHECK: <4 x i32> +; +entry: + br label %loop + +loop: + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv + %lv = load i32, i32* %gep.src, align 1 + %lv.trunc = trunc i32 %lv to i8 + %add = add i8 %lv.trunc, %off + %add.ext = zext i8 %add to i32 + %gep.dst = getelementptr inbounds i32, i32* %dst, i64 %iv + store i32 %add.ext, i32* %gep.dst + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + +; Test with loop body that requires a large number of vector registers if the +; vectorization factor is large. Make sure the register estimates limit the +; vectorization factor. +define void @test_load_i8_store_i64_large(i8* noalias %src, i64* noalias %dst, i64* noalias %dst.2, i64* noalias %dst.3, i64* noalias %dst.4, i64* noalias %dst.5, i64%off, i64 %off.2, i64 %N) { +; CHECK-LABEL: @test_load_i8_store_i64_large +; CHECK: <2 x i64> +; +entry: + br label %loop + +loop: + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + %gep.src = getelementptr inbounds i8, i8* %src, i64 %iv + %gep.dst.3 = getelementptr inbounds i64, i64* %dst.3, i64 %iv + %lv.dst.3 = load i64, i64* %gep.dst.3, align 1 + %gep.dst.5 = getelementptr inbounds i64, i64* %dst.5, i64 %iv + %lv.dst.5 = load i64, i64* %gep.dst.3, align 1 + + %lv = load i8, i8* %gep.src, align 1 + %lv.ext = zext i8 %lv to i64 + %add = add i64 %lv.ext, %off + %add.2 = add i64 %add, %off.2 + %gep.dst = getelementptr inbounds i64, i64* %dst, i64 %iv + %gep.dst.2 = getelementptr inbounds i64, i64* %dst.2, i64 %iv + + %add.3 = add i64 %add.2, %lv.dst.3 + %add.4 = add i64 %add.3, %add + %gep.dst.4 = getelementptr inbounds i64, i64* %dst.4, i64 %iv + %add.5 = add i64 %add.2, %lv.dst.5 + store i64 %add.2, i64* %gep.dst.2 + store i64 %add, i64* %gep.dst + store i64 %add.3, i64* %gep.dst.3 + store i64 %add.4, i64* %gep.dst.4 + store i64 %add.5, i64* %gep.dst.5 + + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %N + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +}