[AArch64][SLP] Precommit tests which would be better not to SLP vectorize. NFC.

2020-11-27 13:41:35 +00:00 · 2020-11-27 13:41:35 +00:00 · a2016dc887
parent 7793db35ca
commit a2016dc887
1 changed files with 108 additions and 0 deletions
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/mul.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/mul.ll
@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basic-aa -slp-vectorizer -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; These examples correspond to input code like:
+;
+;   void t(long * __restrict a, long * __restrict b) {
+;     a[0] *= b[0];
+;     a[1] *= b[1];
+;   }
+;
+; If we SLP vectorise this then we end up with something like this because we
+; don't have a mul.2d:
+;
+;        ldr     q0, [x1]
+;        ldr     q1, [x0]
+;        fmov    x8, d0
+;        mov     x10, v0.d[1]
+;        fmov    x9, d1
+;        mov     x11, v1.d[1]
+;        mul     x8, x9, x8
+;        mul     x9, x11, x10
+;        fmov    d0, x8
+;        mov     v0.d[1], x9
+;        str     q0, [x0]
+;        ret
+;
+; but if we don't SLP vectorise these examples we get this which is smaller
+; and faster:
+;
+;        ldp     x8, x9, [x1]
+;        ldp     x10, x11, [x0]
+;        mul     x9, x11, x9
+;        mul     x8, x10, x8
+;        stp     x8, x9, [x0]
+;        ret
+;
+; FIXME: don't SLP vectorise this.
+
+define void @mul(i64* noalias nocapture %a, i64* noalias nocapture readonly %b) {
+; CHECK-LABEL: @mul(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[B]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[A]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <2 x i64> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[A]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i64, i64* %b, align 8
+  %1 = load i64, i64* %a, align 8
+  %mul = mul nsw i64 %1, %0
+  store i64 %mul, i64* %a, align 8
+  %arrayidx2 = getelementptr inbounds i64, i64* %b, i64 1
+  %2 = load i64, i64* %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds i64, i64* %a, i64 1
+  %3 = load i64, i64* %arrayidx3, align 8
+  %mul4 = mul nsw i64 %3, %2
+  store i64 %mul4, i64* %arrayidx3, align 8
+  ret void
+}
+
+; Similar example, but now a multiply-accumulate:
+;
+;  void x (long * __restrict a, long * __restrict b) {
+;    a[0] *= b[0];
+;    a[1] *= b[1];
+;    a[0] += b[0];
+;    a[1] += b[1];
+;  }
+;
+define void @mac(i64* noalias nocapture %a, i64* noalias nocapture readonly %b) {
+; CHECK-LABEL: @mac(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i64* [[B]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64* [[A]] to <2 x i64>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <2 x i64> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i64> [[TMP4]], [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i64* [[A]] to <2 x i64>*
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i64, i64* %b, align 8
+  %1 = load i64, i64* %a, align 8
+  %mul = mul nsw i64 %1, %0
+  %arrayidx2 = getelementptr inbounds i64, i64* %b, i64 1
+  %2 = load i64, i64* %arrayidx2, align 8
+  %arrayidx3 = getelementptr inbounds i64, i64* %a, i64 1
+  %3 = load i64, i64* %arrayidx3, align 8
+  %mul4 = mul nsw i64 %3, %2
+  %add = add nsw i64 %mul, %0
+  store i64 %add, i64* %a, align 8
+  %add9 = add nsw i64 %mul4, %2
+  store i64 %add9, i64* %arrayidx3, align 8
+  ret void
+}