[ARM] Extra tests for MVE vhadd and vmulh. NFC

2021-05-20 14:13:39 +01:00 · 2021-05-20 14:13:39 +01:00 · bdd82c3f51
parent ac00cb0d2a
commit bdd82c3f51
2 changed files with 1425 additions and 0 deletions
--- a/llvm/test/CodeGen/Thumb2/mve-vhadd.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vhadd.ll
--- a/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vmulh.ll
@ -237,3 +237,306 @@ entry:
  %s2 = trunc <16 x i16> %s to <16 x i8>
  ret <16 x i8> %s2
 }
+
+define void @vmulh_s8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) {
+; CHECK-LABEL: vmulh_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    mov.w lr, #64
+; CHECK-NEXT:  .LBB12_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
+; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
+; CHECK-NEXT:    vmullt.s8 q2, q1, q0
+; CHECK-NEXT:    vmullb.s8 q0, q1, q0
+; CHECK-NEXT:    vshr.u16 q2, q2, #8
+; CHECK-NEXT:    vshr.u16 q0, q0, #8
+; CHECK-NEXT:    vmovnt.i16 q0, q2
+; CHECK-NEXT:    vstrb.8 q0, [r2], #16
+; CHECK-NEXT:    le lr, .LBB12_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i8, i8* %x, i32 %index
+  %1 = bitcast i8* %0 to <16 x i8>*
+  %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
+  %2 = sext <16 x i8> %wide.load to <16 x i16>
+  %3 = getelementptr inbounds i8, i8* %y, i32 %index
+  %4 = bitcast i8* %3 to <16 x i8>*
+  %wide.load17 = load <16 x i8>, <16 x i8>* %4, align 1
+  %5 = sext <16 x i8> %wide.load17 to <16 x i16>
+  %6 = mul nsw <16 x i16> %5, %2
+  %7 = lshr <16 x i16> %6, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %8 = trunc <16 x i16> %7 to <16 x i8>
+  %9 = getelementptr inbounds i8, i8* %z, i32 %index
+  %10 = bitcast i8* %9 to <16 x i8>*
+  store <16 x i8> %8, <16 x i8>* %10, align 1
+  %index.next = add i32 %index, 16
+  %11 = icmp eq i32 %index.next, 1024
+  br i1 %11, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @vmulh_s16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) {
+; CHECK-LABEL: vmulh_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    mov.w lr, #128
+; CHECK-NEXT:  .LBB13_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
+; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
+; CHECK-NEXT:    vmullt.s16 q2, q1, q0
+; CHECK-NEXT:    vmullb.s16 q0, q1, q0
+; CHECK-NEXT:    vshr.u32 q2, q2, #16
+; CHECK-NEXT:    vshr.u32 q0, q0, #16
+; CHECK-NEXT:    vmovnt.i32 q0, q2
+; CHECK-NEXT:    vstrb.8 q0, [r2], #16
+; CHECK-NEXT:    le lr, .LBB13_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i16, i16* %x, i32 %index
+  %1 = bitcast i16* %0 to <8 x i16>*
+  %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
+  %2 = sext <8 x i16> %wide.load to <8 x i32>
+  %3 = getelementptr inbounds i16, i16* %y, i32 %index
+  %4 = bitcast i16* %3 to <8 x i16>*
+  %wide.load17 = load <8 x i16>, <8 x i16>* %4, align 2
+  %5 = sext <8 x i16> %wide.load17 to <8 x i32>
+  %6 = mul nsw <8 x i32> %5, %2
+  %7 = lshr <8 x i32> %6, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  %9 = getelementptr inbounds i16, i16* %z, i32 %index
+  %10 = bitcast i16* %9 to <8 x i16>*
+  store <8 x i16> %8, <8 x i16>* %10, align 2
+  %index.next = add i32 %index, 8
+  %11 = icmp eq i32 %index.next, 1024
+  br i1 %11, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @vmulh_s32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
+; CHECK-LABEL: vmulh_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    mov.w lr, #256
+; CHECK-NEXT:  .LBB14_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
+; CHECK-NEXT:    vmov.f32 s0, s4
+; CHECK-NEXT:    vmov.f32 s12, s8
+; CHECK-NEXT:    vmov.f32 s2, s5
+; CHECK-NEXT:    vmov.f32 s14, s9
+; CHECK-NEXT:    vmov r12, s0
+; CHECK-NEXT:    vmov r3, s12
+; CHECK-NEXT:    vmov.f32 s16, s6
+; CHECK-NEXT:    vmov.f32 s18, s7
+; CHECK-NEXT:    vmov.f32 s4, s10
+; CHECK-NEXT:    vmov.f32 s6, s11
+; CHECK-NEXT:    vmullb.s32 q2, q1, q4
+; CHECK-NEXT:    smmul r12, r3, r12
+; CHECK-NEXT:    vmov r3, s9
+; CHECK-NEXT:    vmov q1[2], q1[0], r12, r3
+; CHECK-NEXT:    vmov r12, s2
+; CHECK-NEXT:    vmov r3, s14
+; CHECK-NEXT:    smmul r12, r3, r12
+; CHECK-NEXT:    vmov r3, s11
+; CHECK-NEXT:    vmov q1[3], q1[1], r12, r3
+; CHECK-NEXT:    vstrb.8 q1, [r2], #16
+; CHECK-NEXT:    le lr, .LBB14_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, i32* %x, i32 %index
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
+  %2 = sext <4 x i32> %wide.load to <4 x i64>
+  %3 = getelementptr inbounds i32, i32* %y, i32 %index
+  %4 = bitcast i32* %3 to <4 x i32>*
+  %wide.load17 = load <4 x i32>, <4 x i32>* %4, align 4
+  %5 = sext <4 x i32> %wide.load17 to <4 x i64>
+  %6 = mul nsw <4 x i64> %5, %2
+  %7 = lshr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32>
+  %8 = trunc <4 x i64> %7 to <4 x i32>
+  %9 = getelementptr inbounds i32, i32* %z, i32 %index
+  %10 = bitcast i32* %9 to <4 x i32>*
+  store <4 x i32> %8, <4 x i32>* %10, align 4
+  %index.next = add i32 %index, 4
+  %11 = icmp eq i32 %index.next, 1024
+  br i1 %11, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @vmulh_u8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) {
+; CHECK-LABEL: vmulh_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    mov.w lr, #64
+; CHECK-NEXT:  .LBB15_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
+; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
+; CHECK-NEXT:    vmullt.u8 q2, q1, q0
+; CHECK-NEXT:    vmullb.u8 q0, q1, q0
+; CHECK-NEXT:    vshr.u16 q2, q2, #8
+; CHECK-NEXT:    vshr.u16 q0, q0, #8
+; CHECK-NEXT:    vmovnt.i16 q0, q2
+; CHECK-NEXT:    vstrb.8 q0, [r2], #16
+; CHECK-NEXT:    le lr, .LBB15_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i8, i8* %x, i32 %index
+  %1 = bitcast i8* %0 to <16 x i8>*
+  %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
+  %2 = zext <16 x i8> %wide.load to <16 x i16>
+  %3 = getelementptr inbounds i8, i8* %y, i32 %index
+  %4 = bitcast i8* %3 to <16 x i8>*
+  %wide.load17 = load <16 x i8>, <16 x i8>* %4, align 1
+  %5 = zext <16 x i8> %wide.load17 to <16 x i16>
+  %6 = mul nuw <16 x i16> %5, %2
+  %7 = lshr <16 x i16> %6, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %8 = trunc <16 x i16> %7 to <16 x i8>
+  %9 = getelementptr inbounds i8, i8* %z, i32 %index
+  %10 = bitcast i8* %9 to <16 x i8>*
+  store <16 x i8> %8, <16 x i8>* %10, align 1
+  %index.next = add i32 %index, 16
+  %11 = icmp eq i32 %index.next, 1024
+  br i1 %11, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @vmulh_u16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) {
+; CHECK-LABEL: vmulh_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    mov.w lr, #128
+; CHECK-NEXT:  .LBB16_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
+; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
+; CHECK-NEXT:    vmullt.u16 q2, q1, q0
+; CHECK-NEXT:    vmullb.u16 q0, q1, q0
+; CHECK-NEXT:    vshr.u32 q2, q2, #16
+; CHECK-NEXT:    vshr.u32 q0, q0, #16
+; CHECK-NEXT:    vmovnt.i32 q0, q2
+; CHECK-NEXT:    vstrb.8 q0, [r2], #16
+; CHECK-NEXT:    le lr, .LBB16_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i16, i16* %x, i32 %index
+  %1 = bitcast i16* %0 to <8 x i16>*
+  %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
+  %2 = zext <8 x i16> %wide.load to <8 x i32>
+  %3 = getelementptr inbounds i16, i16* %y, i32 %index
+  %4 = bitcast i16* %3 to <8 x i16>*
+  %wide.load17 = load <8 x i16>, <8 x i16>* %4, align 2
+  %5 = zext <8 x i16> %wide.load17 to <8 x i32>
+  %6 = mul nuw <8 x i32> %5, %2
+  %7 = lshr <8 x i32> %6, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %8 = trunc <8 x i32> %7 to <8 x i16>
+  %9 = getelementptr inbounds i16, i16* %z, i32 %index
+  %10 = bitcast i16* %9 to <8 x i16>*
+  store <8 x i16> %8, <8 x i16>* %10, align 2
+  %index.next = add i32 %index, 8
+  %11 = icmp eq i32 %index.next, 1024
+  br i1 %11, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}
+
+define void @vmulh_u32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
+; CHECK-LABEL: vmulh_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    mov.w lr, #256
+; CHECK-NEXT:  .LBB17_1: @ %vector.body
+; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
+; CHECK-NEXT:    vldrw.u32 q3, [r1], #16
+; CHECK-NEXT:    vmov.f32 s8, s2
+; CHECK-NEXT:    vmov.f32 s16, s14
+; CHECK-NEXT:    vmov.f32 s10, s3
+; CHECK-NEXT:    vmov.f32 s18, s15
+; CHECK-NEXT:    vmov.f32 s2, s1
+; CHECK-NEXT:    vmullb.u32 q1, q4, q2
+; CHECK-NEXT:    vmov.f32 s14, s13
+; CHECK-NEXT:    vmullb.u32 q2, q3, q0
+; CHECK-NEXT:    vmov.f32 s0, s9
+; CHECK-NEXT:    vmov.f32 s1, s11
+; CHECK-NEXT:    vmov.f32 s2, s5
+; CHECK-NEXT:    vmov.f32 s3, s7
+; CHECK-NEXT:    vstrb.8 q0, [r2], #16
+; CHECK-NEXT:    le lr, .LBB17_1
+; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    pop {r7, pc}
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i32, i32* %x, i32 %index
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
+  %2 = zext <4 x i32> %wide.load to <4 x i64>
+  %3 = getelementptr inbounds i32, i32* %y, i32 %index
+  %4 = bitcast i32* %3 to <4 x i32>*
+  %wide.load17 = load <4 x i32>, <4 x i32>* %4, align 4
+  %5 = zext <4 x i32> %wide.load17 to <4 x i64>
+  %6 = mul nuw <4 x i64> %5, %2
+  %7 = lshr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32>
+  %8 = trunc <4 x i64> %7 to <4 x i32>
+  %9 = getelementptr inbounds i32, i32* %z, i32 %index
+  %10 = bitcast i32* %9 to <4 x i32>*
+  store <4 x i32> %8, <4 x i32>* %10, align 4
+  %index.next = add i32 %index, 4
+  %11 = icmp eq i32 %index.next, 1024
+  br i1 %11, label %for.cond.cleanup, label %vector.body
+
+for.cond.cleanup:                                 ; preds = %vector.body
+  ret void
+}