llvm-project/llvm/test/CodeGen/Thumb2/mve-intrinsics/idup.ll

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

776 lines
29 KiB
LLVM
Raw Normal View History

[ARM,MVE] Add intrinsics for v[id]dupq and v[id]wdupq. Summary: These instructions generate a vector of consecutive elements starting from a given base value and incrementing by 1, 2, 4 or 8. The `wdup` versions also wrap the values back to zero when they reach a given limit value. The instruction updates the scalar base register so that another use of the same instruction will continue the sequence from where the previous one left off. At the IR level, I've represented these instructions as a family of target-specific intrinsics with two return values (the constructed vector and the updated base). The user-facing ACLE API provides a set of intrinsics that throw away the written-back base and another set that receive it as a pointer so they can update it, plus the usual predicated versions. Because the intrinsics return two values (as do the underlying instructions), the isel has to be done in C++. This is the first family of MVE intrinsics that use the `imm_1248` immediate type in the clang Tablegen framework, so naturally, I found I'd given it the wrong C integer type. Also added some tests of the check that the immediate has a legal value, because this is the first time those particular checks have been exercised. Finally, I also had to fix a bug in MveEmitter which failed an assertion when I nested two `seq` nodes (the inner one used to extract the two values from the pair returned by the IR intrinsic, and the outer one put on by the predication multiclass). Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard Reviewed By: dmgreen Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D73357
2020-01-31 18:53:31 +08:00
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
define arm_aapcs_vfpcc <16 x i8> @test_vidupq_n_u8(i32 %a) {
; CHECK-LABEL: test_vidupq_n_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vidup.u8 q0, r0, #4
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32 %a, i32 4)
%1 = extractvalue { <16 x i8>, i32 } %0, 0
ret <16 x i8> %1
}
define arm_aapcs_vfpcc <8 x i16> @test_vidupq_n_u16(i32 %a) {
; CHECK-LABEL: test_vidupq_n_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vidup.u16 q0, r0, #1
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32 %a, i32 1)
%1 = extractvalue { <8 x i16>, i32 } %0, 0
ret <8 x i16> %1
}
define arm_aapcs_vfpcc <4 x i32> @test_vidupq_n_u32(i32 %a) {
; CHECK-LABEL: test_vidupq_n_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vidup.u32 q0, r0, #4
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 %a, i32 4)
%1 = extractvalue { <4 x i32>, i32 } %0, 0
ret <4 x i32> %1
}
define arm_aapcs_vfpcc <16 x i8> @test_vddupq_n_u8(i32 %a) {
; CHECK-LABEL: test_vddupq_n_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vddup.u8 q0, r0, #2
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vddup.v16i8(i32 %a, i32 2)
%1 = extractvalue { <16 x i8>, i32 } %0, 0
ret <16 x i8> %1
}
define arm_aapcs_vfpcc <8 x i16> @test_vddupq_n_u16(i32 %a) {
; CHECK-LABEL: test_vddupq_n_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vddup.u16 q0, r0, #4
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vddup.v8i16(i32 %a, i32 4)
%1 = extractvalue { <8 x i16>, i32 } %0, 0
ret <8 x i16> %1
}
define arm_aapcs_vfpcc <4 x i32> @test_vddupq_n_u32(i32 %a) {
; CHECK-LABEL: test_vddupq_n_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vddup.u32 q0, r0, #2
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vddup.v4i32(i32 %a, i32 2)
%1 = extractvalue { <4 x i32>, i32 } %0, 0
ret <4 x i32> %1
}
define arm_aapcs_vfpcc <16 x i8> @test_viwdupq_n_u8(i32 %a, i32 %b) {
; CHECK-LABEL: test_viwdupq_n_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: viwdup.u8 q0, r0, r1, #4
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.v16i8(i32 %a, i32 %b, i32 4)
%1 = extractvalue { <16 x i8>, i32 } %0, 0
ret <16 x i8> %1
}
define arm_aapcs_vfpcc <8 x i16> @test_viwdupq_n_u16(i32 %a, i32 %b) {
; CHECK-LABEL: test_viwdupq_n_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: viwdup.u16 q0, r0, r1, #2
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.v8i16(i32 %a, i32 %b, i32 2)
%1 = extractvalue { <8 x i16>, i32 } %0, 0
ret <8 x i16> %1
}
define arm_aapcs_vfpcc <4 x i32> @test_viwdupq_n_u32(i32 %a, i32 %b) {
; CHECK-LABEL: test_viwdupq_n_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: viwdup.u32 q0, r0, r1, #8
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32 %a, i32 %b, i32 8)
%1 = extractvalue { <4 x i32>, i32 } %0, 0
ret <4 x i32> %1
}
define arm_aapcs_vfpcc <16 x i8> @test_vdwdupq_n_u8(i32 %a, i32 %b) {
; CHECK-LABEL: test_vdwdupq_n_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vdwdup.u8 q0, r0, r1, #4
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.v16i8(i32 %a, i32 %b, i32 4)
%1 = extractvalue { <16 x i8>, i32 } %0, 0
ret <16 x i8> %1
}
define arm_aapcs_vfpcc <8 x i16> @test_vdwdupq_n_u16(i32 %a, i32 %b) {
; CHECK-LABEL: test_vdwdupq_n_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vdwdup.u16 q0, r0, r1, #8
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.v8i16(i32 %a, i32 %b, i32 8)
%1 = extractvalue { <8 x i16>, i32 } %0, 0
ret <8 x i16> %1
}
define arm_aapcs_vfpcc <4 x i32> @test_vdwdupq_n_u32(i32 %a, i32 %b) {
; CHECK-LABEL: test_vdwdupq_n_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vdwdup.u32 q0, r0, r1, #1
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.v4i32(i32 %a, i32 %b, i32 1)
%1 = extractvalue { <4 x i32>, i32 } %0, 0
ret <4 x i32> %1
}
define arm_aapcs_vfpcc <16 x i8> @test_vidupq_wb_u8(i32* nocapture %a) {
; CHECK-LABEL: test_vidupq_wb_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vidup.u8 q0, r2, #8
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32 %0, i32 8)
%2 = extractvalue { <16 x i8>, i32 } %1, 1
store i32 %2, i32* %a, align 4
%3 = extractvalue { <16 x i8>, i32 } %1, 0
ret <16 x i8> %3
}
define arm_aapcs_vfpcc <8 x i16> @test_vidupq_wb_u16(i32* nocapture %a) {
; CHECK-LABEL: test_vidupq_wb_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vidup.u16 q0, r2, #1
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32 %0, i32 1)
%2 = extractvalue { <8 x i16>, i32 } %1, 1
store i32 %2, i32* %a, align 4
%3 = extractvalue { <8 x i16>, i32 } %1, 0
ret <8 x i16> %3
}
define arm_aapcs_vfpcc <4 x i32> @test_vidupq_wb_u32(i32* nocapture %a) {
; CHECK-LABEL: test_vidupq_wb_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vidup.u32 q0, r2, #4
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 %0, i32 4)
%2 = extractvalue { <4 x i32>, i32 } %1, 1
store i32 %2, i32* %a, align 4
%3 = extractvalue { <4 x i32>, i32 } %1, 0
ret <4 x i32> %3
}
define arm_aapcs_vfpcc <16 x i8> @test_vddupq_wb_u8(i32* nocapture %a) {
; CHECK-LABEL: test_vddupq_wb_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vddup.u8 q0, r2, #2
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vddup.v16i8(i32 %0, i32 2)
%2 = extractvalue { <16 x i8>, i32 } %1, 1
store i32 %2, i32* %a, align 4
%3 = extractvalue { <16 x i8>, i32 } %1, 0
ret <16 x i8> %3
}
define arm_aapcs_vfpcc <8 x i16> @test_vddupq_wb_u16(i32* nocapture %a) {
; CHECK-LABEL: test_vddupq_wb_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vddup.u16 q0, r2, #8
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vddup.v8i16(i32 %0, i32 8)
%2 = extractvalue { <8 x i16>, i32 } %1, 1
store i32 %2, i32* %a, align 4
%3 = extractvalue { <8 x i16>, i32 } %1, 0
ret <8 x i16> %3
}
define arm_aapcs_vfpcc <4 x i32> @test_vddupq_wb_u32(i32* nocapture %a) {
; CHECK-LABEL: test_vddupq_wb_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vddup.u32 q0, r2, #2
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vddup.v4i32(i32 %0, i32 2)
%2 = extractvalue { <4 x i32>, i32 } %1, 1
store i32 %2, i32* %a, align 4
%3 = extractvalue { <4 x i32>, i32 } %1, 0
ret <4 x i32> %3
}
define arm_aapcs_vfpcc <16 x i8> @test_vdwdupq_wb_u8(i32* nocapture %a, i32 %b) {
; CHECK-LABEL: test_vdwdupq_wb_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vdwdup.u8 q0, r2, r1, #4
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.v16i8(i32 %0, i32 %b, i32 4)
%2 = extractvalue { <16 x i8>, i32 } %1, 1
store i32 %2, i32* %a, align 4
%3 = extractvalue { <16 x i8>, i32 } %1, 0
ret <16 x i8> %3
}
define arm_aapcs_vfpcc <8 x i16> @test_vdwdupq_wb_u16(i32* nocapture %a, i32 %b) {
; CHECK-LABEL: test_vdwdupq_wb_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vdwdup.u16 q0, r2, r1, #4
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.v8i16(i32 %0, i32 %b, i32 4)
%2 = extractvalue { <8 x i16>, i32 } %1, 1
store i32 %2, i32* %a, align 4
%3 = extractvalue { <8 x i16>, i32 } %1, 0
ret <8 x i16> %3
}
define arm_aapcs_vfpcc <16 x i8> @test_viwdupq_wb_u8(i32* nocapture %a, i32 %b) {
; CHECK-LABEL: test_viwdupq_wb_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: viwdup.u8 q0, r2, r1, #1
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = tail call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.v16i8(i32 %0, i32 %b, i32 1)
%2 = extractvalue { <16 x i8>, i32 } %1, 1
store i32 %2, i32* %a, align 4
%3 = extractvalue { <16 x i8>, i32 } %1, 0
ret <16 x i8> %3
}
define arm_aapcs_vfpcc <8 x i16> @test_viwdupq_wb_u16(i32* nocapture %a, i32 %b) {
; CHECK-LABEL: test_viwdupq_wb_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: viwdup.u16 q0, r2, r1, #1
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = tail call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.v8i16(i32 %0, i32 %b, i32 1)
%2 = extractvalue { <8 x i16>, i32 } %1, 1
store i32 %2, i32* %a, align 4
%3 = extractvalue { <8 x i16>, i32 } %1, 0
ret <8 x i16> %3
}
define arm_aapcs_vfpcc <4 x i32> @test_viwdupq_wb_u32(i32* nocapture %a, i32 %b) {
; CHECK-LABEL: test_viwdupq_wb_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: viwdup.u32 q0, r2, r1, #8
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32 %0, i32 %b, i32 8)
%2 = extractvalue { <4 x i32>, i32 } %1, 1
store i32 %2, i32* %a, align 4
%3 = extractvalue { <4 x i32>, i32 } %1, 0
ret <4 x i32> %3
}
define arm_aapcs_vfpcc <4 x i32> @test_vdwdupq_wb_u32(i32* nocapture %a, i32 %b) {
; CHECK-LABEL: test_vdwdupq_wb_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vdwdup.u32 q0, r2, r1, #2
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.v4i32(i32 %0, i32 %b, i32 2)
%2 = extractvalue { <4 x i32>, i32 } %1, 1
store i32 %2, i32* %a, align 4
%3 = extractvalue { <4 x i32>, i32 } %1, 0
ret <4 x i32> %3
}
define arm_aapcs_vfpcc <16 x i8> @test_vidupq_m_n_u8(<16 x i8> %inactive, i32 %a, i16 zeroext %p) {
; CHECK-LABEL: test_vidupq_m_n_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vidupt.u8 q0, r0, #8
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
%2 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %a, i32 8, <16 x i1> %1)
%3 = extractvalue { <16 x i8>, i32 } %2, 0
ret <16 x i8> %3
}
define arm_aapcs_vfpcc <8 x i16> @test_vidupq_m_n_u16(<8 x i16> %inactive, i32 %a, i16 zeroext %p) {
; CHECK-LABEL: test_vidupq_m_n_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vidupt.u16 q0, r0, #8
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
%2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %a, i32 8, <8 x i1> %1)
%3 = extractvalue { <8 x i16>, i32 } %2, 0
ret <8 x i16> %3
}
define arm_aapcs_vfpcc <4 x i32> @test_vidupq_m_n_u32(<4 x i32> %inactive, i32 %a, i16 zeroext %p) {
; CHECK-LABEL: test_vidupq_m_n_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vidupt.u32 q0, r0, #2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
%2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %a, i32 2, <4 x i1> %1)
%3 = extractvalue { <4 x i32>, i32 } %2, 0
ret <4 x i32> %3
}
define arm_aapcs_vfpcc <16 x i8> @test_vddupq_m_n_u8(<16 x i8> %inactive, i32 %a, i16 zeroext %p) {
; CHECK-LABEL: test_vddupq_m_n_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vddupt.u8 q0, r0, #8
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
%2 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vddup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %a, i32 8, <16 x i1> %1)
%3 = extractvalue { <16 x i8>, i32 } %2, 0
ret <16 x i8> %3
}
define arm_aapcs_vfpcc <8 x i16> @test_vddupq_m_n_u16(<8 x i16> %inactive, i32 %a, i16 zeroext %p) {
; CHECK-LABEL: test_vddupq_m_n_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vddupt.u16 q0, r0, #2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
%2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vddup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %a, i32 2, <8 x i1> %1)
%3 = extractvalue { <8 x i16>, i32 } %2, 0
ret <8 x i16> %3
}
define arm_aapcs_vfpcc <4 x i32> @test_vddupq_m_n_u32(<4 x i32> %inactive, i32 %a, i16 zeroext %p) {
; CHECK-LABEL: test_vddupq_m_n_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vddupt.u32 q0, r0, #8
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
%2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vddup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %a, i32 8, <4 x i1> %1)
%3 = extractvalue { <4 x i32>, i32 } %2, 0
ret <4 x i32> %3
}
define arm_aapcs_vfpcc <16 x i8> @test_viwdupq_m_n_u8(<16 x i8> %inactive, i32 %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_viwdupq_m_n_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: viwdupt.u8 q0, r0, r1, #8
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
%2 = tail call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %a, i32 %b, i32 8, <16 x i1> %1)
%3 = extractvalue { <16 x i8>, i32 } %2, 0
ret <16 x i8> %3
}
define arm_aapcs_vfpcc <8 x i16> @test_viwdupq_m_n_u16(<8 x i16> %inactive, i32 %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_viwdupq_m_n_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: viwdupt.u16 q0, r0, r1, #8
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
%2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %a, i32 %b, i32 8, <8 x i1> %1)
%3 = extractvalue { <8 x i16>, i32 } %2, 0
ret <8 x i16> %3
}
define arm_aapcs_vfpcc <4 x i32> @test_viwdupq_m_n_u32(<4 x i32> %inactive, i32 %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_viwdupq_m_n_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: viwdupt.u32 q0, r0, r1, #4
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
%2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %a, i32 %b, i32 4, <4 x i1> %1)
%3 = extractvalue { <4 x i32>, i32 } %2, 0
ret <4 x i32> %3
}
define arm_aapcs_vfpcc <16 x i8> @test_vdwdupq_m_n_u8(<16 x i8> %inactive, i32 %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_vdwdupq_m_n_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: vdwdupt.u8 q0, r0, r1, #1
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
%2 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %a, i32 %b, i32 1, <16 x i1> %1)
%3 = extractvalue { <16 x i8>, i32 } %2, 0
ret <16 x i8> %3
}
define arm_aapcs_vfpcc <8 x i16> @test_vdwdupq_m_n_u16(<8 x i16> %inactive, i32 %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_vdwdupq_m_n_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: vdwdupt.u16 q0, r0, r1, #2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
%2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %a, i32 %b, i32 2, <8 x i1> %1)
%3 = extractvalue { <8 x i16>, i32 } %2, 0
ret <8 x i16> %3
}
define arm_aapcs_vfpcc <4 x i32> @test_vdwdupq_m_n_u32(<4 x i32> %inactive, i32 %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_vdwdupq_m_n_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: vdwdupt.u32 q0, r0, r1, #4
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
%2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %a, i32 %b, i32 4, <4 x i1> %1)
%3 = extractvalue { <4 x i32>, i32 } %2, 0
ret <4 x i32> %3
}
define arm_aapcs_vfpcc <16 x i8> @test_vidupq_m_wb_u8(<16 x i8> %inactive, i32* nocapture %a, i16 zeroext %p) {
; CHECK-LABEL: test_vidupq_m_wb_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vidupt.u8 q0, r2, #8
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = zext i16 %p to i32
%2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
%3 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %0, i32 8, <16 x i1> %2)
%4 = extractvalue { <16 x i8>, i32 } %3, 1
store i32 %4, i32* %a, align 4
%5 = extractvalue { <16 x i8>, i32 } %3, 0
ret <16 x i8> %5
}
define arm_aapcs_vfpcc <8 x i16> @test_vidupq_m_wb_u16(<8 x i16> %inactive, i32* nocapture %a, i16 zeroext %p) {
; CHECK-LABEL: test_vidupq_m_wb_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vidupt.u16 q0, r2, #2
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = zext i16 %p to i32
%2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
%3 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %0, i32 2, <8 x i1> %2)
%4 = extractvalue { <8 x i16>, i32 } %3, 1
store i32 %4, i32* %a, align 4
%5 = extractvalue { <8 x i16>, i32 } %3, 0
ret <8 x i16> %5
}
define arm_aapcs_vfpcc <4 x i32> @test_vidupq_m_wb_u32(<4 x i32> %inactive, i32* nocapture %a, i16 zeroext %p) {
; CHECK-LABEL: test_vidupq_m_wb_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vidupt.u32 q0, r2, #8
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = zext i16 %p to i32
%2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
%3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %0, i32 8, <4 x i1> %2)
%4 = extractvalue { <4 x i32>, i32 } %3, 1
store i32 %4, i32* %a, align 4
%5 = extractvalue { <4 x i32>, i32 } %3, 0
ret <4 x i32> %5
}
define arm_aapcs_vfpcc <16 x i8> @test_vddupq_m_wb_u8(<16 x i8> %inactive, i32* nocapture %a, i16 zeroext %p) {
; CHECK-LABEL: test_vddupq_m_wb_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vddupt.u8 q0, r2, #1
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = zext i16 %p to i32
%2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
%3 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vddup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %0, i32 1, <16 x i1> %2)
%4 = extractvalue { <16 x i8>, i32 } %3, 1
store i32 %4, i32* %a, align 4
%5 = extractvalue { <16 x i8>, i32 } %3, 0
ret <16 x i8> %5
}
define arm_aapcs_vfpcc <8 x i16> @test_vddupq_m_wb_u16(<8 x i16> %inactive, i32* nocapture %a, i16 zeroext %p) {
; CHECK-LABEL: test_vddupq_m_wb_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vddupt.u16 q0, r2, #1
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = zext i16 %p to i32
%2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
%3 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vddup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %0, i32 1, <8 x i1> %2)
%4 = extractvalue { <8 x i16>, i32 } %3, 1
store i32 %4, i32* %a, align 4
%5 = extractvalue { <8 x i16>, i32 } %3, 0
ret <8 x i16> %5
}
define arm_aapcs_vfpcc <4 x i32> @test_vddupq_m_wb_u32(<4 x i32> %inactive, i32* nocapture %a, i16 zeroext %p) {
; CHECK-LABEL: test_vddupq_m_wb_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vddupt.u32 q0, r2, #4
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = zext i16 %p to i32
%2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
%3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vddup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %0, i32 4, <4 x i1> %2)
%4 = extractvalue { <4 x i32>, i32 } %3, 1
store i32 %4, i32* %a, align 4
%5 = extractvalue { <4 x i32>, i32 } %3, 0
ret <4 x i32> %5
}
define arm_aapcs_vfpcc <16 x i8> @test_viwdupq_m_wb_u8(<16 x i8> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_viwdupq_m_wb_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr.w r12, [r0]
; CHECK-NEXT: vmsr p0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: viwdupt.u8 q0, r12, r1, #8
; CHECK-NEXT: str.w r12, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = zext i16 %p to i32
%2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
%3 = tail call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %0, i32 %b, i32 8, <16 x i1> %2)
%4 = extractvalue { <16 x i8>, i32 } %3, 1
store i32 %4, i32* %a, align 4
%5 = extractvalue { <16 x i8>, i32 } %3, 0
ret <16 x i8> %5
}
define arm_aapcs_vfpcc <8 x i16> @test_viwdupq_m_wb_u16(<8 x i16> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_viwdupq_m_wb_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr.w r12, [r0]
; CHECK-NEXT: vmsr p0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: viwdupt.u16 q0, r12, r1, #8
; CHECK-NEXT: str.w r12, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = zext i16 %p to i32
%2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
%3 = tail call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %0, i32 %b, i32 8, <8 x i1> %2)
%4 = extractvalue { <8 x i16>, i32 } %3, 1
store i32 %4, i32* %a, align 4
%5 = extractvalue { <8 x i16>, i32 } %3, 0
ret <8 x i16> %5
}
define arm_aapcs_vfpcc <4 x i32> @test_viwdupq_m_wb_u32(<4 x i32> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_viwdupq_m_wb_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr.w r12, [r0]
; CHECK-NEXT: vmsr p0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: viwdupt.u32 q0, r12, r1, #4
; CHECK-NEXT: str.w r12, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = zext i16 %p to i32
%2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
%3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %0, i32 %b, i32 4, <4 x i1> %2)
%4 = extractvalue { <4 x i32>, i32 } %3, 1
store i32 %4, i32* %a, align 4
%5 = extractvalue { <4 x i32>, i32 } %3, 0
ret <4 x i32> %5
}
define arm_aapcs_vfpcc <16 x i8> @test_vdwdupq_m_wb_u8(<16 x i8> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_vdwdupq_m_wb_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr.w r12, [r0]
; CHECK-NEXT: vmsr p0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: vdwdupt.u8 q0, r12, r1, #1
; CHECK-NEXT: str.w r12, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = zext i16 %p to i32
%2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
%3 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %0, i32 %b, i32 1, <16 x i1> %2)
%4 = extractvalue { <16 x i8>, i32 } %3, 1
store i32 %4, i32* %a, align 4
%5 = extractvalue { <16 x i8>, i32 } %3, 0
ret <16 x i8> %5
}
define arm_aapcs_vfpcc <8 x i16> @test_vdwdupq_m_wb_u16(<8 x i16> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_vdwdupq_m_wb_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr.w r12, [r0]
; CHECK-NEXT: vmsr p0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: vdwdupt.u16 q0, r12, r1, #4
; CHECK-NEXT: str.w r12, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = zext i16 %p to i32
%2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
%3 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %0, i32 %b, i32 4, <8 x i1> %2)
%4 = extractvalue { <8 x i16>, i32 } %3, 1
store i32 %4, i32* %a, align 4
%5 = extractvalue { <8 x i16>, i32 } %3, 0
ret <8 x i16> %5
}
define arm_aapcs_vfpcc <4 x i32> @test_vdwdupq_m_wb_u32(<4 x i32> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_vdwdupq_m_wb_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr.w r12, [r0]
; CHECK-NEXT: vmsr p0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: vdwdupt.u32 q0, r12, r1, #4
; CHECK-NEXT: str.w r12, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = zext i16 %p to i32
%2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
%3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %0, i32 %b, i32 4, <4 x i1> %2)
%4 = extractvalue { <4 x i32>, i32 } %3, 1
store i32 %4, i32* %a, align 4
%5 = extractvalue { <4 x i32>, i32 } %3, 0
ret <4 x i32> %5
}
declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
declare { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32, i32)
declare { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32, i32)
declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32, i32)
declare { <16 x i8>, i32 } @llvm.arm.mve.vddup.v16i8(i32, i32)
declare { <8 x i16>, i32 } @llvm.arm.mve.vddup.v8i16(i32, i32)
declare { <4 x i32>, i32 } @llvm.arm.mve.vddup.v4i32(i32, i32)
declare { <16 x i8>, i32 } @llvm.arm.mve.viwdup.v16i8(i32, i32, i32)
declare { <8 x i16>, i32 } @llvm.arm.mve.viwdup.v8i16(i32, i32, i32)
declare { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32, i32, i32)
declare { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.v16i8(i32, i32, i32)
declare { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.v8i16(i32, i32, i32)
declare { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.v4i32(i32, i32, i32)
declare { <16 x i8>, i32 } @llvm.arm.mve.vidup.predicated.v16i8.v16i1(<16 x i8>, i32, i32, <16 x i1>)
declare { <8 x i16>, i32 } @llvm.arm.mve.vidup.predicated.v8i16.v8i1(<8 x i16>, i32, i32, <8 x i1>)
declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.predicated.v4i32.v4i1(<4 x i32>, i32, i32, <4 x i1>)
declare { <16 x i8>, i32 } @llvm.arm.mve.vddup.predicated.v16i8.v16i1(<16 x i8>, i32, i32, <16 x i1>)
declare { <8 x i16>, i32 } @llvm.arm.mve.vddup.predicated.v8i16.v8i1(<8 x i16>, i32, i32, <8 x i1>)
declare { <4 x i32>, i32 } @llvm.arm.mve.vddup.predicated.v4i32.v4i1(<4 x i32>, i32, i32, <4 x i1>)
declare { <16 x i8>, i32 } @llvm.arm.mve.viwdup.predicated.v16i8.v16i1(<16 x i8>, i32, i32, i32, <16 x i1>)
declare { <8 x i16>, i32 } @llvm.arm.mve.viwdup.predicated.v8i16.v8i1(<8 x i16>, i32, i32, i32, <8 x i1>)
declare { <4 x i32>, i32 } @llvm.arm.mve.viwdup.predicated.v4i32.v4i1(<4 x i32>, i32, i32, i32, <4 x i1>)
declare { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.predicated.v16i8.v16i1(<16 x i8>, i32, i32, i32, <16 x i1>)
declare { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.predicated.v8i16.v8i1(<8 x i16>, i32, i32, i32, <8 x i1>)
declare { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.predicated.v4i32.v4i1(<4 x i32>, i32, i32, i32, <4 x i1>)