llvm-project/llvm/test/CodeGen/Thumb2/mve-basic.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -o - %s | FileCheck %s
; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -o - %s | FileCheck %s

define arm_aapcs_vfpcc <4 x i32> @vector_add_by_value(<4 x i32> %lhs, <4 x i32>%rhs) {
; CHECK-LABEL: vector_add_by_value:
; CHECK:       @ %bb.0:
; CHECK-NEXT:    @APP
; CHECK-NEXT:    vadd.i32 q0, q0, q1
; CHECK-NEXT:    @NO_APP
; CHECK-NEXT:    bx lr
  %result = tail call <4 x i32> asm "vadd.i32 $0,$1,$2", "=t,t,t"(<4 x i32> %lhs, <4 x i32> %rhs)
  ret <4 x i32> %result
}

define void @vector_add_by_reference(<4 x i32>* %resultp, <4 x i32>* %lhsp, <4 x i32>* %rhsp) {
; CHECK-LABEL: vector_add_by_reference:
; CHECK:       @ %bb.0:
; CHECK-NEXT:    vldrw.u32 q0, [r1]
; CHECK-NEXT:    vldrw.u32 q1, [r2]
; CHECK-NEXT:    @APP
; CHECK-NEXT:    vadd.i32 q0, q0, q1
; CHECK-NEXT:    @NO_APP
; CHECK-NEXT:    vstrw.32 q0, [r0]
; CHECK-NEXT:    bx lr
  %lhs = load <4 x i32>, <4 x i32>* %lhsp, align 16
  %rhs = load <4 x i32>, <4 x i32>* %rhsp, align 16
  %result = tail call <4 x i32> asm "vadd.i32 $0,$1,$2", "=t,t,t"(<4 x i32> %lhs, <4 x i32> %rhs)
  store <4 x i32> %result, <4 x i32>* %resultp, align 16
  ret void
}

define void @vector_f64_copy(<2 x double>* %from, <2 x double>* %to) {
; CHECK-LABEL: vector_f64_copy:
; CHECK:       @ %bb.0:
; CHECK-NEXT:    vldrw.u32 q0, [r0]
; CHECK-NEXT:    vstrw.32 q0, [r1]
; CHECK-NEXT:    bx lr
  %v = load <2 x double>, <2 x double>* %from, align 16
  store <2 x double> %v, <2 x double>* %to, align 16
  ret void
}

define arm_aapcs_vfpcc <16 x i8> @stack_slot_handling(<16 x i8> %a) #0 {
; CHECK-LABEL: stack_slot_handling:
; CHECK:       @ %bb.0: @ %entry
; CHECK-NEXT:    push {r4, r6, r7, lr}
; CHECK-NEXT:    add r7, sp, #8
; CHECK-NEXT:    sub sp, #16
; CHECK-NEXT:    mov r4, sp
; CHECK-NEXT:    bfc r4, #0, #4
; CHECK-NEXT:    mov sp, r4
; CHECK-NEXT:    mov r0, sp
; CHECK-NEXT:    vstrw.32 q0, [r0]
; CHECK-NEXT:    vldrw.u32 q0, [r0]
; CHECK-NEXT:    sub.w r4, r7, #8
; CHECK-NEXT:    mov sp, r4
; CHECK-NEXT:    pop {r4, r6, r7, pc}
entry:
  %a.addr = alloca <16 x i8>, align 8
  store <16 x i8> %a, <16 x i8>* %a.addr, align 8
  %0 = load <16 x i8>, <16 x i8>* %a.addr, align 8
  ret <16 x i8> %0
}

attributes #0 = { noinline optnone }
[ARM] Code-generation infrastructure for MVE. This provides the low-level support to start using MVE vector types in LLVM IR, loading and storing them, passing them to __asm__ statements containing hand-written MVE vector instructions, and if you have the hard-float ABI turned on, using them as function parameters. (In the soft-float ABI, vector types are passed in integer registers, and combining all those 32-bit integers into a q-reg requires support for selection DAG nodes like insert_vector_elt and build_vector which aren't implemented yet for MVE. In fact I've also had to add `arm_aapcs_vfpcc` to a couple of existing tests to avoid that problem.) Specifically, this commit adds support for: * spills, reloads and register moves for MVE vector registers * ditto for the VPT predication mask that lives in VPR.P0 * make all the MVE vector types legal in ISel, and provide selection DAG patterns for BITCAST, LOAD and STORE * make loads and stores of scalar FP types conditional on `hasFPRegs()` rather than `hasVFP2Base()`. As a result a few existing tests needed their llc command lines updating to use `-mattr=-fpregs` as their method of turning off all hardware FP support. Reviewers: dmgreen, samparker, SjoerdMeijer Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D60708 llvm-svn: 364329 2019-06-26 00:48:46 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
			`; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -o - %s \| FileCheck %s`
			`; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -o - %s \| FileCheck %s`

			`define arm_aapcs_vfpcc <4 x i32> @vector_add_by_value(<4 x i32> %lhs, <4 x i32>%rhs) {`
			`; CHECK-LABEL: vector_add_by_value:`
			`; CHECK: @ %bb.0:`
			`; CHECK-NEXT: @APP`
			`; CHECK-NEXT: vadd.i32 q0, q0, q1`
			`; CHECK-NEXT: @NO_APP`
			`; CHECK-NEXT: bx lr`
			`%result = tail call <4 x i32> asm "vadd.i32 $0,$1,$2", "=t,t,t"(<4 x i32> %lhs, <4 x i32> %rhs)`
			`ret <4 x i32> %result`
			`}`

			`define void @vector_add_by_reference(<4 x i32>* %resultp, <4 x i32>* %lhsp, <4 x i32>* %rhsp) {`
			`; CHECK-LABEL: vector_add_by_reference:`
			`; CHECK: @ %bb.0:`
			`; CHECK-NEXT: vldrw.u32 q0, [r1]`
			`; CHECK-NEXT: vldrw.u32 q1, [r2]`
			`; CHECK-NEXT: @APP`
			`; CHECK-NEXT: vadd.i32 q0, q0, q1`
			`; CHECK-NEXT: @NO_APP`
			`; CHECK-NEXT: vstrw.32 q0, [r0]`
			`; CHECK-NEXT: bx lr`
			`%lhs = load <4 x i32>, <4 x i32>* %lhsp, align 16`
			`%rhs = load <4 x i32>, <4 x i32>* %rhsp, align 16`
			`%result = tail call <4 x i32> asm "vadd.i32 $0,$1,$2", "=t,t,t"(<4 x i32> %lhs, <4 x i32> %rhs)`
			`store <4 x i32> %result, <4 x i32>* %resultp, align 16`
			`ret void`
			`}`
[ARM] MVE loads and stores This fills in the gaps for basic MVE loads and stores, allowing unaligned access and adding far too many tests. These will become important as narrowing/expanding and pre/post inc are added. Big endian might still not be handled very well, because we have not yet added bitcasts (and I'm not sure how we want it to work yet). I've included the alignment code anyway which maps with our current patterns. We plan to return to that later. Code written by Simon Tatham, with additional tests from Me and Mikhail Maltsev. Differential Revision: https://reviews.llvm.org/D63838 llvm-svn: 364633 2019-06-28 16:41:40 +08:00
			`define void @vector_f64_copy(<2 x double>* %from, <2 x double>* %to) {`
			`; CHECK-LABEL: vector_f64_copy:`
			`; CHECK: @ %bb.0:`
			`; CHECK-NEXT: vldrw.u32 q0, [r0]`
			`; CHECK-NEXT: vstrw.32 q0, [r1]`
			`; CHECK-NEXT: bx lr`
			`%v = load <2 x double>, <2 x double>* %from, align 16`
			`store <2 x double> %v, <2 x double>* %to, align 16`
			`ret void`
			`}`

			`define arm_aapcs_vfpcc <16 x i8> @stack_slot_handling(<16 x i8> %a) #0 {`
			`; CHECK-LABEL: stack_slot_handling:`
			`; CHECK: @ %bb.0: @ %entry`
			`; CHECK-NEXT: push {r4, r6, r7, lr}`
			`; CHECK-NEXT: add r7, sp, #8`
			`; CHECK-NEXT: sub sp, #16`
			`; CHECK-NEXT: mov r4, sp`
			`; CHECK-NEXT: bfc r4, #0, #4`
			`; CHECK-NEXT: mov sp, r4`
			`; CHECK-NEXT: mov r0, sp`
			`; CHECK-NEXT: vstrw.32 q0, [r0]`
			`; CHECK-NEXT: vldrw.u32 q0, [r0]`
			`; CHECK-NEXT: sub.w r4, r7, #8`
			`; CHECK-NEXT: mov sp, r4`
			`; CHECK-NEXT: pop {r4, r6, r7, pc}`
			`entry:`
			`%a.addr = alloca <16 x i8>, align 8`
			`store <16 x i8> %a, <16 x i8>* %a.addr, align 8`
			`%0 = load <16 x i8>, <16 x i8>* %a.addr, align 8`
			`ret <16 x i8> %0`
			`}`

			`attributes #0 = { noinline optnone }`