diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index e7e652ddc9bf..e4a47edc7508 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -235,6 +235,12 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + + // No native support for these. + setOperationAction(ISD::UDIV, VT, Expand); + setOperationAction(ISD::SDIV, VT, Expand); + setOperationAction(ISD::UREM, VT, Expand); + setOperationAction(ISD::SREM, VT, Expand); } const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 }; @@ -252,6 +258,12 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) { setOperationAction(ISD::BITCAST, VT, Legal); setOperationAction(ISD::LOAD, VT, Legal); setOperationAction(ISD::STORE, VT, Legal); + + if (HasMVEFP) { + // No native support for these. + setOperationAction(ISD::FDIV, VT, Expand); + setOperationAction(ISD::FREM, VT, Expand); + } } // We 'support' these types up to bitcast/load/store level, regardless of diff --git a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll new file mode 100644 index 000000000000..7b3b8135c4fb --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll @@ -0,0 +1,1243 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve,+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVE +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-MVEFP + +define arm_aapcs_vfpcc <4 x i32> @udiv_i32(<4 x i32> %in1, <4 x i32> %in2) { +; CHECK-LABEL: udiv_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: udiv r1, r2, r1 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.32 q2[1], r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %out = udiv <4 x i32> %in1, %in2 + ret <4 x i32> %out +} + +define arm_aapcs_vfpcc <4 x i32> @sdiv_i32(<4 x i32> %in1, <4 x i32> %in2) { +; CHECK-LABEL: sdiv_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: sdiv r0, r1, r0 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: sdiv r1, r2, r1 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.32 q2[1], r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: sdiv r0, r1, r0 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: sdiv r0, r1, r0 +; CHECK-NEXT: vmov.32 q2[3], r0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %out = sdiv <4 x i32> %in1, %in2 + ret <4 x i32> %out +} + +define arm_aapcs_vfpcc <4 x i32> @urem_i32(<4 x i32> %in1, <4 x i32> %in2) { +; CHECK-LABEL: urem_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: udiv r2, r1, r0 +; CHECK-NEXT: mls r12, r2, r0, r1 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: udiv r3, r2, r1 +; CHECK-NEXT: mls lr, r3, r1, r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: udiv r0, r3, r2 +; CHECK-NEXT: mls r0, r0, r2, r3 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmov.32 q0[0], r12 +; CHECK-NEXT: udiv r1, r3, r2 +; CHECK-NEXT: vmov.32 q0[1], lr +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: mls r1, r1, r2, r3 +; CHECK-NEXT: vmov.32 q0[3], r1 +; CHECK-NEXT: pop {r7, pc} +entry: + %out = urem <4 x i32> %in1, %in2 + ret <4 x i32> %out +} + +define arm_aapcs_vfpcc <4 x i32> @srem_i32(<4 x i32> %in1, <4 x i32> %in2) { +; CHECK-LABEL: srem_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: sdiv r2, r1, r0 +; CHECK-NEXT: mls r12, r2, r0, r1 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: sdiv r3, r2, r1 +; CHECK-NEXT: mls lr, r3, r1, r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: sdiv r0, r3, r2 +; CHECK-NEXT: mls r0, r0, r2, r3 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmov.32 q0[0], r12 +; CHECK-NEXT: sdiv r1, r3, r2 +; CHECK-NEXT: vmov.32 q0[1], lr +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: mls r1, r1, r2, r3 +; CHECK-NEXT: vmov.32 q0[3], r1 +; CHECK-NEXT: pop {r7, pc} +entry: + %out = srem <4 x i32> %in1, %in2 + ret <4 x i32> %out +} + + +define arm_aapcs_vfpcc <8 x i16> @udiv_i16(<8 x i16> %in1, <8 x i16> %in2) { +; CHECK-LABEL: udiv_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.u16 r0, q1[0] +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov.u16 r1, q1[1] +; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: vmov.16 q2[0], r0 +; CHECK-NEXT: udiv r1, r2, r1 +; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: vmov.16 q2[1], r1 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov.16 q2[2], r0 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: vmov.16 q2[3], r0 +; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmov.16 q2[4], r0 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: vmov.16 q2[5], r0 +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: vmov.u16 r0, q1[7] +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov.16 q2[7], r0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %out = udiv <8 x i16> %in1, %in2 + ret <8 x i16> %out +} + +define arm_aapcs_vfpcc <8 x i16> @sdiv_i16(<8 x i16> %in1, <8 x i16> %in2) { +; CHECK-LABEL: sdiv_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: sxth r1, r1 +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: sdiv r12, r1, r0 +; CHECK-NEXT: vmov.u16 r1, q1[2] +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: sxth r1, r1 +; CHECK-NEXT: vmov.u16 r4, q1[6] +; CHECK-NEXT: sdiv r3, r2, r1 +; CHECK-NEXT: vmov.u16 r1, q1[1] +; CHECK-NEXT: vmov.u16 r2, q0[1] +; CHECK-NEXT: sxth r1, r1 +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: vmov.u16 r5, q0[6] +; CHECK-NEXT: sdiv r0, r2, r1 +; CHECK-NEXT: vmov.u16 r1, q1[0] +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: sxth r1, r1 +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: sxth r4, r4 +; CHECK-NEXT: sdiv r1, r2, r1 +; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: vmov.16 q2[0], r1 +; CHECK-NEXT: sxth.w lr, r2 +; CHECK-NEXT: vmov.16 q2[1], r0 +; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: vmov.16 q2[2], r3 +; CHECK-NEXT: vmov.u16 r3, q1[4] +; CHECK-NEXT: sxth r6, r2 +; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: vmov.u16 r1, q1[5] +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: sxth r3, r3 +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: sxth r1, r1 +; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: vmov.16 q2[3], r12 +; CHECK-NEXT: sdiv r2, r2, r3 +; CHECK-NEXT: sxth r5, r5 +; CHECK-NEXT: vmov.16 q2[4], r2 +; CHECK-NEXT: sdiv r0, r0, r1 +; CHECK-NEXT: vmov.16 q2[5], r0 +; CHECK-NEXT: sdiv r0, r5, r4 +; CHECK-NEXT: vmov.16 q2[6], r0 +; CHECK-NEXT: sdiv r0, r6, lr +; CHECK-NEXT: vmov.16 q2[7], r0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: pop {r4, r5, r6, pc} +entry: + %out = sdiv <8 x i16> %in1, %in2 + ret <8 x i16> %out +} + +define arm_aapcs_vfpcc <8 x i16> @urem_i16(<8 x i16> %in1, <8 x i16> %in2) { +; CHECK-LABEL: urem_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: vmov.u16 r0, q1[6] +; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: udiv r2, r1, r0 +; CHECK-NEXT: mls r12, r2, r0, r1 +; CHECK-NEXT: vmov.u16 r1, q1[7] +; CHECK-NEXT: vmov.u16 r2, q0[7] +; CHECK-NEXT: udiv r3, r2, r1 +; CHECK-NEXT: mls lr, r3, r1, r2 +; CHECK-NEXT: vmov.u16 r2, q1[4] +; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: udiv r0, r3, r2 +; CHECK-NEXT: mls r2, r0, r2, r3 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: vmov.u16 r3, q0[5] +; CHECK-NEXT: udiv r1, r3, r0 +; CHECK-NEXT: mls r0, r1, r0, r3 +; CHECK-NEXT: vmov.u16 r1, q1[2] +; CHECK-NEXT: vmov.u16 r3, q0[2] +; CHECK-NEXT: udiv r4, r3, r1 +; CHECK-NEXT: mls r1, r4, r1, r3 +; CHECK-NEXT: vmov.u16 r3, q1[3] +; CHECK-NEXT: vmov.u16 r4, q0[3] +; CHECK-NEXT: udiv r5, r4, r3 +; CHECK-NEXT: mls r3, r5, r3, r4 +; CHECK-NEXT: vmov.u16 r4, q1[0] +; CHECK-NEXT: vmov.u16 r5, q0[0] +; CHECK-NEXT: udiv r6, r5, r4 +; CHECK-NEXT: mls r4, r6, r4, r5 +; CHECK-NEXT: vmov.u16 r6, q0[1] +; CHECK-NEXT: vmov.u16 r5, q1[1] +; CHECK-NEXT: udiv r7, r6, r5 +; CHECK-NEXT: vmov.16 q0[0], r4 +; CHECK-NEXT: mls r5, r7, r5, r6 +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: vmov.16 q0[2], r1 +; CHECK-NEXT: vmov.16 q0[3], r3 +; CHECK-NEXT: vmov.16 q0[4], r2 +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.16 q0[6], r12 +; CHECK-NEXT: vmov.16 q0[7], lr +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +entry: + %out = urem <8 x i16> %in1, %in2 + ret <8 x i16> %out +} + +define arm_aapcs_vfpcc <8 x i16> @srem_i16(<8 x i16> %in1, <8 x i16> %in2) { +; CHECK-LABEL: srem_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: vmov.u16 r5, q1[6] +; CHECK-NEXT: vmov.u16 r6, q0[6] +; CHECK-NEXT: sxth r5, r5 +; CHECK-NEXT: sxth r6, r6 +; CHECK-NEXT: vmov.u16 r0, q1[0] +; CHECK-NEXT: sdiv r7, r6, r5 +; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: sxth.w r8, r0 +; CHECK-NEXT: vmov.u16 r0, q1[3] +; CHECK-NEXT: mls r12, r7, r5, r6 +; CHECK-NEXT: vmov.u16 r7, q0[7] +; CHECK-NEXT: sxth r3, r0 +; CHECK-NEXT: vmov.u16 r0, q1[2] +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: sxth r7, r7 +; CHECK-NEXT: sxth r4, r0 +; CHECK-NEXT: vmov.u16 r0, q1[5] +; CHECK-NEXT: sdiv r6, r7, r2 +; CHECK-NEXT: mls lr, r6, r2, r7 +; CHECK-NEXT: vmov.u16 r2, q0[4] +; CHECK-NEXT: sxth r1, r0 +; CHECK-NEXT: vmov.u16 r0, q1[4] +; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: sdiv r5, r2, r0 +; CHECK-NEXT: vmov.u16 r6, q0[1] +; CHECK-NEXT: mls r0, r5, r0, r2 +; CHECK-NEXT: vmov.u16 r2, q0[5] +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: sdiv r5, r2, r1 +; CHECK-NEXT: sxth r6, r6 +; CHECK-NEXT: mls r1, r5, r1, r2 +; CHECK-NEXT: vmov.u16 r2, q0[2] +; CHECK-NEXT: sxth r2, r2 +; CHECK-NEXT: sdiv r5, r2, r4 +; CHECK-NEXT: mls r2, r5, r4, r2 +; CHECK-NEXT: vmov.u16 r4, q0[3] +; CHECK-NEXT: sxth r4, r4 +; CHECK-NEXT: sdiv r5, r4, r3 +; CHECK-NEXT: mls r3, r5, r3, r4 +; CHECK-NEXT: vmov.u16 r4, q0[0] +; CHECK-NEXT: sxth r4, r4 +; CHECK-NEXT: sdiv r5, r4, r8 +; CHECK-NEXT: mls r4, r5, r8, r4 +; CHECK-NEXT: vmov.u16 r5, q1[1] +; CHECK-NEXT: sxth r5, r5 +; CHECK-NEXT: sdiv r7, r6, r5 +; CHECK-NEXT: vmov.16 q0[0], r4 +; CHECK-NEXT: mls r5, r7, r5, r6 +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.16 q0[3], r3 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r12 +; CHECK-NEXT: vmov.16 q0[7], lr +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +entry: + %out = srem <8 x i16> %in1, %in2 + ret <8 x i16> %out +} + + +define arm_aapcs_vfpcc <16 x i8> @udiv_i8(<16 x i8> %in1, <16 x i8> %in2) { +; CHECK-LABEL: udiv_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.u8 r0, q1[0] +; CHECK-NEXT: vmov.u8 r1, q0[0] +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov.u8 r1, q1[1] +; CHECK-NEXT: vmov.u8 r2, q0[1] +; CHECK-NEXT: vmov.8 q2[0], r0 +; CHECK-NEXT: udiv r1, r2, r1 +; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: vmov.8 q2[1], r1 +; CHECK-NEXT: vmov.u8 r1, q0[2] +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov.u8 r1, q0[3] +; CHECK-NEXT: vmov.8 q2[2], r0 +; CHECK-NEXT: vmov.u8 r0, q1[3] +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: vmov.8 q2[3], r0 +; CHECK-NEXT: vmov.u8 r0, q1[4] +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov.u8 r1, q0[5] +; CHECK-NEXT: vmov.8 q2[4], r0 +; CHECK-NEXT: vmov.u8 r0, q1[5] +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov.u8 r1, q0[6] +; CHECK-NEXT: vmov.8 q2[5], r0 +; CHECK-NEXT: vmov.u8 r0, q1[6] +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov.u8 r1, q0[7] +; CHECK-NEXT: vmov.8 q2[6], r0 +; CHECK-NEXT: vmov.u8 r0, q1[7] +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov.u8 r1, q0[8] +; CHECK-NEXT: vmov.8 q2[7], r0 +; CHECK-NEXT: vmov.u8 r0, q1[8] +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov.u8 r1, q0[9] +; CHECK-NEXT: vmov.8 q2[8], r0 +; CHECK-NEXT: vmov.u8 r0, q1[9] +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov.u8 r1, q0[10] +; CHECK-NEXT: vmov.8 q2[9], r0 +; CHECK-NEXT: vmov.u8 r0, q1[10] +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov.u8 r1, q0[11] +; CHECK-NEXT: vmov.8 q2[10], r0 +; CHECK-NEXT: vmov.u8 r0, q1[11] +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov.u8 r1, q0[12] +; CHECK-NEXT: vmov.8 q2[11], r0 +; CHECK-NEXT: vmov.u8 r0, q1[12] +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov.u8 r1, q0[13] +; CHECK-NEXT: vmov.8 q2[12], r0 +; CHECK-NEXT: vmov.u8 r0, q1[13] +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov.u8 r1, q0[14] +; CHECK-NEXT: vmov.8 q2[13], r0 +; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: vmov.8 q2[14], r0 +; CHECK-NEXT: vmov.u8 r0, q1[15] +; CHECK-NEXT: udiv r0, r1, r0 +; CHECK-NEXT: vmov.8 q2[15], r0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %out = udiv <16 x i8> %in1, %in2 + ret <16 x i8> %out +} + +define arm_aapcs_vfpcc <16 x i8> @sdiv_i8(<16 x i8> %in1, <16 x i8> %in2) { +; CHECK-LABEL: sdiv_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: vmov.u8 r0, q1[1] +; CHECK-NEXT: vmov.u8 r1, q0[1] +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: sdiv r0, r1, r0 +; CHECK-NEXT: vmov.u8 r1, q1[0] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: vmov.u8 r4, q1[3] +; CHECK-NEXT: sdiv r1, r2, r1 +; CHECK-NEXT: vmov.u8 r5, q0[3] +; CHECK-NEXT: vmov.8 q2[0], r1 +; CHECK-NEXT: vmov.u8 r1, q1[2] +; CHECK-NEXT: vmov.8 q2[1], r0 +; CHECK-NEXT: vmov.u8 r0, q0[2] +; CHECK-NEXT: vmov.u8 r2, q1[11] +; CHECK-NEXT: vmov.u8 r3, q0[11] +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: sdiv r0, r0, r1 +; CHECK-NEXT: sxtb.w r12, r2 +; CHECK-NEXT: sxtb.w lr, r3 +; CHECK-NEXT: vmov.u8 r2, q1[4] +; CHECK-NEXT: vmov.u8 r3, q0[4] +; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: sxtb r5, r5 +; CHECK-NEXT: vmov.8 q2[2], r0 +; CHECK-NEXT: sdiv r0, r5, r4 +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: vmov.8 q2[3], r0 +; CHECK-NEXT: sdiv r0, r3, r2 +; CHECK-NEXT: vmov.u8 r1, q0[10] +; CHECK-NEXT: vmov.8 q2[4], r0 +; CHECK-NEXT: vmov.u8 r0, q1[10] +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: sdiv r12, lr, r12 +; CHECK-NEXT: sdiv lr, r1, r0 +; CHECK-NEXT: vmov.u8 r0, q1[9] +; CHECK-NEXT: vmov.u8 r1, q0[9] +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: sdiv r2, r1, r0 +; CHECK-NEXT: vmov.u8 r0, q1[8] +; CHECK-NEXT: vmov.u8 r1, q0[8] +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: vmov.u8 r3, q0[7] +; CHECK-NEXT: sdiv r1, r1, r0 +; CHECK-NEXT: vmov.u8 r0, q1[7] +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: sdiv r4, r3, r0 +; CHECK-NEXT: vmov.u8 r0, q1[6] +; CHECK-NEXT: vmov.u8 r3, q0[6] +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: vmov.u8 r6, q0[12] +; CHECK-NEXT: sdiv r5, r3, r0 +; CHECK-NEXT: vmov.u8 r0, q1[5] +; CHECK-NEXT: vmov.u8 r3, q0[5] +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: sxtb r6, r6 +; CHECK-NEXT: sdiv r0, r3, r0 +; CHECK-NEXT: vmov.u8 r3, q1[15] +; CHECK-NEXT: vmov.8 q2[5], r0 +; CHECK-NEXT: sxtb r7, r3 +; CHECK-NEXT: vmov.8 q2[6], r5 +; CHECK-NEXT: vmov.u8 r3, q1[12] +; CHECK-NEXT: vmov.8 q2[7], r4 +; CHECK-NEXT: sxtb r3, r3 +; CHECK-NEXT: vmov.8 q2[8], r1 +; CHECK-NEXT: vmov.u8 r1, q1[13] +; CHECK-NEXT: vmov.8 q2[9], r2 +; CHECK-NEXT: vmov.u8 r2, q0[13] +; CHECK-NEXT: vmov.8 q2[10], lr +; CHECK-NEXT: vmov.u8 r5, q1[14] +; CHECK-NEXT: vmov.u8 r4, q0[14] +; CHECK-NEXT: sxtb r1, r1 +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov.8 q2[11], r12 +; CHECK-NEXT: sdiv r3, r6, r3 +; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: sxtb r5, r5 +; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: vmov.8 q2[12], r3 +; CHECK-NEXT: sdiv r1, r2, r1 +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: vmov.8 q2[13], r1 +; CHECK-NEXT: sdiv r1, r4, r5 +; CHECK-NEXT: sdiv r0, r0, r7 +; CHECK-NEXT: vmov.8 q2[14], r1 +; CHECK-NEXT: vmov.8 q2[15], r0 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +entry: + %out = sdiv <16 x i8> %in1, %in2 + ret <16 x i8> %out +} + +define arm_aapcs_vfpcc <16 x i8> @urem_i8(<16 x i8> %in1, <16 x i8> %in2) { +; CHECK-LABEL: urem_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: vmov.u8 r0, q1[14] +; CHECK-NEXT: vmov.u8 r1, q0[14] +; CHECK-NEXT: udiv r2, r1, r0 +; CHECK-NEXT: mls r12, r2, r0, r1 +; CHECK-NEXT: vmov.u8 r0, q1[15] +; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: udiv r2, r1, r0 +; CHECK-NEXT: mls lr, r2, r0, r1 +; CHECK-NEXT: vmov.u8 r0, q1[12] +; CHECK-NEXT: vmov.u8 r1, q0[12] +; CHECK-NEXT: udiv r2, r1, r0 +; CHECK-NEXT: mls r8, r2, r0, r1 +; CHECK-NEXT: vmov.u8 r0, q1[13] +; CHECK-NEXT: vmov.u8 r1, q0[13] +; CHECK-NEXT: udiv r3, r1, r0 +; CHECK-NEXT: mls r3, r3, r0, r1 +; CHECK-NEXT: vmov.u8 r0, q1[10] +; CHECK-NEXT: vmov.u8 r1, q0[10] +; CHECK-NEXT: udiv r4, r1, r0 +; CHECK-NEXT: mls r0, r4, r0, r1 +; CHECK-NEXT: vmov.u8 r1, q1[11] +; CHECK-NEXT: vmov.u8 r4, q0[11] +; CHECK-NEXT: udiv r5, r4, r1 +; CHECK-NEXT: mls r1, r5, r1, r4 +; CHECK-NEXT: vmov.u8 r4, q1[8] +; CHECK-NEXT: vmov.u8 r5, q0[8] +; CHECK-NEXT: udiv r6, r5, r4 +; CHECK-NEXT: mls r4, r6, r4, r5 +; CHECK-NEXT: vmov.u8 r5, q1[0] +; CHECK-NEXT: vmov.u8 r6, q0[0] +; CHECK-NEXT: udiv r7, r6, r5 +; CHECK-NEXT: mls r5, r7, r5, r6 +; CHECK-NEXT: vmov.u8 r6, q1[1] +; CHECK-NEXT: vmov.u8 r7, q0[1] +; CHECK-NEXT: udiv r2, r7, r6 +; CHECK-NEXT: vmov.8 q2[0], r5 +; CHECK-NEXT: mls r2, r2, r6, r7 +; CHECK-NEXT: vmov.u8 r5, q0[2] +; CHECK-NEXT: vmov.8 q2[1], r2 +; CHECK-NEXT: vmov.u8 r2, q1[2] +; CHECK-NEXT: udiv r6, r5, r2 +; CHECK-NEXT: mls r2, r6, r2, r5 +; CHECK-NEXT: vmov.u8 r5, q0[3] +; CHECK-NEXT: vmov.8 q2[2], r2 +; CHECK-NEXT: vmov.u8 r2, q1[3] +; CHECK-NEXT: udiv r6, r5, r2 +; CHECK-NEXT: mls r2, r6, r2, r5 +; CHECK-NEXT: vmov.u8 r5, q0[4] +; CHECK-NEXT: vmov.8 q2[3], r2 +; CHECK-NEXT: vmov.u8 r2, q1[4] +; CHECK-NEXT: udiv r6, r5, r2 +; CHECK-NEXT: mls r2, r6, r2, r5 +; CHECK-NEXT: vmov.u8 r5, q0[5] +; CHECK-NEXT: vmov.8 q2[4], r2 +; CHECK-NEXT: vmov.u8 r2, q1[5] +; CHECK-NEXT: udiv r6, r5, r2 +; CHECK-NEXT: mls r2, r6, r2, r5 +; CHECK-NEXT: vmov.u8 r5, q0[6] +; CHECK-NEXT: vmov.8 q2[5], r2 +; CHECK-NEXT: vmov.u8 r2, q1[6] +; CHECK-NEXT: udiv r6, r5, r2 +; CHECK-NEXT: mls r2, r6, r2, r5 +; CHECK-NEXT: vmov.u8 r5, q0[7] +; CHECK-NEXT: vmov.8 q2[6], r2 +; CHECK-NEXT: vmov.u8 r2, q1[7] +; CHECK-NEXT: udiv r6, r5, r2 +; CHECK-NEXT: mls r2, r6, r2, r5 +; CHECK-NEXT: vmov.u8 r5, q0[9] +; CHECK-NEXT: vmov.8 q2[7], r2 +; CHECK-NEXT: vmov.u8 r2, q1[9] +; CHECK-NEXT: udiv r6, r5, r2 +; CHECK-NEXT: vmov.8 q2[8], r4 +; CHECK-NEXT: mls r2, r6, r2, r5 +; CHECK-NEXT: vmov.8 q2[9], r2 +; CHECK-NEXT: vmov.8 q2[10], r0 +; CHECK-NEXT: vmov.8 q2[11], r1 +; CHECK-NEXT: vmov.8 q2[12], r8 +; CHECK-NEXT: vmov.8 q2[13], r3 +; CHECK-NEXT: vmov.8 q2[14], r12 +; CHECK-NEXT: vmov.8 q2[15], lr +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +entry: + %out = urem <16 x i8> %in1, %in2 + ret <16 x i8> %out +} + +define arm_aapcs_vfpcc <16 x i8> @srem_i8(<16 x i8> %in1, <16 x i8> %in2) { +; CHECK-LABEL: srem_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} +; CHECK-NEXT: vmov.u8 r5, q1[14] +; CHECK-NEXT: vmov.u8 r6, q0[14] +; CHECK-NEXT: sxtb r5, r5 +; CHECK-NEXT: sxtb r6, r6 +; CHECK-NEXT: sdiv r7, r6, r5 +; CHECK-NEXT: vmov.u8 r4, q1[15] +; CHECK-NEXT: mls r12, r7, r5, r6 +; CHECK-NEXT: vmov.u8 r7, q0[15] +; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: vmov.u8 r2, q1[13] +; CHECK-NEXT: sxtb r7, r7 +; CHECK-NEXT: sxtb r3, r2 +; CHECK-NEXT: sdiv r6, r7, r4 +; CHECK-NEXT: vmov.u8 r2, q1[12] +; CHECK-NEXT: mls lr, r6, r4, r7 +; CHECK-NEXT: vmov.u8 r4, q0[12] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov.u8 r0, q1[8] +; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: sxtb.w r8, r0 +; CHECK-NEXT: sdiv r5, r4, r2 +; CHECK-NEXT: vmov.u8 r0, q1[11] +; CHECK-NEXT: mls r9, r5, r2, r4 +; CHECK-NEXT: vmov.u8 r4, q0[13] +; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: vmov.u8 r6, q0[0] +; CHECK-NEXT: sdiv r5, r4, r3 +; CHECK-NEXT: sxtb r1, r0 +; CHECK-NEXT: vmov.u8 r0, q1[10] +; CHECK-NEXT: mls r3, r5, r3, r4 +; CHECK-NEXT: vmov.u8 r4, q0[10] +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: sxtb r6, r6 +; CHECK-NEXT: sdiv r5, r4, r0 +; CHECK-NEXT: mls r0, r5, r0, r4 +; CHECK-NEXT: vmov.u8 r4, q0[11] +; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: sdiv r5, r4, r1 +; CHECK-NEXT: mls r1, r5, r1, r4 +; CHECK-NEXT: vmov.u8 r4, q0[8] +; CHECK-NEXT: sxtb r4, r4 +; CHECK-NEXT: sdiv r5, r4, r8 +; CHECK-NEXT: mls r4, r5, r8, r4 +; CHECK-NEXT: vmov.u8 r5, q1[0] +; CHECK-NEXT: sxtb r5, r5 +; CHECK-NEXT: sdiv r7, r6, r5 +; CHECK-NEXT: mls r5, r7, r5, r6 +; CHECK-NEXT: vmov.u8 r6, q1[1] +; CHECK-NEXT: vmov.u8 r7, q0[1] +; CHECK-NEXT: sxtb r6, r6 +; CHECK-NEXT: sxtb r7, r7 +; CHECK-NEXT: vmov.8 q2[0], r5 +; CHECK-NEXT: sdiv r2, r7, r6 +; CHECK-NEXT: vmov.u8 r5, q0[2] +; CHECK-NEXT: mls r2, r2, r6, r7 +; CHECK-NEXT: sxtb r5, r5 +; CHECK-NEXT: vmov.8 q2[1], r2 +; CHECK-NEXT: vmov.u8 r2, q1[2] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sdiv r6, r5, r2 +; CHECK-NEXT: mls r2, r6, r2, r5 +; CHECK-NEXT: vmov.u8 r5, q0[3] +; CHECK-NEXT: sxtb r5, r5 +; CHECK-NEXT: vmov.8 q2[2], r2 +; CHECK-NEXT: vmov.u8 r2, q1[3] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sdiv r6, r5, r2 +; CHECK-NEXT: mls r2, r6, r2, r5 +; CHECK-NEXT: vmov.u8 r5, q0[4] +; CHECK-NEXT: sxtb r5, r5 +; CHECK-NEXT: vmov.8 q2[3], r2 +; CHECK-NEXT: vmov.u8 r2, q1[4] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sdiv r6, r5, r2 +; CHECK-NEXT: mls r2, r6, r2, r5 +; CHECK-NEXT: vmov.u8 r5, q0[5] +; CHECK-NEXT: sxtb r5, r5 +; CHECK-NEXT: vmov.8 q2[4], r2 +; CHECK-NEXT: vmov.u8 r2, q1[5] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sdiv r6, r5, r2 +; CHECK-NEXT: mls r2, r6, r2, r5 +; CHECK-NEXT: vmov.u8 r5, q0[6] +; CHECK-NEXT: sxtb r5, r5 +; CHECK-NEXT: vmov.8 q2[5], r2 +; CHECK-NEXT: vmov.u8 r2, q1[6] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sdiv r6, r5, r2 +; CHECK-NEXT: mls r2, r6, r2, r5 +; CHECK-NEXT: vmov.u8 r5, q0[7] +; CHECK-NEXT: sxtb r5, r5 +; CHECK-NEXT: vmov.8 q2[6], r2 +; CHECK-NEXT: vmov.u8 r2, q1[7] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: sdiv r6, r5, r2 +; CHECK-NEXT: mls r2, r6, r2, r5 +; CHECK-NEXT: vmov.u8 r5, q0[9] +; CHECK-NEXT: sxtb r5, r5 +; CHECK-NEXT: vmov.8 q2[7], r2 +; CHECK-NEXT: vmov.u8 r2, q1[9] +; CHECK-NEXT: sxtb r2, r2 +; CHECK-NEXT: vmov.8 q2[8], r4 +; CHECK-NEXT: sdiv r6, r5, r2 +; CHECK-NEXT: mls r2, r6, r2, r5 +; CHECK-NEXT: vmov.8 q2[9], r2 +; CHECK-NEXT: vmov.8 q2[10], r0 +; CHECK-NEXT: vmov.8 q2[11], r1 +; CHECK-NEXT: vmov.8 q2[12], r9 +; CHECK-NEXT: vmov.8 q2[13], r3 +; CHECK-NEXT: vmov.8 q2[14], r12 +; CHECK-NEXT: vmov.8 q2[15], lr +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} +entry: + %out = srem <16 x i8> %in1, %in2 + ret <16 x i8> %out +} + + +define arm_aapcs_vfpcc <4 x float> @fdiv_f32(<4 x float> %in1, <4 x float> %in2) { +; CHECK-MVE-LABEL: fdiv_f32: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: vdiv.f32 s8, s0, s4 +; CHECK-MVE-NEXT: movs r0, #0 +; CHECK-MVE-NEXT: vdiv.f32 s10, s1, s5 +; CHECK-MVE-NEXT: vdiv.f32 s12, s2, s6 +; CHECK-MVE-NEXT: vdiv.f32 s4, s3, s7 +; CHECK-MVE-NEXT: vdup.32 q0, r0 +; CHECK-MVE-NEXT: vmov.f32 s0, s8 +; CHECK-MVE-NEXT: vmov.f32 s1, s10 +; CHECK-MVE-NEXT: vmov.f32 s2, s12 +; CHECK-MVE-NEXT: vmov.f32 s3, s4 +; CHECK-MVE-NEXT: bx lr +; +; CHECK-MVEFP-LABEL: fdiv_f32: +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vmov q2, q0 +; CHECK-MVEFP-NEXT: vdiv.f32 s0, s8, s4 +; CHECK-MVEFP-NEXT: vdiv.f32 s1, s9, s5 +; CHECK-MVEFP-NEXT: vdiv.f32 s2, s10, s6 +; CHECK-MVEFP-NEXT: vdiv.f32 s3, s11, s7 +; CHECK-MVEFP-NEXT: bx lr +entry: + %out = fdiv <4 x float> %in1, %in2 + ret <4 x float> %out +} + +define arm_aapcs_vfpcc <4 x float> @frem_f32(<4 x float> %in1, <4 x float> %in2) { +; CHECK-MVE-LABEL: frem_f32: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: .save {r4, r5, r6, lr} +; CHECK-MVE-NEXT: push {r4, r5, r6, lr} +; CHECK-MVE-NEXT: .vsave {d8, d9, d10} +; CHECK-MVE-NEXT: vpush {d8, d9, d10} +; CHECK-MVE-NEXT: .pad #32 +; CHECK-MVE-NEXT: sub sp, #32 +; CHECK-MVE-NEXT: vstr s3, [sp, #24] +; CHECK-MVE-NEXT: ldr r4, [sp, #24] +; CHECK-MVE-NEXT: vstr s1, [sp, #8] +; CHECK-MVE-NEXT: vstr s5, [sp, #12] +; CHECK-MVE-NEXT: vstr s0, [sp] +; CHECK-MVE-NEXT: vstr s4, [sp, #4] +; CHECK-MVE-NEXT: vstr s7, [sp, #28] +; CHECK-MVE-NEXT: vstr s2, [sp, #16] +; CHECK-MVE-NEXT: vstr s6, [sp, #20] +; CHECK-MVE-NEXT: ldrd r0, r1, [sp, #8] +; CHECK-MVE-NEXT: bl fmodf +; CHECK-MVE-NEXT: mov r5, r0 +; CHECK-MVE-NEXT: ldrd r0, r1, [sp] +; CHECK-MVE-NEXT: bl fmodf +; CHECK-MVE-NEXT: ldrd r2, r1, [sp, #16] +; CHECK-MVE-NEXT: vmov s16, r0 +; CHECK-MVE-NEXT: ldr r6, [sp, #28] +; CHECK-MVE-NEXT: vmov s18, r5 +; CHECK-MVE-NEXT: mov r0, r2 +; CHECK-MVE-NEXT: bl fmodf +; CHECK-MVE-NEXT: vmov s20, r0 +; CHECK-MVE-NEXT: mov r0, r4 +; CHECK-MVE-NEXT: mov r1, r6 +; CHECK-MVE-NEXT: bl fmodf +; CHECK-MVE-NEXT: movs r1, #0 +; CHECK-MVE-NEXT: vmov s4, r0 +; CHECK-MVE-NEXT: vdup.32 q0, r1 +; CHECK-MVE-NEXT: vmov.f32 s0, s16 +; CHECK-MVE-NEXT: vmov.f32 s1, s18 +; CHECK-MVE-NEXT: vmov.f32 s2, s20 +; CHECK-MVE-NEXT: vmov.f32 s3, s4 +; CHECK-MVE-NEXT: add sp, #32 +; CHECK-MVE-NEXT: vpop {d8, d9, d10} +; CHECK-MVE-NEXT: pop {r4, r5, r6, pc} +; +; CHECK-MVEFP-LABEL: frem_f32: +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: .save {r4, r5, r6, lr} +; CHECK-MVEFP-NEXT: push {r4, r5, r6, lr} +; CHECK-MVEFP-NEXT: .vsave {d8, d9} +; CHECK-MVEFP-NEXT: vpush {d8, d9} +; CHECK-MVEFP-NEXT: .pad #32 +; CHECK-MVEFP-NEXT: sub sp, #32 +; CHECK-MVEFP-NEXT: vstr s3, [sp, #24] +; CHECK-MVEFP-NEXT: ldr r4, [sp, #24] +; CHECK-MVEFP-NEXT: vstr s1, [sp, #8] +; CHECK-MVEFP-NEXT: vstr s5, [sp, #12] +; CHECK-MVEFP-NEXT: vstr s0, [sp] +; CHECK-MVEFP-NEXT: vstr s4, [sp, #4] +; CHECK-MVEFP-NEXT: vstr s7, [sp, #28] +; CHECK-MVEFP-NEXT: vstr s2, [sp, #16] +; CHECK-MVEFP-NEXT: vstr s6, [sp, #20] +; CHECK-MVEFP-NEXT: ldrd r0, r1, [sp, #8] +; CHECK-MVEFP-NEXT: bl fmodf +; CHECK-MVEFP-NEXT: mov r5, r0 +; CHECK-MVEFP-NEXT: ldrd r0, r1, [sp] +; CHECK-MVEFP-NEXT: bl fmodf +; CHECK-MVEFP-NEXT: ldrd r2, r1, [sp, #16] +; CHECK-MVEFP-NEXT: vmov s16, r0 +; CHECK-MVEFP-NEXT: ldr r6, [sp, #28] +; CHECK-MVEFP-NEXT: vmov s17, r5 +; CHECK-MVEFP-NEXT: mov r0, r2 +; CHECK-MVEFP-NEXT: bl fmodf +; CHECK-MVEFP-NEXT: vmov s18, r0 +; CHECK-MVEFP-NEXT: mov r0, r4 +; CHECK-MVEFP-NEXT: mov r1, r6 +; CHECK-MVEFP-NEXT: bl fmodf +; CHECK-MVEFP-NEXT: vmov s19, r0 +; CHECK-MVEFP-NEXT: vmov q0, q4 +; CHECK-MVEFP-NEXT: add sp, #32 +; CHECK-MVEFP-NEXT: vpop {d8, d9} +; CHECK-MVEFP-NEXT: pop {r4, r5, r6, pc} +entry: + %out = frem <4 x float> %in1, %in2 + ret <4 x float> %out +} + + +define arm_aapcs_vfpcc <8 x half> @fdiv_f16(<8 x half> %in1, <8 x half> %in2) { +; CHECK-MVE-LABEL: fdiv_f16: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: vmov.u16 r0, q1[0] +; CHECK-MVE-NEXT: vmov.u16 r1, q0[0] +; CHECK-MVE-NEXT: vmov s10, r1 +; CHECK-MVE-NEXT: vmov.u16 r1, q1[1] +; CHECK-MVE-NEXT: vmov s8, r0 +; CHECK-MVE-NEXT: movs r2, #0 +; CHECK-MVE-NEXT: vdiv.f16 s8, s10, s8 +; CHECK-MVE-NEXT: vmov r0, s8 +; CHECK-MVE-NEXT: vmov s8, r1 +; CHECK-MVE-NEXT: vmov.u16 r1, q0[1] +; CHECK-MVE-NEXT: vmov s10, r1 +; CHECK-MVE-NEXT: vdiv.f16 s8, s10, s8 +; CHECK-MVE-NEXT: vmov r1, s8 +; CHECK-MVE-NEXT: vdup.16 q2, r2 +; CHECK-MVE-NEXT: vmov.16 q2[0], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q1[2] +; CHECK-MVE-NEXT: vmov s12, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[2] +; CHECK-MVE-NEXT: vmov s14, r0 +; CHECK-MVE-NEXT: vmov.16 q2[1], r1 +; CHECK-MVE-NEXT: vdiv.f16 s12, s14, s12 +; CHECK-MVE-NEXT: vmov r0, s12 +; CHECK-MVE-NEXT: vmov.16 q2[2], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q1[3] +; CHECK-MVE-NEXT: vmov s12, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[3] +; CHECK-MVE-NEXT: vmov s14, r0 +; CHECK-MVE-NEXT: vdiv.f16 s12, s14, s12 +; CHECK-MVE-NEXT: vmov r0, s12 +; CHECK-MVE-NEXT: vmov.16 q2[3], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q1[4] +; CHECK-MVE-NEXT: vmov s12, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[4] +; CHECK-MVE-NEXT: vmov s14, r0 +; CHECK-MVE-NEXT: vdiv.f16 s12, s14, s12 +; CHECK-MVE-NEXT: vmov r0, s12 +; CHECK-MVE-NEXT: vmov.16 q2[4], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q1[5] +; CHECK-MVE-NEXT: vmov s12, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[5] +; CHECK-MVE-NEXT: vmov s14, r0 +; CHECK-MVE-NEXT: vdiv.f16 s12, s14, s12 +; CHECK-MVE-NEXT: vmov r0, s12 +; CHECK-MVE-NEXT: vmov.16 q2[5], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q1[6] +; CHECK-MVE-NEXT: vmov s12, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[6] +; CHECK-MVE-NEXT: vmov s14, r0 +; CHECK-MVE-NEXT: vdiv.f16 s12, s14, s12 +; CHECK-MVE-NEXT: vmov r0, s12 +; CHECK-MVE-NEXT: vmov.16 q2[6], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q1[7] +; CHECK-MVE-NEXT: vmov s4, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q0[7] +; CHECK-MVE-NEXT: vmov s0, r0 +; CHECK-MVE-NEXT: vdiv.f16 s0, s0, s4 +; CHECK-MVE-NEXT: vmov r0, s0 +; CHECK-MVE-NEXT: vmov.16 q2[7], r0 +; CHECK-MVE-NEXT: vmov q0, q2 +; CHECK-MVE-NEXT: bx lr +; +; CHECK-MVEFP-LABEL: fdiv_f16: +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: vmov.u16 r0, q1[0] +; CHECK-MVEFP-NEXT: vmov.u16 r1, q1[1] +; CHECK-MVEFP-NEXT: vmov s8, r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[0] +; CHECK-MVEFP-NEXT: vmov s10, r0 +; CHECK-MVEFP-NEXT: vmov.u16 r2, q0[1] +; CHECK-MVEFP-NEXT: vdiv.f16 s8, s10, s8 +; CHECK-MVEFP-NEXT: vmov s10, r2 +; CHECK-MVEFP-NEXT: vmov r0, s8 +; CHECK-MVEFP-NEXT: vmov s8, r1 +; CHECK-MVEFP-NEXT: vdiv.f16 s8, s10, s8 +; CHECK-MVEFP-NEXT: vmov r1, s8 +; CHECK-MVEFP-NEXT: vmov.16 q2[0], r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q1[2] +; CHECK-MVEFP-NEXT: vmov.16 q2[1], r1 +; CHECK-MVEFP-NEXT: vmov s12, r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[2] +; CHECK-MVEFP-NEXT: vmov s14, r0 +; CHECK-MVEFP-NEXT: vdiv.f16 s12, s14, s12 +; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.16 q2[2], r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q1[3] +; CHECK-MVEFP-NEXT: vmov s12, r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[3] +; CHECK-MVEFP-NEXT: vmov s14, r0 +; CHECK-MVEFP-NEXT: vdiv.f16 s12, s14, s12 +; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.16 q2[3], r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q1[4] +; CHECK-MVEFP-NEXT: vmov s12, r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[4] +; CHECK-MVEFP-NEXT: vmov s14, r0 +; CHECK-MVEFP-NEXT: vdiv.f16 s12, s14, s12 +; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.16 q2[4], r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q1[5] +; CHECK-MVEFP-NEXT: vmov s12, r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[5] +; CHECK-MVEFP-NEXT: vmov s14, r0 +; CHECK-MVEFP-NEXT: vdiv.f16 s12, s14, s12 +; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.16 q2[5], r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q1[6] +; CHECK-MVEFP-NEXT: vmov s12, r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[6] +; CHECK-MVEFP-NEXT: vmov s14, r0 +; CHECK-MVEFP-NEXT: vdiv.f16 s12, s14, s12 +; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: vmov.16 q2[6], r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q1[7] +; CHECK-MVEFP-NEXT: vmov s4, r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q0[7] +; CHECK-MVEFP-NEXT: vmov s0, r0 +; CHECK-MVEFP-NEXT: vdiv.f16 s0, s0, s4 +; CHECK-MVEFP-NEXT: vmov r0, s0 +; CHECK-MVEFP-NEXT: vmov.16 q2[7], r0 +; CHECK-MVEFP-NEXT: vmov q0, q2 +; CHECK-MVEFP-NEXT: bx lr +entry: + %out = fdiv <8 x half> %in1, %in2 + ret <8 x half> %out +} + +define arm_aapcs_vfpcc <8 x half> @frem_f16(<8 x half> %in1, <8 x half> %in2) { +; CHECK-MVE-LABEL: frem_f16: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: .save {r4, lr} +; CHECK-MVE-NEXT: push {r4, lr} +; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-MVE-NEXT: .pad #64 +; CHECK-MVE-NEXT: sub sp, #64 +; CHECK-MVE-NEXT: vmov q5, q1 +; CHECK-MVE-NEXT: vmov q4, q0 +; CHECK-MVE-NEXT: vmov.u16 r0, q1[0] +; CHECK-MVE-NEXT: vmov s0, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q4[0] +; CHECK-MVE-NEXT: vmov s2, r0 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-MVE-NEXT: vstr s2, [sp, #56] +; CHECK-MVE-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-MVE-NEXT: vstr s0, [sp, #60] +; CHECK-MVE-NEXT: ldrd r0, r1, [sp, #56] +; CHECK-MVE-NEXT: bl fmodf +; CHECK-MVE-NEXT: vmov s0, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q5[1] +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVE-NEXT: vmov r4, s0 +; CHECK-MVE-NEXT: vmov s0, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q4[1] +; CHECK-MVE-NEXT: vmov s2, r0 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-MVE-NEXT: vstr s2, [sp, #48] +; CHECK-MVE-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-MVE-NEXT: vstr s0, [sp, #52] +; CHECK-MVE-NEXT: ldrd r0, r1, [sp, #48] +; CHECK-MVE-NEXT: bl fmodf +; CHECK-MVE-NEXT: vmov s0, r0 +; CHECK-MVE-NEXT: movs r1, #0 +; CHECK-MVE-NEXT: vdup.16 q6, r1 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVE-NEXT: vmov r0, s0 +; CHECK-MVE-NEXT: vmov.16 q6[0], r4 +; CHECK-MVE-NEXT: vmov.16 q6[1], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q5[2] +; CHECK-MVE-NEXT: vmov s0, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q4[2] +; CHECK-MVE-NEXT: vmov s2, r0 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-MVE-NEXT: vstr s2, [sp, #40] +; CHECK-MVE-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-MVE-NEXT: vstr s0, [sp, #44] +; CHECK-MVE-NEXT: ldrd r0, r1, [sp, #40] +; CHECK-MVE-NEXT: bl fmodf +; CHECK-MVE-NEXT: vmov s0, r0 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVE-NEXT: vmov r0, s0 +; CHECK-MVE-NEXT: vmov.16 q6[2], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q5[3] +; CHECK-MVE-NEXT: vmov s0, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q4[3] +; CHECK-MVE-NEXT: vmov s2, r0 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-MVE-NEXT: vstr s2, [sp, #32] +; CHECK-MVE-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-MVE-NEXT: vstr s0, [sp, #36] +; CHECK-MVE-NEXT: ldrd r0, r1, [sp, #32] +; CHECK-MVE-NEXT: bl fmodf +; CHECK-MVE-NEXT: vmov s0, r0 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVE-NEXT: vmov r0, s0 +; CHECK-MVE-NEXT: vmov.16 q6[3], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q5[4] +; CHECK-MVE-NEXT: vmov s0, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q4[4] +; CHECK-MVE-NEXT: vmov s2, r0 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-MVE-NEXT: vstr s2, [sp, #24] +; CHECK-MVE-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-MVE-NEXT: vstr s0, [sp, #28] +; CHECK-MVE-NEXT: ldrd r0, r1, [sp, #24] +; CHECK-MVE-NEXT: bl fmodf +; CHECK-MVE-NEXT: vmov s0, r0 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVE-NEXT: vmov r0, s0 +; CHECK-MVE-NEXT: vmov.16 q6[4], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q5[5] +; CHECK-MVE-NEXT: vmov s0, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q4[5] +; CHECK-MVE-NEXT: vmov s2, r0 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-MVE-NEXT: vstr s2, [sp, #16] +; CHECK-MVE-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-MVE-NEXT: vstr s0, [sp, #20] +; CHECK-MVE-NEXT: ldrd r0, r1, [sp, #16] +; CHECK-MVE-NEXT: bl fmodf +; CHECK-MVE-NEXT: vmov s0, r0 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVE-NEXT: vmov r0, s0 +; CHECK-MVE-NEXT: vmov.16 q6[5], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q5[6] +; CHECK-MVE-NEXT: vmov s0, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q4[6] +; CHECK-MVE-NEXT: vmov s2, r0 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-MVE-NEXT: vstr s2, [sp, #8] +; CHECK-MVE-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-MVE-NEXT: vstr s0, [sp, #12] +; CHECK-MVE-NEXT: ldrd r0, r1, [sp, #8] +; CHECK-MVE-NEXT: bl fmodf +; CHECK-MVE-NEXT: vmov s0, r0 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVE-NEXT: vmov r0, s0 +; CHECK-MVE-NEXT: vmov.16 q6[6], r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q5[7] +; CHECK-MVE-NEXT: vmov s0, r0 +; CHECK-MVE-NEXT: vmov.u16 r0, q4[7] +; CHECK-MVE-NEXT: vmov s2, r0 +; CHECK-MVE-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-MVE-NEXT: vstr s2, [sp] +; CHECK-MVE-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-MVE-NEXT: vstr s0, [sp, #4] +; CHECK-MVE-NEXT: ldrd r0, r1, [sp] +; CHECK-MVE-NEXT: bl fmodf +; CHECK-MVE-NEXT: vmov s0, r0 +; CHECK-MVE-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVE-NEXT: vmov r0, s0 +; CHECK-MVE-NEXT: vmov.16 q6[7], r0 +; CHECK-MVE-NEXT: vmov q0, q6 +; CHECK-MVE-NEXT: add sp, #64 +; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-MVE-NEXT: pop {r4, pc} +; +; CHECK-MVEFP-LABEL: frem_f16: +; CHECK-MVEFP: @ %bb.0: @ %entry +; CHECK-MVEFP-NEXT: .save {r4, lr} +; CHECK-MVEFP-NEXT: push {r4, lr} +; CHECK-MVEFP-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-MVEFP-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-MVEFP-NEXT: .pad #64 +; CHECK-MVEFP-NEXT: sub sp, #64 +; CHECK-MVEFP-NEXT: vmov q5, q1 +; CHECK-MVEFP-NEXT: vmov q4, q0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q1[0] +; CHECK-MVEFP-NEXT: vmov s0, r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q4[0] +; CHECK-MVEFP-NEXT: vmov s2, r0 +; CHECK-MVEFP-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-MVEFP-NEXT: vstr s2, [sp, #56] +; CHECK-MVEFP-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-MVEFP-NEXT: vstr s0, [sp, #60] +; CHECK-MVEFP-NEXT: ldrd r0, r1, [sp, #56] +; CHECK-MVEFP-NEXT: bl fmodf +; CHECK-MVEFP-NEXT: vmov s0, r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q5[1] +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVEFP-NEXT: vmov r4, s0 +; CHECK-MVEFP-NEXT: vmov s0, r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q4[1] +; CHECK-MVEFP-NEXT: vmov s2, r0 +; CHECK-MVEFP-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-MVEFP-NEXT: vstr s2, [sp, #48] +; CHECK-MVEFP-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-MVEFP-NEXT: vstr s0, [sp, #52] +; CHECK-MVEFP-NEXT: ldrd r0, r1, [sp, #48] +; CHECK-MVEFP-NEXT: bl fmodf +; CHECK-MVEFP-NEXT: vmov s0, r0 +; CHECK-MVEFP-NEXT: vmov.16 q6[0], r4 +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVEFP-NEXT: vmov r0, s0 +; CHECK-MVEFP-NEXT: vmov.16 q6[1], r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q5[2] +; CHECK-MVEFP-NEXT: vmov s0, r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q4[2] +; CHECK-MVEFP-NEXT: vmov s2, r0 +; CHECK-MVEFP-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-MVEFP-NEXT: vstr s2, [sp, #40] +; CHECK-MVEFP-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-MVEFP-NEXT: vstr s0, [sp, #44] +; CHECK-MVEFP-NEXT: ldrd r0, r1, [sp, #40] +; CHECK-MVEFP-NEXT: bl fmodf +; CHECK-MVEFP-NEXT: vmov s0, r0 +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVEFP-NEXT: vmov r0, s0 +; CHECK-MVEFP-NEXT: vmov.16 q6[2], r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q5[3] +; CHECK-MVEFP-NEXT: vmov s0, r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q4[3] +; CHECK-MVEFP-NEXT: vmov s2, r0 +; CHECK-MVEFP-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-MVEFP-NEXT: vstr s2, [sp, #32] +; CHECK-MVEFP-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-MVEFP-NEXT: vstr s0, [sp, #36] +; CHECK-MVEFP-NEXT: ldrd r0, r1, [sp, #32] +; CHECK-MVEFP-NEXT: bl fmodf +; CHECK-MVEFP-NEXT: vmov s0, r0 +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVEFP-NEXT: vmov r0, s0 +; CHECK-MVEFP-NEXT: vmov.16 q6[3], r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q5[4] +; CHECK-MVEFP-NEXT: vmov s0, r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q4[4] +; CHECK-MVEFP-NEXT: vmov s2, r0 +; CHECK-MVEFP-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-MVEFP-NEXT: vstr s2, [sp, #24] +; CHECK-MVEFP-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-MVEFP-NEXT: vstr s0, [sp, #28] +; CHECK-MVEFP-NEXT: ldrd r0, r1, [sp, #24] +; CHECK-MVEFP-NEXT: bl fmodf +; CHECK-MVEFP-NEXT: vmov s0, r0 +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVEFP-NEXT: vmov r0, s0 +; CHECK-MVEFP-NEXT: vmov.16 q6[4], r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q5[5] +; CHECK-MVEFP-NEXT: vmov s0, r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q4[5] +; CHECK-MVEFP-NEXT: vmov s2, r0 +; CHECK-MVEFP-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-MVEFP-NEXT: vstr s2, [sp, #16] +; CHECK-MVEFP-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-MVEFP-NEXT: vstr s0, [sp, #20] +; CHECK-MVEFP-NEXT: ldrd r0, r1, [sp, #16] +; CHECK-MVEFP-NEXT: bl fmodf +; CHECK-MVEFP-NEXT: vmov s0, r0 +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVEFP-NEXT: vmov r0, s0 +; CHECK-MVEFP-NEXT: vmov.16 q6[5], r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q5[6] +; CHECK-MVEFP-NEXT: vmov s0, r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q4[6] +; CHECK-MVEFP-NEXT: vmov s2, r0 +; CHECK-MVEFP-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-MVEFP-NEXT: vstr s2, [sp, #8] +; CHECK-MVEFP-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-MVEFP-NEXT: vstr s0, [sp, #12] +; CHECK-MVEFP-NEXT: ldrd r0, r1, [sp, #8] +; CHECK-MVEFP-NEXT: bl fmodf +; CHECK-MVEFP-NEXT: vmov s0, r0 +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVEFP-NEXT: vmov r0, s0 +; CHECK-MVEFP-NEXT: vmov.16 q6[6], r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q5[7] +; CHECK-MVEFP-NEXT: vmov s0, r0 +; CHECK-MVEFP-NEXT: vmov.u16 r0, q4[7] +; CHECK-MVEFP-NEXT: vmov s2, r0 +; CHECK-MVEFP-NEXT: vcvtb.f32.f16 s2, s2 +; CHECK-MVEFP-NEXT: vstr s2, [sp] +; CHECK-MVEFP-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-MVEFP-NEXT: vstr s0, [sp, #4] +; CHECK-MVEFP-NEXT: ldrd r0, r1, [sp] +; CHECK-MVEFP-NEXT: bl fmodf +; CHECK-MVEFP-NEXT: vmov s0, r0 +; CHECK-MVEFP-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-MVEFP-NEXT: vmov r0, s0 +; CHECK-MVEFP-NEXT: vmov.16 q6[7], r0 +; CHECK-MVEFP-NEXT: vmov q0, q6 +; CHECK-MVEFP-NEXT: add sp, #64 +; CHECK-MVEFP-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-MVEFP-NEXT: pop {r4, pc} +entry: + %out = frem <8 x half> %in1, %in2 + ret <8 x half> %out +}