From a4b415a6839b515e571328ab7eeb403ea8637707 Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Tue, 25 Jun 2019 16:48:46 +0000 Subject: [PATCH] [ARM] Code-generation infrastructure for MVE. This provides the low-level support to start using MVE vector types in LLVM IR, loading and storing them, passing them to __asm__ statements containing hand-written MVE vector instructions, and *if* you have the hard-float ABI turned on, using them as function parameters. (In the soft-float ABI, vector types are passed in integer registers, and combining all those 32-bit integers into a q-reg requires support for selection DAG nodes like insert_vector_elt and build_vector which aren't implemented yet for MVE. In fact I've also had to add `arm_aapcs_vfpcc` to a couple of existing tests to avoid that problem.) Specifically, this commit adds support for: * spills, reloads and register moves for MVE vector registers * ditto for the VPT predication mask that lives in VPR.P0 * make all the MVE vector types legal in ISel, and provide selection DAG patterns for BITCAST, LOAD and STORE * make loads and stores of scalar FP types conditional on `hasFPRegs()` rather than `hasVFP2Base()`. As a result a few existing tests needed their llc command lines updating to use `-mattr=-fpregs` as their method of turning off all hardware FP support. Reviewers: dmgreen, samparker, SjoerdMeijer Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D60708 llvm-svn: 364329 --- llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp | 116 ++++- llvm/lib/Target/ARM/ARMBaseInstrInfo.h | 7 + llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp | 32 ++ llvm/lib/Target/ARM/ARMISelLowering.cpp | 37 +- llvm/lib/Target/ARM/ARMISelLowering.h | 1 + llvm/lib/Target/ARM/ARMInstrMVE.td | 123 +++++ llvm/lib/Target/ARM/ARMRegisterInfo.td | 7 +- llvm/test/CodeGen/ARM/fast-isel-call.ll | 6 +- llvm/test/CodeGen/ARM/fp16-promote.ll | 2 +- llvm/test/CodeGen/ARM/no-fpu.ll | 4 +- llvm/test/CodeGen/Thumb2/mve-basic.ll | 31 ++ llvm/test/CodeGen/Thumb2/mve-bitcasts.ll | 449 ++++++++++++++++++ .../Transforms/HardwareLoops/ARM/calls.ll | 6 +- 13 files changed, 795 insertions(+), 26 deletions(-) create mode 100644 llvm/test/CodeGen/Thumb2/mve-basic.ll create mode 100644 llvm/test/CodeGen/Thumb2/mve-bitcasts.ll diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp index 6bbb98c974a4..de8a04632f5b 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -805,6 +805,28 @@ void ARMBaseInstrInfo::copyToCPSR(MachineBasicBlock &MBB, .addReg(ARM::CPSR, RegState::Implicit | RegState::Define); } +void llvm::addUnpredicatedMveVpredNOp(MachineInstrBuilder &MIB) { + MIB.addImm(ARMVCC::None); + MIB.addReg(0); +} + +void llvm::addUnpredicatedMveVpredROp(MachineInstrBuilder &MIB, + unsigned DestReg) { + addUnpredicatedMveVpredNOp(MIB); + MIB.addReg(DestReg, RegState::Undef); +} + +void llvm::addPredicatedMveVpredNOp(MachineInstrBuilder &MIB, unsigned Cond) { + MIB.addImm(Cond); + MIB.addReg(ARM::VPR, RegState::Implicit); +} + +void llvm::addPredicatedMveVpredROp(MachineInstrBuilder &MIB, + unsigned Cond, unsigned Inactive) { + addPredicatedMveVpredNOp(MIB, Cond); + MIB.addReg(Inactive); +} + void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, unsigned DestReg, @@ -833,14 +855,17 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, else if (ARM::DPRRegClass.contains(DestReg, SrcReg) && Subtarget.hasFP64()) Opc = ARM::VMOVD; else if (ARM::QPRRegClass.contains(DestReg, SrcReg)) - Opc = ARM::VORRq; + Opc = Subtarget.hasNEON() ? ARM::VORRq : ARM::MVE_VORR; if (Opc) { MachineInstrBuilder MIB = BuildMI(MBB, I, DL, get(Opc), DestReg); MIB.addReg(SrcReg, getKillRegState(KillSrc)); - if (Opc == ARM::VORRq) + if (Opc == ARM::VORRq || Opc == ARM::MVE_VORR) MIB.addReg(SrcReg, getKillRegState(KillSrc)); - MIB.add(predOps(ARMCC::AL)); + if (Opc == ARM::MVE_VORR) + addUnpredicatedMveVpredROp(MIB, DestReg); + else + MIB.add(predOps(ARMCC::AL)); return; } @@ -851,11 +876,11 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Use VORRq when possible. if (ARM::QQPRRegClass.contains(DestReg, SrcReg)) { - Opc = ARM::VORRq; + Opc = Subtarget.hasNEON() ? ARM::VORRq : ARM::MVE_VORR; BeginIdx = ARM::qsub_0; SubRegs = 2; } else if (ARM::QQQQPRRegClass.contains(DestReg, SrcReg)) { - Opc = ARM::VORRq; + Opc = Subtarget.hasNEON() ? ARM::VORRq : ARM::MVE_VORR; BeginIdx = ARM::qsub_0; SubRegs = 4; // Fall back to VMOVD. @@ -901,6 +926,30 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } else if (DestReg == ARM::CPSR) { copyToCPSR(MBB, I, SrcReg, KillSrc, Subtarget); return; + } else if (DestReg == ARM::VPR) { + assert(ARM::GPRPairRegClass.contains(SrcReg)); + BuildMI(MBB, I, I->getDebugLoc(), get(ARM::VMSR_P0), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)) + .add(predOps(ARMCC::AL)); + return; + } else if (SrcReg == ARM::VPR) { + assert(ARM::GPRPairRegClass.contains(DestReg)); + BuildMI(MBB, I, I->getDebugLoc(), get(ARM::VMRS_P0), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)) + .add(predOps(ARMCC::AL)); + return; + } else if (DestReg == ARM::FPSCR_NZCV) { + assert(ARM::GPRPairRegClass.contains(SrcReg)); + BuildMI(MBB, I, I->getDebugLoc(), get(ARM::VMSR_FPSCR_NZCVQC), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)) + .add(predOps(ARMCC::AL)); + return; + } else if (SrcReg == ARM::FPSCR_NZCV) { + assert(ARM::GPRPairRegClass.contains(DestReg)); + BuildMI(MBB, I, I->getDebugLoc(), get(ARM::VMRS_FPSCR_NZCVQC), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)) + .add(predOps(ARMCC::AL)); + return; } assert(Opc && "Impossible reg-to-reg copy"); @@ -925,10 +974,15 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB, DstRegs.insert(Dst); #endif Mov = BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst).addReg(Src); - // VORR takes two source operands. - if (Opc == ARM::VORRq) + // VORR (NEON or MVE) takes two source operands. + if (Opc == ARM::VORRq || Opc == ARM::MVE_VORR) { Mov.addReg(Src); - Mov = Mov.add(predOps(ARMCC::AL)); + } + // MVE VORR takes predicate operands in place of an ordinary condition. + if (Opc == ARM::MVE_VORR) + addUnpredicatedMveVpredROp(Mov, Dst); + else + Mov = Mov.add(predOps(ARMCC::AL)); // MOVr can set CC. if (Opc == ARM::MOVr) Mov = Mov.add(condCodeOp()); @@ -1010,6 +1064,13 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, .addImm(0) .addMemOperand(MMO) .add(predOps(ARMCC::AL)); + } else if (ARM::VCCRRegClass.hasSubClassEq(RC)) { + BuildMI(MBB, I, DebugLoc(), get(ARM::VSTR_P0_off)) + .addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI) + .addImm(0) + .addMemOperand(MMO) + .add(predOps(ARMCC::AL)); } else llvm_unreachable("Unknown reg class!"); break; @@ -1042,7 +1103,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, llvm_unreachable("Unknown reg class!"); break; case 16: - if (ARM::DPairRegClass.hasSubClassEq(RC)) { + if (ARM::DPairRegClass.hasSubClassEq(RC) && Subtarget.hasNEON()) { // Use aligned spills if the stack can be realigned. if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { BuildMI(MBB, I, DebugLoc(), get(ARM::VST1q64)) @@ -1058,6 +1119,14 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, .addMemOperand(MMO) .add(predOps(ARMCC::AL)); } + } else if (ARM::QPRRegClass.hasSubClassEq(RC) && + Subtarget.hasMVEIntegerOps()) { + auto MIB = BuildMI(MBB, I, DebugLoc(), get(ARM::MVE_VSTRWU32)); + MIB.addReg(SrcReg, getKillRegState(isKill)) + .addFrameIndex(FI) + .addImm(0) + .addMemOperand(MMO); + addUnpredicatedMveVpredNOp(MIB); } else llvm_unreachable("Unknown reg class!"); break; @@ -1155,6 +1224,13 @@ unsigned ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr &MI, return MI.getOperand(0).getReg(); } break; + case ARM::VSTR_P0_off: + if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() && + MI.getOperand(1).getImm() == 0) { + FrameIndex = MI.getOperand(0).getIndex(); + return ARM::P0; + } + break; case ARM::VST1q64: case ARM::VST1d64TPseudo: case ARM::VST1d64QPseudo: @@ -1225,6 +1301,12 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, .addImm(0) .addMemOperand(MMO) .add(predOps(ARMCC::AL)); + } else if (ARM::VCCRRegClass.hasSubClassEq(RC)) { + BuildMI(MBB, I, DL, get(ARM::VLDR_P0_off), DestReg) + .addFrameIndex(FI) + .addImm(0) + .addMemOperand(MMO) + .add(predOps(ARMCC::AL)); } else llvm_unreachable("Unknown reg class!"); break; @@ -1261,7 +1343,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, llvm_unreachable("Unknown reg class!"); break; case 16: - if (ARM::DPairRegClass.hasSubClassEq(RC)) { + if (ARM::DPairRegClass.hasSubClassEq(RC) && Subtarget.hasNEON()) { if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) { BuildMI(MBB, I, DL, get(ARM::VLD1q64), DestReg) .addFrameIndex(FI) @@ -1274,6 +1356,13 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, .addMemOperand(MMO) .add(predOps(ARMCC::AL)); } + } else if (ARM::QPRRegClass.hasSubClassEq(RC) && + Subtarget.hasMVEIntegerOps()) { + auto MIB = BuildMI(MBB, I, DL, get(ARM::MVE_VLDRWU32), DestReg); + MIB.addFrameIndex(FI) + .addImm(0) + .addMemOperand(MMO); + addUnpredicatedMveVpredNOp(MIB); } else llvm_unreachable("Unknown reg class!"); break; @@ -1370,6 +1459,13 @@ unsigned ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, return MI.getOperand(0).getReg(); } break; + case ARM::VLDR_P0_off: + if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() && + MI.getOperand(1).getImm() == 0) { + FrameIndex = MI.getOperand(0).getIndex(); + return ARM::P0; + } + break; case ARM::VLD1q64: case ARM::VLD1d8TPseudo: case ARM::VLD1d16TPseudo: diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h index 620a2b6f0501..465d13803fdd 100644 --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -591,6 +591,13 @@ bool registerDefinedBetween(unsigned Reg, MachineBasicBlock::iterator From, MachineInstr *findCMPToFoldIntoCBZ(MachineInstr *Br, const TargetRegisterInfo *TRI); +void addUnpredicatedMveVpredNOp(MachineInstrBuilder &MIB); +void addUnpredicatedMveVpredROp(MachineInstrBuilder &MIB, unsigned DestReg); + +void addPredicatedMveVpredNOp(MachineInstrBuilder &MIB, unsigned Cond); +void addPredicatedMveVpredROp(MachineInstrBuilder &MIB, unsigned Cond, + unsigned Inactive); + } // end namespace llvm #endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index c74459a15425..127d86c91387 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -146,6 +146,9 @@ public: SDValue &OffImm); bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N, SDValue &OffImm); + template + bool SelectT2AddrModeImm7(SDValue N, SDValue &Base, + SDValue &OffImm); bool SelectT2AddrModeSoReg(SDValue N, SDValue &Base, SDValue &OffReg, SDValue &ShImm); bool SelectT2AddrModeExclusive(SDValue N, SDValue &Base, SDValue &OffImm); @@ -1268,6 +1271,35 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N, return false; } +template +bool ARMDAGToDAGISel::SelectT2AddrModeImm7(SDValue N, + SDValue &Base, SDValue &OffImm) { + if (N.getOpcode() == ISD::SUB || + CurDAG->isBaseWithConstantOffset(N)) { + if (auto RHS = dyn_cast(N.getOperand(1))) { + int RHSC = (int)RHS->getZExtValue(); + if (N.getOpcode() == ISD::SUB) + RHSC = -RHSC; + + if (isShiftedInt<7, Shift>(RHSC)) { + Base = N.getOperand(0); + if (Base.getOpcode() == ISD::FrameIndex) { + int FI = cast(Base)->getIndex(); + Base = CurDAG->getTargetFrameIndex( + FI, TLI->getPointerTy(CurDAG->getDataLayout())); + } + OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32); + return true; + } + } + } + + // Base only. + Base = N; + OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32); + return true; +} + bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N, SDValue &Base, SDValue &OffReg, SDValue &ShImm) { diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 1ae87e4854e2..8ca947a49692 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -221,6 +221,26 @@ void ARMTargetLowering::addQRTypeForNEON(MVT VT) { addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); } +void ARMTargetLowering::addMVEVectorTypes() { + // We 'support' these types up to bitcast/load/store level, regardless of + // MVE integer-only / float support. Only doing FP data processing on the FP + // vector types is inhibited at integer-only level. + + const MVT VecTypes[] = { + MVT::v2i64, MVT::v4i32, MVT::v8i16, MVT::v16i8, + MVT::v2f64, MVT::v4f32, MVT::v8f16, + }; + + for (auto VT : VecTypes) { + addRegisterClass(VT, &ARM::QPRRegClass); + for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc) + setOperationAction(Opc, VT, Expand); + setOperationAction(ISD::BITCAST, VT, Legal); + setOperationAction(ISD::LOAD, VT, Legal); + setOperationAction(ISD::STORE, VT, Legal); + } +} + ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { @@ -510,7 +530,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, else addRegisterClass(MVT::i32, &ARM::GPRRegClass); - if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() && + if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() && !Subtarget->isThumb1Only()) { addRegisterClass(MVT::f32, &ARM::SPRRegClass); addRegisterClass(MVT::f64, &ARM::DPRRegClass); @@ -548,6 +568,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); + if (Subtarget->hasMVEIntegerOps()) + addMVEVectorTypes(); + if (Subtarget->hasNEON()) { addDRTypeForNEON(MVT::v2f32); addDRTypeForNEON(MVT::v8i8); @@ -566,11 +589,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, addQRTypeForNEON(MVT::v8f16); addDRTypeForNEON(MVT::v4f16); } + } + if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) { // v2f64 is legal so that QR subregs can be extracted as f64 elements, but - // neither Neon nor VFP support any arithmetic operations on it. - // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively - // supported for v4f32. + // none of Neon, MVE or VFP supports any arithmetic operations on it. setOperationAction(ISD::FADD, MVT::v2f64, Expand); setOperationAction(ISD::FSUB, MVT::v2f64, Expand); setOperationAction(ISD::FMUL, MVT::v2f64, Expand); @@ -604,7 +627,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); setOperationAction(ISD::FMA, MVT::v2f64, Expand); + } + if (Subtarget->hasNEON()) { + // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively + // supported for v4f32. setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); setOperationAction(ISD::FSIN, MVT::v4f32, Expand); setOperationAction(ISD::FCOS, MVT::v4f32, Expand); @@ -1040,7 +1067,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); - if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() && + if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() && !Subtarget->isThumb1Only()) { // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR // iff target supports vfp2. diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index fa934bf74782..fcdfd14e9dff 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -813,6 +813,7 @@ class VectorType; MachineBasicBlock *MBB) const; MachineBasicBlock *EmitLowered__dbzchk(MachineInstr &MI, MachineBasicBlock *MBB) const; + void addMVEVectorTypes(); }; enum NEONModImmType { diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index 2d37255bba8b..5cde34a47850 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -3998,3 +3998,126 @@ def MVE_LCTP : MVE_loltp_end<(outs), (ins pred:$p), "lctp${p}", ""> { let Unpredictable{21-20} = 0b11; let Unpredictable{11-1} = 0b11111111111; } + + +//===----------------------------------------------------------------------===// +// Patterns +//===----------------------------------------------------------------------===// + +class MVE_unpred_vector_store_typed + : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7:$addr), + (RegImmInst (Ty MQPR:$val), t2addrmode_imm7:$addr)>; + +multiclass MVE_unpred_vector_store { + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; +} + +class MVE_unpred_vector_load_typed + : Pat<(Ty (LoadKind t2addrmode_imm7:$addr)), + (Ty (RegImmInst t2addrmode_imm7:$addr))>; +multiclass MVE_unpred_vector_load { + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; +} + +let Predicates = [HasMVEInt, IsLE] in { + defm : MVE_unpred_vector_store; + defm : MVE_unpred_vector_store; + defm : MVE_unpred_vector_store; + + defm : MVE_unpred_vector_load; + defm : MVE_unpred_vector_load; + defm : MVE_unpred_vector_load; + + def : Pat<(v16i1 (load t2addrmode_imm7<2>:$addr)), + (v16i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; + def : Pat<(v8i1 (load t2addrmode_imm7<2>:$addr)), + (v8i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; + def : Pat<(v4i1 (load t2addrmode_imm7<2>:$addr)), + (v4i1 (VLDR_P0_off t2addrmode_imm7<2>:$addr))>; +} + +let Predicates = [HasMVEInt, IsBE] in { + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + def : MVE_unpred_vector_store_typed; + + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; + def : MVE_unpred_vector_load_typed; +} + +// Bit convert patterns + +let Predicates = [HasMVEInt] in { + def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>; + + def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>; + + def : Pat<(v8i16 (bitconvert (v8f16 QPR:$src))), (v8i16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v8i16 QPR:$src))), (v8f16 QPR:$src)>; +} + +let Predicates = [IsLE,HasMVEInt] in { + def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>; + + def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (v2i64 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>; + def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>; + + def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>; + + def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (v4i32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>; + def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>; + + def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (v8f16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v2i64 QPR:$src))), (v8f16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v4f32 QPR:$src))), (v8f16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (v8f16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (v8f16 QPR:$src)>; + + def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>; + def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>; + + def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (v16i8 QPR:$src)>; + def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>; +} diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.td b/llvm/lib/Target/ARM/ARMRegisterInfo.td index fc0286e724d8..5eabc6bad219 100644 --- a/llvm/lib/Target/ARM/ARMRegisterInfo.td +++ b/llvm/lib/Target/ARM/ARMRegisterInfo.td @@ -468,8 +468,11 @@ def DPair : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128, (interleave QPR, TuplesOE2D)> { // Allocate starting at non-VFP2 registers D16-D31 first. // Prefer even-odd pairs as they are easier to copy. - let AltOrders = [(add (rotl QPR, 8), (rotl DPair, 16))]; - let AltOrderSelect = [{ return 1; }]; + let AltOrders = [(add (rotl QPR, 8), (rotl DPair, 16)), + (add (trunc QPR, 8), (trunc DPair, 16))]; + let AltOrderSelect = [{ + return 1 + MF.getSubtarget().hasMVEIntegerOps(); + }]; } // Pseudo-registers representing even-odd pairs of GPRs from R1 to R13/SP. diff --git a/llvm/test/CodeGen/ARM/fast-isel-call.ll b/llvm/test/CodeGen/ARM/fast-isel-call.ll index 3e5c79dc6330..9c313c727aee 100644 --- a/llvm/test/CodeGen/ARM/fast-isel-call.ll +++ b/llvm/test/CodeGen/ARM/fast-isel-call.ll @@ -4,9 +4,9 @@ ; RUN: llc -fast-isel-sink-local-values < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -mattr=+long-calls | FileCheck %s --check-prefix=ARM-LONG --check-prefix=ARM-LONG-MACHO ; RUN: llc -fast-isel-sink-local-values < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -mattr=+long-calls | FileCheck %s --check-prefix=ARM-LONG --check-prefix=ARM-LONG-ELF ; RUN: llc -fast-isel-sink-local-values < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -mattr=+long-calls | FileCheck %s --check-prefix=THUMB-LONG -; RUN: llc -fast-isel-sink-local-values < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -mattr=-vfp2d16sp | FileCheck %s --check-prefix=ARM-NOVFP -; RUN: llc -fast-isel-sink-local-values < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -mattr=-vfp2d16sp | FileCheck %s --check-prefix=ARM-NOVFP -; RUN: llc -fast-isel-sink-local-values < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -mattr=-vfp2d16sp | FileCheck %s --check-prefix=THUMB-NOVFP +; RUN: llc -fast-isel-sink-local-values < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios -mattr=-fpregs | FileCheck %s --check-prefix=ARM-NOVFP +; RUN: llc -fast-isel-sink-local-values < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=armv7-linux-gnueabi -mattr=-fpregs | FileCheck %s --check-prefix=ARM-NOVFP +; RUN: llc -fast-isel-sink-local-values < %s -O0 -verify-machineinstrs -fast-isel-abort=1 -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios -mattr=-fpregs | FileCheck %s --check-prefix=THUMB-NOVFP ; Note that some of these tests assume that relocations are either ; movw/movt or constant pool loads. Different platforms will select diff --git a/llvm/test/CodeGen/ARM/fp16-promote.ll b/llvm/test/CodeGen/ARM/fp16-promote.ll index 855f8d55dcb8..f382144cf95f 100644 --- a/llvm/test/CodeGen/ARM/fp16-promote.ll +++ b/llvm/test/CodeGen/ARM/fp16-promote.ll @@ -1,6 +1,6 @@ ; RUN: llc -asm-verbose=false < %s -mattr=+vfp3,+fp16 | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=CHECK-FP16 --check-prefix=CHECK-VFP -check-prefix=CHECK-ALL ; RUN: llc -asm-verbose=false < %s | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=CHECK-LIBCALL --check-prefix=CHECK-VFP -check-prefix=CHECK-ALL --check-prefix=CHECK-LIBCALL-VFP -; RUN: llc -asm-verbose=false < %s -mattr=-vfp2d16sp | FileCheck -allow-deprecated-dag-overlap %s --check-prefix=CHECK-LIBCALL -check-prefix=CHECK-NOVFP -check-prefix=CHECK-ALL +; RUN: llc -asm-verbose=false < %s -mattr=-fpregs | FileCheck -allow-deprecated-dag-overlap %s --check-prefix=CHECK-LIBCALL -check-prefix=CHECK-NOVFP -check-prefix=CHECK-ALL target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32" target triple = "armv7---eabihf" diff --git a/llvm/test/CodeGen/ARM/no-fpu.ll b/llvm/test/CodeGen/ARM/no-fpu.ll index 13da7190a9fd..468a8237f8fa 100644 --- a/llvm/test/CodeGen/ARM/no-fpu.ll +++ b/llvm/test/CodeGen/ARM/no-fpu.ll @@ -1,6 +1,6 @@ -; RUN: llc < %s -mtriple=armv7-none-gnueabi -mattr=-neon,-vfp2d16sp | FileCheck --check-prefix=NONEON-NOVFP %s +; RUN: llc < %s -mtriple=armv7-none-gnueabi -mattr=-neon,-fpregs | FileCheck --check-prefix=NONEON-NOVFP %s ; RUN: llc < %s -mtriple=armv7-none-gnueabi -mattr=-neon | FileCheck --check-prefix=NONEON %s -; RUN: llc < %s -mtriple=armv7-none-gnueabi -mattr=-vfp2d16sp | FileCheck --check-prefix=NOVFP %s +; RUN: llc < %s -mtriple=armv7-none-gnueabi -mattr=-fpregs | FileCheck --check-prefix=NOVFP %s ; RUN: llc < %s -mtriple=armv7-none-gnueabi -mattr=-neon,+vfp2 | FileCheck --check-prefix=NONEON-VFP %s ; Check no NEON instructions are selected when feature is disabled. diff --git a/llvm/test/CodeGen/Thumb2/mve-basic.ll b/llvm/test/CodeGen/Thumb2/mve-basic.ll new file mode 100644 index 000000000000..d37bbde23ebb --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-basic.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -o - %s | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -o - %s | FileCheck %s + +define arm_aapcs_vfpcc <4 x i32> @vector_add_by_value(<4 x i32> %lhs, <4 x i32>%rhs) { +; CHECK-LABEL: vector_add_by_value: +; CHECK: @ %bb.0: +; CHECK-NEXT: @APP +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: @NO_APP +; CHECK-NEXT: bx lr + %result = tail call <4 x i32> asm "vadd.i32 $0,$1,$2", "=t,t,t"(<4 x i32> %lhs, <4 x i32> %rhs) + ret <4 x i32> %result +} + +define void @vector_add_by_reference(<4 x i32>* %resultp, <4 x i32>* %lhsp, <4 x i32>* %rhsp) { +; CHECK-LABEL: vector_add_by_reference: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: @APP +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: @NO_APP +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr + %lhs = load <4 x i32>, <4 x i32>* %lhsp, align 16 + %rhs = load <4 x i32>, <4 x i32>* %rhsp, align 16 + %result = tail call <4 x i32> asm "vadd.i32 $0,$1,$2", "=t,t,t"(<4 x i32> %lhs, <4 x i32> %rhs) + store <4 x i32> %result, <4 x i32>* %resultp, align 16 + ret void +} diff --git a/llvm/test/CodeGen/Thumb2/mve-bitcasts.ll b/llvm/test/CodeGen/Thumb2/mve-bitcasts.ll new file mode 100644 index 000000000000..fc0ba5c4a857 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-bitcasts.ll @@ -0,0 +1,449 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s + +define arm_aapcs_vfpcc <2 x i64> @bitcast_i64_i64(<2 x i64> %src) { +; CHECK-LABEL: bitcast_i64_i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <2 x i64> %src to <2 x i64> + ret <2 x i64> %r +} + +define arm_aapcs_vfpcc <2 x i64> @bitcast_i64_i32(<4 x i32> %src) { +; CHECK-LABEL: bitcast_i64_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <4 x i32> %src to <2 x i64> + ret <2 x i64> %r +} + +define arm_aapcs_vfpcc <2 x i64> @bitcast_i64_i16(<8 x i16> %src) { +; CHECK-LABEL: bitcast_i64_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <8 x i16> %src to <2 x i64> + ret <2 x i64> %r +} + +define arm_aapcs_vfpcc <2 x i64> @bitcast_i64_i8(<16 x i8> %src) { +; CHECK-LABEL: bitcast_i64_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <16 x i8> %src to <2 x i64> + ret <2 x i64> %r +} + +define arm_aapcs_vfpcc <2 x i64> @bitcast_i64_f64(<2 x double> %src) { +; CHECK-LABEL: bitcast_i64_f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <2 x double> %src to <2 x i64> + ret <2 x i64> %r +} + +define arm_aapcs_vfpcc <2 x i64> @bitcast_i64_f32(<4 x float> %src) { +; CHECK-LABEL: bitcast_i64_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <4 x float> %src to <2 x i64> + ret <2 x i64> %r +} + +define arm_aapcs_vfpcc <2 x i64> @bitcast_i64_f16(<8 x half> %src) { +; CHECK-LABEL: bitcast_i64_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <8 x half> %src to <2 x i64> + ret <2 x i64> %r +} + + +define arm_aapcs_vfpcc <4 x i32> @bitcast_i32_i64(<2 x i64> %src) { +; CHECK-LABEL: bitcast_i32_i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <2 x i64> %src to <4 x i32> + ret <4 x i32> %r +} + +define arm_aapcs_vfpcc <4 x i32> @bitcast_i32_i32(<4 x i32> %src) { +; CHECK-LABEL: bitcast_i32_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <4 x i32> %src to <4 x i32> + ret <4 x i32> %r +} + +define arm_aapcs_vfpcc <4 x i32> @bitcast_i32_i16(<8 x i16> %src) { +; CHECK-LABEL: bitcast_i32_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <8 x i16> %src to <4 x i32> + ret <4 x i32> %r +} + +define arm_aapcs_vfpcc <4 x i32> @bitcast_i32_i8(<16 x i8> %src) { +; CHECK-LABEL: bitcast_i32_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <16 x i8> %src to <4 x i32> + ret <4 x i32> %r +} + +define arm_aapcs_vfpcc <4 x i32> @bitcast_i32_f64(<2 x double> %src) { +; CHECK-LABEL: bitcast_i32_f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <2 x double> %src to <4 x i32> + ret <4 x i32> %r +} + +define arm_aapcs_vfpcc <4 x i32> @bitcast_i32_f32(<4 x float> %src) { +; CHECK-LABEL: bitcast_i32_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <4 x float> %src to <4 x i32> + ret <4 x i32> %r +} + +define arm_aapcs_vfpcc <4 x i32> @bitcast_i32_f16(<8 x half> %src) { +; CHECK-LABEL: bitcast_i32_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <8 x half> %src to <4 x i32> + ret <4 x i32> %r +} + + +define arm_aapcs_vfpcc <8 x i16> @bitcast_i16_i64(<2 x i64> %src) { +; CHECK-LABEL: bitcast_i16_i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <2 x i64> %src to <8 x i16> + ret <8 x i16> %r +} + +define arm_aapcs_vfpcc <8 x i16> @bitcast_i16_i32(<4 x i32> %src) { +; CHECK-LABEL: bitcast_i16_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <4 x i32> %src to <8 x i16> + ret <8 x i16> %r +} + +define arm_aapcs_vfpcc <8 x i16> @bitcast_i16_i16(<8 x i16> %src) { +; CHECK-LABEL: bitcast_i16_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <8 x i16> %src to <8 x i16> + ret <8 x i16> %r +} + +define arm_aapcs_vfpcc <8 x i16> @bitcast_i16_i8(<16 x i8> %src) { +; CHECK-LABEL: bitcast_i16_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <16 x i8> %src to <8 x i16> + ret <8 x i16> %r +} + +define arm_aapcs_vfpcc <8 x i16> @bitcast_i16_f64(<2 x double> %src) { +; CHECK-LABEL: bitcast_i16_f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <2 x double> %src to <8 x i16> + ret <8 x i16> %r +} + +define arm_aapcs_vfpcc <8 x i16> @bitcast_i16_f32(<4 x float> %src) { +; CHECK-LABEL: bitcast_i16_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <4 x float> %src to <8 x i16> + ret <8 x i16> %r +} + +define arm_aapcs_vfpcc <8 x i16> @bitcast_i16_f16(<8 x half> %src) { +; CHECK-LABEL: bitcast_i16_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <8 x half> %src to <8 x i16> + ret <8 x i16> %r +} + + +define arm_aapcs_vfpcc <16 x i8> @bitcast_i8_i64(<2 x i64> %src) { +; CHECK-LABEL: bitcast_i8_i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <2 x i64> %src to <16 x i8> + ret <16 x i8> %r +} + +define arm_aapcs_vfpcc <16 x i8> @bitcast_i8_i32(<4 x i32> %src) { +; CHECK-LABEL: bitcast_i8_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <4 x i32> %src to <16 x i8> + ret <16 x i8> %r +} + +define arm_aapcs_vfpcc <16 x i8> @bitcast_i8_i16(<8 x i16> %src) { +; CHECK-LABEL: bitcast_i8_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <8 x i16> %src to <16 x i8> + ret <16 x i8> %r +} + +define arm_aapcs_vfpcc <16 x i8> @bitcast_i8_i8(<16 x i8> %src) { +; CHECK-LABEL: bitcast_i8_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <16 x i8> %src to <16 x i8> + ret <16 x i8> %r +} + +define arm_aapcs_vfpcc <16 x i8> @bitcast_i8_f64(<2 x double> %src) { +; CHECK-LABEL: bitcast_i8_f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <2 x double> %src to <16 x i8> + ret <16 x i8> %r +} + +define arm_aapcs_vfpcc <16 x i8> @bitcast_i8_f32(<4 x float> %src) { +; CHECK-LABEL: bitcast_i8_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <4 x float> %src to <16 x i8> + ret <16 x i8> %r +} + +define arm_aapcs_vfpcc <16 x i8> @bitcast_i8_f16(<8 x half> %src) { +; CHECK-LABEL: bitcast_i8_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <8 x half> %src to <16 x i8> + ret <16 x i8> %r +} + + +define arm_aapcs_vfpcc <2 x double> @bitcast_f64_i64(<2 x i64> %src) { +; CHECK-LABEL: bitcast_f64_i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <2 x i64> %src to <2 x double> + ret <2 x double> %r +} + +define arm_aapcs_vfpcc <2 x double> @bitcast_f64_i32(<4 x i32> %src) { +; CHECK-LABEL: bitcast_f64_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <4 x i32> %src to <2 x double> + ret <2 x double> %r +} + +define arm_aapcs_vfpcc <2 x double> @bitcast_f64_i16(<8 x i16> %src) { +; CHECK-LABEL: bitcast_f64_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <8 x i16> %src to <2 x double> + ret <2 x double> %r +} + +define arm_aapcs_vfpcc <2 x double> @bitcast_f64_i8(<16 x i8> %src) { +; CHECK-LABEL: bitcast_f64_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <16 x i8> %src to <2 x double> + ret <2 x double> %r +} + +define arm_aapcs_vfpcc <2 x double> @bitcast_f64_f64(<2 x double> %src) { +; CHECK-LABEL: bitcast_f64_f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <2 x double> %src to <2 x double> + ret <2 x double> %r +} + +define arm_aapcs_vfpcc <2 x double> @bitcast_f64_f32(<4 x float> %src) { +; CHECK-LABEL: bitcast_f64_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <4 x float> %src to <2 x double> + ret <2 x double> %r +} + +define arm_aapcs_vfpcc <2 x double> @bitcast_f64_f16(<8 x half> %src) { +; CHECK-LABEL: bitcast_f64_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <8 x half> %src to <2 x double> + ret <2 x double> %r +} + + +define arm_aapcs_vfpcc <4 x float> @bitcast_f32_i64(<2 x i64> %src) { +; CHECK-LABEL: bitcast_f32_i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <2 x i64> %src to <4 x float> + ret <4 x float> %r +} + +define arm_aapcs_vfpcc <4 x float> @bitcast_f32_i32(<4 x i32> %src) { +; CHECK-LABEL: bitcast_f32_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <4 x i32> %src to <4 x float> + ret <4 x float> %r +} + +define arm_aapcs_vfpcc <4 x float> @bitcast_f32_i16(<8 x i16> %src) { +; CHECK-LABEL: bitcast_f32_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <8 x i16> %src to <4 x float> + ret <4 x float> %r +} + +define arm_aapcs_vfpcc <4 x float> @bitcast_f32_i8(<16 x i8> %src) { +; CHECK-LABEL: bitcast_f32_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <16 x i8> %src to <4 x float> + ret <4 x float> %r +} + +define arm_aapcs_vfpcc <4 x float> @bitcast_f32_f64(<2 x double> %src) { +; CHECK-LABEL: bitcast_f32_f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <2 x double> %src to <4 x float> + ret <4 x float> %r +} + +define arm_aapcs_vfpcc <4 x float> @bitcast_f32_f32(<4 x float> %src) { +; CHECK-LABEL: bitcast_f32_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <4 x float> %src to <4 x float> + ret <4 x float> %r +} + +define arm_aapcs_vfpcc <4 x float> @bitcast_f32_f16(<8 x half> %src) { +; CHECK-LABEL: bitcast_f32_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <8 x half> %src to <4 x float> + ret <4 x float> %r +} + + +define arm_aapcs_vfpcc <8 x half> @bitcast_f16_i64(<2 x i64> %src) { +; CHECK-LABEL: bitcast_f16_i64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <2 x i64> %src to <8 x half> + ret <8 x half> %r +} + +define arm_aapcs_vfpcc <8 x half> @bitcast_f16_i32(<4 x i32> %src) { +; CHECK-LABEL: bitcast_f16_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <4 x i32> %src to <8 x half> + ret <8 x half> %r +} + +define arm_aapcs_vfpcc <8 x half> @bitcast_f16_i16(<8 x i16> %src) { +; CHECK-LABEL: bitcast_f16_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <8 x i16> %src to <8 x half> + ret <8 x half> %r +} + +define arm_aapcs_vfpcc <8 x half> @bitcast_f16_i8(<16 x i8> %src) { +; CHECK-LABEL: bitcast_f16_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <16 x i8> %src to <8 x half> + ret <8 x half> %r +} + +define arm_aapcs_vfpcc <8 x half> @bitcast_f16_f64(<2 x double> %src) { +; CHECK-LABEL: bitcast_f16_f64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <2 x double> %src to <8 x half> + ret <8 x half> %r +} + +define arm_aapcs_vfpcc <8 x half> @bitcast_f16_f32(<4 x float> %src) { +; CHECK-LABEL: bitcast_f16_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <4 x float> %src to <8 x half> + ret <8 x half> %r +} + +define arm_aapcs_vfpcc <8 x half> @bitcast_f16_f16(<8 x half> %src) { +; CHECK-LABEL: bitcast_f16_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bx lr +entry: + %r = bitcast <8 x half> %src to <8 x half> + ret <8 x half> %r +} diff --git a/llvm/test/Transforms/HardwareLoops/ARM/calls.ll b/llvm/test/Transforms/HardwareLoops/ARM/calls.ll index 0e1d859d88dc..d9018b8730d0 100644 --- a/llvm/test/Transforms/HardwareLoops/ARM/calls.ll +++ b/llvm/test/Transforms/HardwareLoops/ARM/calls.ll @@ -333,7 +333,7 @@ exit: ; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1) ; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 ; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit -define void @test_masked_i32(<4 x i1> %mask, <4 x i32>* %a, <4 x i32>* %b, <4 x i32>* %c, <4 x i32> %passthru) { +define arm_aapcs_vfpcc void @test_masked_i32(<4 x i1> %mask, <4 x i32>* %a, <4 x i32>* %b, <4 x i32>* %c, <4 x i32> %passthru) { entry: br label %loop loop: @@ -360,7 +360,7 @@ exit: ; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1) ; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 ; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit -define void @test_masked_f32(<4 x i1> %mask, <4 x float>* %a, <4 x float>* %b, <4 x float>* %c, <4 x float> %passthru) { +define arm_aapcs_vfpcc void @test_masked_f32(<4 x i1> %mask, <4 x float>* %a, <4 x float>* %b, <4 x float>* %c, <4 x float> %passthru) { entry: br label %loop loop: @@ -387,7 +387,7 @@ exit: ; CHECK-MVE: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[COUNT]], i32 1) ; CHECK-MVE: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 ; CHECK-MVE: br i1 [[CMP]], label %loop, label %exit -define void @test_gather_scatter(<4 x i1> %mask, <4 x float*> %a, <4 x float*> %b, <4 x float*> %c, <4 x float> %passthru) { +define arm_aapcs_vfpcc void @test_gather_scatter(<4 x i1> %mask, <4 x float*> %a, <4 x float*> %b, <4 x float*> %c, <4 x float> %passthru) { entry: br label %loop loop: