From 72ca86fd34ecc5f7ccbaf923d2d508dad2a6a64c Mon Sep 17 00:00:00 2001 From: Anna Welker Date: Tue, 14 Jan 2020 09:48:02 +0000 Subject: [PATCH] [ARM][MVE] Masked gathers from base + vector of offsets Enables the masked gather pass to create a masked gather loading from a base and vector of offsets. This also enables v8i16 and v16i8 gather loads. Differential Revision: https://reviews.llvm.org/D72330 --- .../Target/ARM/MVEGatherScatterLowering.cpp | 200 ++++++++-- .../CodeGen/Thumb2/mve-gather-ind16-scaled.ll | 271 +++++++++++++ .../Thumb2/mve-gather-ind16-unscaled.ll | 242 ++++++++++++ .../CodeGen/Thumb2/mve-gather-ind32-scaled.ll | 60 +-- .../Thumb2/mve-gather-ind32-unscaled.ll | 75 ++-- .../Thumb2/mve-gather-ind8-unscaled.ll | 370 ++++++++++++++++++ llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll | 13 + .../CodeGen/Thumb2/mve-gather-scatter-opt.ll | 28 +- 8 files changed, 1138 insertions(+), 121 deletions(-) create mode 100644 llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll create mode 100644 llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll create mode 100644 llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp index 4657a043dba1..9f64af02e698 100644 --- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -1,4 +1,4 @@ -//===- ARMGatherScatterLowering.cpp - Gather/Scatter lowering -------------===// +//===- MVEGatherScatterLowering.cpp - Gather/Scatter lowering -------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -19,22 +19,22 @@ #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/InitializePasses.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsARM.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" -#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include @@ -69,6 +69,25 @@ public: AU.addRequired(); FunctionPass::getAnalysisUsage(AU); } + +private: + // Check this is a valid gather with correct alignment + bool isLegalTypeAndAlignment(unsigned NumElements, unsigned ElemSize, + unsigned Alignment); + // Check whether Ptr is hidden behind a bitcast and look through it + void lookThroughBitcast(Value *&Ptr); + // Check for a getelementptr and deduce base and offsets from it, on success + // returning the base directly and the offsets indirectly using the Offsets + // argument + Value *checkGEP(Value *&Offsets, Type *Ty, Value *Ptr, IRBuilder<> Builder); + + bool lowerGather(IntrinsicInst *I); + // Create a gather from a base + vector of offsets + Value *tryCreateMaskedGatherOffset(IntrinsicInst *I, Value *Ptr, + IRBuilder<> Builder); + // Create a gather from a vector of pointers + Value *tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr, + IRBuilder<> Builder); }; } // end anonymous namespace @@ -82,13 +101,78 @@ Pass *llvm::createMVEGatherScatterLoweringPass() { return new MVEGatherScatterLowering(); } -static bool isLegalTypeAndAlignment(unsigned NumElements, unsigned ElemSize, - unsigned Alignment) { - // Do only allow non-extending v4i32 gathers for now - return NumElements == 4 && ElemSize == 32 && Alignment >= 4; +bool MVEGatherScatterLowering::isLegalTypeAndAlignment(unsigned NumElements, + unsigned ElemSize, + unsigned Alignment) { + // Do only allow non-extending gathers for now + if (((NumElements == 4 && ElemSize == 32) || + (NumElements == 8 && ElemSize == 16) || + (NumElements == 16 && ElemSize == 8)) && + ElemSize / 8 <= Alignment) + return true; + LLVM_DEBUG(dbgs() << "masked gathers: instruction does not have valid " + << "alignment or vector type \n"); + return false; } -static bool LowerGather(IntrinsicInst *I) { +Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty, Value *Ptr, + IRBuilder<> Builder) { + GetElementPtrInst *GEP = dyn_cast(Ptr); + if (!GEP) { + LLVM_DEBUG(dbgs() << "masked gathers: no getelementpointer found\n"); + return nullptr; + } + LLVM_DEBUG(dbgs() << "masked gathers: getelementpointer found. Loading" + << " from base + vector of offsets\n"); + Value *GEPPtr = GEP->getPointerOperand(); + if (GEPPtr->getType()->isVectorTy()) { + LLVM_DEBUG(dbgs() << "masked gathers: gather from a vector of pointers" + << " hidden behind a getelementptr currently not" + << " supported. Expanding.\n"); + return nullptr; + } + if (GEP->getNumOperands() != 2) { + LLVM_DEBUG(dbgs() << "masked gathers: getelementptr with too many" + << " operands. Expanding.\n"); + return nullptr; + } + Offsets = GEP->getOperand(1); + // SExt offsets inside masked gathers are not permitted by the architecture; + // we therefore can't fold them + if (ZExtInst *ZextOffs = dyn_cast(Offsets)) + Offsets = ZextOffs->getOperand(0); + Type *OffsType = VectorType::getInteger(cast(Ty)); + // If the offset we found does not have the type the intrinsic expects, + // i.e., the same type as the gather itself, we need to convert it (only i + // types) or fall back to expanding the gather + if (OffsType != Offsets->getType()) { + if (OffsType->getScalarSizeInBits() > + Offsets->getType()->getScalarSizeInBits()) { + LLVM_DEBUG(dbgs() << "masked gathers: extending offsets\n"); + Offsets = Builder.CreateZExt(Offsets, OffsType, ""); + } else { + LLVM_DEBUG(dbgs() << "masked gathers: no correct offset type. Can't" + << " create masked gather\n"); + return nullptr; + } + } + // If none of the checks failed, return the gep's base pointer + return GEPPtr; +} + +void MVEGatherScatterLowering::lookThroughBitcast(Value *&Ptr) { + // Look through bitcast instruction if #elements is the same + if (auto *BitCast = dyn_cast(Ptr)) { + Type *BCTy = BitCast->getType(); + Type *BCSrcTy = BitCast->getOperand(0)->getType(); + if (BCTy->getVectorNumElements() == BCSrcTy->getVectorNumElements()) { + LLVM_DEBUG(dbgs() << "masked gathers: looking through bitcast\n"); + Ptr = BitCast->getOperand(0); + } + } +} + +bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) { using namespace PatternMatch; LLVM_DEBUG(dbgs() << "masked gathers: checking transform preconditions\n"); @@ -101,42 +185,20 @@ static bool LowerGather(IntrinsicInst *I) { Value *Mask = I->getArgOperand(2); Value *PassThru = I->getArgOperand(3); - // Check this is a valid gather with correct alignment if (!isLegalTypeAndAlignment(Ty->getVectorNumElements(), - Ty->getScalarSizeInBits(), Alignment)) { - LLVM_DEBUG(dbgs() << "masked gathers: instruction does not have valid " - << "alignment or vector type \n"); + Ty->getScalarSizeInBits(), Alignment)) return false; - } + lookThroughBitcast(Ptr); + assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type"); IRBuilder<> Builder(I->getContext()); Builder.SetInsertPoint(I); Builder.SetCurrentDebugLocation(I->getDebugLoc()); - - Value *Load = nullptr; - // Look through bitcast instruction if #elements is the same - if (auto *BitCast = dyn_cast(Ptr)) { - Type *BCTy = BitCast->getType(); - Type *BCSrcTy = BitCast->getOperand(0)->getType(); - if (BCTy->getVectorNumElements() == BCSrcTy->getVectorNumElements()) { - LLVM_DEBUG(dbgs() << "masked gathers: looking through bitcast\n"); - Ptr = BitCast->getOperand(0); - } - } - assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type"); - - if (Ty->getVectorNumElements() != 4) - // Can't build an intrinsic for this + Value *Load = tryCreateMaskedGatherOffset(I, Ptr, Builder); + if (!Load) + Load = tryCreateMaskedGatherBase(I, Ptr, Builder); + if (!Load) return false; - if (match(Mask, m_One())) - Load = Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base, - {Ty, Ptr->getType()}, - {Ptr, Builder.getInt32(0)}); - else - Load = - Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base_predicated, - {Ty, Ptr->getType(), Mask->getType()}, - {Ptr, Builder.getInt32(0), Mask}); if (!isa(PassThru) && !match(PassThru, m_Zero())) { LLVM_DEBUG(dbgs() << "masked gathers: found non-trivial passthru - " @@ -150,6 +212,68 @@ static bool LowerGather(IntrinsicInst *I) { return true; } +Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase( + IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) { + using namespace PatternMatch; + LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n"); + Type *Ty = I->getType(); + if (Ty->getVectorNumElements() != 4) + // Can't build an intrinsic for this + return nullptr; + Value *Mask = I->getArgOperand(2); + if (match(Mask, m_One())) + return Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base, + {Ty, Ptr->getType()}, + {Ptr, Builder.getInt32(0)}); + else + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vldr_gather_base_predicated, + {Ty, Ptr->getType(), Mask->getType()}, + {Ptr, Builder.getInt32(0), Mask}); +} + +Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset( + IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) { + using namespace PatternMatch; + Type *Ty = I->getType(); + Value *Offsets; + Value *BasePtr = checkGEP(Offsets, Ty, Ptr, Builder); + if (!BasePtr) + return nullptr; + + unsigned Scale; + int GEPElemSize = + BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(); + int ResultElemSize = Ty->getScalarSizeInBits(); + // This can be a 32bit load scaled by 4, a 16bit load scaled by 2, or a + // 8bit, 16bit or 32bit load scaled by 1 + if (GEPElemSize == 32 && ResultElemSize == 32) { + Scale = 2; + } else if (GEPElemSize == 16 && ResultElemSize == 16) { + Scale = 1; + } else if (GEPElemSize == 8) { + Scale = 0; + } else { + LLVM_DEBUG(dbgs() << "masked gathers: incorrect scale for load. Can't" + << " create masked gather\n"); + return nullptr; + } + + Value *Mask = I->getArgOperand(2); + if (!match(Mask, m_One())) + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vldr_gather_offset_predicated, + {Ty, BasePtr->getType(), Offsets->getType(), Mask->getType()}, + {BasePtr, Offsets, Builder.getInt32(Ty->getScalarSizeInBits()), + Builder.getInt32(Scale), Builder.getInt32(1), Mask}); + else + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vldr_gather_offset, + {Ty, BasePtr->getType(), Offsets->getType()}, + {BasePtr, Offsets, Builder.getInt32(Ty->getScalarSizeInBits()), + Builder.getInt32(Scale), Builder.getInt32(1)}); +} + bool MVEGatherScatterLowering::runOnFunction(Function &F) { if (!EnableMaskedGatherScatters) return false; @@ -171,7 +295,7 @@ bool MVEGatherScatterLowering::runOnFunction(Function &F) { return false; for (IntrinsicInst *I : Gathers) - LowerGather(I); + lowerGather(I); return true; } diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll new file mode 100644 index 000000000000..948f030a84da --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll @@ -0,0 +1,271 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -o - | FileCheck %s + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16(i16* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: scaled_v8i16_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.zext = zext <8 x i16> %offs to <8 x i32> + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x half> @scaled_v8f16_i16(i16* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: scaled_v8f16_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.zext = zext <8 x i16> %offs to <8 x i32> + %i16_ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext + %ptrs = bitcast <8 x i16*> %i16_ptrs to <8 x half*> + %gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> , <8 x half> undef) + ret <8 x half> %gather +} + +define arm_aapcs_vfpcc <8 x half> @scaled_v8f16_half(half* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: scaled_v8f16_half: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.zext = zext <8 x i16> %offs to <8 x i32> + %ptrs = getelementptr inbounds half, half* %base, <8 x i32> %offs.zext + %gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> , <8 x half> undef) + ret <8 x half> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_sext(i16* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: scaled_v8i16_sext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vldrh.s32 q0, [r1] +; CHECK-NEXT: vldrh.s32 q1, [r1, #8] +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrh.w r12, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrh.w lr, [r3] +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: vmov.16 q0[2], r12 +; CHECK-NEXT: vmov.16 q0[3], lr +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r7, pc} +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.sext = sext <8 x i16> %offs to <8 x i32> + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.sext + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x half> @scaled_v8f16_sext(i16* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: scaled_v8f16_sext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.s32 q0, [r1] +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vmov r2, s5 +; CHECK-NEXT: vldr.16 s0, [r2] +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vldr.16 s0, [r3] +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: vmov.16 q0[1], r2 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vldr.16 s8, [r2] +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vldr.16 s4, [r2] +; CHECK-NEXT: vmov r2, s4 +; CHECK-NEXT: vldrh.s32 q1, [r1, #8] +; CHECK-NEXT: vmov.16 q0[3], r2 +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vldr.16 s8, [r0] +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vldr.16 s8, [r0] +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vldr.16 s8, [r0] +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vldr.16 s4, [r0] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.sext = sext <8 x i16> %offs to <8 x i32> + %i16_ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.sext + %ptrs = bitcast <8 x i16*> %i16_ptrs to <8 x half*> + %gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> , <8 x half> undef) + ret <8 x half> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @unsigned_scaled_v8i16_i8(i16* %base, <8 x i8>* %offptr) { +; CHECK-LABEL: unsigned_scaled_v8i16_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 + %offs.zext = zext <8 x i8> %offs to <8 x i32> + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x half> @unsigned_scaled_v8f16_i8(i16* %base, <8 x i8>* %offptr) { +; CHECK-LABEL: unsigned_scaled_v8f16_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 + %offs.zext = zext <8 x i8> %offs to <8 x i32> + %i16_ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext + %ptrs = bitcast <8 x i16*> %i16_ptrs to <8 x half*> + %gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> , <8 x half> undef) + ret <8 x half> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru0t(i16* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: scaled_v8i16_i16_passthru0t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.zext = zext <8 x i16> %offs to <8 x i32> + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> zeroinitializer) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru1t(i16* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: scaled_v8i16_i16_passthru1t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.zext = zext <8 x i16> %offs to <8 x i32> + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> ) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru1f(i16* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: scaled_v8i16_i16_passthru1f: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movw r2, #65487 +; CHECK-NEXT: vmov.i16 q0, #0x1 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q2, [r0, q1, uxtw #1] +; CHECK-NEXT: vpsel q0, q2, q0 +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.zext = zext <8 x i16> %offs to <8 x i32> + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> ) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru0f(i16* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: scaled_v8i16_i16_passthru0f: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: movw r2, #65523 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.zext = zext <8 x i16> %offs to <8 x i32> + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> ) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru_icmp0(i16* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: scaled_v8i16_i16_passthru_icmp0: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vpt.s16 gt, q1, zr +; CHECK-NEXT: vldrht.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.zext = zext <8 x i16> %offs to <8 x i32> + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext + %mask = icmp sgt <8 x i16> %offs, zeroinitializer + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> %mask, <8 x i16> ) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru_icmp1(i16* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: scaled_v8i16_i16_passthru_icmp1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i16 q0, #0x1 +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vpt.s16 gt, q1, zr +; CHECK-NEXT: vldrht.u16 q2, [r0, q1, uxtw #1] +; CHECK-NEXT: vpsel q0, q2, q0 +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.zext = zext <8 x i16> %offs to <8 x i32> + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext + %mask = icmp sgt <8 x i16> %offs, zeroinitializer + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> %mask, <8 x i16> ) + ret <8 x i16> %gather +} + +declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>) #1 +declare <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>) #1 +declare <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*>, i32, <8 x i1>, <8 x half>) #1 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll new file mode 100644 index 000000000000..5e4e5a394096 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll @@ -0,0 +1,242 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -o - | FileCheck %s + +define arm_aapcs_vfpcc <8 x i16> @zext_unscaled_i8_i16(i8* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: zext_unscaled_i8_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrh.u32 q1, [r1, #8] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: ldrb.w lr, [r3] +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.16 q0[0], r5 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.16 q0[1], lr +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.16 q0[3], r12 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: vmovlb.u8 q0, q0 +; CHECK-NEXT: pop {r4, r5, r7, pc} +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.zext = zext <8 x i16> %offs to <8 x i32> + %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext + %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> , <8 x i8> undef) + %gather.zext = zext <8 x i8> %gather to <8 x i16> + ret <8 x i16> %gather.zext +} + +define arm_aapcs_vfpcc <8 x i16> @sext_unscaled_i8_i16(i8* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: sext_unscaled_i8_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrh.u32 q1, [r1, #8] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: ldrb.w lr, [r3] +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.16 q0[0], r5 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.16 q0[1], lr +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.16 q0[3], r12 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: vmovlb.s8 q0, q0 +; CHECK-NEXT: pop {r4, r5, r7, pc} +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.zext = zext <8 x i16> %offs to <8 x i32> + %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext + %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> , <8 x i8> undef) + %gather.sext = sext <8 x i8> %gather to <8 x i16> + ret <8 x i16> %gather.sext +} + +define arm_aapcs_vfpcc <8 x i16> @unscaled_i16_i16(i8* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: unscaled_i16_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.zext = zext <8 x i16> %offs to <8 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext + %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*> + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x half> @unscaled_f16_i16(i8* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: unscaled_f16_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.zext = zext <8 x i16> %offs to <8 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext + %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*> + %gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> , <8 x half> undef) + ret <8 x half> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @zext_unsigned_unscaled_i8_i8(i8* %base, <8 x i8>* %offptr) { +; CHECK-LABEL: zext_unsigned_unscaled_i8_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrb.u32 q1, [r1, #4] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: ldrb.w lr, [r3] +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.16 q0[0], r5 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.16 q0[1], lr +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.16 q0[3], r12 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: vmovlb.u8 q0, q0 +; CHECK-NEXT: pop {r4, r5, r7, pc} +entry: + %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 + %offs.zext = zext <8 x i8> %offs to <8 x i32> + %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext + %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> , <8 x i8> undef) + %gather.zext = zext <8 x i8> %gather to <8 x i16> + ret <8 x i16> %gather.zext +} + +define arm_aapcs_vfpcc <8 x i16> @sext_unsigned_unscaled_i8_i8(i8* %base, <8 x i8>* %offptr) { +; CHECK-LABEL: sext_unsigned_unscaled_i8_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrb.u32 q1, [r1, #4] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r5, s0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: ldrb.w lr, [r3] +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.16 q0[0], r5 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.16 q0[1], lr +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: vmov.16 q0[2], r2 +; CHECK-NEXT: vmov.16 q0[3], r12 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: vmovlb.s8 q0, q0 +; CHECK-NEXT: pop {r4, r5, r7, pc} +entry: + %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 + %offs.zext = zext <8 x i8> %offs to <8 x i32> + %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext + %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> , <8 x i8> undef) + %gather.sext = sext <8 x i8> %gather to <8 x i16> + ret <8 x i16> %gather.sext +} + +define arm_aapcs_vfpcc <8 x i16> @unsigned_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr) { +; CHECK-LABEL: unsigned_unscaled_i16_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 + %offs.zext = zext <8 x i8> %offs to <8 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext + %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*> + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x half> @unsigned_unscaled_f16_i8(i8* %base, <8 x i8>* %offptr) { +; CHECK-LABEL: unsigned_unscaled_f16_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 + %offs.zext = zext <8 x i8> %offs to <8 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext + %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*> + %gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> , <8 x half> undef) + ret <8 x half> %gather +} + +declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>) #1 +declare <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>) #1 +declare <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*>, i32, <8 x i1>, <8 x half>) #1 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll index f75379160d78..1acba1a95c3d 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll @@ -60,10 +60,8 @@ entry: define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32(i32* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: scaled_i32_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -77,10 +75,8 @@ entry: define arm_aapcs_vfpcc <4 x float> @scaled_f32_i32(i32* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: scaled_f32_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -93,10 +89,8 @@ entry: define arm_aapcs_vfpcc <4 x i32> @unsigned_scaled_b_i32_i16(i32* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: unsigned_scaled_b_i32_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -109,10 +103,8 @@ entry: define arm_aapcs_vfpcc <4 x i32> @signed_scaled_i32_i16(i32* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: signed_scaled_i32_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -125,10 +117,8 @@ entry: define arm_aapcs_vfpcc <4 x float> @a_unsigned_scaled_f32_i16(i32* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: a_unsigned_scaled_f32_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -142,10 +132,8 @@ entry: define arm_aapcs_vfpcc <4 x float> @b_signed_scaled_f32_i16(i32* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: b_signed_scaled_f32_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -275,10 +263,8 @@ entry: define arm_aapcs_vfpcc <4 x i32> @unsigned_scaled_b_i32_i8(i32* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: unsigned_scaled_b_i32_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -291,10 +277,8 @@ entry: define arm_aapcs_vfpcc <4 x i32> @signed_scaled_i32_i8(i32* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: signed_scaled_i32_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -307,10 +291,8 @@ entry: define arm_aapcs_vfpcc <4 x float> @a_unsigned_scaled_f32_i8(i32* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: a_unsigned_scaled_f32_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -324,10 +306,8 @@ entry: define arm_aapcs_vfpcc <4 x float> @b_signed_scaled_f32_i8(i32* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: b_signed_scaled_f32_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll index fcb7dd8f87d6..c862aa9656cb 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll @@ -116,9 +116,8 @@ entry: define arm_aapcs_vfpcc <4 x i32> @unscaled_i32_i32(i8* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: unscaled_i32_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -131,9 +130,8 @@ entry: define arm_aapcs_vfpcc <4 x float> @unscaled_f32_i32(i8* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: unscaled_f32_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -146,9 +144,8 @@ entry: define arm_aapcs_vfpcc <4 x i32> @unsigned_unscaled_b_i32_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: unsigned_unscaled_b_i32_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -162,9 +159,8 @@ entry: define arm_aapcs_vfpcc <4 x i32> @signed_unscaled_i32_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: signed_unscaled_i32_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -178,9 +174,8 @@ entry: define arm_aapcs_vfpcc <4 x float> @a_unsigned_unscaled_f32_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: a_unsigned_unscaled_f32_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -194,9 +189,8 @@ entry: define arm_aapcs_vfpcc <4 x float> @b_signed_unscaled_f32_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: b_signed_unscaled_f32_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -442,9 +436,8 @@ entry: define arm_aapcs_vfpcc <4 x i32> @unsigned_unscaled_b_i32_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: unsigned_unscaled_b_i32_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -458,9 +451,8 @@ entry: define arm_aapcs_vfpcc <4 x i32> @signed_unscaled_i32_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: signed_unscaled_i32_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -474,9 +466,8 @@ entry: define arm_aapcs_vfpcc <4 x float> @a_unsigned_unscaled_f32_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: a_unsigned_unscaled_f32_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -490,9 +481,8 @@ entry: define arm_aapcs_vfpcc <4 x float> @b_signed_unscaled_f32_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: b_signed_unscaled_f32_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -735,6 +725,31 @@ entry: ret <4 x i32> %gather.sext } +; VLDRW.u32 Qd, [P, 4] +define arm_aapcs_vfpcc <4 x i32> @qi4(<4 x i32*> %p) { +; CHECK-LABEL: qi4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i32 q1, #0x10 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: ldr r0, [r0] +; CHECK-NEXT: ldr r3, [r3] +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: ldr r1, [r1] +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: ldr r2, [r2] +; CHECK-NEXT: vmov.32 q0[2], r1 +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: bx lr +entry: + %g = getelementptr inbounds i32, <4 x i32*> %p, i32 4 + %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %g, i32 1, <4 x i1> , <4 x i32> undef) + ret <4 x i32> %gather +} + declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>) declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>) declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll new file mode 100644 index 000000000000..deb3be28ddb6 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll @@ -0,0 +1,370 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -o - | FileCheck %s + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8(i8* %base, <16 x i8>* %offptr) { +; CHECK-LABEL: unscaled_v16i8_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q1, [r1] +; CHECK-NEXT: vldrb.u8 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 + %offs.zext = zext <16 x i8> %offs to <16 x i32> + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + +define arm_aapcs_vfpcc <8 x i8> @unscaled_v8i8_i8(i8* %base, <8 x i8>* %offptr) { +; CHECK-LABEL: unscaled_v8i8_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrb.u32 q1, [r1, #4] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrb.w lr, [r3] +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: vmov.16 q0[2], r12 +; CHECK-NEXT: vmov.16 q0[3], lr +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r7, pc} +entry: + %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 + %offs.zext = zext <8 x i8> %offs to <8 x i32> + %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext + %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> , <8 x i8> undef) + ret <8 x i8> %gather +} + +define arm_aapcs_vfpcc <2 x i8> @unscaled_v2i8_i8(i8* %base, <2 x i8>* %offptr) { +; CHECK-LABEL: unscaled_v2i8_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrb r2, [r1] +; CHECK-NEXT: vmov.i32 q0, #0xff +; CHECK-NEXT: ldrb r1, [r1, #1] +; CHECK-NEXT: vmov.32 q1[0], r2 +; CHECK-NEXT: vmov.32 q1[2], r1 +; CHECK-NEXT: vand q0, q1, q0 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: ldrb r1, [r0, r1] +; CHECK-NEXT: ldrb r0, [r0, r2] +; CHECK-NEXT: vmov.32 q0[0], r1 +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: bx lr +entry: + %offs = load <2 x i8>, <2 x i8>* %offptr, align 1 + %offs.zext = zext <2 x i8> %offs to <2 x i32> + %ptrs = getelementptr inbounds i8, i8* %base, <2 x i32> %offs.zext + %gather = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> , <2 x i8> undef) + ret <2 x i8> %gather +} + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_sext(i8* %base, <16 x i8>* %offptr) { +; CHECK-LABEL: unscaled_v16i8_sext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrb.s32 q0, [r1, #8] +; CHECK-NEXT: vldrb.s32 q2, [r1, #4] +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrb.s32 q0, [r1, #12] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r6, s4 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb.w lr, [r2] +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vldrb.s32 q0, [r1] +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r6 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], r5 +; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: vmov.8 q0[13], r2 +; CHECK-NEXT: vmov.8 q0[14], r3 +; CHECK-NEXT: vmov.8 q0[15], r4 +; CHECK-NEXT: pop {r4, r5, r6, pc} +entry: + %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 + %offs.sext = sext <16 x i8> %offs to <16 x i32> + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.sext + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i16(i8* %base, <16 x i16>* %offptr) { +; CHECK-LABEL: unscaled_v16i8_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrh.s32 q0, [r1, #16] +; CHECK-NEXT: vldrh.s32 q2, [r1, #8] +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrh.s32 q0, [r1, #24] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r6, s4 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb.w lr, [r2] +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vldrh.s32 q0, [r1] +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r6 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], r5 +; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: vmov.8 q0[13], r2 +; CHECK-NEXT: vmov.8 q0[14], r3 +; CHECK-NEXT: vmov.8 q0[15], r4 +; CHECK-NEXT: pop {r4, r5, r6, pc} +entry: + %offs = load <16 x i16>, <16 x i16>* %offptr, align 2 + %offs.sext = sext <16 x i16> %offs to <16 x i32> + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.sext + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_scaled(i32* %base, <16 x i8>* %offptr) { +; CHECK-LABEL: unscaled_v16i8_scaled: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrb.u32 q0, [r1, #8] +; CHECK-NEXT: vldrb.u32 q2, [r1, #4] +; CHECK-NEXT: vshl.i32 q0, q0, #2 +; CHECK-NEXT: vshl.i32 q2, q2, #2 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrb.u32 q0, [r1, #12] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vshl.i32 q0, q0, #2 +; CHECK-NEXT: vmov r6, s4 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb.w lr, [r2] +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vshl.i32 q0, q0, #2 +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r6 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], r5 +; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: vmov.8 q0[13], r2 +; CHECK-NEXT: vmov.8 q0[14], r3 +; CHECK-NEXT: vmov.8 q0[15], r4 +; CHECK-NEXT: pop {r4, r5, r6, pc} +entry: + %offs = load <16 x i8>, <16 x i8>* %offptr, align 4 + %offs.zext = zext <16 x i8> %offs to <16 x i32> + %ptrs32 = getelementptr inbounds i32, i32* %base, <16 x i32> %offs.zext + %ptrs = bitcast <16 x i32*> %ptrs32 to <16 x i8*> + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_next(i8* %base, <16 x i32>* %offptr) { +; CHECK-LABEL: unscaled_v16i8_i8_next: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrw.u32 q0, [r1, #32] +; CHECK-NEXT: vldrw.u32 q2, [r1, #16] +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [r1, #48] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r6, s4 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb.w lr, [r2] +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r6 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], r5 +; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: vmov.8 q0[13], r2 +; CHECK-NEXT: vmov.8 q0[14], r3 +; CHECK-NEXT: vmov.8 q0[15], r4 +; CHECK-NEXT: pop {r4, r5, r6, pc} +entry: + %offs = load <16 x i32>, <16 x i32>* %offptr, align 4 + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + +declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32, <16 x i1>, <16 x i8>) +declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>) +declare <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*>, i32, <2 x i1>, <2 x i8>) diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll index b595a20bf497..75f18ea8f081 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll @@ -829,6 +829,19 @@ for.end: ; preds = %vector.body, %entry ret void } +define arm_aapcs_vfpcc <4 x i32> @qi4(<4 x i32*> %p) { +; CHECK-LABEL: qi4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i32 q1, #0x10 +; CHECK-NEXT: vadd.i32 q1, q0, q1 +; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: bx lr +entry: + %g = getelementptr inbounds i32, <4 x i32*> %p, i32 4 + %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %g, i32 4, <4 x i1> , <4 x i32> undef) + ret <4 x i32> %gather +} + declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>) declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>, i32, <8 x i1>, <8 x i32>) diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll index 70ad30aa60c8..a50bd2cc94a0 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll @@ -1,11 +1,12 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp %s -o 2>/dev/null - | FileCheck --check-prefix NOGATSCAT %s ; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=-mve -enable-arm-maskedgatscat %s -o 2>/dev/null - | FileCheck --check-prefix NOMVE %s define arm_aapcs_vfpcc <4 x i32> @unscaled_i32_i32(i8* %base, <4 x i32>* %offptr) { ; NOGATSCAT-LABEL: unscaled_i32_i32: ; NOGATSCAT: @ %bb.0: @ %entry -; NOGATSCAT-NEXT: vldrw.u32 q0, [r1] -; NOGATSCAT-NEXT: vadd.i32 q0, q0, r0 +; NOGATSCAT-NEXT: vldrw.u32 q0, [r1] +; NOGATSCAT-NEXT: vadd.i32 q0, q0, r0 ; NOGATSCAT-NEXT: vmov r0, s0 ; NOGATSCAT-NEXT: vmov r3, s1 ; NOGATSCAT-NEXT: vmov r1, s2 @@ -19,19 +20,20 @@ define arm_aapcs_vfpcc <4 x i32> @unscaled_i32_i32(i8* %base, <4 x i32>* %offptr ; NOGATSCAT-NEXT: vmov.32 q0[2], r1 ; NOGATSCAT-NEXT: vmov.32 q0[3], r2 ; NOGATSCAT-NEXT: bx lr - +; ; NOMVE-LABEL: unscaled_i32_i32: ; NOMVE: @ %bb.0: @ %entry -; NOMVE-NEXT: .save {r4, lr} -; NOMVE-NEXT: push {r4, lr} -; NOMVE-NEXT: ldm.w r1, {r2, r3, lr} -; NOMVE-NEXT: ldr r4, [r1, #12] -; NOMVE-NEXT: ldr.w r12, [r0, r2] -; NOMVE-NEXT: ldr r1, [r0, r3] -; NOMVE-NEXT: ldr.w r2, [r0, lr] -; NOMVE-NEXT: ldr r3, [r0, r4] -; NOMVE-NEXT: mov r0, r12 -; NOMVE-NEXT: pop {r4, pc} +; NOMVE-NEXT: .save {r4, lr} +; NOMVE-NEXT: push {r4, lr} +; NOMVE-NEXT: ldm.w r1, {r2, r3, lr} +; NOMVE-NEXT: ldr r4, [r1, #12] +; NOMVE-NEXT: ldr.w r12, [r0, r2] +; NOMVE-NEXT: ldr r1, [r0, r3] +; NOMVE-NEXT: ldr.w r2, [r0, lr] +; NOMVE-NEXT: ldr r3, [r0, r4] +; NOMVE-NEXT: mov r0, r12 +; NOMVE-NEXT: pop {r4, pc} + entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4