forked from OSchip/llvm-project
[ARM][MVE] Masked gathers from base + vector of offsets
Enables the masked gather pass to create a masked gather loading from a base and vector of offsets. This also enables v8i16 and v16i8 gather loads. Differential Revision: https://reviews.llvm.org/D72330
This commit is contained in:
parent
41b5201888
commit
72ca86fd34
|
@ -1,4 +1,4 @@
|
|||
//===- ARMGatherScatterLowering.cpp - Gather/Scatter lowering -------------===//
|
||||
//===- MVEGatherScatterLowering.cpp - Gather/Scatter lowering -------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
|
@ -19,22 +19,22 @@
|
|||
#include "llvm/CodeGen/TargetLowering.h"
|
||||
#include "llvm/CodeGen/TargetPassConfig.h"
|
||||
#include "llvm/CodeGen/TargetSubtargetInfo.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
#include "llvm/IR/BasicBlock.h"
|
||||
#include "llvm/IR/Constant.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/IR/DerivedTypes.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/InstrTypes.h"
|
||||
#include "llvm/IR/Instruction.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/IntrinsicInst.h"
|
||||
#include "llvm/IR/Intrinsics.h"
|
||||
#include "llvm/IR/IntrinsicsARM.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/PatternMatch.h"
|
||||
#include "llvm/IR/Type.h"
|
||||
#include "llvm/IR/Value.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
#include "llvm/Pass.h"
|
||||
#include "llvm/Support/Casting.h"
|
||||
#include <algorithm>
|
||||
|
@ -69,6 +69,25 @@ public:
|
|||
AU.addRequired<TargetPassConfig>();
|
||||
FunctionPass::getAnalysisUsage(AU);
|
||||
}
|
||||
|
||||
private:
|
||||
// Check this is a valid gather with correct alignment
|
||||
bool isLegalTypeAndAlignment(unsigned NumElements, unsigned ElemSize,
|
||||
unsigned Alignment);
|
||||
// Check whether Ptr is hidden behind a bitcast and look through it
|
||||
void lookThroughBitcast(Value *&Ptr);
|
||||
// Check for a getelementptr and deduce base and offsets from it, on success
|
||||
// returning the base directly and the offsets indirectly using the Offsets
|
||||
// argument
|
||||
Value *checkGEP(Value *&Offsets, Type *Ty, Value *Ptr, IRBuilder<> Builder);
|
||||
|
||||
bool lowerGather(IntrinsicInst *I);
|
||||
// Create a gather from a base + vector of offsets
|
||||
Value *tryCreateMaskedGatherOffset(IntrinsicInst *I, Value *Ptr,
|
||||
IRBuilder<> Builder);
|
||||
// Create a gather from a vector of pointers
|
||||
Value *tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr,
|
||||
IRBuilder<> Builder);
|
||||
};
|
||||
|
||||
} // end anonymous namespace
|
||||
|
@ -82,13 +101,78 @@ Pass *llvm::createMVEGatherScatterLoweringPass() {
|
|||
return new MVEGatherScatterLowering();
|
||||
}
|
||||
|
||||
static bool isLegalTypeAndAlignment(unsigned NumElements, unsigned ElemSize,
|
||||
unsigned Alignment) {
|
||||
// Do only allow non-extending v4i32 gathers for now
|
||||
return NumElements == 4 && ElemSize == 32 && Alignment >= 4;
|
||||
bool MVEGatherScatterLowering::isLegalTypeAndAlignment(unsigned NumElements,
|
||||
unsigned ElemSize,
|
||||
unsigned Alignment) {
|
||||
// Do only allow non-extending gathers for now
|
||||
if (((NumElements == 4 && ElemSize == 32) ||
|
||||
(NumElements == 8 && ElemSize == 16) ||
|
||||
(NumElements == 16 && ElemSize == 8)) &&
|
||||
ElemSize / 8 <= Alignment)
|
||||
return true;
|
||||
LLVM_DEBUG(dbgs() << "masked gathers: instruction does not have valid "
|
||||
<< "alignment or vector type \n");
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool LowerGather(IntrinsicInst *I) {
|
||||
Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty, Value *Ptr,
|
||||
IRBuilder<> Builder) {
|
||||
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
|
||||
if (!GEP) {
|
||||
LLVM_DEBUG(dbgs() << "masked gathers: no getelementpointer found\n");
|
||||
return nullptr;
|
||||
}
|
||||
LLVM_DEBUG(dbgs() << "masked gathers: getelementpointer found. Loading"
|
||||
<< " from base + vector of offsets\n");
|
||||
Value *GEPPtr = GEP->getPointerOperand();
|
||||
if (GEPPtr->getType()->isVectorTy()) {
|
||||
LLVM_DEBUG(dbgs() << "masked gathers: gather from a vector of pointers"
|
||||
<< " hidden behind a getelementptr currently not"
|
||||
<< " supported. Expanding.\n");
|
||||
return nullptr;
|
||||
}
|
||||
if (GEP->getNumOperands() != 2) {
|
||||
LLVM_DEBUG(dbgs() << "masked gathers: getelementptr with too many"
|
||||
<< " operands. Expanding.\n");
|
||||
return nullptr;
|
||||
}
|
||||
Offsets = GEP->getOperand(1);
|
||||
// SExt offsets inside masked gathers are not permitted by the architecture;
|
||||
// we therefore can't fold them
|
||||
if (ZExtInst *ZextOffs = dyn_cast<ZExtInst>(Offsets))
|
||||
Offsets = ZextOffs->getOperand(0);
|
||||
Type *OffsType = VectorType::getInteger(cast<VectorType>(Ty));
|
||||
// If the offset we found does not have the type the intrinsic expects,
|
||||
// i.e., the same type as the gather itself, we need to convert it (only i
|
||||
// types) or fall back to expanding the gather
|
||||
if (OffsType != Offsets->getType()) {
|
||||
if (OffsType->getScalarSizeInBits() >
|
||||
Offsets->getType()->getScalarSizeInBits()) {
|
||||
LLVM_DEBUG(dbgs() << "masked gathers: extending offsets\n");
|
||||
Offsets = Builder.CreateZExt(Offsets, OffsType, "");
|
||||
} else {
|
||||
LLVM_DEBUG(dbgs() << "masked gathers: no correct offset type. Can't"
|
||||
<< " create masked gather\n");
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
// If none of the checks failed, return the gep's base pointer
|
||||
return GEPPtr;
|
||||
}
|
||||
|
||||
void MVEGatherScatterLowering::lookThroughBitcast(Value *&Ptr) {
|
||||
// Look through bitcast instruction if #elements is the same
|
||||
if (auto *BitCast = dyn_cast<BitCastInst>(Ptr)) {
|
||||
Type *BCTy = BitCast->getType();
|
||||
Type *BCSrcTy = BitCast->getOperand(0)->getType();
|
||||
if (BCTy->getVectorNumElements() == BCSrcTy->getVectorNumElements()) {
|
||||
LLVM_DEBUG(dbgs() << "masked gathers: looking through bitcast\n");
|
||||
Ptr = BitCast->getOperand(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
|
||||
using namespace PatternMatch;
|
||||
LLVM_DEBUG(dbgs() << "masked gathers: checking transform preconditions\n");
|
||||
|
||||
|
@ -101,42 +185,20 @@ static bool LowerGather(IntrinsicInst *I) {
|
|||
Value *Mask = I->getArgOperand(2);
|
||||
Value *PassThru = I->getArgOperand(3);
|
||||
|
||||
// Check this is a valid gather with correct alignment
|
||||
if (!isLegalTypeAndAlignment(Ty->getVectorNumElements(),
|
||||
Ty->getScalarSizeInBits(), Alignment)) {
|
||||
LLVM_DEBUG(dbgs() << "masked gathers: instruction does not have valid "
|
||||
<< "alignment or vector type \n");
|
||||
Ty->getScalarSizeInBits(), Alignment))
|
||||
return false;
|
||||
}
|
||||
lookThroughBitcast(Ptr);
|
||||
assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type");
|
||||
|
||||
IRBuilder<> Builder(I->getContext());
|
||||
Builder.SetInsertPoint(I);
|
||||
Builder.SetCurrentDebugLocation(I->getDebugLoc());
|
||||
|
||||
Value *Load = nullptr;
|
||||
// Look through bitcast instruction if #elements is the same
|
||||
if (auto *BitCast = dyn_cast<BitCastInst>(Ptr)) {
|
||||
Type *BCTy = BitCast->getType();
|
||||
Type *BCSrcTy = BitCast->getOperand(0)->getType();
|
||||
if (BCTy->getVectorNumElements() == BCSrcTy->getVectorNumElements()) {
|
||||
LLVM_DEBUG(dbgs() << "masked gathers: looking through bitcast\n");
|
||||
Ptr = BitCast->getOperand(0);
|
||||
}
|
||||
}
|
||||
assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type");
|
||||
|
||||
if (Ty->getVectorNumElements() != 4)
|
||||
// Can't build an intrinsic for this
|
||||
Value *Load = tryCreateMaskedGatherOffset(I, Ptr, Builder);
|
||||
if (!Load)
|
||||
Load = tryCreateMaskedGatherBase(I, Ptr, Builder);
|
||||
if (!Load)
|
||||
return false;
|
||||
if (match(Mask, m_One()))
|
||||
Load = Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base,
|
||||
{Ty, Ptr->getType()},
|
||||
{Ptr, Builder.getInt32(0)});
|
||||
else
|
||||
Load =
|
||||
Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base_predicated,
|
||||
{Ty, Ptr->getType(), Mask->getType()},
|
||||
{Ptr, Builder.getInt32(0), Mask});
|
||||
|
||||
if (!isa<UndefValue>(PassThru) && !match(PassThru, m_Zero())) {
|
||||
LLVM_DEBUG(dbgs() << "masked gathers: found non-trivial passthru - "
|
||||
|
@ -150,6 +212,68 @@ static bool LowerGather(IntrinsicInst *I) {
|
|||
return true;
|
||||
}
|
||||
|
||||
Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase(
|
||||
IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) {
|
||||
using namespace PatternMatch;
|
||||
LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n");
|
||||
Type *Ty = I->getType();
|
||||
if (Ty->getVectorNumElements() != 4)
|
||||
// Can't build an intrinsic for this
|
||||
return nullptr;
|
||||
Value *Mask = I->getArgOperand(2);
|
||||
if (match(Mask, m_One()))
|
||||
return Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base,
|
||||
{Ty, Ptr->getType()},
|
||||
{Ptr, Builder.getInt32(0)});
|
||||
else
|
||||
return Builder.CreateIntrinsic(
|
||||
Intrinsic::arm_mve_vldr_gather_base_predicated,
|
||||
{Ty, Ptr->getType(), Mask->getType()},
|
||||
{Ptr, Builder.getInt32(0), Mask});
|
||||
}
|
||||
|
||||
Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
|
||||
IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) {
|
||||
using namespace PatternMatch;
|
||||
Type *Ty = I->getType();
|
||||
Value *Offsets;
|
||||
Value *BasePtr = checkGEP(Offsets, Ty, Ptr, Builder);
|
||||
if (!BasePtr)
|
||||
return nullptr;
|
||||
|
||||
unsigned Scale;
|
||||
int GEPElemSize =
|
||||
BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits();
|
||||
int ResultElemSize = Ty->getScalarSizeInBits();
|
||||
// This can be a 32bit load scaled by 4, a 16bit load scaled by 2, or a
|
||||
// 8bit, 16bit or 32bit load scaled by 1
|
||||
if (GEPElemSize == 32 && ResultElemSize == 32) {
|
||||
Scale = 2;
|
||||
} else if (GEPElemSize == 16 && ResultElemSize == 16) {
|
||||
Scale = 1;
|
||||
} else if (GEPElemSize == 8) {
|
||||
Scale = 0;
|
||||
} else {
|
||||
LLVM_DEBUG(dbgs() << "masked gathers: incorrect scale for load. Can't"
|
||||
<< " create masked gather\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
Value *Mask = I->getArgOperand(2);
|
||||
if (!match(Mask, m_One()))
|
||||
return Builder.CreateIntrinsic(
|
||||
Intrinsic::arm_mve_vldr_gather_offset_predicated,
|
||||
{Ty, BasePtr->getType(), Offsets->getType(), Mask->getType()},
|
||||
{BasePtr, Offsets, Builder.getInt32(Ty->getScalarSizeInBits()),
|
||||
Builder.getInt32(Scale), Builder.getInt32(1), Mask});
|
||||
else
|
||||
return Builder.CreateIntrinsic(
|
||||
Intrinsic::arm_mve_vldr_gather_offset,
|
||||
{Ty, BasePtr->getType(), Offsets->getType()},
|
||||
{BasePtr, Offsets, Builder.getInt32(Ty->getScalarSizeInBits()),
|
||||
Builder.getInt32(Scale), Builder.getInt32(1)});
|
||||
}
|
||||
|
||||
bool MVEGatherScatterLowering::runOnFunction(Function &F) {
|
||||
if (!EnableMaskedGatherScatters)
|
||||
return false;
|
||||
|
@ -171,7 +295,7 @@ bool MVEGatherScatterLowering::runOnFunction(Function &F) {
|
|||
return false;
|
||||
|
||||
for (IntrinsicInst *I : Gathers)
|
||||
LowerGather(I);
|
||||
lowerGather(I);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,271 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -o - | FileCheck %s
|
||||
|
||||
define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16(i16* %base, <8 x i16>* %offptr) {
|
||||
; CHECK-LABEL: scaled_v8i16_i16:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r1]
|
||||
; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <8 x i16>, <8 x i16>* %offptr, align 2
|
||||
%offs.zext = zext <8 x i16> %offs to <8 x i32>
|
||||
%ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext
|
||||
%gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
|
||||
ret <8 x i16> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <8 x half> @scaled_v8f16_i16(i16* %base, <8 x i16>* %offptr) {
|
||||
; CHECK-LABEL: scaled_v8f16_i16:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r1]
|
||||
; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <8 x i16>, <8 x i16>* %offptr, align 2
|
||||
%offs.zext = zext <8 x i16> %offs to <8 x i32>
|
||||
%i16_ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext
|
||||
%ptrs = bitcast <8 x i16*> %i16_ptrs to <8 x half*>
|
||||
%gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x half> undef)
|
||||
ret <8 x half> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <8 x half> @scaled_v8f16_half(half* %base, <8 x i16>* %offptr) {
|
||||
; CHECK-LABEL: scaled_v8f16_half:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r1]
|
||||
; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <8 x i16>, <8 x i16>* %offptr, align 2
|
||||
%offs.zext = zext <8 x i16> %offs to <8 x i32>
|
||||
%ptrs = getelementptr inbounds half, half* %base, <8 x i32> %offs.zext
|
||||
%gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x half> undef)
|
||||
ret <8 x half> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_sext(i16* %base, <8 x i16>* %offptr) {
|
||||
; CHECK-LABEL: scaled_v8i16_sext:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r4, r5, r7, lr}
|
||||
; CHECK-NEXT: push {r4, r5, r7, lr}
|
||||
; CHECK-NEXT: vldrh.s32 q0, [r1]
|
||||
; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
|
||||
; CHECK-NEXT: vshl.i32 q0, q0, #1
|
||||
; CHECK-NEXT: vshl.i32 q1, q1, #1
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, r0
|
||||
; CHECK-NEXT: vadd.i32 q1, q1, r0
|
||||
; CHECK-NEXT: vmov r2, s2
|
||||
; CHECK-NEXT: vmov r3, s3
|
||||
; CHECK-NEXT: vmov r5, s1
|
||||
; CHECK-NEXT: vmov r0, s4
|
||||
; CHECK-NEXT: vmov r1, s5
|
||||
; CHECK-NEXT: vmov r4, s7
|
||||
; CHECK-NEXT: ldrh.w r12, [r2]
|
||||
; CHECK-NEXT: vmov r2, s0
|
||||
; CHECK-NEXT: ldrh.w lr, [r3]
|
||||
; CHECK-NEXT: vmov r3, s6
|
||||
; CHECK-NEXT: ldrh r5, [r5]
|
||||
; CHECK-NEXT: ldrh r0, [r0]
|
||||
; CHECK-NEXT: ldrh r1, [r1]
|
||||
; CHECK-NEXT: ldrh r4, [r4]
|
||||
; CHECK-NEXT: ldrh r2, [r2]
|
||||
; CHECK-NEXT: ldrh r3, [r3]
|
||||
; CHECK-NEXT: vmov.16 q0[0], r2
|
||||
; CHECK-NEXT: vmov.16 q0[1], r5
|
||||
; CHECK-NEXT: vmov.16 q0[2], r12
|
||||
; CHECK-NEXT: vmov.16 q0[3], lr
|
||||
; CHECK-NEXT: vmov.16 q0[4], r0
|
||||
; CHECK-NEXT: vmov.16 q0[5], r1
|
||||
; CHECK-NEXT: vmov.16 q0[6], r3
|
||||
; CHECK-NEXT: vmov.16 q0[7], r4
|
||||
; CHECK-NEXT: pop {r4, r5, r7, pc}
|
||||
entry:
|
||||
%offs = load <8 x i16>, <8 x i16>* %offptr, align 2
|
||||
%offs.sext = sext <8 x i16> %offs to <8 x i32>
|
||||
%ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.sext
|
||||
%gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
|
||||
ret <8 x i16> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <8 x half> @scaled_v8f16_sext(i16* %base, <8 x i16>* %offptr) {
|
||||
; CHECK-LABEL: scaled_v8f16_sext:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrh.s32 q0, [r1]
|
||||
; CHECK-NEXT: vshl.i32 q0, q0, #1
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vmov r2, s5
|
||||
; CHECK-NEXT: vldr.16 s0, [r2]
|
||||
; CHECK-NEXT: vmov r3, s4
|
||||
; CHECK-NEXT: vmov r2, s0
|
||||
; CHECK-NEXT: vldr.16 s0, [r3]
|
||||
; CHECK-NEXT: vmov r3, s0
|
||||
; CHECK-NEXT: vmov.16 q0[0], r3
|
||||
; CHECK-NEXT: vmov.16 q0[1], r2
|
||||
; CHECK-NEXT: vmov r2, s6
|
||||
; CHECK-NEXT: vldr.16 s8, [r2]
|
||||
; CHECK-NEXT: vmov r2, s8
|
||||
; CHECK-NEXT: vmov.16 q0[2], r2
|
||||
; CHECK-NEXT: vmov r2, s7
|
||||
; CHECK-NEXT: vldr.16 s4, [r2]
|
||||
; CHECK-NEXT: vmov r2, s4
|
||||
; CHECK-NEXT: vldrh.s32 q1, [r1, #8]
|
||||
; CHECK-NEXT: vmov.16 q0[3], r2
|
||||
; CHECK-NEXT: vshl.i32 q1, q1, #1
|
||||
; CHECK-NEXT: vadd.i32 q1, q1, r0
|
||||
; CHECK-NEXT: vmov r0, s4
|
||||
; CHECK-NEXT: vldr.16 s8, [r0]
|
||||
; CHECK-NEXT: vmov r0, s8
|
||||
; CHECK-NEXT: vmov.16 q0[4], r0
|
||||
; CHECK-NEXT: vmov r0, s5
|
||||
; CHECK-NEXT: vldr.16 s8, [r0]
|
||||
; CHECK-NEXT: vmov r0, s8
|
||||
; CHECK-NEXT: vmov.16 q0[5], r0
|
||||
; CHECK-NEXT: vmov r0, s6
|
||||
; CHECK-NEXT: vldr.16 s8, [r0]
|
||||
; CHECK-NEXT: vmov r0, s8
|
||||
; CHECK-NEXT: vmov.16 q0[6], r0
|
||||
; CHECK-NEXT: vmov r0, s7
|
||||
; CHECK-NEXT: vldr.16 s4, [r0]
|
||||
; CHECK-NEXT: vmov r0, s4
|
||||
; CHECK-NEXT: vmov.16 q0[7], r0
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <8 x i16>, <8 x i16>* %offptr, align 2
|
||||
%offs.sext = sext <8 x i16> %offs to <8 x i32>
|
||||
%i16_ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.sext
|
||||
%ptrs = bitcast <8 x i16*> %i16_ptrs to <8 x half*>
|
||||
%gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x half> undef)
|
||||
ret <8 x half> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <8 x i16> @unsigned_scaled_v8i16_i8(i16* %base, <8 x i8>* %offptr) {
|
||||
; CHECK-LABEL: unsigned_scaled_v8i16_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrb.u16 q1, [r1]
|
||||
; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <8 x i8>, <8 x i8>* %offptr, align 1
|
||||
%offs.zext = zext <8 x i8> %offs to <8 x i32>
|
||||
%ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext
|
||||
%gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
|
||||
ret <8 x i16> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <8 x half> @unsigned_scaled_v8f16_i8(i16* %base, <8 x i8>* %offptr) {
|
||||
; CHECK-LABEL: unsigned_scaled_v8f16_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrb.u16 q1, [r1]
|
||||
; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <8 x i8>, <8 x i8>* %offptr, align 1
|
||||
%offs.zext = zext <8 x i8> %offs to <8 x i32>
|
||||
%i16_ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext
|
||||
%ptrs = bitcast <8 x i16*> %i16_ptrs to <8 x half*>
|
||||
%gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x half> undef)
|
||||
ret <8 x half> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru0t(i16* %base, <8 x i16>* %offptr) {
|
||||
; CHECK-LABEL: scaled_v8i16_i16_passthru0t:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r1]
|
||||
; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <8 x i16>, <8 x i16>* %offptr, align 2
|
||||
%offs.zext = zext <8 x i16> %offs to <8 x i32>
|
||||
%ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext
|
||||
%gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> zeroinitializer)
|
||||
ret <8 x i16> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru1t(i16* %base, <8 x i16>* %offptr) {
|
||||
; CHECK-LABEL: scaled_v8i16_i16_passthru1t:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r1]
|
||||
; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <8 x i16>, <8 x i16>* %offptr, align 2
|
||||
%offs.zext = zext <8 x i16> %offs to <8 x i32>
|
||||
%ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext
|
||||
%gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
|
||||
ret <8 x i16> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru1f(i16* %base, <8 x i16>* %offptr) {
|
||||
; CHECK-LABEL: scaled_v8i16_i16_passthru1f:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: movw r2, #65487
|
||||
; CHECK-NEXT: vmov.i16 q0, #0x1
|
||||
; CHECK-NEXT: vmsr p0, r2
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r1]
|
||||
; CHECK-NEXT: vpst
|
||||
; CHECK-NEXT: vldrht.u16 q2, [r0, q1, uxtw #1]
|
||||
; CHECK-NEXT: vpsel q0, q2, q0
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <8 x i16>, <8 x i16>* %offptr, align 2
|
||||
%offs.zext = zext <8 x i16> %offs to <8 x i32>
|
||||
%ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext
|
||||
%gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
|
||||
ret <8 x i16> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru0f(i16* %base, <8 x i16>* %offptr) {
|
||||
; CHECK-LABEL: scaled_v8i16_i16_passthru0f:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: movw r2, #65523
|
||||
; CHECK-NEXT: vmsr p0, r2
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r1]
|
||||
; CHECK-NEXT: vpst
|
||||
; CHECK-NEXT: vldrht.u16 q0, [r0, q1, uxtw #1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <8 x i16>, <8 x i16>* %offptr, align 2
|
||||
%offs.zext = zext <8 x i16> %offs to <8 x i32>
|
||||
%ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext
|
||||
%gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
|
||||
ret <8 x i16> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru_icmp0(i16* %base, <8 x i16>* %offptr) {
|
||||
; CHECK-LABEL: scaled_v8i16_i16_passthru_icmp0:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r1]
|
||||
; CHECK-NEXT: vpt.s16 gt, q1, zr
|
||||
; CHECK-NEXT: vldrht.u16 q0, [r0, q1, uxtw #1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <8 x i16>, <8 x i16>* %offptr, align 2
|
||||
%offs.zext = zext <8 x i16> %offs to <8 x i32>
|
||||
%ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext
|
||||
%mask = icmp sgt <8 x i16> %offs, zeroinitializer
|
||||
%gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> %mask, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>)
|
||||
ret <8 x i16> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru_icmp1(i16* %base, <8 x i16>* %offptr) {
|
||||
; CHECK-LABEL: scaled_v8i16_i16_passthru_icmp1:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmov.i16 q0, #0x1
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r1]
|
||||
; CHECK-NEXT: vpt.s16 gt, q1, zr
|
||||
; CHECK-NEXT: vldrht.u16 q2, [r0, q1, uxtw #1]
|
||||
; CHECK-NEXT: vpsel q0, q2, q0
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <8 x i16>, <8 x i16>* %offptr, align 2
|
||||
%offs.zext = zext <8 x i16> %offs to <8 x i32>
|
||||
%ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext
|
||||
%mask = icmp sgt <8 x i16> %offs, zeroinitializer
|
||||
%gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> %mask, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
|
||||
ret <8 x i16> %gather
|
||||
}
|
||||
|
||||
declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>) #1
|
||||
declare <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>) #1
|
||||
declare <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*>, i32, <8 x i1>, <8 x half>) #1
|
|
@ -0,0 +1,242 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -o - | FileCheck %s
|
||||
|
||||
define arm_aapcs_vfpcc <8 x i16> @zext_unscaled_i8_i16(i8* %base, <8 x i16>* %offptr) {
|
||||
; CHECK-LABEL: zext_unscaled_i8_i16:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r4, r5, r7, lr}
|
||||
; CHECK-NEXT: push {r4, r5, r7, lr}
|
||||
; CHECK-NEXT: vldrh.u32 q0, [r1]
|
||||
; CHECK-NEXT: vldrh.u32 q1, [r1, #8]
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, r0
|
||||
; CHECK-NEXT: vadd.i32 q1, q1, r0
|
||||
; CHECK-NEXT: vmov r2, s3
|
||||
; CHECK-NEXT: vmov r3, s1
|
||||
; CHECK-NEXT: vmov r5, s0
|
||||
; CHECK-NEXT: vmov r0, s4
|
||||
; CHECK-NEXT: vmov r1, s5
|
||||
; CHECK-NEXT: vmov r4, s7
|
||||
; CHECK-NEXT: ldrb.w r12, [r2]
|
||||
; CHECK-NEXT: vmov r2, s2
|
||||
; CHECK-NEXT: ldrb.w lr, [r3]
|
||||
; CHECK-NEXT: vmov r3, s6
|
||||
; CHECK-NEXT: ldrb r5, [r5]
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.16 q0[0], r5
|
||||
; CHECK-NEXT: ldrb r1, [r1]
|
||||
; CHECK-NEXT: vmov.16 q0[1], lr
|
||||
; CHECK-NEXT: ldrb r4, [r4]
|
||||
; CHECK-NEXT: ldrb r2, [r2]
|
||||
; CHECK-NEXT: ldrb r3, [r3]
|
||||
; CHECK-NEXT: vmov.16 q0[2], r2
|
||||
; CHECK-NEXT: vmov.16 q0[3], r12
|
||||
; CHECK-NEXT: vmov.16 q0[4], r0
|
||||
; CHECK-NEXT: vmov.16 q0[5], r1
|
||||
; CHECK-NEXT: vmov.16 q0[6], r3
|
||||
; CHECK-NEXT: vmov.16 q0[7], r4
|
||||
; CHECK-NEXT: vmovlb.u8 q0, q0
|
||||
; CHECK-NEXT: pop {r4, r5, r7, pc}
|
||||
entry:
|
||||
%offs = load <8 x i16>, <8 x i16>* %offptr, align 2
|
||||
%offs.zext = zext <8 x i16> %offs to <8 x i32>
|
||||
%ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
|
||||
%gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
|
||||
%gather.zext = zext <8 x i8> %gather to <8 x i16>
|
||||
ret <8 x i16> %gather.zext
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <8 x i16> @sext_unscaled_i8_i16(i8* %base, <8 x i16>* %offptr) {
|
||||
; CHECK-LABEL: sext_unscaled_i8_i16:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r4, r5, r7, lr}
|
||||
; CHECK-NEXT: push {r4, r5, r7, lr}
|
||||
; CHECK-NEXT: vldrh.u32 q0, [r1]
|
||||
; CHECK-NEXT: vldrh.u32 q1, [r1, #8]
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, r0
|
||||
; CHECK-NEXT: vadd.i32 q1, q1, r0
|
||||
; CHECK-NEXT: vmov r2, s3
|
||||
; CHECK-NEXT: vmov r3, s1
|
||||
; CHECK-NEXT: vmov r5, s0
|
||||
; CHECK-NEXT: vmov r0, s4
|
||||
; CHECK-NEXT: vmov r1, s5
|
||||
; CHECK-NEXT: vmov r4, s7
|
||||
; CHECK-NEXT: ldrb.w r12, [r2]
|
||||
; CHECK-NEXT: vmov r2, s2
|
||||
; CHECK-NEXT: ldrb.w lr, [r3]
|
||||
; CHECK-NEXT: vmov r3, s6
|
||||
; CHECK-NEXT: ldrb r5, [r5]
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.16 q0[0], r5
|
||||
; CHECK-NEXT: ldrb r1, [r1]
|
||||
; CHECK-NEXT: vmov.16 q0[1], lr
|
||||
; CHECK-NEXT: ldrb r4, [r4]
|
||||
; CHECK-NEXT: ldrb r2, [r2]
|
||||
; CHECK-NEXT: ldrb r3, [r3]
|
||||
; CHECK-NEXT: vmov.16 q0[2], r2
|
||||
; CHECK-NEXT: vmov.16 q0[3], r12
|
||||
; CHECK-NEXT: vmov.16 q0[4], r0
|
||||
; CHECK-NEXT: vmov.16 q0[5], r1
|
||||
; CHECK-NEXT: vmov.16 q0[6], r3
|
||||
; CHECK-NEXT: vmov.16 q0[7], r4
|
||||
; CHECK-NEXT: vmovlb.s8 q0, q0
|
||||
; CHECK-NEXT: pop {r4, r5, r7, pc}
|
||||
entry:
|
||||
%offs = load <8 x i16>, <8 x i16>* %offptr, align 2
|
||||
%offs.zext = zext <8 x i16> %offs to <8 x i32>
|
||||
%ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
|
||||
%gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
|
||||
%gather.sext = sext <8 x i8> %gather to <8 x i16>
|
||||
ret <8 x i16> %gather.sext
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <8 x i16> @unscaled_i16_i16(i8* %base, <8 x i16>* %offptr) {
|
||||
; CHECK-LABEL: unscaled_i16_i16:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r1]
|
||||
; CHECK-NEXT: vldrh.u16 q0, [r0, q1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <8 x i16>, <8 x i16>* %offptr, align 2
|
||||
%offs.zext = zext <8 x i16> %offs to <8 x i32>
|
||||
%byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
|
||||
%ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
|
||||
%gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
|
||||
ret <8 x i16> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <8 x half> @unscaled_f16_i16(i8* %base, <8 x i16>* %offptr) {
|
||||
; CHECK-LABEL: unscaled_f16_i16:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrh.u16 q1, [r1]
|
||||
; CHECK-NEXT: vldrh.u16 q0, [r0, q1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <8 x i16>, <8 x i16>* %offptr, align 2
|
||||
%offs.zext = zext <8 x i16> %offs to <8 x i32>
|
||||
%byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
|
||||
%ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*>
|
||||
%gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x half> undef)
|
||||
ret <8 x half> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <8 x i16> @zext_unsigned_unscaled_i8_i8(i8* %base, <8 x i8>* %offptr) {
|
||||
; CHECK-LABEL: zext_unsigned_unscaled_i8_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r4, r5, r7, lr}
|
||||
; CHECK-NEXT: push {r4, r5, r7, lr}
|
||||
; CHECK-NEXT: vldrb.u32 q0, [r1]
|
||||
; CHECK-NEXT: vldrb.u32 q1, [r1, #4]
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, r0
|
||||
; CHECK-NEXT: vadd.i32 q1, q1, r0
|
||||
; CHECK-NEXT: vmov r2, s3
|
||||
; CHECK-NEXT: vmov r3, s1
|
||||
; CHECK-NEXT: vmov r5, s0
|
||||
; CHECK-NEXT: vmov r0, s4
|
||||
; CHECK-NEXT: vmov r1, s5
|
||||
; CHECK-NEXT: vmov r4, s7
|
||||
; CHECK-NEXT: ldrb.w r12, [r2]
|
||||
; CHECK-NEXT: vmov r2, s2
|
||||
; CHECK-NEXT: ldrb.w lr, [r3]
|
||||
; CHECK-NEXT: vmov r3, s6
|
||||
; CHECK-NEXT: ldrb r5, [r5]
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.16 q0[0], r5
|
||||
; CHECK-NEXT: ldrb r1, [r1]
|
||||
; CHECK-NEXT: vmov.16 q0[1], lr
|
||||
; CHECK-NEXT: ldrb r4, [r4]
|
||||
; CHECK-NEXT: ldrb r2, [r2]
|
||||
; CHECK-NEXT: ldrb r3, [r3]
|
||||
; CHECK-NEXT: vmov.16 q0[2], r2
|
||||
; CHECK-NEXT: vmov.16 q0[3], r12
|
||||
; CHECK-NEXT: vmov.16 q0[4], r0
|
||||
; CHECK-NEXT: vmov.16 q0[5], r1
|
||||
; CHECK-NEXT: vmov.16 q0[6], r3
|
||||
; CHECK-NEXT: vmov.16 q0[7], r4
|
||||
; CHECK-NEXT: vmovlb.u8 q0, q0
|
||||
; CHECK-NEXT: pop {r4, r5, r7, pc}
|
||||
entry:
|
||||
%offs = load <8 x i8>, <8 x i8>* %offptr, align 1
|
||||
%offs.zext = zext <8 x i8> %offs to <8 x i32>
|
||||
%ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
|
||||
%gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
|
||||
%gather.zext = zext <8 x i8> %gather to <8 x i16>
|
||||
ret <8 x i16> %gather.zext
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <8 x i16> @sext_unsigned_unscaled_i8_i8(i8* %base, <8 x i8>* %offptr) {
|
||||
; CHECK-LABEL: sext_unsigned_unscaled_i8_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r4, r5, r7, lr}
|
||||
; CHECK-NEXT: push {r4, r5, r7, lr}
|
||||
; CHECK-NEXT: vldrb.u32 q0, [r1]
|
||||
; CHECK-NEXT: vldrb.u32 q1, [r1, #4]
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, r0
|
||||
; CHECK-NEXT: vadd.i32 q1, q1, r0
|
||||
; CHECK-NEXT: vmov r2, s3
|
||||
; CHECK-NEXT: vmov r3, s1
|
||||
; CHECK-NEXT: vmov r5, s0
|
||||
; CHECK-NEXT: vmov r0, s4
|
||||
; CHECK-NEXT: vmov r1, s5
|
||||
; CHECK-NEXT: vmov r4, s7
|
||||
; CHECK-NEXT: ldrb.w r12, [r2]
|
||||
; CHECK-NEXT: vmov r2, s2
|
||||
; CHECK-NEXT: ldrb.w lr, [r3]
|
||||
; CHECK-NEXT: vmov r3, s6
|
||||
; CHECK-NEXT: ldrb r5, [r5]
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.16 q0[0], r5
|
||||
; CHECK-NEXT: ldrb r1, [r1]
|
||||
; CHECK-NEXT: vmov.16 q0[1], lr
|
||||
; CHECK-NEXT: ldrb r4, [r4]
|
||||
; CHECK-NEXT: ldrb r2, [r2]
|
||||
; CHECK-NEXT: ldrb r3, [r3]
|
||||
; CHECK-NEXT: vmov.16 q0[2], r2
|
||||
; CHECK-NEXT: vmov.16 q0[3], r12
|
||||
; CHECK-NEXT: vmov.16 q0[4], r0
|
||||
; CHECK-NEXT: vmov.16 q0[5], r1
|
||||
; CHECK-NEXT: vmov.16 q0[6], r3
|
||||
; CHECK-NEXT: vmov.16 q0[7], r4
|
||||
; CHECK-NEXT: vmovlb.s8 q0, q0
|
||||
; CHECK-NEXT: pop {r4, r5, r7, pc}
|
||||
entry:
|
||||
%offs = load <8 x i8>, <8 x i8>* %offptr, align 1
|
||||
%offs.zext = zext <8 x i8> %offs to <8 x i32>
|
||||
%ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
|
||||
%gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
|
||||
%gather.sext = sext <8 x i8> %gather to <8 x i16>
|
||||
ret <8 x i16> %gather.sext
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <8 x i16> @unsigned_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr) {
|
||||
; CHECK-LABEL: unsigned_unscaled_i16_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrb.u16 q1, [r1]
|
||||
; CHECK-NEXT: vldrh.u16 q0, [r0, q1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <8 x i8>, <8 x i8>* %offptr, align 1
|
||||
%offs.zext = zext <8 x i8> %offs to <8 x i32>
|
||||
%byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
|
||||
%ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*>
|
||||
%gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
|
||||
ret <8 x i16> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <8 x half> @unsigned_unscaled_f16_i8(i8* %base, <8 x i8>* %offptr) {
|
||||
; CHECK-LABEL: unsigned_unscaled_f16_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrb.u16 q1, [r1]
|
||||
; CHECK-NEXT: vldrh.u16 q0, [r0, q1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <8 x i8>, <8 x i8>* %offptr, align 1
|
||||
%offs.zext = zext <8 x i8> %offs to <8 x i32>
|
||||
%byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
|
||||
%ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*>
|
||||
%gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x half> undef)
|
||||
ret <8 x half> %gather
|
||||
}
|
||||
|
||||
declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>) #1
|
||||
declare <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>) #1
|
||||
declare <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*>, i32, <8 x i1>, <8 x half>) #1
|
|
@ -60,10 +60,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32(i32* %base, <4 x i32>* %offptr) {
|
||||
; CHECK-LABEL: scaled_i32_i32:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
||||
; CHECK-NEXT: vshl.i32 q0, q0, #2
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vldrw.u32 q0, [q1]
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r1]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <4 x i32>, <4 x i32>* %offptr, align 4
|
||||
|
@ -77,10 +75,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x float> @scaled_f32_i32(i32* %base, <4 x i32>* %offptr) {
|
||||
; CHECK-LABEL: scaled_f32_i32:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
||||
; CHECK-NEXT: vshl.i32 q0, q0, #2
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vldrw.u32 q0, [q1]
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r1]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <4 x i32>, <4 x i32>* %offptr, align 4
|
||||
|
@ -93,10 +89,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x i32> @unsigned_scaled_b_i32_i16(i32* %base, <4 x i16>* %offptr) {
|
||||
; CHECK-LABEL: unsigned_scaled_b_i32_i16:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrh.u32 q0, [r1]
|
||||
; CHECK-NEXT: vshl.i32 q0, q0, #2
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vldrw.u32 q0, [q1]
|
||||
; CHECK-NEXT: vldrh.u32 q1, [r1]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <4 x i16>, <4 x i16>* %offptr, align 2
|
||||
|
@ -109,10 +103,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x i32> @signed_scaled_i32_i16(i32* %base, <4 x i16>* %offptr) {
|
||||
; CHECK-LABEL: signed_scaled_i32_i16:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrh.s32 q0, [r1]
|
||||
; CHECK-NEXT: vshl.i32 q0, q0, #2
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vldrw.u32 q0, [q1]
|
||||
; CHECK-NEXT: vldrh.s32 q1, [r1]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <4 x i16>, <4 x i16>* %offptr, align 2
|
||||
|
@ -125,10 +117,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x float> @a_unsigned_scaled_f32_i16(i32* %base, <4 x i16>* %offptr) {
|
||||
; CHECK-LABEL: a_unsigned_scaled_f32_i16:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrh.u32 q0, [r1]
|
||||
; CHECK-NEXT: vshl.i32 q0, q0, #2
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vldrw.u32 q0, [q1]
|
||||
; CHECK-NEXT: vldrh.u32 q1, [r1]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <4 x i16>, <4 x i16>* %offptr, align 2
|
||||
|
@ -142,10 +132,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x float> @b_signed_scaled_f32_i16(i32* %base, <4 x i16>* %offptr) {
|
||||
; CHECK-LABEL: b_signed_scaled_f32_i16:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrh.s32 q0, [r1]
|
||||
; CHECK-NEXT: vshl.i32 q0, q0, #2
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vldrw.u32 q0, [q1]
|
||||
; CHECK-NEXT: vldrh.s32 q1, [r1]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <4 x i16>, <4 x i16>* %offptr, align 2
|
||||
|
@ -275,10 +263,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x i32> @unsigned_scaled_b_i32_i8(i32* %base, <4 x i8>* %offptr) {
|
||||
; CHECK-LABEL: unsigned_scaled_b_i32_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrb.u32 q0, [r1]
|
||||
; CHECK-NEXT: vshl.i32 q0, q0, #2
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vldrw.u32 q0, [q1]
|
||||
; CHECK-NEXT: vldrb.u32 q1, [r1]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
|
||||
|
@ -291,10 +277,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x i32> @signed_scaled_i32_i8(i32* %base, <4 x i8>* %offptr) {
|
||||
; CHECK-LABEL: signed_scaled_i32_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrb.s32 q0, [r1]
|
||||
; CHECK-NEXT: vshl.i32 q0, q0, #2
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vldrw.u32 q0, [q1]
|
||||
; CHECK-NEXT: vldrb.s32 q1, [r1]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
|
||||
|
@ -307,10 +291,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x float> @a_unsigned_scaled_f32_i8(i32* %base, <4 x i8>* %offptr) {
|
||||
; CHECK-LABEL: a_unsigned_scaled_f32_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrb.u32 q0, [r1]
|
||||
; CHECK-NEXT: vshl.i32 q0, q0, #2
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vldrw.u32 q0, [q1]
|
||||
; CHECK-NEXT: vldrb.u32 q1, [r1]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
|
||||
|
@ -324,10 +306,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x float> @b_signed_scaled_f32_i8(i32* %base, <4 x i8>* %offptr) {
|
||||
; CHECK-LABEL: b_signed_scaled_f32_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrb.s32 q0, [r1]
|
||||
; CHECK-NEXT: vshl.i32 q0, q0, #2
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vldrw.u32 q0, [q1]
|
||||
; CHECK-NEXT: vldrb.s32 q1, [r1]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
|
||||
|
|
|
@ -116,9 +116,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x i32> @unscaled_i32_i32(i8* %base, <4 x i32>* %offptr) {
|
||||
; CHECK-LABEL: unscaled_i32_i32:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vldrw.u32 q0, [q1]
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r1]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, q1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <4 x i32>, <4 x i32>* %offptr, align 4
|
||||
|
@ -131,9 +130,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x float> @unscaled_f32_i32(i8* %base, <4 x i32>* %offptr) {
|
||||
; CHECK-LABEL: unscaled_f32_i32:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vldrw.u32 q0, [q1]
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r1]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, q1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <4 x i32>, <4 x i32>* %offptr, align 4
|
||||
|
@ -146,9 +144,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x i32> @unsigned_unscaled_b_i32_i16(i8* %base, <4 x i16>* %offptr) {
|
||||
; CHECK-LABEL: unsigned_unscaled_b_i32_i16:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrh.u32 q0, [r1]
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vldrw.u32 q0, [q1]
|
||||
; CHECK-NEXT: vldrh.u32 q1, [r1]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, q1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <4 x i16>, <4 x i16>* %offptr, align 2
|
||||
|
@ -162,9 +159,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x i32> @signed_unscaled_i32_i16(i8* %base, <4 x i16>* %offptr) {
|
||||
; CHECK-LABEL: signed_unscaled_i32_i16:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrh.s32 q0, [r1]
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vldrw.u32 q0, [q1]
|
||||
; CHECK-NEXT: vldrh.s32 q1, [r1]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, q1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <4 x i16>, <4 x i16>* %offptr, align 2
|
||||
|
@ -178,9 +174,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x float> @a_unsigned_unscaled_f32_i16(i8* %base, <4 x i16>* %offptr) {
|
||||
; CHECK-LABEL: a_unsigned_unscaled_f32_i16:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrh.u32 q0, [r1]
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vldrw.u32 q0, [q1]
|
||||
; CHECK-NEXT: vldrh.u32 q1, [r1]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, q1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <4 x i16>, <4 x i16>* %offptr, align 2
|
||||
|
@ -194,9 +189,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x float> @b_signed_unscaled_f32_i16(i8* %base, <4 x i16>* %offptr) {
|
||||
; CHECK-LABEL: b_signed_unscaled_f32_i16:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrh.s32 q0, [r1]
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vldrw.u32 q0, [q1]
|
||||
; CHECK-NEXT: vldrh.s32 q1, [r1]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, q1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <4 x i16>, <4 x i16>* %offptr, align 2
|
||||
|
@ -442,9 +436,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x i32> @unsigned_unscaled_b_i32_i8(i8* %base, <4 x i8>* %offptr) {
|
||||
; CHECK-LABEL: unsigned_unscaled_b_i32_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrb.u32 q0, [r1]
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vldrw.u32 q0, [q1]
|
||||
; CHECK-NEXT: vldrb.u32 q1, [r1]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, q1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
|
||||
|
@ -458,9 +451,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x i32> @signed_unscaled_i32_i8(i8* %base, <4 x i8>* %offptr) {
|
||||
; CHECK-LABEL: signed_unscaled_i32_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrb.s32 q0, [r1]
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vldrw.u32 q0, [q1]
|
||||
; CHECK-NEXT: vldrb.s32 q1, [r1]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, q1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
|
||||
|
@ -474,9 +466,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x float> @a_unsigned_unscaled_f32_i8(i8* %base, <4 x i8>* %offptr) {
|
||||
; CHECK-LABEL: a_unsigned_unscaled_f32_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrb.u32 q0, [r1]
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vldrw.u32 q0, [q1]
|
||||
; CHECK-NEXT: vldrb.u32 q1, [r1]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, q1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
|
||||
|
@ -490,9 +481,8 @@ entry:
|
|||
define arm_aapcs_vfpcc <4 x float> @b_signed_unscaled_f32_i8(i8* %base, <4 x i8>* %offptr) {
|
||||
; CHECK-LABEL: b_signed_unscaled_f32_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrb.s32 q0, [r1]
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vldrw.u32 q0, [q1]
|
||||
; CHECK-NEXT: vldrb.s32 q1, [r1]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, q1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <4 x i8>, <4 x i8>* %offptr, align 1
|
||||
|
@ -735,6 +725,31 @@ entry:
|
|||
ret <4 x i32> %gather.sext
|
||||
}
|
||||
|
||||
; VLDRW.u32 Qd, [P, 4]
|
||||
define arm_aapcs_vfpcc <4 x i32> @qi4(<4 x i32*> %p) {
|
||||
; CHECK-LABEL: qi4:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmov.i32 q1, #0x10
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, q1
|
||||
; CHECK-NEXT: vmov r0, s0
|
||||
; CHECK-NEXT: vmov r3, s1
|
||||
; CHECK-NEXT: vmov r1, s2
|
||||
; CHECK-NEXT: vmov r2, s3
|
||||
; CHECK-NEXT: ldr r0, [r0]
|
||||
; CHECK-NEXT: ldr r3, [r3]
|
||||
; CHECK-NEXT: vmov.32 q0[0], r0
|
||||
; CHECK-NEXT: ldr r1, [r1]
|
||||
; CHECK-NEXT: vmov.32 q0[1], r3
|
||||
; CHECK-NEXT: ldr r2, [r2]
|
||||
; CHECK-NEXT: vmov.32 q0[2], r1
|
||||
; CHECK-NEXT: vmov.32 q0[3], r2
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%g = getelementptr inbounds i32, <4 x i32*> %p, i32 4
|
||||
%gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %g, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
|
||||
ret <4 x i32> %gather
|
||||
}
|
||||
|
||||
declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>)
|
||||
declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>)
|
||||
declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
|
||||
|
|
|
@ -0,0 +1,370 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -o - | FileCheck %s
|
||||
|
||||
define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8(i8* %base, <16 x i8>* %offptr) {
|
||||
; CHECK-LABEL: unscaled_v16i8_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrb.u8 q1, [r1]
|
||||
; CHECK-NEXT: vldrb.u8 q0, [r0, q1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <16 x i8>, <16 x i8>* %offptr, align 1
|
||||
%offs.zext = zext <16 x i8> %offs to <16 x i32>
|
||||
%ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext
|
||||
%gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
|
||||
ret <16 x i8> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <8 x i8> @unscaled_v8i8_i8(i8* %base, <8 x i8>* %offptr) {
|
||||
; CHECK-LABEL: unscaled_v8i8_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r4, r5, r7, lr}
|
||||
; CHECK-NEXT: push {r4, r5, r7, lr}
|
||||
; CHECK-NEXT: vldrb.u32 q0, [r1]
|
||||
; CHECK-NEXT: vldrb.u32 q1, [r1, #4]
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, r0
|
||||
; CHECK-NEXT: vadd.i32 q1, q1, r0
|
||||
; CHECK-NEXT: vmov r2, s2
|
||||
; CHECK-NEXT: vmov r3, s3
|
||||
; CHECK-NEXT: vmov r5, s1
|
||||
; CHECK-NEXT: vmov r0, s4
|
||||
; CHECK-NEXT: vmov r1, s5
|
||||
; CHECK-NEXT: vmov r4, s7
|
||||
; CHECK-NEXT: ldrb.w r12, [r2]
|
||||
; CHECK-NEXT: vmov r2, s0
|
||||
; CHECK-NEXT: ldrb.w lr, [r3]
|
||||
; CHECK-NEXT: vmov r3, s6
|
||||
; CHECK-NEXT: ldrb r5, [r5]
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: ldrb r1, [r1]
|
||||
; CHECK-NEXT: ldrb r4, [r4]
|
||||
; CHECK-NEXT: ldrb r2, [r2]
|
||||
; CHECK-NEXT: ldrb r3, [r3]
|
||||
; CHECK-NEXT: vmov.16 q0[0], r2
|
||||
; CHECK-NEXT: vmov.16 q0[1], r5
|
||||
; CHECK-NEXT: vmov.16 q0[2], r12
|
||||
; CHECK-NEXT: vmov.16 q0[3], lr
|
||||
; CHECK-NEXT: vmov.16 q0[4], r0
|
||||
; CHECK-NEXT: vmov.16 q0[5], r1
|
||||
; CHECK-NEXT: vmov.16 q0[6], r3
|
||||
; CHECK-NEXT: vmov.16 q0[7], r4
|
||||
; CHECK-NEXT: pop {r4, r5, r7, pc}
|
||||
entry:
|
||||
%offs = load <8 x i8>, <8 x i8>* %offptr, align 1
|
||||
%offs.zext = zext <8 x i8> %offs to <8 x i32>
|
||||
%ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext
|
||||
%gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
|
||||
ret <8 x i8> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <2 x i8> @unscaled_v2i8_i8(i8* %base, <2 x i8>* %offptr) {
|
||||
; CHECK-LABEL: unscaled_v2i8_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: ldrb r2, [r1]
|
||||
; CHECK-NEXT: vmov.i32 q0, #0xff
|
||||
; CHECK-NEXT: ldrb r1, [r1, #1]
|
||||
; CHECK-NEXT: vmov.32 q1[0], r2
|
||||
; CHECK-NEXT: vmov.32 q1[2], r1
|
||||
; CHECK-NEXT: vand q0, q1, q0
|
||||
; CHECK-NEXT: vmov r1, s0
|
||||
; CHECK-NEXT: vmov r2, s2
|
||||
; CHECK-NEXT: ldrb r1, [r0, r1]
|
||||
; CHECK-NEXT: ldrb r0, [r0, r2]
|
||||
; CHECK-NEXT: vmov.32 q0[0], r1
|
||||
; CHECK-NEXT: vmov.32 q0[2], r0
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <2 x i8>, <2 x i8>* %offptr, align 1
|
||||
%offs.zext = zext <2 x i8> %offs to <2 x i32>
|
||||
%ptrs = getelementptr inbounds i8, i8* %base, <2 x i32> %offs.zext
|
||||
%gather = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i8> undef)
|
||||
ret <2 x i8> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_sext(i8* %base, <16 x i8>* %offptr) {
|
||||
; CHECK-LABEL: unscaled_v16i8_sext:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r4, r5, r6, lr}
|
||||
; CHECK-NEXT: push {r4, r5, r6, lr}
|
||||
; CHECK-NEXT: vldrb.s32 q0, [r1, #8]
|
||||
; CHECK-NEXT: vldrb.s32 q2, [r1, #4]
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vldrb.s32 q0, [r1, #12]
|
||||
; CHECK-NEXT: vmov r2, s6
|
||||
; CHECK-NEXT: vadd.i32 q2, q2, r0
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, r0
|
||||
; CHECK-NEXT: vmov r6, s4
|
||||
; CHECK-NEXT: vmov r3, s2
|
||||
; CHECK-NEXT: vmov r4, s3
|
||||
; CHECK-NEXT: vmov r5, s7
|
||||
; CHECK-NEXT: ldrb.w r12, [r2]
|
||||
; CHECK-NEXT: vmov r2, s0
|
||||
; CHECK-NEXT: ldrb r6, [r6]
|
||||
; CHECK-NEXT: ldrb r3, [r3]
|
||||
; CHECK-NEXT: ldrb r4, [r4]
|
||||
; CHECK-NEXT: ldrb r5, [r5]
|
||||
; CHECK-NEXT: ldrb.w lr, [r2]
|
||||
; CHECK-NEXT: vmov r2, s1
|
||||
; CHECK-NEXT: vldrb.s32 q0, [r1]
|
||||
; CHECK-NEXT: vadd.i32 q3, q0, r0
|
||||
; CHECK-NEXT: vmov r0, s12
|
||||
; CHECK-NEXT: ldrb r2, [r2]
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[0], r0
|
||||
; CHECK-NEXT: vmov r0, s13
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[1], r0
|
||||
; CHECK-NEXT: vmov r0, s14
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[2], r0
|
||||
; CHECK-NEXT: vmov r0, s15
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[3], r0
|
||||
; CHECK-NEXT: vmov r0, s8
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[4], r0
|
||||
; CHECK-NEXT: vmov r0, s9
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[5], r0
|
||||
; CHECK-NEXT: vmov r0, s10
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[6], r0
|
||||
; CHECK-NEXT: vmov r0, s11
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[7], r0
|
||||
; CHECK-NEXT: vmov r0, s5
|
||||
; CHECK-NEXT: vmov.8 q0[8], r6
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[9], r0
|
||||
; CHECK-NEXT: vmov.8 q0[10], r12
|
||||
; CHECK-NEXT: vmov.8 q0[11], r5
|
||||
; CHECK-NEXT: vmov.8 q0[12], lr
|
||||
; CHECK-NEXT: vmov.8 q0[13], r2
|
||||
; CHECK-NEXT: vmov.8 q0[14], r3
|
||||
; CHECK-NEXT: vmov.8 q0[15], r4
|
||||
; CHECK-NEXT: pop {r4, r5, r6, pc}
|
||||
entry:
|
||||
%offs = load <16 x i8>, <16 x i8>* %offptr, align 1
|
||||
%offs.sext = sext <16 x i8> %offs to <16 x i32>
|
||||
%ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.sext
|
||||
%gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
|
||||
ret <16 x i8> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i16(i8* %base, <16 x i16>* %offptr) {
|
||||
; CHECK-LABEL: unscaled_v16i8_i16:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r4, r5, r6, lr}
|
||||
; CHECK-NEXT: push {r4, r5, r6, lr}
|
||||
; CHECK-NEXT: vldrh.s32 q0, [r1, #16]
|
||||
; CHECK-NEXT: vldrh.s32 q2, [r1, #8]
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vldrh.s32 q0, [r1, #24]
|
||||
; CHECK-NEXT: vmov r2, s6
|
||||
; CHECK-NEXT: vadd.i32 q2, q2, r0
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, r0
|
||||
; CHECK-NEXT: vmov r6, s4
|
||||
; CHECK-NEXT: vmov r3, s2
|
||||
; CHECK-NEXT: vmov r4, s3
|
||||
; CHECK-NEXT: vmov r5, s7
|
||||
; CHECK-NEXT: ldrb.w r12, [r2]
|
||||
; CHECK-NEXT: vmov r2, s0
|
||||
; CHECK-NEXT: ldrb r6, [r6]
|
||||
; CHECK-NEXT: ldrb r3, [r3]
|
||||
; CHECK-NEXT: ldrb r4, [r4]
|
||||
; CHECK-NEXT: ldrb r5, [r5]
|
||||
; CHECK-NEXT: ldrb.w lr, [r2]
|
||||
; CHECK-NEXT: vmov r2, s1
|
||||
; CHECK-NEXT: vldrh.s32 q0, [r1]
|
||||
; CHECK-NEXT: vadd.i32 q3, q0, r0
|
||||
; CHECK-NEXT: vmov r0, s12
|
||||
; CHECK-NEXT: ldrb r2, [r2]
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[0], r0
|
||||
; CHECK-NEXT: vmov r0, s13
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[1], r0
|
||||
; CHECK-NEXT: vmov r0, s14
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[2], r0
|
||||
; CHECK-NEXT: vmov r0, s15
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[3], r0
|
||||
; CHECK-NEXT: vmov r0, s8
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[4], r0
|
||||
; CHECK-NEXT: vmov r0, s9
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[5], r0
|
||||
; CHECK-NEXT: vmov r0, s10
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[6], r0
|
||||
; CHECK-NEXT: vmov r0, s11
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[7], r0
|
||||
; CHECK-NEXT: vmov r0, s5
|
||||
; CHECK-NEXT: vmov.8 q0[8], r6
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[9], r0
|
||||
; CHECK-NEXT: vmov.8 q0[10], r12
|
||||
; CHECK-NEXT: vmov.8 q0[11], r5
|
||||
; CHECK-NEXT: vmov.8 q0[12], lr
|
||||
; CHECK-NEXT: vmov.8 q0[13], r2
|
||||
; CHECK-NEXT: vmov.8 q0[14], r3
|
||||
; CHECK-NEXT: vmov.8 q0[15], r4
|
||||
; CHECK-NEXT: pop {r4, r5, r6, pc}
|
||||
entry:
|
||||
%offs = load <16 x i16>, <16 x i16>* %offptr, align 2
|
||||
%offs.sext = sext <16 x i16> %offs to <16 x i32>
|
||||
%ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.sext
|
||||
%gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
|
||||
ret <16 x i8> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_scaled(i32* %base, <16 x i8>* %offptr) {
|
||||
; CHECK-LABEL: unscaled_v16i8_scaled:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r4, r5, r6, lr}
|
||||
; CHECK-NEXT: push {r4, r5, r6, lr}
|
||||
; CHECK-NEXT: vldrb.u32 q0, [r1, #8]
|
||||
; CHECK-NEXT: vldrb.u32 q2, [r1, #4]
|
||||
; CHECK-NEXT: vshl.i32 q0, q0, #2
|
||||
; CHECK-NEXT: vshl.i32 q2, q2, #2
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vldrb.u32 q0, [r1, #12]
|
||||
; CHECK-NEXT: vmov r2, s6
|
||||
; CHECK-NEXT: vadd.i32 q2, q2, r0
|
||||
; CHECK-NEXT: vshl.i32 q0, q0, #2
|
||||
; CHECK-NEXT: vmov r6, s4
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, r0
|
||||
; CHECK-NEXT: vmov r5, s7
|
||||
; CHECK-NEXT: vmov r3, s2
|
||||
; CHECK-NEXT: vmov r4, s3
|
||||
; CHECK-NEXT: ldrb.w r12, [r2]
|
||||
; CHECK-NEXT: vmov r2, s0
|
||||
; CHECK-NEXT: ldrb r6, [r6]
|
||||
; CHECK-NEXT: ldrb r5, [r5]
|
||||
; CHECK-NEXT: ldrb r3, [r3]
|
||||
; CHECK-NEXT: ldrb r4, [r4]
|
||||
; CHECK-NEXT: ldrb.w lr, [r2]
|
||||
; CHECK-NEXT: vmov r2, s1
|
||||
; CHECK-NEXT: vldrb.u32 q0, [r1]
|
||||
; CHECK-NEXT: vshl.i32 q0, q0, #2
|
||||
; CHECK-NEXT: vadd.i32 q3, q0, r0
|
||||
; CHECK-NEXT: vmov r0, s12
|
||||
; CHECK-NEXT: ldrb r2, [r2]
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[0], r0
|
||||
; CHECK-NEXT: vmov r0, s13
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[1], r0
|
||||
; CHECK-NEXT: vmov r0, s14
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[2], r0
|
||||
; CHECK-NEXT: vmov r0, s15
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[3], r0
|
||||
; CHECK-NEXT: vmov r0, s8
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[4], r0
|
||||
; CHECK-NEXT: vmov r0, s9
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[5], r0
|
||||
; CHECK-NEXT: vmov r0, s10
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[6], r0
|
||||
; CHECK-NEXT: vmov r0, s11
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[7], r0
|
||||
; CHECK-NEXT: vmov r0, s5
|
||||
; CHECK-NEXT: vmov.8 q0[8], r6
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[9], r0
|
||||
; CHECK-NEXT: vmov.8 q0[10], r12
|
||||
; CHECK-NEXT: vmov.8 q0[11], r5
|
||||
; CHECK-NEXT: vmov.8 q0[12], lr
|
||||
; CHECK-NEXT: vmov.8 q0[13], r2
|
||||
; CHECK-NEXT: vmov.8 q0[14], r3
|
||||
; CHECK-NEXT: vmov.8 q0[15], r4
|
||||
; CHECK-NEXT: pop {r4, r5, r6, pc}
|
||||
entry:
|
||||
%offs = load <16 x i8>, <16 x i8>* %offptr, align 4
|
||||
%offs.zext = zext <16 x i8> %offs to <16 x i32>
|
||||
%ptrs32 = getelementptr inbounds i32, i32* %base, <16 x i32> %offs.zext
|
||||
%ptrs = bitcast <16 x i32*> %ptrs32 to <16 x i8*>
|
||||
%gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
|
||||
ret <16 x i8> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_next(i8* %base, <16 x i32>* %offptr) {
|
||||
; CHECK-LABEL: unscaled_v16i8_i8_next:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r4, r5, r6, lr}
|
||||
; CHECK-NEXT: push {r4, r5, r6, lr}
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r1, #32]
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r1, #16]
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, r0
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r1, #48]
|
||||
; CHECK-NEXT: vmov r2, s6
|
||||
; CHECK-NEXT: vadd.i32 q2, q2, r0
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, r0
|
||||
; CHECK-NEXT: vmov r6, s4
|
||||
; CHECK-NEXT: vmov r3, s2
|
||||
; CHECK-NEXT: vmov r4, s3
|
||||
; CHECK-NEXT: vmov r5, s7
|
||||
; CHECK-NEXT: ldrb.w r12, [r2]
|
||||
; CHECK-NEXT: vmov r2, s0
|
||||
; CHECK-NEXT: ldrb r6, [r6]
|
||||
; CHECK-NEXT: ldrb r3, [r3]
|
||||
; CHECK-NEXT: ldrb r4, [r4]
|
||||
; CHECK-NEXT: ldrb r5, [r5]
|
||||
; CHECK-NEXT: ldrb.w lr, [r2]
|
||||
; CHECK-NEXT: vmov r2, s1
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
||||
; CHECK-NEXT: vadd.i32 q3, q0, r0
|
||||
; CHECK-NEXT: vmov r0, s12
|
||||
; CHECK-NEXT: ldrb r2, [r2]
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[0], r0
|
||||
; CHECK-NEXT: vmov r0, s13
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[1], r0
|
||||
; CHECK-NEXT: vmov r0, s14
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[2], r0
|
||||
; CHECK-NEXT: vmov r0, s15
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[3], r0
|
||||
; CHECK-NEXT: vmov r0, s8
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[4], r0
|
||||
; CHECK-NEXT: vmov r0, s9
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[5], r0
|
||||
; CHECK-NEXT: vmov r0, s10
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[6], r0
|
||||
; CHECK-NEXT: vmov r0, s11
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[7], r0
|
||||
; CHECK-NEXT: vmov r0, s5
|
||||
; CHECK-NEXT: vmov.8 q0[8], r6
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.8 q0[9], r0
|
||||
; CHECK-NEXT: vmov.8 q0[10], r12
|
||||
; CHECK-NEXT: vmov.8 q0[11], r5
|
||||
; CHECK-NEXT: vmov.8 q0[12], lr
|
||||
; CHECK-NEXT: vmov.8 q0[13], r2
|
||||
; CHECK-NEXT: vmov.8 q0[14], r3
|
||||
; CHECK-NEXT: vmov.8 q0[15], r4
|
||||
; CHECK-NEXT: pop {r4, r5, r6, pc}
|
||||
entry:
|
||||
%offs = load <16 x i32>, <16 x i32>* %offptr, align 4
|
||||
%ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs
|
||||
%gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
|
||||
ret <16 x i8> %gather
|
||||
}
|
||||
|
||||
declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32, <16 x i1>, <16 x i8>)
|
||||
declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>)
|
||||
declare <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*>, i32, <2 x i1>, <2 x i8>)
|
|
@ -829,6 +829,19 @@ for.end: ; preds = %vector.body, %entry
|
|||
ret void
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <4 x i32> @qi4(<4 x i32*> %p) {
|
||||
; CHECK-LABEL: qi4:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmov.i32 q1, #0x10
|
||||
; CHECK-NEXT: vadd.i32 q1, q0, q1
|
||||
; CHECK-NEXT: vldrw.u32 q0, [q1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%g = getelementptr inbounds i32, <4 x i32*> %p, i32 4
|
||||
%gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %g, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
|
||||
ret <4 x i32> %gather
|
||||
}
|
||||
|
||||
declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
|
||||
declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
|
||||
declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>, i32, <8 x i1>, <8 x i32>)
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp %s -o 2>/dev/null - | FileCheck --check-prefix NOGATSCAT %s
|
||||
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=-mve -enable-arm-maskedgatscat %s -o 2>/dev/null - | FileCheck --check-prefix NOMVE %s
|
||||
|
||||
define arm_aapcs_vfpcc <4 x i32> @unscaled_i32_i32(i8* %base, <4 x i32>* %offptr) {
|
||||
; NOGATSCAT-LABEL: unscaled_i32_i32:
|
||||
; NOGATSCAT: @ %bb.0: @ %entry
|
||||
; NOGATSCAT-NEXT: vldrw.u32 q0, [r1]
|
||||
; NOGATSCAT-NEXT: vadd.i32 q0, q0, r0
|
||||
; NOGATSCAT-NEXT: vldrw.u32 q0, [r1]
|
||||
; NOGATSCAT-NEXT: vadd.i32 q0, q0, r0
|
||||
; NOGATSCAT-NEXT: vmov r0, s0
|
||||
; NOGATSCAT-NEXT: vmov r3, s1
|
||||
; NOGATSCAT-NEXT: vmov r1, s2
|
||||
|
@ -19,19 +20,20 @@ define arm_aapcs_vfpcc <4 x i32> @unscaled_i32_i32(i8* %base, <4 x i32>* %offptr
|
|||
; NOGATSCAT-NEXT: vmov.32 q0[2], r1
|
||||
; NOGATSCAT-NEXT: vmov.32 q0[3], r2
|
||||
; NOGATSCAT-NEXT: bx lr
|
||||
|
||||
;
|
||||
; NOMVE-LABEL: unscaled_i32_i32:
|
||||
; NOMVE: @ %bb.0: @ %entry
|
||||
; NOMVE-NEXT: .save {r4, lr}
|
||||
; NOMVE-NEXT: push {r4, lr}
|
||||
; NOMVE-NEXT: ldm.w r1, {r2, r3, lr}
|
||||
; NOMVE-NEXT: ldr r4, [r1, #12]
|
||||
; NOMVE-NEXT: ldr.w r12, [r0, r2]
|
||||
; NOMVE-NEXT: ldr r1, [r0, r3]
|
||||
; NOMVE-NEXT: ldr.w r2, [r0, lr]
|
||||
; NOMVE-NEXT: ldr r3, [r0, r4]
|
||||
; NOMVE-NEXT: mov r0, r12
|
||||
; NOMVE-NEXT: pop {r4, pc}
|
||||
; NOMVE-NEXT: .save {r4, lr}
|
||||
; NOMVE-NEXT: push {r4, lr}
|
||||
; NOMVE-NEXT: ldm.w r1, {r2, r3, lr}
|
||||
; NOMVE-NEXT: ldr r4, [r1, #12]
|
||||
; NOMVE-NEXT: ldr.w r12, [r0, r2]
|
||||
; NOMVE-NEXT: ldr r1, [r0, r3]
|
||||
; NOMVE-NEXT: ldr.w r2, [r0, lr]
|
||||
; NOMVE-NEXT: ldr r3, [r0, r4]
|
||||
; NOMVE-NEXT: mov r0, r12
|
||||
; NOMVE-NEXT: pop {r4, pc}
|
||||
|
||||
|
||||
entry:
|
||||
%offs = load <4 x i32>, <4 x i32>* %offptr, align 4
|
||||
|
|
Loading…
Reference in New Issue