[VP] Implementing expansion pass for VP load and store.

Added function to the ExpandVectorPredication pass to handle VP loads
and stores.

Reviewed By: simoll

Differential Revision: https://reviews.llvm.org/D109584
This commit is contained in:
Lorenzo Albano 2022-07-14 16:15:48 +02:00 committed by Simon Moll
parent 7fa1c32634
commit f390781cec
2 changed files with 270 additions and 2 deletions

View File

@ -15,6 +15,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
@ -82,8 +83,11 @@ STATISTIC(NumLoweredVPOps, "Number of folded vector predication operations");
/// \returns Whether the vector mask \p MaskVal has all lane bits set.
static bool isAllTrueMask(Value *MaskVal) {
auto *ConstVec = dyn_cast<ConstantVector>(MaskVal);
return ConstVec && ConstVec->isAllOnesValue();
if (Value *SplattedVal = getSplatValue(MaskVal))
if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
return ConstValue->isAllOnesValue();
return false;
}
/// \returns A non-excepting divisor constant for this type.
@ -171,6 +175,10 @@ struct CachingVPExpander {
Value *expandPredicationInReduction(IRBuilder<> &Builder,
VPReductionIntrinsic &PI);
/// \brief Lower this VP memory operation to a non-VP intrinsic.
Value *expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder,
VPIntrinsic &VPI);
/// \brief Query TTI and expand the vector predication in \p P accordingly.
Value *expandPredication(VPIntrinsic &PI);
@ -389,6 +397,53 @@ CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder,
return Reduction;
}
Value *
CachingVPExpander::expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder,
VPIntrinsic &VPI) {
assert(VPI.canIgnoreVectorLengthParam());
Value *MaskParam = VPI.getMaskParam();
Value *PtrParam = VPI.getMemoryPointerParam();
Value *DataParam = VPI.getMemoryDataParam();
bool IsUnmasked = isAllTrueMask(MaskParam);
MaybeAlign AlignOpt = VPI.getPointerAlignment();
Value *NewMemoryInst = nullptr;
switch (VPI.getIntrinsicID()) {
default:
llvm_unreachable("Not a VP memory intrinsic");
case Intrinsic::vp_store:
if (IsUnmasked) {
StoreInst *NewStore =
Builder.CreateStore(DataParam, PtrParam, /*IsVolatile*/ false);
if (AlignOpt.hasValue())
NewStore->setAlignment(AlignOpt.getValue());
NewMemoryInst = NewStore;
} else
NewMemoryInst = Builder.CreateMaskedStore(
DataParam, PtrParam, AlignOpt.valueOrOne(), MaskParam);
break;
case Intrinsic::vp_load:
if (IsUnmasked) {
LoadInst *NewLoad =
Builder.CreateLoad(VPI.getType(), PtrParam, /*IsVolatile*/ false);
if (AlignOpt.hasValue())
NewLoad->setAlignment(AlignOpt.getValue());
NewMemoryInst = NewLoad;
} else
NewMemoryInst = Builder.CreateMaskedLoad(
VPI.getType(), PtrParam, AlignOpt.valueOrOne(), MaskParam);
break;
}
assert(NewMemoryInst);
replaceOperation(*NewMemoryInst, VPI);
return NewMemoryInst;
}
void CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) {
LLVM_DEBUG(dbgs() << "Discard EVL parameter in " << VPI << "\n");
@ -465,6 +520,14 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
if (auto *VPRI = dyn_cast<VPReductionIntrinsic>(&VPI))
return expandPredicationInReduction(Builder, *VPRI);
switch (VPI.getIntrinsicID()) {
default:
break;
case Intrinsic::vp_load:
case Intrinsic::vp_store:
return expandPredicationInMemoryIntrinsic(Builder, VPI);
}
return &VPI;
}

View File

@ -0,0 +1,205 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt --expandvp -S < %s | FileCheck %s
; RUN: opt --expandvp --expandvp-override-evl-transform=Legal --expandvp-override-mask-transform=Convert -S < %s | FileCheck %s
; Fixed vectors
define <2 x i64> @vpload_v2i64(<2 x i64>* %ptr, <2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: @vpload_v2i64(
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i32 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[M:%.*]]
; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]], <2 x i64> undef)
; CHECK-NEXT: ret <2 x i64> [[TMP3]]
;
%load = call <2 x i64> @llvm.vp.load.v2i64.p0v2i64(<2 x i64>* %ptr, <2 x i1> %m, i32 %evl)
ret <2 x i64> %load
}
define <2 x i64> @vpload_v2i64_vlmax(<2 x i64>* %ptr, <2 x i1> %m) {
; CHECK-LABEL: @vpload_v2i64_vlmax(
; CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[M:%.*]], <2 x i64> undef)
; CHECK-NEXT: ret <2 x i64> [[TMP1]]
;
%load = call <2 x i64> @llvm.vp.load.v2i64.p0v2i64(<2 x i64>* %ptr, <2 x i1> %m, i32 2)
ret <2 x i64> %load
}
define <2 x i64> @vpload_v2i64_allones_mask(<2 x i64>* %ptr, i32 zeroext %evl) {
; CHECK-LABEL: @vpload_v2i64_allones_mask(
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i32 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], <i1 true, i1 true>
; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]], <2 x i64> undef)
; CHECK-NEXT: ret <2 x i64> [[TMP3]]
;
%load = call <2 x i64> @llvm.vp.load.v2i64.p0v2i64(<2 x i64>* %ptr, <2 x i1> <i1 1, i1 1>, i32 %evl)
ret <2 x i64> %load
}
define <2 x i64> @vpload_v2i64_allones_mask_vlmax(<2 x i64>* %ptr) {
; CHECK-LABEL: @vpload_v2i64_allones_mask_vlmax(
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[PTR:%.*]], align 16
; CHECK-NEXT: ret <2 x i64> [[TMP1]]
;
%load = call <2 x i64> @llvm.vp.load.v2i64.p0v2i64(<2 x i64>* %ptr, <2 x i1> <i1 1, i1 1>, i32 2)
ret <2 x i64> %load
}
define void @vpstore_v2i64(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: @vpstore_v2i64(
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i32 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], [[M:%.*]]
; CHECK-NEXT: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> [[VAL:%.*]], <2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]])
; CHECK-NEXT: ret void
;
call void @llvm.vp.store.v2i64.p0v2i64(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> %m, i32 %evl)
ret void
}
define void @vpstore_v2i64_vlmax(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> %m) {
; CHECK-LABEL: @vpstore_v2i64_vlmax(
; CHECK-NEXT: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> [[VAL:%.*]], <2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[M:%.*]])
; CHECK-NEXT: ret void
;
call void @llvm.vp.store.v2i64.p0v2i64(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> %m, i32 2)
ret void
}
define void @vpstore_v2i64_allones_mask(<2 x i64> %val, <2 x i64>* %ptr, i32 zeroext %evl) {
; CHECK-LABEL: @vpstore_v2i64_allones_mask(
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EVL:%.*]], i32 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i32> <i32 0, i32 1>, [[DOTSPLAT]]
; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i1> [[TMP1]], <i1 true, i1 true>
; CHECK-NEXT: call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> [[VAL:%.*]], <2 x i64>* [[PTR:%.*]], i32 1, <2 x i1> [[TMP2]])
; CHECK-NEXT: ret void
;
call void @llvm.vp.store.v2i64.p0v2i64(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> <i1 1, i1 1>, i32 %evl)
ret void
}
define void @vpstore_v2i64_allones_mask_vlmax(<2 x i64> %val, <2 x i64>* %ptr) {
; CHECK-LABEL: @vpstore_v2i64_allones_mask_vlmax(
; CHECK-NEXT: store <2 x i64> [[VAL:%.*]], <2 x i64>* [[PTR:%.*]], align 16
; CHECK-NEXT: ret void
;
call void @llvm.vp.store.v2i64.p0v2i64(<2 x i64> %val, <2 x i64>* %ptr, <2 x i1> <i1 1, i1 1>, i32 2)
ret void
}
; Scalable vectors
define <vscale x 1 x i64> @vpload_nxv1i64(<vscale x 1 x i64>* %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: @vpload_nxv1i64(
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]])
; CHECK-NEXT: [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], [[M:%.*]]
; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1
; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0nxv1i64(<vscale x 1 x i64>* [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP2]], <vscale x 1 x i64> undef)
; CHECK-NEXT: ret <vscale x 1 x i64> [[TMP3]]
;
%load = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0nxv1i64(<vscale x 1 x i64>* %ptr, <vscale x 1 x i1> %m, i32 %evl)
ret <vscale x 1 x i64> %load
}
define <vscale x 1 x i64> @vpload_nxv1i64_vscale(<vscale x 1 x i64>* %ptr, <vscale x 1 x i1> %m) {
; CHECK-LABEL: @vpload_nxv1i64_vscale(
; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0nxv1i64(<vscale x 1 x i64>* [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[M:%.*]], <vscale x 1 x i64> undef)
; CHECK-NEXT: ret <vscale x 1 x i64> [[TMP1]]
;
%vscale = call i32 @llvm.vscale.i32()
%vlmax = mul nuw i32 %vscale, 1
%load = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0nxv1i64(<vscale x 1 x i64>* %ptr, <vscale x 1 x i1> %m, i32 %vlmax)
ret <vscale x 1 x i64> %load
}
define <vscale x 1 x i64> @vpload_nxv1i64_allones_mask(<vscale x 1 x i64>* %ptr, i32 zeroext %evl) {
; CHECK-LABEL: @vpload_nxv1i64_allones_mask(
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]])
; CHECK-NEXT: [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1
; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 1 x i64> @llvm.masked.load.nxv1i64.p0nxv1i64(<vscale x 1 x i64>* [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP2]], <vscale x 1 x i64> undef)
; CHECK-NEXT: ret <vscale x 1 x i64> [[TMP3]]
;
%load = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0nxv1i64(<vscale x 1 x i64>* %ptr, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %evl)
ret <vscale x 1 x i64> %load
}
define <vscale x 1 x i64> @vpload_nxv1i64_allones_mask_vscale(<vscale x 1 x i64>* %ptr) {
; CHECK-LABEL: @vpload_nxv1i64_allones_mask_vscale(
; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1
; CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 1 x i64>, <vscale x 1 x i64>* [[PTR:%.*]], align 8
; CHECK-NEXT: ret <vscale x 1 x i64> [[TMP1]]
;
%vscale = call i32 @llvm.vscale.i32()
%vlmax = mul nuw i32 %vscale, 1
%load = call <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0nxv1i64(<vscale x 1 x i64>* %ptr, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %vlmax)
ret <vscale x 1 x i64> %load
}
define void @vpstore_nxv1i64(<vscale x 1 x i64> %val, <vscale x 1 x i64>* %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: @vpstore_nxv1i64(
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]])
; CHECK-NEXT: [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], [[M:%.*]]
; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1
; CHECK-NEXT: call void @llvm.masked.store.nxv1i64.p0nxv1i64(<vscale x 1 x i64> [[VAL:%.*]], <vscale x 1 x i64>* [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP2]])
; CHECK-NEXT: ret void
;
call void @llvm.vp.store.nxv1i64.p0nxv1i64(<vscale x 1 x i64> %val, <vscale x 1 x i64>* %ptr, <vscale x 1 x i1> %m, i32 %evl)
ret void
}
define void @vpstore_nxv1i64_vscale(<vscale x 1 x i64> %val, <vscale x 1 x i64>* %ptr, <vscale x 1 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: @vpstore_nxv1i64_vscale(
; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1
; CHECK-NEXT: call void @llvm.masked.store.nxv1i64.p0nxv1i64(<vscale x 1 x i64> [[VAL:%.*]], <vscale x 1 x i64>* [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[M:%.*]])
; CHECK-NEXT: ret void
;
%vscale = call i32 @llvm.vscale.i32()
%vlmax = mul nuw i32 %vscale, 1
call void @llvm.vp.store.nxv1i64.p0nxv1i64(<vscale x 1 x i64> %val, <vscale x 1 x i64>* %ptr, <vscale x 1 x i1> %m, i32 %vlmax)
ret void
}
define void @vpstore_nxv1i64_allones_mask(<vscale x 1 x i64> %val, <vscale x 1 x i64>* %ptr, i32 zeroext %evl) {
; CHECK-LABEL: @vpstore_nxv1i64_allones_mask(
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 0, i32 [[EVL:%.*]])
; CHECK-NEXT: [[TMP2:%.*]] = and <vscale x 1 x i1> [[TMP1]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[SCALABLE_SIZE:%.*]] = mul nuw i32 [[VSCALE]], 1
; CHECK-NEXT: call void @llvm.masked.store.nxv1i64.p0nxv1i64(<vscale x 1 x i64> [[VAL:%.*]], <vscale x 1 x i64>* [[PTR:%.*]], i32 1, <vscale x 1 x i1> [[TMP2]])
; CHECK-NEXT: ret void
;
call void @llvm.vp.store.nxv1i64.p0nxv1i64(<vscale x 1 x i64> %val, <vscale x 1 x i64>* %ptr, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %evl)
ret void
}
define void @vpstore_nxv1i64_allones_mask_vscale(<vscale x 1 x i64> %val, <vscale x 1 x i64>* %ptr) {
; CHECK-LABEL: @vpstore_nxv1i64_allones_mask_vscale(
; CHECK-NEXT: [[VSCALE:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[VLMAX:%.*]] = mul nuw i32 [[VSCALE]], 1
; CHECK-NEXT: store <vscale x 1 x i64> [[VAL:%.*]], <vscale x 1 x i64>* [[PTR:%.*]], align 8
; CHECK-NEXT: ret void
;
%vscale = call i32 @llvm.vscale.i32()
%vlmax = mul nuw i32 %vscale, 1
call void @llvm.vp.store.nxv1i64.p0nxv1i64(<vscale x 1 x i64> %val, <vscale x 1 x i64>* %ptr, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i32 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %vlmax)
ret void
}
declare i32 @llvm.vscale.i32()
declare <2 x i64> @llvm.vp.load.v2i64.p0v2i64(<2 x i64>*, <2 x i1>, i32)
declare void @llvm.vp.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, <2 x i1>, i32)
declare <vscale x 1 x i64> @llvm.vp.load.nxv1i64.p0nxv1i64(<vscale x 1 x i64>*, <vscale x 1 x i1>, i32)
declare void @llvm.vp.store.nxv1i64.p0nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>*, <vscale x 1 x i1>, i32)