[clang][AArch64][SVE] Avoid going through memory for fixed/scalable predicate casts

For fixed SVE types, predicates are represented using vectors of i8,
where as for scalable types they are represented using vectors of i1. We
can avoid going through memory for casts between these by bitcasting the
i1 scalable vectors to/from a scalable i8 vector of matching size, which
can then use the existing vector insert/extract logic.

Differential Revision: https://reviews.llvm.org/D106860
This commit is contained in:
Bradley Smith 2021-07-26 10:34:08 +00:00
parent 66d4430492
commit e57e1e4e00
7 changed files with 113 additions and 149 deletions

View File

@ -1271,12 +1271,26 @@ static llvm::Value *CreateCoercedLoad(Address Src, llvm::Type *Ty,
// perform the conversion.
if (auto *ScalableDst = dyn_cast<llvm::ScalableVectorType>(Ty)) {
if (auto *FixedSrc = dyn_cast<llvm::FixedVectorType>(SrcTy)) {
// If we are casting a fixed i8 vector to a scalable 16 x i1 predicate
// vector, use a vector insert and bitcast the result.
bool NeedsBitcast = false;
auto PredType =
llvm::ScalableVectorType::get(CGF.Builder.getInt1Ty(), 16);
llvm::Type *OrigType = Ty;
if (ScalableDst == PredType &&
FixedSrc->getElementType() == CGF.Builder.getInt8Ty()) {
ScalableDst = llvm::ScalableVectorType::get(CGF.Builder.getInt8Ty(), 2);
NeedsBitcast = true;
}
if (ScalableDst->getElementType() == FixedSrc->getElementType()) {
auto *Load = CGF.Builder.CreateLoad(Src);
auto *UndefVec = llvm::UndefValue::get(ScalableDst);
auto *Zero = llvm::Constant::getNullValue(CGF.CGM.Int64Ty);
return CGF.Builder.CreateInsertVector(ScalableDst, UndefVec, Load, Zero,
"castScalableSve");
llvm::Value *Result = CGF.Builder.CreateInsertVector(
ScalableDst, UndefVec, Load, Zero, "castScalableSve");
if (NeedsBitcast)
Result = CGF.Builder.CreateBitCast(Result, OrigType);
return Result;
}
}
}
@ -2857,9 +2871,18 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
// llvm.experimental.vector.extract to convert back to the original
// VLST.
if (auto *VecTyTo = dyn_cast<llvm::FixedVectorType>(ConvertType(Ty))) {
auto *Coerced = Fn->getArg(FirstIRArg);
llvm::Value *Coerced = Fn->getArg(FirstIRArg);
if (auto *VecTyFrom =
dyn_cast<llvm::ScalableVectorType>(Coerced->getType())) {
// If we are casting a scalable 16 x i1 predicate vector to a fixed i8
// vector, bitcast the source and use a vector extract.
auto PredType =
llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
if (VecTyFrom == PredType &&
VecTyTo->getElementType() == Builder.getInt8Ty()) {
VecTyFrom = llvm::ScalableVectorType::get(Builder.getInt8Ty(), 2);
Coerced = Builder.CreateBitCast(Coerced, VecTyFrom);
}
if (VecTyFrom->getElementType() == VecTyTo->getElementType()) {
llvm::Value *Zero = llvm::Constant::getNullValue(CGM.Int64Ty);

View File

@ -2063,11 +2063,25 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
// perform the bitcast.
if (const auto *FixedSrc = dyn_cast<llvm::FixedVectorType>(SrcTy)) {
if (const auto *ScalableDst = dyn_cast<llvm::ScalableVectorType>(DstTy)) {
// If we are casting a fixed i8 vector to a scalable 16 x i1 predicate
// vector, use a vector insert and bitcast the result.
bool NeedsBitCast = false;
auto PredType = llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
llvm::Type *OrigType = DstTy;
if (ScalableDst == PredType &&
FixedSrc->getElementType() == Builder.getInt8Ty()) {
DstTy = llvm::ScalableVectorType::get(Builder.getInt8Ty(), 2);
ScalableDst = dyn_cast<llvm::ScalableVectorType>(DstTy);
NeedsBitCast = true;
}
if (FixedSrc->getElementType() == ScalableDst->getElementType()) {
llvm::Value *UndefVec = llvm::UndefValue::get(DstTy);
llvm::Value *Zero = llvm::Constant::getNullValue(CGF.CGM.Int64Ty);
return Builder.CreateInsertVector(DstTy, UndefVec, Src, Zero,
"castScalableSve");
llvm::Value *Result = Builder.CreateInsertVector(
DstTy, UndefVec, Src, Zero, "castScalableSve");
if (NeedsBitCast)
Result = Builder.CreateBitCast(Result, OrigType);
return Result;
}
}
}
@ -2077,6 +2091,15 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
// perform the bitcast.
if (const auto *ScalableSrc = dyn_cast<llvm::ScalableVectorType>(SrcTy)) {
if (const auto *FixedDst = dyn_cast<llvm::FixedVectorType>(DstTy)) {
// If we are casting a scalable 16 x i1 predicate vector to a fixed i8
// vector, bitcast the source and use a vector extract.
auto PredType = llvm::ScalableVectorType::get(Builder.getInt1Ty(), 16);
if (ScalableSrc == PredType &&
FixedDst->getElementType() == Builder.getInt8Ty()) {
SrcTy = llvm::ScalableVectorType::get(Builder.getInt8Ty(), 2);
ScalableSrc = dyn_cast<llvm::ScalableVectorType>(SrcTy);
Src = Builder.CreateBitCast(Src, SrcTy);
}
if (ScalableSrc->getElementType() == FixedDst->getElementType()) {
llvm::Value *Zero = llvm::Constant::getNullValue(CGF.CGM.Int64Ty);
return Builder.CreateExtractVector(DstTy, Src, Zero, "castFixedSve");
@ -2087,10 +2110,9 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
// Perform VLAT <-> VLST bitcast through memory.
// TODO: since the llvm.experimental.vector.{insert,extract} intrinsics
// require the element types of the vectors to be the same, we
// need to keep this around for casting between predicates, or more
// generally for bitcasts between VLAT <-> VLST where the element
// types of the vectors are not the same, until we figure out a better
// way of doing these casts.
// need to keep this around for bitcasts between VLAT <-> VLST where
// the element types of the vectors are not the same, until we figure
// out a better way of doing these casts.
if ((isa<llvm::FixedVectorType>(SrcTy) &&
isa<llvm::ScalableVectorType>(DstTy)) ||
(isa<llvm::ScalableVectorType>(SrcTy) &&

View File

@ -191,32 +191,26 @@ void write_bfloat16(struct struct_bfloat16 *s, svbfloat16_t x) {
// CHECK-128-LABEL: @read_bool(
// CHECK-128-NEXT: entry:
// CHECK-128-NEXT: [[SAVED_VALUE:%.*]] = alloca <2 x i8>, align 2
// CHECK-128-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL:%.*]], %struct.struct_bool* [[S:%.*]], i64 0, i32 1, i64 0
// CHECK-128-NEXT: [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* [[ARRAYIDX]], align 2, !tbaa [[TBAA6]]
// CHECK-128-NEXT: store <2 x i8> [[TMP0]], <2 x i8>* [[SAVED_VALUE]], align 2, !tbaa [[TBAA6]]
// CHECK-128-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast <2 x i8>* [[SAVED_VALUE]] to <vscale x 16 x i1>*
// CHECK-128-NEXT: [[TMP1:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[CASTFIXEDSVE]], align 2, !tbaa [[TBAA6]]
// CHECK-128-NEXT: [[CASTFIXEDSVE:%.*]] = call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> undef, <2 x i8> [[TMP0]], i64 0)
// CHECK-128-NEXT: [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CASTFIXEDSVE]] to <vscale x 16 x i1>
// CHECK-128-NEXT: ret <vscale x 16 x i1> [[TMP1]]
//
// CHECK-256-LABEL: @read_bool(
// CHECK-256-NEXT: entry:
// CHECK-256-NEXT: [[SAVED_VALUE:%.*]] = alloca <4 x i8>, align 4
// CHECK-256-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL:%.*]], %struct.struct_bool* [[S:%.*]], i64 0, i32 1, i64 0
// CHECK-256-NEXT: [[TMP0:%.*]] = load <4 x i8>, <4 x i8>* [[ARRAYIDX]], align 2, !tbaa [[TBAA6]]
// CHECK-256-NEXT: store <4 x i8> [[TMP0]], <4 x i8>* [[SAVED_VALUE]], align 4, !tbaa [[TBAA6]]
// CHECK-256-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast <4 x i8>* [[SAVED_VALUE]] to <vscale x 16 x i1>*
// CHECK-256-NEXT: [[TMP1:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[CASTFIXEDSVE]], align 4, !tbaa [[TBAA6]]
// CHECK-256-NEXT: [[CASTFIXEDSVE:%.*]] = call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> undef, <4 x i8> [[TMP0]], i64 0)
// CHECK-256-NEXT: [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CASTFIXEDSVE]] to <vscale x 16 x i1>
// CHECK-256-NEXT: ret <vscale x 16 x i1> [[TMP1]]
//
// CHECK-512-LABEL: @read_bool(
// CHECK-512-NEXT: entry:
// CHECK-512-NEXT: [[SAVED_VALUE:%.*]] = alloca <8 x i8>, align 8
// CHECK-512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL:%.*]], %struct.struct_bool* [[S:%.*]], i64 0, i32 1, i64 0
// CHECK-512-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 2, !tbaa [[TBAA6]]
// CHECK-512-NEXT: store <8 x i8> [[TMP0]], <8 x i8>* [[SAVED_VALUE]], align 8, !tbaa [[TBAA6]]
// CHECK-512-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast <8 x i8>* [[SAVED_VALUE]] to <vscale x 16 x i1>*
// CHECK-512-NEXT: [[TMP1:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[CASTFIXEDSVE]], align 8, !tbaa [[TBAA6]]
// CHECK-512-NEXT: [[CASTFIXEDSVE:%.*]] = call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> undef, <8 x i8> [[TMP0]], i64 0)
// CHECK-512-NEXT: [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CASTFIXEDSVE]] to <vscale x 16 x i1>
// CHECK-512-NEXT: ret <vscale x 16 x i1> [[TMP1]]
//
svbool_t read_bool(struct struct_bool *s) {
@ -225,32 +219,26 @@ svbool_t read_bool(struct struct_bool *s) {
// CHECK-128-LABEL: @write_bool(
// CHECK-128-NEXT: entry:
// CHECK-128-NEXT: [[SAVED_VALUE:%.*]] = alloca <vscale x 16 x i1>, align 2
// CHECK-128-NEXT: store <vscale x 16 x i1> [[X:%.*]], <vscale x 16 x i1>* [[SAVED_VALUE]], align 2, !tbaa [[TBAA9:![0-9]+]]
// CHECK-128-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast <vscale x 16 x i1>* [[SAVED_VALUE]] to <2 x i8>*
// CHECK-128-NEXT: [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* [[CASTFIXEDSVE]], align 2, !tbaa [[TBAA6]]
// CHECK-128-NEXT: [[TMP0:%.*]] = bitcast <vscale x 16 x i1> %x to <vscale x 2 x i8>
// CHECK-128-NEXT: [[CASTFIXEDSVE:%.*]] = call <2 x i8> @llvm.experimental.vector.extract.v2i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
// CHECK-128-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL:%.*]], %struct.struct_bool* [[S:%.*]], i64 0, i32 1, i64 0
// CHECK-128-NEXT: store <2 x i8> [[TMP0]], <2 x i8>* [[ARRAYIDX]], align 2, !tbaa [[TBAA6]]
// CHECK-128-NEXT: store <2 x i8> [[CASTFIXEDSVE]], <2 x i8>* [[ARRAYIDX]], align 2, !tbaa [[TBAA6]]
// CHECK-128-NEXT: ret void
//
// CHECK-256-LABEL: @write_bool(
// CHECK-256-NEXT: entry:
// CHECK-256-NEXT: [[SAVED_VALUE:%.*]] = alloca <vscale x 16 x i1>, align 4
// CHECK-256-NEXT: store <vscale x 16 x i1> [[X:%.*]], <vscale x 16 x i1>* [[SAVED_VALUE]], align 4, !tbaa [[TBAA9:![0-9]+]]
// CHECK-256-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast <vscale x 16 x i1>* [[SAVED_VALUE]] to <4 x i8>*
// CHECK-256-NEXT: [[TMP0:%.*]] = load <4 x i8>, <4 x i8>* [[CASTFIXEDSVE]], align 4, !tbaa [[TBAA6]]
// CHECK-256-NEXT: [[TMP0:%.*]] = bitcast <vscale x 16 x i1> %x to <vscale x 2 x i8>
// CHECK-256-NEXT: [[CASTFIXEDSVE:%.*]] = call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
// CHECK-256-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL:%.*]], %struct.struct_bool* [[S:%.*]], i64 0, i32 1, i64 0
// CHECK-256-NEXT: store <4 x i8> [[TMP0]], <4 x i8>* [[ARRAYIDX]], align 2, !tbaa [[TBAA6]]
// CHECK-256-NEXT: store <4 x i8> [[CASTFIXEDSVE]], <4 x i8>* [[ARRAYIDX]], align 2, !tbaa [[TBAA6]]
// CHECK-256-NEXT: ret void
//
// CHECK-512-LABEL: @write_bool(
// CHECK-512-NEXT: entry:
// CHECK-512-NEXT: [[SAVED_VALUE:%.*]] = alloca <vscale x 16 x i1>, align 8
// CHECK-512-NEXT: store <vscale x 16 x i1> [[X:%.*]], <vscale x 16 x i1>* [[SAVED_VALUE]], align 8, !tbaa [[TBAA9:![0-9]+]]
// CHECK-512-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast <vscale x 16 x i1>* [[SAVED_VALUE]] to <8 x i8>*
// CHECK-512-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* [[CASTFIXEDSVE]], align 8, !tbaa [[TBAA6]]
// CHECK-512-NEXT: [[TMP0:%.*]] = bitcast <vscale x 16 x i1> %x to <vscale x 2 x i8>
// CHECK-512-NEXT: [[CASTFIXEDSVE:%.*]] = call <8 x i8> @llvm.experimental.vector.extract.v8i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
// CHECK-512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL:%.*]], %struct.struct_bool* [[S:%.*]], i64 0, i32 1, i64 0
// CHECK-512-NEXT: store <8 x i8> [[TMP0]], <8 x i8>* [[ARRAYIDX]], align 2, !tbaa [[TBAA6]]
// CHECK-512-NEXT: store <8 x i8> [[CASTFIXEDSVE]], <8 x i8>* [[ARRAYIDX]], align 2, !tbaa [[TBAA6]]
// CHECK-512-NEXT: ret void
//
void write_bool(struct struct_bool *s, svbool_t x) {

View File

@ -77,32 +77,8 @@ fixed_float64_t call_float64_ff(svbool_t pg, fixed_float64_t op1, fixed_float64_
// CHECK-LABEL: @call_bool_ff(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OP1:%.*]] = alloca <8 x i8>, align 8
// CHECK-NEXT: [[OP2:%.*]] = alloca <8 x i8>, align 8
// CHECK-NEXT: [[SAVED_VALUE:%.*]] = alloca <8 x i8>, align 8
// CHECK-NEXT: [[SAVED_VALUE3:%.*]] = alloca <8 x i8>, align 8
// CHECK-NEXT: [[SAVED_VALUE5:%.*]] = alloca <vscale x 16 x i1>, align 8
// CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca <vscale x 16 x i1>, align 8
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8>* [[OP1]] to <vscale x 16 x i1>*
// CHECK-NEXT: store <vscale x 16 x i1> [[OP1_COERCE:%.*]], <vscale x 16 x i1>* [[TMP0]], align 8
// CHECK-NEXT: [[OP11:%.*]] = load <8 x i8>, <8 x i8>* [[OP1]], align 8, !tbaa [[TBAA6]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8>* [[OP2]] to <vscale x 16 x i1>*
// CHECK-NEXT: store <vscale x 16 x i1> [[OP2_COERCE:%.*]], <vscale x 16 x i1>* [[TMP1]], align 8
// CHECK-NEXT: [[OP22:%.*]] = load <8 x i8>, <8 x i8>* [[OP2]], align 8, !tbaa [[TBAA6]]
// CHECK-NEXT: store <8 x i8> [[OP11]], <8 x i8>* [[SAVED_VALUE]], align 8, !tbaa [[TBAA6]]
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast <8 x i8>* [[SAVED_VALUE]] to <vscale x 16 x i1>*
// CHECK-NEXT: [[TMP2:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[CASTFIXEDSVE]], align 8, !tbaa [[TBAA6]]
// CHECK-NEXT: store <8 x i8> [[OP22]], <8 x i8>* [[SAVED_VALUE3]], align 8, !tbaa [[TBAA6]]
// CHECK-NEXT: [[CASTFIXEDSVE4:%.*]] = bitcast <8 x i8>* [[SAVED_VALUE3]] to <vscale x 16 x i1>*
// CHECK-NEXT: [[TMP3:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[CASTFIXEDSVE4]], align 8, !tbaa [[TBAA6]]
// CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.sel.nxv16i1(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]])
// CHECK-NEXT: store <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1>* [[SAVED_VALUE5]], align 8, !tbaa [[TBAA9:![0-9]+]]
// CHECK-NEXT: [[CASTFIXEDSVE6:%.*]] = bitcast <vscale x 16 x i1>* [[SAVED_VALUE5]] to <8 x i8>*
// CHECK-NEXT: [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[CASTFIXEDSVE6]], align 8, !tbaa [[TBAA6]]
// CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast <vscale x 16 x i1>* [[RETVAL_COERCE]] to <8 x i8>*
// CHECK-NEXT: store <8 x i8> [[TMP5]], <8 x i8>* [[RETVAL_0__SROA_CAST]], align 8
// CHECK-NEXT: [[TMP6:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[RETVAL_COERCE]], align 8
// CHECK-NEXT: ret <vscale x 16 x i1> [[TMP6]]
// CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.sel.nxv16i1(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i1> [[OP1_COERCE:%.*]], <vscale x 16 x i1> [[OP2_COERCE:%.*]])
// CHECK-NEXT: ret <vscale x 16 x i1> [[TMP0]]
//
fixed_bool_t call_bool_ff(svbool_t pg, fixed_bool_t op1, fixed_bool_t op2) {
return svsel(pg, op1, op2);
@ -134,24 +110,8 @@ fixed_float64_t call_float64_fs(svbool_t pg, fixed_float64_t op1, svfloat64_t op
// CHECK-LABEL: @call_bool_fs(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OP1:%.*]] = alloca <8 x i8>, align 8
// CHECK-NEXT: [[SAVED_VALUE:%.*]] = alloca <8 x i8>, align 8
// CHECK-NEXT: [[SAVED_VALUE2:%.*]] = alloca <vscale x 16 x i1>, align 8
// CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca <vscale x 16 x i1>, align 8
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8>* [[OP1]] to <vscale x 16 x i1>*
// CHECK-NEXT: store <vscale x 16 x i1> [[OP1_COERCE:%.*]], <vscale x 16 x i1>* [[TMP0]], align 8
// CHECK-NEXT: [[OP11:%.*]] = load <8 x i8>, <8 x i8>* [[OP1]], align 8, !tbaa [[TBAA6]]
// CHECK-NEXT: store <8 x i8> [[OP11]], <8 x i8>* [[SAVED_VALUE]], align 8, !tbaa [[TBAA6]]
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast <8 x i8>* [[SAVED_VALUE]] to <vscale x 16 x i1>*
// CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[CASTFIXEDSVE]], align 8, !tbaa [[TBAA6]]
// CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.sel.nxv16i1(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i1> [[TMP1]], <vscale x 16 x i1> [[OP2:%.*]])
// CHECK-NEXT: store <vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1>* [[SAVED_VALUE2]], align 8, !tbaa [[TBAA9]]
// CHECK-NEXT: [[CASTFIXEDSVE3:%.*]] = bitcast <vscale x 16 x i1>* [[SAVED_VALUE2]] to <8 x i8>*
// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[CASTFIXEDSVE3]], align 8, !tbaa [[TBAA6]]
// CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast <vscale x 16 x i1>* [[RETVAL_COERCE]] to <8 x i8>*
// CHECK-NEXT: store <8 x i8> [[TMP3]], <8 x i8>* [[RETVAL_0__SROA_CAST]], align 8
// CHECK-NEXT: [[TMP4:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[RETVAL_COERCE]], align 8
// CHECK-NEXT: ret <vscale x 16 x i1> [[TMP4]]
// CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.sel.nxv16i1(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i1> [[OP1_COERCE:%.*]], <vscale x 16 x i1> [[OP2_COERCE:%.*]])
// CHECK-NEXT: ret <vscale x 16 x i1> [[TMP0]]
//
fixed_bool_t call_bool_fs(svbool_t pg, fixed_bool_t op1, svbool_t op2) {
return svsel(pg, op1, op2);
@ -183,16 +143,8 @@ fixed_float64_t call_float64_ss(svbool_t pg, svfloat64_t op1, svfloat64_t op2) {
// CHECK-LABEL: @call_bool_ss(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SAVED_VALUE:%.*]] = alloca <vscale x 16 x i1>, align 8
// CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca <vscale x 16 x i1>, align 8
// CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.sel.nxv16i1(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i1> [[OP1:%.*]], <vscale x 16 x i1> [[OP2:%.*]])
// CHECK-NEXT: store <vscale x 16 x i1> [[TMP0]], <vscale x 16 x i1>* [[SAVED_VALUE]], align 8, !tbaa [[TBAA9]]
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast <vscale x 16 x i1>* [[SAVED_VALUE]] to <8 x i8>*
// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[CASTFIXEDSVE]], align 8, !tbaa [[TBAA6]]
// CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast <vscale x 16 x i1>* [[RETVAL_COERCE]] to <8 x i8>*
// CHECK-NEXT: store <8 x i8> [[TMP1]], <8 x i8>* [[RETVAL_0__SROA_CAST]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[RETVAL_COERCE]], align 8
// CHECK-NEXT: ret <vscale x 16 x i1> [[TMP2]]
// CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.sel.nxv16i1(<vscale x 16 x i1> [[PG:%.*]], <vscale x 16 x i1> [[OP1_COERCE:%.*]], <vscale x 16 x i1> [[OP2_COERCE:%.*]])
// CHECK-NEXT: ret <vscale x 16 x i1> [[TMP0]]
//
fixed_bool_t call_bool_ss(svbool_t pg, svbool_t op1, svbool_t op2) {
return svsel(pg, op1, op2);

View File

@ -45,15 +45,7 @@ fixed_float64_t from_svfloat64_t(svfloat64_t type) {
// CHECK-LABEL: @to_svbool_t(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TYPE:%.*]] = alloca <8 x i8>, align 8
// CHECK-NEXT: [[SAVED_VALUE:%.*]] = alloca <8 x i8>, align 8
// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8>* [[TYPE]] to <vscale x 16 x i1>*
// CHECK-NEXT: store <vscale x 16 x i1> [[TYPE_COERCE:%.*]], <vscale x 16 x i1>* [[TMP0]], align 8
// CHECK-NEXT: [[TYPE1:%.*]] = load <8 x i8>, <8 x i8>* [[TYPE]], align 8, !tbaa [[TBAA6:![0-9]+]]
// CHECK-NEXT: store <8 x i8> [[TYPE1]], <8 x i8>* [[SAVED_VALUE]], align 8, !tbaa [[TBAA6]]
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast <8 x i8>* [[SAVED_VALUE]] to <vscale x 16 x i1>*
// CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[CASTFIXEDSVE]], align 8, !tbaa [[TBAA6]]
// CHECK-NEXT: ret <vscale x 16 x i1> [[TMP1]]
// CHECK-NEXT: ret <vscale x 16 x i1> [[TYPE:%.*]]
//
svbool_t to_svbool_t(fixed_bool_t type) {
return type;
@ -61,23 +53,28 @@ svbool_t to_svbool_t(fixed_bool_t type) {
// CHECK-LABEL: @from_svbool_t(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[SAVED_VALUE:%.*]] = alloca <vscale x 16 x i1>, align 8
// CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca <vscale x 16 x i1>, align 8
// CHECK-NEXT: store <vscale x 16 x i1> [[TYPE:%.*]], <vscale x 16 x i1>* [[SAVED_VALUE]], align 8, !tbaa [[TBAA9:![0-9]+]]
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast <vscale x 16 x i1>* [[SAVED_VALUE]] to <8 x i8>*
// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* [[CASTFIXEDSVE]], align 8, !tbaa [[TBAA6]]
// CHECK-NEXT: [[RETVAL_0__SROA_CAST:%.*]] = bitcast <vscale x 16 x i1>* [[RETVAL_COERCE]] to <8 x i8>*
// CHECK-NEXT: store <8 x i8> [[TMP0]], <8 x i8>* [[RETVAL_0__SROA_CAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[RETVAL_COERCE]], align 8
// CHECK-NEXT: ret <vscale x 16 x i1> [[TMP1]]
// CHECK-NEXT: ret <vscale x 16 x i1> [[TYPE:%.*]]
//
fixed_bool_t from_svbool_t(svbool_t type) {
return type;
}
// CHECK-LABEL: @lax_cast(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TMP0:%.*]] = alloca <16 x i32>, align 64
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <16 x i32> @llvm.experimental.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> [[TYPE_COERCE:%.*]], i64 0)
// CHECK-NEXT: store <16 x i32> [[CASTFIXEDSVE]], <16 x i32>* [[TMP0:%.*]], align 64, !tbaa [[TBAA6:![0-9]+]]
// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i32>* [[TMP0]] to <vscale x 2 x i64>*
// CHECK-NEXT: [[TMP2:%.*]] = load <vscale x 2 x i64>, <vscale x 2 x i64>* [[TMP1]], align 64, !tbaa [[TBAA6]]
// CHECK-NEXT: ret <vscale x 2 x i64> [[TMP2]]
//
svint64_t lax_cast(fixed_int32_t type) {
return type;
}
// CHECK-LABEL: @to_svint32_t__from_gnu_int32_t(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[TYPE:%.*]] = load <16 x i32>, <16 x i32>* [[TMP0:%.*]], align 16, !tbaa [[TBAA6]]
// CHECK-NEXT: [[TYPE:%.*]] = load <16 x i32>, <16 x i32>* [[TMP0:%.*]], align 16, !tbaa [[TBAA6:![0-9]+]]
// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> undef, <16 x i32> [[TYPE]], i64 0)
// CHECK-NEXT: ret <vscale x 4 x i32> [[CASTSCALABLESVE]]
//

View File

@ -18,19 +18,15 @@ fixed_int32_t global_vec;
// CHECK-NEXT: [[PRED_ADDR:%.*]] = alloca <vscale x 16 x i1>, align 2
// CHECK-NEXT: [[VEC_ADDR:%.*]] = alloca <vscale x 4 x i32>, align 16
// CHECK-NEXT: [[PG:%.*]] = alloca <vscale x 16 x i1>, align 2
// CHECK-NEXT: [[SAVED_VALUE:%.*]] = alloca <8 x i8>, align 8
// CHECK-NEXT: [[SAVED_VALUE1:%.*]] = alloca <8 x i8>, align 8
// CHECK-NEXT: store <vscale x 16 x i1> [[PRED:%.*]], <vscale x 16 x i1>* [[PRED_ADDR]], align 2
// CHECK-NEXT: store <vscale x 4 x i32> [[VEC:%.*]], <vscale x 4 x i32>* [[VEC_ADDR]], align 16
// CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[PRED_ADDR]], align 2
// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* @global_pred, align 2
// CHECK-NEXT: store <8 x i8> [[TMP1]], <8 x i8>* [[SAVED_VALUE]], align 8
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast <8 x i8>* [[SAVED_VALUE]] to <vscale x 16 x i1>*
// CHECK-NEXT: [[TMP2:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[CASTFIXEDSVE]], align 8
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> undef, <8 x i8> [[TMP1]], i64 0)
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 2 x i8> [[CASTFIXEDSVE]] to <vscale x 16 x i1>
// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* @global_pred, align 2
// CHECK-NEXT: store <8 x i8> [[TMP3]], <8 x i8>* [[SAVED_VALUE1]], align 8
// CHECK-NEXT: [[CASTFIXEDSVE2:%.*]] = bitcast <8 x i8>* [[SAVED_VALUE1]] to <vscale x 16 x i1>*
// CHECK-NEXT: [[TMP4:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[CASTFIXEDSVE2]], align 8
// CHECK-NEXT: [[CASTFIXEDSVE2:%.*]] = call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> undef, <8 x i8> [[TMP3]], i64 0)
// CHECK-NEXT: [[TMP4:%.*]] = bitcast <vscale x 2 x i8> [[CASTFIXEDSVE2]] to <vscale x 16 x i1>
// CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.and.z.nxv16i1(<vscale x 16 x i1> [[TMP0]], <vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP4]])
// CHECK-NEXT: store <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1>* [[PG]], align 2
// CHECK-NEXT: [[TMP6:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[PG]], align 2
@ -92,17 +88,15 @@ fixed_int32_t array_arg(fixed_int32_t arr[]) {
// CHECK-NEXT: [[RETVAL:%.*]] = alloca <8 x i8>, align 2
// CHECK-NEXT: [[ARR:%.*]] = alloca [3 x <8 x i8>], align 2
// CHECK-NEXT: [[PARR:%.*]] = alloca <8 x i8>*, align 8
// CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca <vscale x 16 x i1>, align 2
// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[ARR]], i64 0, i64 0
// CHECK-NEXT: store <8 x i8>* [[ARRAYIDX]], <8 x i8>** [[PARR]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>*, <8 x i8>** [[PARR]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 2
// CHECK-NEXT: store <8 x i8> [[TMP1]], <8 x i8>* [[RETVAL]], align 2
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 16 x i1>* [[RETVAL_COERCE]] to i8*
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i8>* [[RETVAL]] to i8*
// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 2 [[TMP2]], i8* align 2 [[TMP3]], i64 8, i1 false)
// CHECK-NEXT: [[TMP4:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[RETVAL_COERCE]], align 2
// CHECK-NEXT: ret <vscale x 16 x i1> [[TMP4]]
// CHECK-NEXT: [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[RETVAL]], align 2
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> undef, <8 x i8> [[TMP2]], i64 0)
// CHECK-NEXT: [[TMP3:%.*]] = bitcast <vscale x 2 x i8> [[CASTFIXEDSVE]] to <vscale x 16 x i1>
// CHECK-NEXT: ret <vscale x 16 x i1> [[TMP3]]
//
fixed_bool_t address_of_array_idx() {
fixed_bool_t arr[3];
@ -119,23 +113,19 @@ fixed_bool_t address_of_array_idx() {
// CHECK-NEXT: [[XX:%.*]] = alloca <8 x i8>, align 8
// CHECK-NEXT: [[YY:%.*]] = alloca <8 x i8>, align 8
// CHECK-NEXT: [[PG:%.*]] = alloca <vscale x 16 x i1>, align 2
// CHECK-NEXT: [[SAVED_VALUE:%.*]] = alloca <8 x i8>, align 8
// CHECK-NEXT: [[SAVED_VALUE1:%.*]] = alloca <8 x i8>, align 8
// CHECK-NEXT: store <vscale x 16 x i1> [[PRED:%.*]], <vscale x 16 x i1>* [[PRED_ADDR]], align 2
// CHECK-NEXT: store <vscale x 4 x i32> [[VEC:%.*]], <vscale x 4 x i32>* [[VEC_ADDR]], align 16
// CHECK-NEXT: store <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 0, i8 0, i8 0, i8 0>, <8 x i8>* [[XX]], align 8
// CHECK-NEXT: store <8 x i8> <i8 2, i8 5, i8 4, i8 6, i8 0, i8 0, i8 0, i8 0>, <8 x i8>* [[YY]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[PRED_ADDR]], align 2
// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* @global_pred, align 2
// CHECK-NEXT: store <8 x i8> [[TMP1]], <8 x i8>* [[SAVED_VALUE]], align 8
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast <8 x i8>* [[SAVED_VALUE]] to <vscale x 16 x i1>*
// CHECK-NEXT: [[TMP2:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[CASTFIXEDSVE]], align 8
// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> undef, <8 x i8> [[TMP1]], i64 0)
// CHECK-NEXT: [[TMP2:%.*]] = bitcast <vscale x 2 x i8> [[CASTFIXEDSVE]] to <vscale x 16 x i1>
// CHECK-NEXT: [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[XX]], align 8
// CHECK-NEXT: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[YY]], align 8
// CHECK-NEXT: [[ADD:%.*]] = add <8 x i8> [[TMP3]], [[TMP4]]
// CHECK-NEXT: store <8 x i8> [[ADD]], <8 x i8>* [[SAVED_VALUE1]], align 8
// CHECK-NEXT: [[CASTFIXEDSVE2:%.*]] = bitcast <8 x i8>* [[SAVED_VALUE1]] to <vscale x 16 x i1>*
// CHECK-NEXT: [[TMP5:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[CASTFIXEDSVE2]], align 8
// CHECK-NEXT: [[CASTFIXEDSVE2:%.*]] = call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> undef, <8 x i8> [[ADD]], i64 0)
// CHECK-NEXT: [[TMP5:%.*]] = bitcast <vscale x 2 x i8> [[CASTFIXEDSVE2]] to <vscale x 16 x i1>
// CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.and.z.nxv16i1(<vscale x 16 x i1> [[TMP0]], <vscale x 16 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP5]])
// CHECK-NEXT: store <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i1>* [[PG]], align 2
// CHECK-NEXT: [[TMP7:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[PG]], align 2

View File

@ -49,20 +49,16 @@ void write_global_bf16(svbfloat16_t v) { global_bf16 = v; }
// CHECK-128-LABEL: @write_global_bool(
// CHECK-128-NEXT: entry:
// CHECK-128-NEXT: [[SAVED_VALUE:%.*]] = alloca <vscale x 16 x i1>, align 2
// CHECK-128-NEXT: store <vscale x 16 x i1> [[V:%.*]], <vscale x 16 x i1>* [[SAVED_VALUE]], align 2, !tbaa [[TBAA9:![0-9]+]]
// CHECK-128-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast <vscale x 16 x i1>* [[SAVED_VALUE]] to <2 x i8>*
// CHECK-128-NEXT: [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* [[CASTFIXEDSVE]], align 2, !tbaa [[TBAA6]]
// CHECK-128-NEXT: store <2 x i8> [[TMP0]], <2 x i8>* @global_bool, align 2, !tbaa [[TBAA6]]
// CHECK-128-NEXT: [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[V:%.*]] to <vscale x 2 x i8>
// CHECK-128-NEXT: [[CASTFIXEDSVE:%.*]] = call <2 x i8> @llvm.experimental.vector.extract.v2i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
// CHECK-128-NEXT: store <2 x i8> [[CASTFIXEDSVE]], <2 x i8>* @global_bool, align 2, !tbaa [[TBAA6:![0-9]+]]
// CHECK-128-NEXT: ret void
//
// CHECK-512-LABEL: @write_global_bool(
// CHECK-512-NEXT: entry:
// CHECK-512-NEXT: [[SAVED_VALUE:%.*]] = alloca <vscale x 16 x i1>, align 8
// CHECK-512-NEXT: store <vscale x 16 x i1> [[V:%.*]], <vscale x 16 x i1>* [[SAVED_VALUE]], align 8, !tbaa [[TBAA9:![0-9]+]]
// CHECK-512-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast <vscale x 16 x i1>* [[SAVED_VALUE]] to <8 x i8>*
// CHECK-512-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* [[CASTFIXEDSVE]], align 8, !tbaa [[TBAA6]]
// CHECK-512-NEXT: store <8 x i8> [[TMP0]], <8 x i8>* @global_bool, align 2, !tbaa [[TBAA6]]
// CHECK-512-NEXT: [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[V:%.*]] to <vscale x 2 x i8>
// CHECK-512-NEXT: [[CASTFIXEDSVE:%.*]] = call <8 x i8> @llvm.experimental.vector.extract.v8i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
// CHECK-512-NEXT: store <8 x i8> [[CASTFIXEDSVE]], <8 x i8>* @global_bool, align 2, !tbaa [[TBAA6]]
// CHECK-512-NEXT: ret void
//
void write_global_bool(svbool_t v) { global_bool = v; }
@ -101,20 +97,16 @@ svbfloat16_t read_global_bf16() { return global_bf16; }
// CHECK-128-LABEL: @read_global_bool(
// CHECK-128-NEXT: entry:
// CHECK-128-NEXT: [[SAVED_VALUE:%.*]] = alloca <2 x i8>, align 2
// CHECK-128-NEXT: [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* @global_bool, align 2, !tbaa [[TBAA6]]
// CHECK-128-NEXT: store <2 x i8> [[TMP0]], <2 x i8>* [[SAVED_VALUE]], align 2, !tbaa [[TBAA6]]
// CHECK-128-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast <2 x i8>* [[SAVED_VALUE]] to <vscale x 16 x i1>*
// CHECK-128-NEXT: [[TMP1:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[CASTFIXEDSVE]], align 2, !tbaa [[TBAA6]]
// CHECK-128-NEXT: [[CASTFIXEDSVE:%.*]] = call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> undef, <2 x i8> [[TMP0]], i64 0)
// CHECK-128-NEXT: [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CASTFIXEDSVE]] to <vscale x 16 x i1>
// CHECK-128-NEXT: ret <vscale x 16 x i1> [[TMP1]]
//
// CHECK-512-LABEL: @read_global_bool(
// CHECK-512-NEXT: entry:
// CHECK-512-NEXT: [[SAVED_VALUE:%.*]] = alloca <8 x i8>, align 8
// CHECK-512-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* @global_bool, align 2, !tbaa [[TBAA6]]
// CHECK-512-NEXT: store <8 x i8> [[TMP0]], <8 x i8>* [[SAVED_VALUE]], align 8, !tbaa [[TBAA6]]
// CHECK-512-NEXT: [[CASTFIXEDSVE:%.*]] = bitcast <8 x i8>* [[SAVED_VALUE]] to <vscale x 16 x i1>*
// CHECK-512-NEXT: [[TMP1:%.*]] = load <vscale x 16 x i1>, <vscale x 16 x i1>* [[CASTFIXEDSVE]], align 8, !tbaa [[TBAA6]]
// CHECK-512-NEXT: [[CASTFIXEDSVE:%.*]] = call <vscale x 2 x i8> @llvm.experimental.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> undef, <8 x i8> [[TMP0]], i64 0)
// CHECK-512-NEXT: [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CASTFIXEDSVE]] to <vscale x 16 x i1>
// CHECK-512-NEXT: ret <vscale x 16 x i1> [[TMP1]]
//
svbool_t read_global_bool() { return global_bool; }