[ARM] Tune getCastInstrCost for extending masked loads and truncating masked stores

This patch uses the feature added in D79162 to fix the cost of a
sext/zext of a masked load, or a trunc for a masked store.
Previously, those were considered cheap or even free, but it's
not the case as we cannot split the load in the same way we would for
normal loads.

This updates the costs to better reflect reality, and adds a test for it
in test/Analysis/CostModel/ARM/cast.ll.

It also adds a vectorizer test that showcases the improvement: in some
cases, the vectorizer will now choose a smaller VF when
tail-predication is enabled, which results in better codegen. (Because
if it were to use a higher VF in those cases, the code we see above
would be generated, and the vmovs would block tail-predication later in
the process, resulting in very poor codegen overall)

Original Patch by Pierre van Houtryve

Differential Revision: https://reviews.llvm.org/D79163
This commit is contained in:
David Green 2020-07-29 13:41:34 +01:00
parent 60280e9818
commit 9ddb28964c
3 changed files with 207 additions and 85 deletions

View File

@ -313,6 +313,12 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
return Cost == 0 ? 0 : 1;
return Cost;
};
auto IsLegalFPType = [this](EVT VT) {
EVT EltVT = VT.getScalarType();
return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
(EltVT == MVT::f64 && ST->hasFP64()) ||
(EltVT == MVT::f16 && ST->hasFullFP16());
};
EVT SrcTy = TLI->getValueType(DL, Src);
EVT DstTy = TLI->getValueType(DL, Dst);
@ -321,8 +327,21 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
return AdjustCost(
BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
// The extend of a load is free
if (I && isa<LoadInst>(I->getOperand(0))) {
// Extending masked load/Truncating masked stores is expensive because we
// currently don't split them. This means that we'll likely end up
// loading/storing each element individually (hence the high cost).
if ((ST->hasMVEIntegerOps() &&
(Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
Opcode == Instruction::SExt)) ||
(ST->hasMVEFloatOps() &&
(Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
return 2 * DstTy.getVectorNumElements() * ST->getMVEVectorCostFactor();
// The extend of other kinds of load is free
if (CCH == TTI::CastContextHint::Normal ||
CCH == TTI::CastContextHint::Masked) {
static const TypeConversionCostTblEntry LoadConversionTbl[] = {
{ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
{ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
@ -376,11 +395,9 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
}
}
// The truncate of a store is free. This is the mirror of extends above.
if (I && I->hasOneUse() && isa<StoreInst>(*I->user_begin())) {
static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
{ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
{ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
{ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
@ -390,18 +407,18 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
};
if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
if (const auto *Entry =
ConvertCostTableLookup(MVELoadConversionTbl, ISD,
ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
}
static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
{ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
{ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
};
if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
if (const auto *Entry =
ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
}
@ -638,14 +655,8 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
int Lanes = 1;
if (SrcTy.isFixedLengthVector())
Lanes = SrcTy.getVectorNumElements();
auto IsLegal = [this](EVT VT) {
EVT EltVT = VT.getScalarType();
return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
(EltVT == MVT::f64 && ST->hasFP64()) ||
(EltVT == MVT::f16 && ST->hasFullFP16());
};
if (IsLegal(SrcTy) && IsLegal(DstTy))
if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
return Lanes;
else
return Lanes * CallCost;

View File

@ -1852,38 +1852,38 @@ define i32 @maskedload_extends() {
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2864u = zext <2 x i8> %loadv2i8 to <2 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4816s = sext <4 x i8> %loadv4i8 to <4 x i16>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4816u = zext <4 x i8> %loadv4i8 to <4 x i16>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4832s = sext <4 x i8> %loadv4i8 to <4 x i32>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v4832u = zext <4 x i8> %loadv4i8 to <4 x i32>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %v4864s = sext <4 x i8> %loadv4i8 to <4 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v4864u = zext <4 x i8> %loadv4i8 to <4 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8816s = sext <8 x i8> %loadv8i8 to <8 x i16>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8816u = zext <8 x i8> %loadv8i8 to <8 x i16>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v8832s = sext <8 x i8> %loadv8i8 to <8 x i32>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v8832u = zext <8 x i8> %loadv8i8 to <8 x i32>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %v8864s = sext <8 x i8> %loadv8i8 to <8 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %v8864u = zext <8 x i8> %loadv8i8 to <8 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v16816s = sext <16 x i8> %loadv16i8 to <16 x i16>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v16816u = zext <16 x i8> %loadv16i8 to <16 x i16>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %v16832s = sext <16 x i8> %loadv16i8 to <16 x i32>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %v16832u = zext <16 x i8> %loadv16i8 to <16 x i32>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1322 for instruction: %v16864s = sext <16 x i8> %loadv16i8 to <16 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 298 for instruction: %v16864u = zext <16 x i8> %loadv16i8 to <16 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4832s = sext <4 x i8> %loadv4i8 to <4 x i32>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4832u = zext <4 x i8> %loadv4i8 to <4 x i32>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4864s = sext <4 x i8> %loadv4i8 to <4 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4864u = zext <4 x i8> %loadv4i8 to <4 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8816s = sext <8 x i8> %loadv8i8 to <8 x i16>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8816u = zext <8 x i8> %loadv8i8 to <8 x i16>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8832s = sext <8 x i8> %loadv8i8 to <8 x i32>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8832u = zext <8 x i8> %loadv8i8 to <8 x i32>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8864s = sext <8 x i8> %loadv8i8 to <8 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8864u = zext <8 x i8> %loadv8i8 to <8 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16816s = sext <16 x i8> %loadv16i8 to <16 x i16>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16816u = zext <16 x i8> %loadv16i8 to <16 x i16>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16832s = sext <16 x i8> %loadv16i8 to <16 x i32>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16832u = zext <16 x i8> %loadv16i8 to <16 x i32>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16864s = sext <16 x i8> %loadv16i8 to <16 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16864u = zext <16 x i8> %loadv16i8 to <16 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v21632s = sext <2 x i16> %loadv2i16 to <2 x i32>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v21632u = zext <2 x i16> %loadv2i16 to <2 x i32>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v21664s = sext <2 x i16> %loadv2i16 to <2 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v21664u = zext <2 x i16> %loadv2i16 to <2 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v41632s = sext <4 x i16> %loadv4i16 to <4 x i32>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v41632u = zext <4 x i16> %loadv4i16 to <4 x i32>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %v41664s = sext <4 x i16> %loadv4i16 to <4 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v41664u = zext <4 x i16> %loadv4i16 to <4 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v81632s = sext <8 x i16> %loadv8i16 to <8 x i32>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v81632u = zext <8 x i16> %loadv8i16 to <8 x i32>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %v81664s = sext <8 x i16> %loadv8i16 to <8 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %v81664u = zext <8 x i16> %loadv8i16 to <8 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v41632s = sext <4 x i16> %loadv4i16 to <4 x i32>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v41632u = zext <4 x i16> %loadv4i16 to <4 x i32>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v41664s = sext <4 x i16> %loadv4i16 to <4 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v41664u = zext <4 x i16> %loadv4i16 to <4 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v81632s = sext <8 x i16> %loadv8i16 to <8 x i32>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v81632u = zext <8 x i16> %loadv8i16 to <8 x i32>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v81664s = sext <8 x i16> %loadv8i16 to <8 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v81664u = zext <8 x i16> %loadv8i16 to <8 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v23264s = sext <2 x i32> %loadv2i32 to <2 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v23264u = zext <2 x i32> %loadv2i32 to <2 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %v43264s = sext <4 x i32> %loadv4i32 to <4 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %v43264u = zext <4 x i32> %loadv4i32 to <4 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v43264s = sext <4 x i32> %loadv4i32 to <4 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v43264u = zext <4 x i32> %loadv4i32 to <4 x i64>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; CHECK-V8M-MAIN-RECIP-LABEL: 'maskedload_extends'
@ -1922,15 +1922,15 @@ define i32 @maskedload_extends() {
; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v16864u = zext <16 x i8> %loadv16i8 to <16 x i64>
; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v21632s = sext <2 x i16> %loadv2i16 to <2 x i32>
; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v21632u = zext <2 x i16> %loadv2i16 to <2 x i32>
; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v21664s = sext <2 x i16> %loadv2i16 to <2 x i64>
; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v21664s = sext <2 x i16> %loadv2i16 to <2 x i64>
; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v21664u = zext <2 x i16> %loadv2i16 to <2 x i64>
; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v41632s = sext <4 x i16> %loadv4i16 to <4 x i32>
; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v41632u = zext <4 x i16> %loadv4i16 to <4 x i32>
; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v41664s = sext <4 x i16> %loadv4i16 to <4 x i64>
; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v41664s = sext <4 x i16> %loadv4i16 to <4 x i64>
; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v41664u = zext <4 x i16> %loadv4i16 to <4 x i64>
; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v81632s = sext <8 x i16> %loadv8i16 to <8 x i32>
; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v81632u = zext <8 x i16> %loadv8i16 to <8 x i32>
; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v81664s = sext <8 x i16> %loadv8i16 to <8 x i64>
; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v81664s = sext <8 x i16> %loadv8i16 to <8 x i64>
; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v81664u = zext <8 x i16> %loadv8i16 to <8 x i64>
; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v23264s = sext <2 x i32> %loadv2i32 to <2 x i64>
; CHECK-V8M-MAIN-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v23264u = zext <2 x i32> %loadv2i32 to <2 x i64>
@ -1974,15 +1974,15 @@ define i32 @maskedload_extends() {
; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v16864u = zext <16 x i8> %loadv16i8 to <16 x i64>
; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v21632s = sext <2 x i16> %loadv2i16 to <2 x i32>
; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v21632u = zext <2 x i16> %loadv2i16 to <2 x i32>
; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v21664s = sext <2 x i16> %loadv2i16 to <2 x i64>
; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v21664s = sext <2 x i16> %loadv2i16 to <2 x i64>
; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v21664u = zext <2 x i16> %loadv2i16 to <2 x i64>
; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v41632s = sext <4 x i16> %loadv4i16 to <4 x i32>
; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v41632u = zext <4 x i16> %loadv4i16 to <4 x i32>
; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v41664s = sext <4 x i16> %loadv4i16 to <4 x i64>
; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v41664s = sext <4 x i16> %loadv4i16 to <4 x i64>
; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v41664u = zext <4 x i16> %loadv4i16 to <4 x i64>
; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v81632s = sext <8 x i16> %loadv8i16 to <8 x i32>
; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v81632u = zext <8 x i16> %loadv8i16 to <8 x i32>
; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v81664s = sext <8 x i16> %loadv8i16 to <8 x i64>
; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v81664s = sext <8 x i16> %loadv8i16 to <8 x i64>
; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v81664u = zext <8 x i16> %loadv8i16 to <8 x i64>
; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v23264s = sext <2 x i32> %loadv2i32 to <2 x i64>
; CHECK-V8M-BASE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v23264u = zext <2 x i32> %loadv2i32 to <2 x i64>
@ -2060,38 +2060,38 @@ define i32 @maskedload_extends() {
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2864u = zext <2 x i8> %loadv2i8 to <2 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4816s = sext <4 x i8> %loadv4i8 to <4 x i16>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4816u = zext <4 x i8> %loadv4i8 to <4 x i16>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4832s = sext <4 x i8> %loadv4i8 to <4 x i32>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4832u = zext <4 x i8> %loadv4i8 to <4 x i32>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4864s = sext <4 x i8> %loadv4i8 to <4 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4864u = zext <4 x i8> %loadv4i8 to <4 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8816s = sext <8 x i8> %loadv8i8 to <8 x i16>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8816u = zext <8 x i8> %loadv8i8 to <8 x i16>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8832s = sext <8 x i8> %loadv8i8 to <8 x i32>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8832u = zext <8 x i8> %loadv8i8 to <8 x i32>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8864s = sext <8 x i8> %loadv8i8 to <8 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8864u = zext <8 x i8> %loadv8i8 to <8 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16816s = sext <16 x i8> %loadv16i8 to <16 x i16>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16816u = zext <16 x i8> %loadv16i8 to <16 x i16>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16832s = sext <16 x i8> %loadv16i8 to <16 x i32>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16832u = zext <16 x i8> %loadv16i8 to <16 x i32>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16864s = sext <16 x i8> %loadv16i8 to <16 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16864u = zext <16 x i8> %loadv16i8 to <16 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4832s = sext <4 x i8> %loadv4i8 to <4 x i32>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v4832u = zext <4 x i8> %loadv4i8 to <4 x i32>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4864s = sext <4 x i8> %loadv4i8 to <4 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4864u = zext <4 x i8> %loadv4i8 to <4 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8816s = sext <8 x i8> %loadv8i8 to <8 x i16>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v8816u = zext <8 x i8> %loadv8i8 to <8 x i16>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8832s = sext <8 x i8> %loadv8i8 to <8 x i32>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8832u = zext <8 x i8> %loadv8i8 to <8 x i32>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8864s = sext <8 x i8> %loadv8i8 to <8 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8864u = zext <8 x i8> %loadv8i8 to <8 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16816s = sext <16 x i8> %loadv16i8 to <16 x i16>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16816u = zext <16 x i8> %loadv16i8 to <16 x i16>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16832s = sext <16 x i8> %loadv16i8 to <16 x i32>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16832u = zext <16 x i8> %loadv16i8 to <16 x i32>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16864s = sext <16 x i8> %loadv16i8 to <16 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16864u = zext <16 x i8> %loadv16i8 to <16 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v21632s = sext <2 x i16> %loadv2i16 to <2 x i32>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v21632u = zext <2 x i16> %loadv2i16 to <2 x i32>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v21664s = sext <2 x i16> %loadv2i16 to <2 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v21664u = zext <2 x i16> %loadv2i16 to <2 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v41632s = sext <4 x i16> %loadv4i16 to <4 x i32>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v41632u = zext <4 x i16> %loadv4i16 to <4 x i32>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v41664s = sext <4 x i16> %loadv4i16 to <4 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v41664u = zext <4 x i16> %loadv4i16 to <4 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v81632s = sext <8 x i16> %loadv8i16 to <8 x i32>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v81632u = zext <8 x i16> %loadv8i16 to <8 x i32>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v81664s = sext <8 x i16> %loadv8i16 to <8 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v81664u = zext <8 x i16> %loadv8i16 to <8 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v41632s = sext <4 x i16> %loadv4i16 to <4 x i32>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v41632u = zext <4 x i16> %loadv4i16 to <4 x i32>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v41664s = sext <4 x i16> %loadv4i16 to <4 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v41664u = zext <4 x i16> %loadv4i16 to <4 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v81632s = sext <8 x i16> %loadv8i16 to <8 x i32>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v81632u = zext <8 x i16> %loadv8i16 to <8 x i32>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v81664s = sext <8 x i16> %loadv8i16 to <8 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v81664u = zext <8 x i16> %loadv8i16 to <8 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v23264s = sext <2 x i32> %loadv2i32 to <2 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v23264u = zext <2 x i32> %loadv2i32 to <2 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v43264s = sext <4 x i32> %loadv4i32 to <4 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v43264u = zext <4 x i32> %loadv4i32 to <4 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v43264s = sext <4 x i32> %loadv4i32 to <4 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v43264u = zext <4 x i32> %loadv4i32 to <4 x i64>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
;
; CHECK-V8M-MAIN-SIZE-LABEL: 'maskedload_extends'
@ -2361,7 +2361,7 @@ define i32 @maskedstore_trunc() {
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8832 = trunc <8 x i32> undef to <8 x i8>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v8864 = trunc <8 x i64> undef to <8 x i8>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16816 = trunc <16 x i16> undef to <16 x i8>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v16832 = trunc <16 x i32> undef to <16 x i8>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v16832 = trunc <16 x i32> undef to <16 x i8>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %v16864 = trunc <16 x i64> undef to <16 x i8>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v21632 = trunc <2 x i32> undef to <2 x i16>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v21664 = trunc <2 x i64> undef to <2 x i16>
@ -2774,9 +2774,9 @@ define i32 @maskedload_fpextends() {
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 4, <4 x i1> undef, <4 x float> undef)
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* undef, i32 4, <8 x i1> undef, <8 x float> undef)
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1 = fpext <2 x half> %loadv2f16 to <2 x float>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4 = fpext <16 x half> %loadv16f16 to <16 x float>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4 = fpext <16 x half> %loadv16f16 to <16 x float>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v5 = fpext <2 x half> %loadv2f16 to <2 x double>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v6 = fpext <4 x half> %loadv4f16 to <4 x double>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v7 = fpext <8 x half> %loadv8f16 to <8 x double>
@ -2866,9 +2866,9 @@ define i32 @maskedload_fpextends() {
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 4, <4 x i1> undef, <4 x float> undef)
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* undef, i32 4, <8 x i1> undef, <8 x float> undef)
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v1 = fpext <2 x half> %loadv2f16 to <2 x float>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4 = fpext <16 x half> %loadv16f16 to <16 x float>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4 = fpext <16 x half> %loadv16f16 to <16 x float>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v5 = fpext <2 x half> %loadv2f16 to <2 x double>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v6 = fpext <4 x half> %loadv4f16 to <4 x double>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v7 = fpext <8 x half> %loadv8f16 to <8 x double>
@ -2998,9 +2998,9 @@ define i32 @maskedload_fptrunc() {
; CHECK-MVE-RECIP-LABEL: 'maskedload_fptrunc'
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v21632 = fptrunc <2 x float> undef to <2 x half>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v21664 = fptrunc <2 x double> undef to <2 x half>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v41632 = fptrunc <4 x float> undef to <4 x half>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v41632 = fptrunc <4 x float> undef to <4 x half>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v41664 = fptrunc <4 x double> undef to <4 x half>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v81632 = fptrunc <8 x float> undef to <8 x half>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v81632 = fptrunc <8 x float> undef to <8 x half>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v81664 = fptrunc <8 x double> undef to <8 x half>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v23264 = fptrunc <2 x double> undef to <2 x float>
; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v43264 = fptrunc <4 x double> undef to <4 x float>
@ -3074,9 +3074,9 @@ define i32 @maskedload_fptrunc() {
; CHECK-MVE-SIZE-LABEL: 'maskedload_fptrunc'
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v21632 = fptrunc <2 x float> undef to <2 x half>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v21664 = fptrunc <2 x double> undef to <2 x half>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v41632 = fptrunc <4 x float> undef to <4 x half>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v41632 = fptrunc <4 x float> undef to <4 x half>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v41664 = fptrunc <4 x double> undef to <4 x half>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v81632 = fptrunc <8 x float> undef to <8 x half>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v81632 = fptrunc <8 x float> undef to <8 x half>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v81664 = fptrunc <8 x double> undef to <8 x half>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v23264 = fptrunc <2 x double> undef to <2 x float>
; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v43264 = fptrunc <4 x double> undef to <4 x float>

View File

@ -0,0 +1,111 @@
; RUN: opt < %s -mattr=+mve,+mve.fp -loop-vectorize -S | FileCheck %s --check-prefixes=DEFAULT
; RUN: opt < %s -mattr=+mve,+mve.fp -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s --check-prefixes=TAILPRED
target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "thumbv8.1m.main-arm-none-eabi"
; When TP is disabled, this test can vectorize with a VF of 16.
; When TP is enabled, this test should vectorize with a VF of 8.
;
; DEFAULT: load <16 x i8>, <16 x i8>*
; DEFAULT: sext <16 x i8> %{{.*}} to <16 x i16>
; DEFAULT: add <16 x i16>
; DEFAULT-NOT: llvm.masked.load
; DEFAULT-NOT: llvm.masked.store
;
; TAILPRED: llvm.masked.load.v8i8.p0v8i8
; TAILPRED: sext <8 x i8> %{{.*}} to <8 x i16>
; TAILPRED: add <8 x i16>
; TAILPRED: call void @llvm.masked.store.v8i8.p0v8i8
; TAILPRED-NOT: load <16 x i8>, <16 x i8>*
define i32 @tp_reduces_vf(i8* nocapture %0, i32 %1, i8** %input) {
%3 = load i8*, i8** %input, align 8
%4 = sext i32 %1 to i64
%5 = icmp eq i32 %1, 0
br i1 %5, label %._crit_edge, label %.preheader47.preheader
.preheader47.preheader:
br label %.preheader47
.preheader47:
%.050 = phi i64 [ %54, %53 ], [ 0, %.preheader47.preheader ]
br label %.preheader
._crit_edge.loopexit:
br label %._crit_edge
._crit_edge:
ret i32 0
.preheader:
%indvars.iv51 = phi i32 [ 1, %.preheader47 ], [ %indvars.iv.next52, %52 ]
%6 = mul nuw nsw i32 %indvars.iv51, 320
br label %7
7:
%indvars.iv = phi i32 [ 1, %.preheader ], [ %indvars.iv.next, %7 ]
%8 = add nuw nsw i32 %6, %indvars.iv
%9 = add nsw i32 %8, -320
%10 = add nsw i32 %8, -321
%11 = getelementptr inbounds i8, i8* %3, i32 %10
%12 = load i8, i8* %11, align 1
%13 = sext i8 %12 to i32
%14 = getelementptr inbounds i8, i8* %3, i32 %9
%15 = load i8, i8* %14, align 1
%16 = sext i8 %15 to i32
%17 = add nsw i32 %8, -319
%18 = getelementptr inbounds i8, i8* %3, i32 %17
%19 = load i8, i8* %18, align 1
%20 = sext i8 %19 to i32
%21 = add nsw i32 %8, -1
%22 = getelementptr inbounds i8, i8* %3, i32 %21
%23 = load i8, i8* %22, align 1
%24 = sext i8 %23 to i32
%25 = getelementptr inbounds i8, i8* %3, i32 %8
%26 = load i8, i8* %25, align 1
%27 = sext i8 %26 to i32
%28 = mul nsw i32 %27, 255
%29 = add nuw nsw i32 %8, 1
%30 = getelementptr inbounds i8, i8* %3, i32 %29
%31 = load i8, i8* %30, align 1
%32 = sext i8 %31 to i32
%33 = add nuw nsw i32 %8, 320
%34 = add nuw nsw i32 %8, 319
%35 = getelementptr inbounds i8, i8* %3, i32 %34
%36 = load i8, i8* %35, align 1
%37 = sext i8 %36 to i32
%38 = getelementptr inbounds i8, i8* %3, i32 %33
%39 = load i8, i8* %38, align 1
%40 = sext i8 %39 to i32
%41 = add nuw nsw i32 %8, 321
%42 = getelementptr inbounds i8, i8* %3, i32 %41
%43 = load i8, i8* %42, align 1
%44 = sext i8 %43 to i32
%reass.add = add nsw i32 %16, %13
%reass.add44 = add nsw i32 %reass.add, %20
%reass.add45 = add nsw i32 %reass.add44, %24
%45 = add nsw i32 %reass.add45, %32
%46 = add nsw i32 %45, %37
%47 = add nsw i32 %46, %40
%reass.add46 = add nsw i32 %47, %44
%reass.mul = mul nsw i32 %reass.add46, -28
%48 = add nsw i32 %reass.mul, %28
%49 = lshr i32 %48, 8
%50 = trunc i32 %49 to i8
%51 = getelementptr inbounds i8, i8* %0, i32 %8
store i8 %50, i8* %51, align 1
%indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
%exitcond = icmp eq i32 %indvars.iv.next, 319
br i1 %exitcond, label %52, label %7
52:
%indvars.iv.next52 = add nuw nsw i32 %indvars.iv51, 1
%exitcond53 = icmp eq i32 %indvars.iv.next52, 239
br i1 %exitcond53, label %53, label %.preheader
53:
%54 = add nuw i64 %.050, 1
%55 = icmp ult i64 %54, %4
br i1 %55, label %.preheader47, label %._crit_edge.loopexit
}