[ARM] Tune getCastInstrCost for extending masked loads and truncating masked stores

This patch uses the feature added in D79162 to fix the cost of a sext/zext of a masked load, or a trunc for a masked store. Previously, those were considered cheap or even free, but it's not the case as we cannot split the load in the same way we would for normal loads. This updates the costs to better reflect reality, and adds a test for it in test/Analysis/CostModel/ARM/cast.ll. It also adds a vectorizer test that showcases the improvement: in some cases, the vectorizer will now choose a smaller VF when tail-predication is enabled, which results in better codegen. (Because if it were to use a higher VF in those cases, the code we see above would be generated, and the vmovs would block tail-predication later in the process, resulting in very poor codegen overall) Original Patch by Pierre van Houtryve Differential Revision: https://reviews.llvm.org/D79163
2020-07-29 13:41:34 +01:00 · 2020-07-29 13:41:34 +01:00 · 9ddb28964c
parent 60280e9818
commit 9ddb28964c
3 changed files with 207 additions and 85 deletions
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@ -313,6 +313,12 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
      return Cost == 0 ? 0 : 1;
    return Cost;
  };
+  auto IsLegalFPType = [this](EVT VT) {
+    EVT EltVT = VT.getScalarType();
+    return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
+            (EltVT == MVT::f64 && ST->hasFP64()) ||
+            (EltVT == MVT::f16 && ST->hasFullFP16());
+  };

  EVT SrcTy = TLI->getValueType(DL, Src);
  EVT DstTy = TLI->getValueType(DL, Dst);
@ -321,8 +327,21 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
    return AdjustCost(
        BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));

-  // The extend of a load is free
-  if (I && isa<LoadInst>(I->getOperand(0))) {
+  // Extending masked load/Truncating masked stores is expensive because we
+  // currently don't split them. This means that we'll likely end up
+  // loading/storing each element individually (hence the high cost).
+  if ((ST->hasMVEIntegerOps() &&
+       (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
+        Opcode == Instruction::SExt)) ||
+      (ST->hasMVEFloatOps() &&
+       (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
+       IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
+    if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
+      return 2 * DstTy.getVectorNumElements() * ST->getMVEVectorCostFactor();
+
+  // The extend of other kinds of load is free
+  if (CCH == TTI::CastContextHint::Normal ||
+      CCH == TTI::CastContextHint::Masked) {
    static const TypeConversionCostTblEntry LoadConversionTbl[] = {
        {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
        {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
@ -376,11 +395,9 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
        return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
    }
-  }

    // The truncate of a store is free. This is the mirror of extends above.
-  if (I && I->hasOneUse() && isa<StoreInst>(*I->user_begin())) {
-    static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
+    static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
        {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
        {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
        {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
@ -390,18 +407,18 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
    };
    if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
      if (const auto *Entry =
-              ConvertCostTableLookup(MVELoadConversionTbl, ISD,
+              ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
                                     SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
        return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
    }

-    static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
+    static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
        {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
        {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
    };
    if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
      if (const auto *Entry =
-              ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
+              ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
                                     SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
        return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
    }
@ -638,14 +655,8 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
    int Lanes = 1;
    if (SrcTy.isFixedLengthVector())
      Lanes = SrcTy.getVectorNumElements();
-    auto IsLegal = [this](EVT VT) {
-      EVT EltVT = VT.getScalarType();
-      return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
-             (EltVT == MVT::f64 && ST->hasFP64()) ||
-             (EltVT == MVT::f16 && ST->hasFullFP16());
-    };

-    if (IsLegal(SrcTy) && IsLegal(DstTy))
+    if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
      return Lanes;
    else
      return Lanes * CallCost;
--- a/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
+++ b/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
@ -1852,38 +1852,38 @@ define i32 @maskedload_extends() {
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2864u = zext <2 x i8> %loadv2i8 to <2 x i64>
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4816s = sext <4 x i8> %loadv4i8 to <4 x i16>
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4816u = zext <4 x i8> %loadv4i8 to <4 x i16>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4832s = sext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4832u = zext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %v4864s = sext <4 x i8> %loadv4i8 to <4 x i64>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v4864u = zext <4 x i8> %loadv4i8 to <4 x i64>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8816s = sext <8 x i8> %loadv8i8 to <8 x i16>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8816u = zext <8 x i8> %loadv8i8 to <8 x i16>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v8832s = sext <8 x i8> %loadv8i8 to <8 x i32>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v8832u = zext <8 x i8> %loadv8i8 to <8 x i32>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %v8864s = sext <8 x i8> %loadv8i8 to <8 x i64>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %v8864u = zext <8 x i8> %loadv8i8 to <8 x i64>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v16816s = sext <16 x i8> %loadv16i8 to <16 x i16>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v16816u = zext <16 x i8> %loadv16i8 to <16 x i16>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %v16832s = sext <16 x i8> %loadv16i8 to <16 x i32>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %v16832u = zext <16 x i8> %loadv16i8 to <16 x i32>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1322 for instruction: %v16864s = sext <16 x i8> %loadv16i8 to <16 x i64>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 298 for instruction: %v16864u = zext <16 x i8> %loadv16i8 to <16 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4832s = sext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4832u = zext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4864s = sext <4 x i8> %loadv4i8 to <4 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4864u = zext <4 x i8> %loadv4i8 to <4 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8816s = sext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8816u = zext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8832s = sext <8 x i8> %loadv8i8 to <8 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8832u = zext <8 x i8> %loadv8i8 to <8 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8864s = sext <8 x i8> %loadv8i8 to <8 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8864u = zext <8 x i8> %loadv8i8 to <8 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16816s = sext <16 x i8> %loadv16i8 to <16 x i16>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16816u = zext <16 x i8> %loadv16i8 to <16 x i16>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16832s = sext <16 x i8> %loadv16i8 to <16 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16832u = zext <16 x i8> %loadv16i8 to <16 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16864s = sext <16 x i8> %loadv16i8 to <16 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16864u = zext <16 x i8> %loadv16i8 to <16 x i64>
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v21632s = sext <2 x i16> %loadv2i16 to <2 x i32>
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v21632u = zext <2 x i16> %loadv2i16 to <2 x i32>
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v21664s = sext <2 x i16> %loadv2i16 to <2 x i64>
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v21664u = zext <2 x i16> %loadv2i16 to <2 x i64>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v41632s = sext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v41632u = zext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 82 for instruction: %v41664s = sext <4 x i16> %loadv4i16 to <4 x i64>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v41664u = zext <4 x i16> %loadv4i16 to <4 x i64>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v81632s = sext <8 x i16> %loadv8i16 to <8 x i32>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v81632u = zext <8 x i16> %loadv8i16 to <8 x i32>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %v81664s = sext <8 x i16> %loadv8i16 to <8 x i64>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 74 for instruction: %v81664u = zext <8 x i16> %loadv8i16 to <8 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v41632s = sext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v41632u = zext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v41664s = sext <4 x i16> %loadv4i16 to <4 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v41664u = zext <4 x i16> %loadv4i16 to <4 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v81632s = sext <8 x i16> %loadv8i16 to <8 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v81632u = zext <8 x i16> %loadv8i16 to <8 x i32>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v81664s = sext <8 x i16> %loadv8i16 to <8 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v81664u = zext <8 x i16> %loadv8i16 to <8 x i64>
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v23264s = sext <2 x i32> %loadv2i32 to <2 x i64>
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v23264u = zext <2 x i32> %loadv2i32 to <2 x i64>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %v43264s = sext <4 x i32> %loadv4i32 to <4 x i64>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v43264u = zext <4 x i32> %loadv4i32 to <4 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v43264s = sext <4 x i32> %loadv4i32 to <4 x i64>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v43264u = zext <4 x i32> %loadv4i32 to <4 x i64>
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; CHECK-V8M-MAIN-RECIP-LABEL: 'maskedload_extends'
@ -1922,15 +1922,15 @@ define i32 @maskedload_extends() {
 ; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16864u = zext <16 x i8> %loadv16i8 to <16 x i64>
 ; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v21632s = sext <2 x i16> %loadv2i16 to <2 x i32>
 ; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v21632u = zext <2 x i16> %loadv2i16 to <2 x i32>
-; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v21664s = sext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v21664s = sext <2 x i16> %loadv2i16 to <2 x i64>
 ; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v21664u = zext <2 x i16> %loadv2i16 to <2 x i64>
 ; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v41632s = sext <4 x i16> %loadv4i16 to <4 x i32>
 ; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v41632u = zext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v41664s = sext <4 x i16> %loadv4i16 to <4 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v41664s = sext <4 x i16> %loadv4i16 to <4 x i64>
 ; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v41664u = zext <4 x i16> %loadv4i16 to <4 x i64>
 ; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v81632s = sext <8 x i16> %loadv8i16 to <8 x i32>
 ; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v81632u = zext <8 x i16> %loadv8i16 to <8 x i32>
-; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v81664s = sext <8 x i16> %loadv8i16 to <8 x i64>
+; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v81664s = sext <8 x i16> %loadv8i16 to <8 x i64>
 ; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v81664u = zext <8 x i16> %loadv8i16 to <8 x i64>
 ; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v23264s = sext <2 x i32> %loadv2i32 to <2 x i64>
 ; CHECK-V8M-MAIN-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v23264u = zext <2 x i32> %loadv2i32 to <2 x i64>
@ -1974,15 +1974,15 @@ define i32 @maskedload_extends() {
 ; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16864u = zext <16 x i8> %loadv16i8 to <16 x i64>
 ; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v21632s = sext <2 x i16> %loadv2i16 to <2 x i32>
 ; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v21632u = zext <2 x i16> %loadv2i16 to <2 x i32>
-; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v21664s = sext <2 x i16> %loadv2i16 to <2 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v21664s = sext <2 x i16> %loadv2i16 to <2 x i64>
 ; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v21664u = zext <2 x i16> %loadv2i16 to <2 x i64>
 ; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v41632s = sext <4 x i16> %loadv4i16 to <4 x i32>
 ; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v41632u = zext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v41664s = sext <4 x i16> %loadv4i16 to <4 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v41664s = sext <4 x i16> %loadv4i16 to <4 x i64>
 ; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v41664u = zext <4 x i16> %loadv4i16 to <4 x i64>
 ; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v81632s = sext <8 x i16> %loadv8i16 to <8 x i32>
 ; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v81632u = zext <8 x i16> %loadv8i16 to <8 x i32>
-; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v81664s = sext <8 x i16> %loadv8i16 to <8 x i64>
+; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v81664s = sext <8 x i16> %loadv8i16 to <8 x i64>
 ; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v81664u = zext <8 x i16> %loadv8i16 to <8 x i64>
 ; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v23264s = sext <2 x i32> %loadv2i32 to <2 x i64>
 ; CHECK-V8M-BASE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v23264u = zext <2 x i32> %loadv2i32 to <2 x i64>
@ -2060,38 +2060,38 @@ define i32 @maskedload_extends() {
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2864u = zext <2 x i8> %loadv2i8 to <2 x i64>
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4816s = sext <4 x i8> %loadv4i8 to <4 x i16>
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4816u = zext <4 x i8> %loadv4i8 to <4 x i16>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4832s = sext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4832u = zext <4 x i8> %loadv4i8 to <4 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4864s = sext <4 x i8> %loadv4i8 to <4 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4864u = zext <4 x i8> %loadv4i8 to <4 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8816s = sext <8 x i8> %loadv8i8 to <8 x i16>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8816u = zext <8 x i8> %loadv8i8 to <8 x i16>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8832s = sext <8 x i8> %loadv8i8 to <8 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8832u = zext <8 x i8> %loadv8i8 to <8 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8864s = sext <8 x i8> %loadv8i8 to <8 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8864u = zext <8 x i8> %loadv8i8 to <8 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16816s = sext <16 x i8> %loadv16i8 to <16 x i16>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16816u = zext <16 x i8> %loadv16i8 to <16 x i16>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16832s = sext <16 x i8> %loadv16i8 to <16 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16832u = zext <16 x i8> %loadv16i8 to <16 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16864s = sext <16 x i8> %loadv16i8 to <16 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16864u = zext <16 x i8> %loadv16i8 to <16 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4832s = sext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4832u = zext <4 x i8> %loadv4i8 to <4 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4864s = sext <4 x i8> %loadv4i8 to <4 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4864u = zext <4 x i8> %loadv4i8 to <4 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8816s = sext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8816u = zext <8 x i8> %loadv8i8 to <8 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8832s = sext <8 x i8> %loadv8i8 to <8 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8832u = zext <8 x i8> %loadv8i8 to <8 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8864s = sext <8 x i8> %loadv8i8 to <8 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8864u = zext <8 x i8> %loadv8i8 to <8 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16816s = sext <16 x i8> %loadv16i8 to <16 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16816u = zext <16 x i8> %loadv16i8 to <16 x i16>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16832s = sext <16 x i8> %loadv16i8 to <16 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16832u = zext <16 x i8> %loadv16i8 to <16 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16864s = sext <16 x i8> %loadv16i8 to <16 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16864u = zext <16 x i8> %loadv16i8 to <16 x i64>
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v21632s = sext <2 x i16> %loadv2i16 to <2 x i32>
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v21632u = zext <2 x i16> %loadv2i16 to <2 x i32>
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v21664s = sext <2 x i16> %loadv2i16 to <2 x i64>
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v21664u = zext <2 x i16> %loadv2i16 to <2 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v41632s = sext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v41632u = zext <4 x i16> %loadv4i16 to <4 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v41664s = sext <4 x i16> %loadv4i16 to <4 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v41664u = zext <4 x i16> %loadv4i16 to <4 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v81632s = sext <8 x i16> %loadv8i16 to <8 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v81632u = zext <8 x i16> %loadv8i16 to <8 x i32>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v81664s = sext <8 x i16> %loadv8i16 to <8 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v81664u = zext <8 x i16> %loadv8i16 to <8 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v41632s = sext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v41632u = zext <4 x i16> %loadv4i16 to <4 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v41664s = sext <4 x i16> %loadv4i16 to <4 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v41664u = zext <4 x i16> %loadv4i16 to <4 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v81632s = sext <8 x i16> %loadv8i16 to <8 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v81632u = zext <8 x i16> %loadv8i16 to <8 x i32>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v81664s = sext <8 x i16> %loadv8i16 to <8 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v81664u = zext <8 x i16> %loadv8i16 to <8 x i64>
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v23264s = sext <2 x i32> %loadv2i32 to <2 x i64>
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v23264u = zext <2 x i32> %loadv2i32 to <2 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v43264s = sext <4 x i32> %loadv4i32 to <4 x i64>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v43264u = zext <4 x i32> %loadv4i32 to <4 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v43264s = sext <4 x i32> %loadv4i32 to <4 x i64>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v43264u = zext <4 x i32> %loadv4i32 to <4 x i64>
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; CHECK-V8M-MAIN-SIZE-LABEL: 'maskedload_extends'
@ -2361,7 +2361,7 @@ define i32 @maskedstore_trunc() {
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8832 = trunc <8 x i32> undef to <8 x i8>
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v8864 = trunc <8 x i64> undef to <8 x i8>
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16816 = trunc <16 x i16> undef to <16 x i8>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v16832 = trunc <16 x i32> undef to <16 x i8>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16832 = trunc <16 x i32> undef to <16 x i8>
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %v16864 = trunc <16 x i64> undef to <16 x i8>
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v21632 = trunc <2 x i32> undef to <2 x i16>
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v21664 = trunc <2 x i64> undef to <2 x i16>
@ -2774,9 +2774,9 @@ define i32 @maskedload_fpextends() {
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 4, <4 x i1> undef, <4 x float> undef)
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* undef, i32 4, <8 x i1> undef, <8 x float> undef)
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v1 = fpext <2 x half> %loadv2f16 to <2 x float>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4 = fpext <16 x half> %loadv16f16 to <16 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4 = fpext <16 x half> %loadv16f16 to <16 x float>
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v5 = fpext <2 x half> %loadv2f16 to <2 x double>
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v6 = fpext <4 x half> %loadv4f16 to <4 x double>
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v7 = fpext <8 x half> %loadv8f16 to <8 x double>
@ -2866,9 +2866,9 @@ define i32 @maskedload_fpextends() {
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv4f32 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* undef, i32 4, <4 x i1> undef, <4 x float> undef)
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %loadv8f32 = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* undef, i32 4, <8 x i1> undef, <8 x float> undef)
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v1 = fpext <2 x half> %loadv2f16 to <2 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4 = fpext <16 x half> %loadv16f16 to <16 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = fpext <4 x half> %loadv4f16 to <4 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v3 = fpext <8 x half> %loadv8f16 to <8 x float>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4 = fpext <16 x half> %loadv16f16 to <16 x float>
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v5 = fpext <2 x half> %loadv2f16 to <2 x double>
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v6 = fpext <4 x half> %loadv4f16 to <4 x double>
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v7 = fpext <8 x half> %loadv8f16 to <8 x double>
@ -2998,9 +2998,9 @@ define i32 @maskedload_fptrunc() {
 ; CHECK-MVE-RECIP-LABEL: 'maskedload_fptrunc'
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v21632 = fptrunc <2 x float> undef to <2 x half>
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v21664 = fptrunc <2 x double> undef to <2 x half>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v41632 = fptrunc <4 x float> undef to <4 x half>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v41632 = fptrunc <4 x float> undef to <4 x half>
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v41664 = fptrunc <4 x double> undef to <4 x half>
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v81632 = fptrunc <8 x float> undef to <8 x half>
+; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v81632 = fptrunc <8 x float> undef to <8 x half>
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v81664 = fptrunc <8 x double> undef to <8 x half>
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v23264 = fptrunc <2 x double> undef to <2 x float>
 ; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v43264 = fptrunc <4 x double> undef to <4 x float>
@ -3074,9 +3074,9 @@ define i32 @maskedload_fptrunc() {
 ; CHECK-MVE-SIZE-LABEL: 'maskedload_fptrunc'
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v21632 = fptrunc <2 x float> undef to <2 x half>
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v21664 = fptrunc <2 x double> undef to <2 x half>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v41632 = fptrunc <4 x float> undef to <4 x half>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v41632 = fptrunc <4 x float> undef to <4 x half>
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v41664 = fptrunc <4 x double> undef to <4 x half>
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v81632 = fptrunc <8 x float> undef to <8 x half>
+; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v81632 = fptrunc <8 x float> undef to <8 x half>
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v81664 = fptrunc <8 x double> undef to <8 x half>
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v23264 = fptrunc <2 x double> undef to <2 x float>
 ; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v43264 = fptrunc <4 x double> undef to <4 x float>
--- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll
@ -0,0 +1,111 @@
+; RUN: opt < %s -mattr=+mve,+mve.fp -loop-vectorize -S | FileCheck %s --check-prefixes=DEFAULT
+; RUN: opt < %s -mattr=+mve,+mve.fp -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s --check-prefixes=TAILPRED
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-arm-none-eabi"
+
+; When TP is disabled, this test can vectorize with a VF of 16.
+; When TP is enabled, this test should vectorize with a VF of 8.
+;
+; DEFAULT: load <16 x i8>, <16 x i8>*
+; DEFAULT: sext <16 x i8> %{{.*}} to <16 x i16>
+; DEFAULT: add <16 x i16>
+; DEFAULT-NOT: llvm.masked.load
+; DEFAULT-NOT: llvm.masked.store
+;
+; TAILPRED: llvm.masked.load.v8i8.p0v8i8
+; TAILPRED: sext <8 x i8> %{{.*}} to <8 x i16>
+; TAILPRED: add <8 x i16>
+; TAILPRED: call void @llvm.masked.store.v8i8.p0v8i8
+; TAILPRED-NOT: load <16 x i8>, <16 x i8>*
+
+define i32 @tp_reduces_vf(i8* nocapture %0, i32 %1, i8** %input) {
+  %3 = load i8*, i8** %input, align 8
+  %4 = sext i32 %1 to i64
+  %5 = icmp eq i32 %1, 0
+  br i1 %5, label %._crit_edge, label %.preheader47.preheader
+
+.preheader47.preheader:
+  br label %.preheader47
+
+.preheader47:
+  %.050 = phi i64 [ %54, %53 ], [ 0, %.preheader47.preheader ]
+  br label %.preheader
+
+._crit_edge.loopexit:
+  br label %._crit_edge
+
+._crit_edge:
+  ret i32 0
+
+.preheader:
+  %indvars.iv51 = phi i32 [ 1, %.preheader47 ], [ %indvars.iv.next52, %52 ]
+  %6 = mul nuw nsw i32 %indvars.iv51, 320
+  br label %7
+
+7:
+  %indvars.iv = phi i32 [ 1, %.preheader ], [ %indvars.iv.next, %7 ]
+  %8 = add nuw nsw i32 %6, %indvars.iv
+  %9 = add nsw i32 %8, -320
+  %10 = add nsw i32 %8, -321
+  %11 = getelementptr inbounds i8, i8* %3, i32 %10
+  %12 = load i8, i8* %11, align 1
+  %13 = sext i8 %12 to i32
+  %14 = getelementptr inbounds i8, i8* %3, i32 %9
+  %15 = load i8, i8* %14, align 1
+  %16 = sext i8 %15 to i32
+  %17 = add nsw i32 %8, -319
+  %18 = getelementptr inbounds i8, i8* %3, i32 %17
+  %19 = load i8, i8* %18, align 1
+  %20 = sext i8 %19 to i32
+  %21 = add nsw i32 %8, -1
+  %22 = getelementptr inbounds i8, i8* %3, i32 %21
+  %23 = load i8, i8* %22, align 1
+  %24 = sext i8 %23 to i32
+  %25 = getelementptr inbounds i8, i8* %3, i32 %8
+  %26 = load i8, i8* %25, align 1
+  %27 = sext i8 %26 to i32
+  %28 = mul nsw i32 %27, 255
+  %29 = add nuw nsw i32 %8, 1
+  %30 = getelementptr inbounds i8, i8* %3, i32 %29
+  %31 = load i8, i8* %30, align 1
+  %32 = sext i8 %31 to i32
+  %33 = add nuw nsw i32 %8, 320
+  %34 = add nuw nsw i32 %8, 319
+  %35 = getelementptr inbounds i8, i8* %3, i32 %34
+  %36 = load i8, i8* %35, align 1
+  %37 = sext i8 %36 to i32
+  %38 = getelementptr inbounds i8, i8* %3, i32 %33
+  %39 = load i8, i8* %38, align 1
+  %40 = sext i8 %39 to i32
+  %41 = add nuw nsw i32 %8, 321
+  %42 = getelementptr inbounds i8, i8* %3, i32 %41
+  %43 = load i8, i8* %42, align 1
+  %44 = sext i8 %43 to i32
+  %reass.add = add nsw i32 %16, %13
+  %reass.add44 = add nsw i32 %reass.add, %20
+  %reass.add45 = add nsw i32 %reass.add44, %24
+  %45 = add nsw i32 %reass.add45, %32
+  %46 = add nsw i32 %45, %37
+  %47 = add nsw i32 %46, %40
+  %reass.add46 = add nsw i32 %47, %44
+  %reass.mul = mul nsw i32 %reass.add46, -28
+  %48 = add nsw i32 %reass.mul, %28
+  %49 = lshr i32 %48, 8
+  %50 = trunc i32 %49 to i8
+  %51 = getelementptr inbounds i8, i8* %0, i32 %8
+  store i8 %50, i8* %51, align 1
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 319
+  br i1 %exitcond, label %52, label %7
+
+52:
+  %indvars.iv.next52 = add nuw nsw i32 %indvars.iv51, 1
+  %exitcond53 = icmp eq i32 %indvars.iv.next52, 239
+  br i1 %exitcond53, label %53, label %.preheader
+
+53:
+  %54 = add nuw i64 %.050, 1
+  %55 = icmp ult i64 %54, %4
+  br i1 %55, label %.preheader47, label %._crit_edge.loopexit
+}