Prepare for inlining of SUM intrinsic

Find calls to FortranASum{Real8,Integer4}, check for dim and mask arguments being absent - then produce an inlineable simple version of the sum function. (No longer a prototype, please review for push to llvm/main - not sure how to make Phabricator update the review with actual commit message) Reviewed By: peixin, awarzynski Differential Revision: https://reviews.llvm.org/D125407
2022-04-29 15:24:37 +01:00 · 2022-04-29 15:24:37 +01:00 · 6e193b5cbb
parent 6ff873ac86
commit 6e193b5cbb
5 changed files with 573 additions and 0 deletions
--- a/flang/include/flang/Optimizer/Transforms/Passes.h
+++ b/flang/include/flang/Optimizer/Transforms/Passes.h
@ -35,6 +35,8 @@ std::unique_ptr<mlir::Pass> createExternalNameConversionPass();
 std::unique_ptr<mlir::Pass> createMemDataFlowOptPass();
 std::unique_ptr<mlir::Pass> createPromoteToAffinePass();
 std::unique_ptr<mlir::Pass> createMemoryAllocationPass();
+std::unique_ptr<mlir::Pass> createSimplifyIntrinsicsPass();
+
 std::unique_ptr<mlir::Pass>
 createMemoryAllocationPass(bool dynOnHeap, std::size_t maxStackSize);
 std::unique_ptr<mlir::Pass> createAnnotateConstantOperandsPass();
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@ -174,6 +174,20 @@ def MemRefDataFlowOpt : Pass<"fir-memref-dataflow-opt", "::mlir::func::FuncOp">
  ];
 }

+// This needs to be a "mlir::ModuleOp" pass, because it inserts simplified
+// functions into the module, which is invalid if a finer grain mlir::Operation
+// is used as the pass specification says to not touch things outside hte scope
+// of the operation being processed.
+def SimplifyIntrinsics : Pass<"simplify-intrinsics", "mlir::ModuleOp"> {
+  let summary = "Intrinsics simplification";
+  let description = [{
+    Qualifying intrinsics calls are replaced with calls to a specialized and
+    simplified function. The simplified function is added to the current module.
+    This function can be inlined by a general purpose inlining pass.
+  }];
+  let constructor = "::fir::createSimplifyIntrinsicsPass()";
+}
+
 def MemoryAllocationOpt : Pass<"memory-allocation-opt", "mlir::func::FuncOp"> {
  let summary = "Convert stack to heap allocations and vice versa.";
  let description = [{
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@ -11,6 +11,7 @@ add_flang_library(FIRTransforms
  RewriteLoop.cpp
  SimplifyRegionLite.cpp
  AlgebraicSimplification.cpp
+  SimplifyIntrinsics.cpp

  DEPENDS
  FIRBuilder
--- a/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
+++ b/flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp
@ -0,0 +1,237 @@
+//===- SimplifyIntrinsics.cpp -- replace intrinsics with simpler form -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+/// \file
+/// This pass looks for suitable calls to runtime library for intrinsics that
+/// can be simplified/specialized and replaces with a specialized function.
+///
+/// For example, SUM(arr) can be specialized as a simple function with one loop,
+/// compared to the three arguments (plus file & line info) that the runtime
+/// call has - when the argument is a 1D-array (multiple loops may be needed
+//  for higher dimension arrays, of course)
+///
+/// The general idea is that besides making the call simpler, it can also be
+/// inlined by other passes that run after this pass, which further improves
+/// performance, particularly when the work done in the function is trivial
+/// and small in size.
+//===----------------------------------------------------------------------===//
+
+#include "PassDetail.h"
+#include "flang/Optimizer/Builder/BoxValue.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "flang/Optimizer/Support/FIRContext.h"
+#include "flang/Optimizer/Transforms/Passes.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/RegionUtils.h"
+
+namespace {
+
+class SimplifyIntrinsicsPass
+    : public fir::SimplifyIntrinsicsBase<SimplifyIntrinsicsPass> {
+public:
+  mlir::func::FuncOp getOrCreateFunction(const mlir::Location &loc,
+                                         fir::FirOpBuilder &builder,
+                                         const mlir::Type &type,
+                                         const mlir::StringRef &basename);
+  void runOnOperation() override;
+};
+
+} // namespace
+
+mlir::func::FuncOp SimplifyIntrinsicsPass::getOrCreateFunction(
+    const mlir::Location &loc, fir::FirOpBuilder &builder,
+    const mlir::Type &type, const mlir::StringRef &baseName) {
+  // In future, the idea is that instead of building the function inside
+  // this function, this does the base creation, and calls a callback
+  // function (e.g. a lambda function) that fills in the actual content.
+  // For now, check that it's the ONLY the SUM runtime call.
+  assert(baseName.startswith("_FortranASum"));
+
+  std::string replacementName = mlir::Twine{baseName, "_simplified"}.str();
+  mlir::ModuleOp module = builder.getModule();
+  // If we already have a function, just return it.
+  mlir::func::FuncOp newFunc =
+      fir::FirOpBuilder::getNamedFunction(module, replacementName);
+  if (newFunc)
+    return newFunc;
+
+  // Need to build the function!
+  // Basic idea:
+  // function FortranASum<T>_simplified(arr)
+  //   T, dimension(:) :: arr
+  //   T sum = 0
+  //   integer iter
+  //   do iter = 0, extent(arr)
+  //     sum = sum + arr[iter]
+  //   end do
+  //   FortranASum<T>_simplified = sum
+  // end function FortranASum<T>_simplified
+  mlir::Type boxType = fir::BoxType::get(builder.getNoneType());
+  mlir::FunctionType fType =
+      mlir::FunctionType::get(builder.getContext(), {boxType}, {type});
+  newFunc =
+      fir::FirOpBuilder::createFunction(loc, module, replacementName, fType);
+  auto inlineLinkage = mlir::LLVM::linkage::Linkage::LinkonceODR;
+  auto linkage =
+      mlir::LLVM::LinkageAttr::get(builder.getContext(), inlineLinkage);
+  newFunc->setAttr("llvm.linkage", linkage);
+
+  // Save the position of the original call.
+  mlir::OpBuilder::InsertPoint insertPt = builder.saveInsertionPoint();
+  builder.setInsertionPointToEnd(newFunc.addEntryBlock());
+
+  mlir::IndexType idxTy = builder.getIndexType();
+
+  mlir::Value zero = type.isa<mlir::FloatType>()
+                         ? builder.createRealConstant(loc, type, 0.0)
+                         : builder.createIntegerConstant(loc, type, 0);
+  mlir::Value sum = builder.create<fir::AllocaOp>(loc, type);
+  builder.create<fir::StoreOp>(loc, zero, sum);
+
+  mlir::Block::BlockArgListType args = newFunc.front().getArguments();
+  mlir::Value arg = args[0];
+
+  mlir::Value zeroIdx = builder.createIntegerConstant(loc, idxTy, 0);
+
+  fir::SequenceType::Shape flatShape = {fir::SequenceType::getUnknownExtent()};
+  mlir::Type arrTy = fir::SequenceType::get(flatShape, type);
+  mlir::Type boxArrTy = fir::BoxType::get(arrTy);
+  mlir::Value array = builder.create<fir::ConvertOp>(loc, boxArrTy, arg);
+  auto dims =
+      builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, array, zeroIdx);
+  mlir::Value len = dims.getResult(1);
+  mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
+  mlir::Value step = one;
+
+  // We use C indexing here, so len-1 as loopcount
+  mlir::Value loopCount = builder.create<mlir::arith::SubIOp>(loc, len, one);
+  auto loop = builder.create<fir::DoLoopOp>(loc, zeroIdx, loopCount, step);
+
+  // Begin loop code
+  mlir::OpBuilder::InsertPoint loopEndPt = builder.saveInsertionPoint();
+  builder.setInsertionPointToStart(loop.getBody());
+
+  mlir::Type eleRefTy = builder.getRefType(type);
+  mlir::Value index = loop.getInductionVar();
+  mlir::Value addr =
+      builder.create<fir::CoordinateOp>(loc, eleRefTy, array, index);
+  mlir::Value elem = builder.create<fir::LoadOp>(loc, addr);
+  mlir::Value sumVal = builder.create<fir::LoadOp>(loc, sum);
+
+  mlir::Value res;
+  if (type.isa<mlir::FloatType>())
+    res = builder.create<mlir::arith::AddFOp>(loc, elem, sumVal);
+  else if (type.isa<mlir::IntegerType>())
+    res = builder.create<mlir::arith::AddIOp>(loc, elem, sumVal);
+  else
+    TODO(loc, "Unsupported type");
+
+  builder.create<fir::StoreOp>(loc, res, sum);
+  // End of loop.
+  builder.restoreInsertionPoint(loopEndPt);
+
+  mlir::Value resultVal = builder.create<fir::LoadOp>(loc, sum);
+  builder.create<mlir::func::ReturnOp>(loc, resultVal);
+
+  // Now back to where we were adding code earlier...
+  builder.restoreInsertionPoint(insertPt);
+
+  return newFunc;
+}
+
+static bool isOperandAbsent(mlir::Value val) {
+  if (mlir::Operation *op = val.getDefiningOp())
+    return mlir::isa_and_nonnull<fir::AbsentOp>(
+        op->getOperand(0).getDefiningOp());
+  return false;
+}
+
+static bool isZero(mlir::Value val) {
+  if (mlir::Operation *op = val.getDefiningOp())
+    if (mlir::Operation *defOp = op->getOperand(0).getDefiningOp())
+      return mlir::matchPattern(defOp, mlir::m_Zero());
+  return false;
+}
+
+static mlir::Value findShape(mlir::Value val) {
+  mlir::Operation *defOp = val.getDefiningOp();
+  while (defOp) {
+    defOp = defOp->getOperand(0).getDefiningOp();
+    if (fir::EmboxOp box = mlir::dyn_cast_or_null<fir::EmboxOp>(defOp))
+      return box.getShape();
+  }
+  return {};
+}
+
+static unsigned getDimCount(mlir::Value val) {
+  if (mlir::Value shapeVal = findShape(val)) {
+    mlir::Type resType = shapeVal.getDefiningOp()->getResultTypes()[0];
+    return fir::getRankOfShapeType(resType);
+  }
+  return 0;
+}
+
+void SimplifyIntrinsicsPass::runOnOperation() {
+  mlir::ModuleOp module = getOperation();
+  fir::KindMapping kindMap = fir::getKindMapping(module);
+  module.walk([&](mlir::Operation *op) {
+    if (auto call = mlir::dyn_cast<fir::CallOp>(op)) {
+      if (mlir::SymbolRefAttr callee = call.getCalleeAttr()) {
+        mlir::StringRef funcName = callee.getLeafReference().getValue();
+        // Replace call to runtime function for SUM when it has single
+        // argument (no dim or mask argument) for 1D arrays with either
+        // Integer4 or Real8 types. Other forms are ignored.
+        // The new function is added to the module.
+        //
+        // Prototype for runtime call (from sum.cpp):
+        // RTNAME(Sum<T>)(const Descriptor &x, const char *source, int line,
+        //                int dim, const Descriptor *mask)
+        if (funcName.startswith("_FortranASum")) {
+          mlir::Operation::operand_range args = call.getArgs();
+          // args[1] and args[2] are source filename and line number, ignored.
+          const mlir::Value &dim = args[3];
+          const mlir::Value &mask = args[4];
+          // dim is zero when it is absent, which is an implementation
+          // detail in the runtime library.
+          bool dimAndMaskAbsent = isZero(dim) && isOperandAbsent(mask);
+          unsigned rank = getDimCount(args[0]);
+          if (dimAndMaskAbsent && rank == 1) {
+            mlir::Location loc = call.getLoc();
+            mlir::Type type;
+            fir::FirOpBuilder builder(op, kindMap);
+            if (funcName.endswith("Integer4")) {
+              type = mlir::IntegerType::get(builder.getContext(), 32);
+            } else if (funcName.endswith("Real8")) {
+              type = mlir::FloatType::getF64(builder.getContext());
+            } else {
+              return;
+            }
+            mlir::func::FuncOp newFunc =
+                getOrCreateFunction(loc, builder, type, funcName);
+            auto newCall = builder.create<fir::CallOp>(
+                loc, newFunc, mlir::ValueRange{args[0]});
+            call->replaceAllUsesWith(newCall.getResults());
+            call->dropAllReferences();
+            call->erase();
+          }
+        }
+      }
+    }
+  });
+}
+
+std::unique_ptr<mlir::Pass> fir::createSimplifyIntrinsicsPass() {
+  return std::make_unique<SimplifyIntrinsicsPass>();
+}
--- a/flang/test/Transforms/simplifyintrinsics.fir
+++ b/flang/test/Transforms/simplifyintrinsics.fir
@ -0,0 +1,319 @@
+// RUN: fir-opt --split-input-file --simplify-intrinsics %s | FileCheck %s
+
+// Call to SUM with 1D I32 array is replaced.
+module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.target_triple = "native"} {
+  func.func @sum_1d_array_int(%arg0: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "a"}) -> i32 {
+    %c10 = arith.constant 10 : index
+    %0 = fir.alloca i32 {bindc_name = "test_sum_2", uniq_name = "_QFtest_sum_2Etest_sum_2"}
+    %1 = fir.shape %c10 : (index) -> !fir.shape<1>
+    %2 = fir.embox %arg0(%1) : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<10xi32>>
+    %3 = fir.absent !fir.box<i1>
+    %c0 = arith.constant 0 : index
+    %4 = fir.address_of(@_QQcl.2E2F6973756D5F322E66393000) : !fir.ref<!fir.char<1,13>>
+    %c5_i32 = arith.constant 5 : i32
+    %5 = fir.convert %2 : (!fir.box<!fir.array<10xi32>>) -> !fir.box<none>
+    %6 = fir.convert %4 : (!fir.ref<!fir.char<1,13>>) -> !fir.ref<i8>
+    %7 = fir.convert %c0 : (index) -> i32
+    %8 = fir.convert %3 : (!fir.box<i1>) -> !fir.box<none>
+    %9 = fir.call @_FortranASumInteger4(%5, %6, %c5_i32, %7, %8) : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
+    fir.store %9 to %0 : !fir.ref<i32>
+    %10 = fir.load %0 : !fir.ref<i32>
+    return %10 : i32
+  }
+  func.func private @_FortranASumInteger4(!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32 attributes {fir.runtime}
+  fir.global linkonce @_QQcl.2E2F6973756D5F322E66393000 constant : !fir.char<1,13> {
+    %0 = fir.string_lit "./isum_2.f90\00"(13) : !fir.char<1,13>
+    fir.has_value %0 : !fir.char<1,13>
+  }
+}
+
+
+// CHECK-LABEL:   func.func @sum_1d_array_int(
+// CHECK-SAME:                             %[[A:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "a"}) -> i32 {
+// CHECK:           %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
+// CHECK:           %[[A_BOX_I32:.*]] = fir.embox %[[A]](%[[SHAPE]]) : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<10xi32>>
+// CHECK:           %[[A_BOX_NONE:.*]] = fir.convert %[[A_BOX_I32]] : (!fir.box<!fir.array<10xi32>>) -> !fir.box<none>
+// CHECK-NOT:       fir.call @_FortranASumInteger4({{.*}})
+// CHECK:           %[[RES:.*]] = fir.call @_FortranASumInteger4_simplified(%[[A_BOX_NONE]]) : (!fir.box<none>) -> i32
+// CHECK-NOT:       fir.call @_FortranASumInteger4({{.*}})
+// CHECK:           return %{{.*}} : i32
+// CHECK:         }
+// CHECK:         func.func private @_FortranASumInteger4(!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32 attributes {fir.runtime}
+
+// CHECK-LABEL:   func.func private @_FortranASumInteger4_simplified(
+// CHECK-SAME:                                                       %[[ARR:.*]]: !fir.box<none>) -> i32 attributes {llvm.linkage = #llvm.linkage<linkonce_odr>} {
+// CHECK:           %[[CI32_0:.*]] = arith.constant 0 : i32
+// CHECK:           %[[SUM:.*]] = fir.alloca i32
+// CHECK:           fir.store %[[CI32_0]] to %[[SUM]] : !fir.ref<i32>
+// CHECK:           %[[CINDEX_0:.*]] = arith.constant 0 : index
+// CHECK:           %[[ARR_BOX_I32:.*]] = fir.convert %[[ARR]] : (!fir.box<none>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:           %[[DIMS:.*]]:3 = fir.box_dims %[[ARR_BOX_I32]], %[[CINDEX_0]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:           %[[CINDEX_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[EXTENT:.*]] = arith.subi %[[DIMS]]#1, %[[CINDEX_1]] : index
+// CHECK:           fir.do_loop %[[ITER:.*]] = %[[CINDEX_0]] to %[[EXTENT]] step %[[CINDEX_1]] {
+// CHECK:             %[[ITEM:.*]] = fir.coordinate_of %[[ARR_BOX_I32]], %[[ITER]] : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:             %[[ITEM_VAL:.*]] = fir.load %[[ITEM]] : !fir.ref<i32>
+// CHECK:             %[[SUM_VAL:.*]] = fir.load %[[SUM]] : !fir.ref<i32>
+// CHECK:             %[[NEW_SUM:.*]] = arith.addi %[[ITEM_VAL]], %[[SUM_VAL]] : i32
+// CHECK:             fir.store %[[NEW_SUM]] to %[[SUM]] : !fir.ref<i32>
+// CHECK:           }
+// CHECK:           %[[RET:.*]] = fir.load %[[SUM]] : !fir.ref<i32>
+// CHECK:           return %[[RET]] : i32
+// CHECK:         }
+
+// -----
+
+// Call to SUM with 2D I32 arrays is not replaced.
+module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.target_triple = "native"} {
+  func.func @sum_2d_array_int(%arg0: !fir.ref<!fir.array<10x10xi32>> {fir.bindc_name = "a"}) -> i32 {
+    %c10 = arith.constant 10 : index
+    %c10_0 = arith.constant 10 : index
+    %0 = fir.alloca i32 {bindc_name = "test_sum_3", uniq_name = "_QFtest_sum_3Etest_sum_3"}
+    %1 = fir.shape %c10, %c10_0 : (index, index) -> !fir.shape<2>
+    %2 = fir.embox %arg0(%1) : (!fir.ref<!fir.array<10x10xi32>>, !fir.shape<2>) -> !fir.box<!fir.array<10x10xi32>>
+    %3 = fir.absent !fir.box<i1>
+    %c0 = arith.constant 0 : index
+    %4 = fir.address_of(@_QQcl.2E2F6973756D5F332E66393000) : !fir.ref<!fir.char<1,13>>
+    %c5_i32 = arith.constant 5 : i32
+    %5 = fir.convert %2 : (!fir.box<!fir.array<10x10xi32>>) -> !fir.box<none>
+    %6 = fir.convert %4 : (!fir.ref<!fir.char<1,13>>) -> !fir.ref<i8>
+    %7 = fir.convert %c0 : (index) -> i32
+    %8 = fir.convert %3 : (!fir.box<i1>) -> !fir.box<none>
+    %9 = fir.call @_FortranASumInteger4(%5, %6, %c5_i32, %7, %8) : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
+    fir.store %9 to %0 : !fir.ref<i32>
+    %10 = fir.load %0 : !fir.ref<i32>
+    return %10 : i32
+  }
+  func.func private @_FortranASumInteger4(!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32 attributes {fir.runtime}
+  fir.global linkonce @_QQcl.2E2F6973756D5F332E66393000 constant : !fir.char<1,13> {
+    %0 = fir.string_lit "./isum_3.f90\00"(13) : !fir.char<1,13>
+    fir.has_value %0 : !fir.char<1,13>
+  }
+}
+
+// CHECK-LABEL:   func.func @sum_2d_array_int({{.*}} !fir.ref<!fir.array<10x10xi32>> {fir.bindc_name = "a"}) -> i32 {
+// CHECK-NOT:       fir.call @_FortranASumInteger4_simplified({{.*}})
+// CHECK:           fir.call @_FortranASumInteger4({{.*}}) : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
+// CHECK-NOT:       fir.call @_FortranASumInteger4_simplified({{.*}})
+
+// -----
+
+// Call to SUM with 1D F64 is replaced.
+module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.target_triple = "native"} {
+  func.func @sum_1d_real(%arg0: !fir.ref<!fir.array<10xf64>> {fir.bindc_name = "a"}) -> f64 {
+    %c10 = arith.constant 10 : index
+    %0 = fir.alloca f64 {bindc_name = "sum_1d_real", uniq_name = "_QFsum_1d_realEsum_1d_real"}
+    %1 = fir.shape %c10 : (index) -> !fir.shape<1>
+    %2 = fir.embox %arg0(%1) : (!fir.ref<!fir.array<10xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<10xf64>>
+    %3 = fir.absent !fir.box<i1>
+    %c0 = arith.constant 0 : index
+    %4 = fir.address_of(@_QQcl.2E2F6973756D5F352E66393000) : !fir.ref<!fir.char<1,13>>
+    %c5_i32 = arith.constant 5 : i32
+    %5 = fir.convert %2 : (!fir.box<!fir.array<10xf64>>) -> !fir.box<none>
+    %6 = fir.convert %4 : (!fir.ref<!fir.char<1,13>>) -> !fir.ref<i8>
+    %7 = fir.convert %c0 : (index) -> i32
+    %8 = fir.convert %3 : (!fir.box<i1>) -> !fir.box<none>
+    %9 = fir.call @_FortranASumReal8(%5, %6, %c5_i32, %7, %8) : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> f64
+    fir.store %9 to %0 : !fir.ref<f64>
+    %10 = fir.load %0 : !fir.ref<f64>
+    return %10 : f64
+  }
+  func.func private @_FortranASumReal8(!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> f64 attributes {fir.runtime}
+  fir.global linkonce @_QQcl.2E2F6973756D5F352E66393000 constant : !fir.char<1,13> {
+    %0 = fir.string_lit "./isum_5.f90\00"(13) : !fir.char<1,13>
+    fir.has_value %0 : !fir.char<1,13>
+  }
+}
+
+
+// CHECK-LABEL:   func.func @sum_1d_real(
+// CHECK-SAME:                           %[[A:.*]]: !fir.ref<!fir.array<10xf64>> {fir.bindc_name = "a"}) -> f64 {
+// CHECK:           %[[CINDEX_10:.*]] = arith.constant 10 : index
+// CHECK:           %[[SHAPE:.*]] = fir.shape %[[CINDEX_10]] : (index) -> !fir.shape<1>
+// CHECK:           %[[A_BOX_F64:.*]] = fir.embox %[[A]](%[[SHAPE]]) : (!fir.ref<!fir.array<10xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<10xf64>>
+// CHECK:           %[[A_BOX_NONE:.*]] = fir.convert %[[A_BOX_F64]] : (!fir.box<!fir.array<10xf64>>) -> !fir.box<none>
+// CHECK-NOT:       fir.call @_FortranASumReal8({{.*}})
+// CHECK:           %[[RES:.*]] = fir.call @_FortranASumReal8_simplified(%[[A_BOX_NONE]]) : (!fir.box<none>) -> f64
+// CHECK-NOT:       fir.call @_FortranASumReal8({{.*}})
+// CHECK:           return %{{.*}} : f64
+// CHECK:         }
+
+// CHECK-LABEL:   func.func private @_FortranASumReal8_simplified(
+// CHECK-SAME:                                                    %[[ARR:.*]]: !fir.box<none>) -> f64 attributes {llvm.linkage = #llvm.linkage<linkonce_odr>} {
+// CHECK:           %[[ZERO:.*]] = arith.constant 0.000000e+00 : f64
+// CHECK:           %[[SUM:.*]] = fir.alloca f64
+// CHECK:           fir.store %[[ZERO]] to %[[SUM]] : !fir.ref<f64>
+// CHECK:           %[[CINDEX_0:.*]] = arith.constant 0 : index
+// CHECK:           %[[ARR_BOX_F64:.*]] = fir.convert %[[ARR]] : (!fir.box<none>) -> !fir.box<!fir.array<?xf64>>
+// CHECK:           %[[DIMS:.*]]:3 = fir.box_dims %[[ARR_BOX_F64]], %[[CINDEX_0]] : (!fir.box<!fir.array<?xf64>>, index) -> (index, index, index)
+// CHECK:           %[[CINDEX_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[EXTENT:.*]] = arith.subi %[[DIMS]]#1, %[[CINDEX_1]] : index
+// CHECK:           fir.do_loop %[[ITER:.*]] = %[[CINDEX_0]] to %[[EXTENT]] step %[[CINDEX_1]] {
+// CHECK:             %[[ITEM:.*]] = fir.coordinate_of %[[ARR_BOX_F64]], %[[ITER]] : (!fir.box<!fir.array<?xf64>>, index) -> !fir.ref<f64>
+// CHECK:             %[[ITEM_VAL:.*]] = fir.load %[[ITEM]] : !fir.ref<f64>
+// CHECK:             %[[SUM_VAL:.*]] = fir.load %[[SUM]] : !fir.ref<f64>
+// CHECK:             %[[NEW_SUM:.*]] = arith.addf %[[ITEM_VAL]], %[[SUM_VAL]] : f64
+// CHECK:             fir.store %[[NEW_SUM]] to %[[SUM]] : !fir.ref<f64>
+// CHECK:           }
+// CHECK:           %[[RES:.*]] = fir.load %[[SUM]] : !fir.ref<f64>
+// CHECK:           return %[[RES]] : f64
+// CHECK:         }
+
+// -----
+
+// Call to SUM with 1D COMPLEX array is not replaced.
+module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.target_triple = "native"} {
+  func.func @sum_1d_complex(%arg0: !fir.ref<!fir.array<10x!fir.complex<4>>> {fir.bindc_name = "a"}) -> !fir.complex<4> {
+    %0 = fir.alloca !fir.complex<4>
+    %c10 = arith.constant 10 : index
+    %1 = fir.alloca !fir.complex<4> {bindc_name = "sum_1d_complex", uniq_name = "_QFsum_1d_complexEsum_1d_complex"}
+    %2 = fir.shape %c10 : (index) -> !fir.shape<1>
+    %3 = fir.embox %arg0(%2) : (!fir.ref<!fir.array<10x!fir.complex<4>>>, !fir.shape<1>) -> !fir.box<!fir.array<10x!fir.complex<4>>>
+    %4 = fir.absent !fir.box<i1>
+    %c0 = arith.constant 0 : index
+    %5 = fir.address_of(@_QQcl.2E2F6973756D5F362E66393000) : !fir.ref<!fir.char<1,13>>
+    %c5_i32 = arith.constant 5 : i32
+    %6 = fir.convert %0 : (!fir.ref<!fir.complex<4>>) -> !fir.ref<complex<f32>>
+    %7 = fir.convert %3 : (!fir.box<!fir.array<10x!fir.complex<4>>>) -> !fir.box<none>
+    %8 = fir.convert %5 : (!fir.ref<!fir.char<1,13>>) -> !fir.ref<i8>
+    %9 = fir.convert %c0 : (index) -> i32
+    %10 = fir.convert %4 : (!fir.box<i1>) -> !fir.box<none>
+    %11 = fir.call @_FortranACppSumComplex4(%6, %7, %8, %c5_i32, %9, %10) : (!fir.ref<complex<f32>>, !fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> none
+    %12 = fir.load %0 : !fir.ref<!fir.complex<4>>
+    fir.store %12 to %1 : !fir.ref<!fir.complex<4>>
+    %13 = fir.load %1 : !fir.ref<!fir.complex<4>>
+    return %13 : !fir.complex<4>
+  }
+  func.func private @_FortranACppSumComplex4(!fir.ref<complex<f32>>, !fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> none attributes {fir.runtime}
+  fir.global linkonce @_QQcl.2E2F6973756D5F362E66393000 constant : !fir.char<1,13> {
+    %0 = fir.string_lit "./isum_6.f90\00"(13) : !fir.char<1,13>
+    fir.has_value %0 : !fir.char<1,13>
+  }
+}
+
+// CHECK-LABEL:   func.func @sum_1d_complex(%{{.*}}: !fir.ref<!fir.array<10x!fir.complex<4>>> {fir.bindc_name = "a"}) -> !fir.complex<4> {
+// CHECK-NOT:       fir.call @_FortranACppSumComplex4_simplified({{.*}})
+// CHECK:           fir.call @_FortranACppSumComplex4({{.*}}) : (!fir.ref<complex<f32>>, !fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> none
+// CHECK-NOT:       fir.call @_FortranACppSumComplex4_simplified({{.*}})
+
+// -----
+
+// Test that two functions calling the same SUM function
+// generates only ONE function declaration (and that both
+// calls are converted)
+module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.target_triple = "native"} {
+  func.func @sum_1d_calla(%arg0: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "a"}) -> i32 {
+    %c10 = arith.constant 10 : index
+    %0 = fir.alloca i32 {bindc_name = "sum_1d_calla", uniq_name = "_QFsum_1d_callaEsum_1d_calla"}
+    %1 = fir.shape %c10 : (index) -> !fir.shape<1>
+    %2 = fir.embox %arg0(%1) : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<10xi32>>
+    %3 = fir.absent !fir.box<i1>
+    %c0 = arith.constant 0 : index
+    %4 = fir.address_of(@_QQcl.2E2F6973756D5F372E66393000) : !fir.ref<!fir.char<1,13>>
+    %c5_i32 = arith.constant 5 : i32
+    %5 = fir.convert %2 : (!fir.box<!fir.array<10xi32>>) -> !fir.box<none>
+    %6 = fir.convert %4 : (!fir.ref<!fir.char<1,13>>) -> !fir.ref<i8>
+    %7 = fir.convert %c0 : (index) -> i32
+    %8 = fir.convert %3 : (!fir.box<i1>) -> !fir.box<none>
+    %9 = fir.call @_FortranASumInteger4(%5, %6, %c5_i32, %7, %8) : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
+    fir.store %9 to %0 : !fir.ref<i32>
+    %10 = fir.load %0 : !fir.ref<i32>
+    return %10 : i32
+  }
+  func.func @sum_1d_callb(%arg0: !fir.ref<!fir.array<20xi32>> {fir.bindc_name = "a"}) -> i32 {
+    %c20 = arith.constant 20 : index
+    %0 = fir.alloca i32 {bindc_name = "sum_1d_callb", uniq_name = "_QFsum_1d_callbEsum_1d_callb"}
+    %1 = fir.shape %c20 : (index) -> !fir.shape<1>
+    %2 = fir.embox %arg0(%1) : (!fir.ref<!fir.array<20xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<20xi32>>
+    %3 = fir.absent !fir.box<i1>
+    %c0 = arith.constant 0 : index
+    %4 = fir.address_of(@_QQcl.2E2F6973756D5F372E66393000) : !fir.ref<!fir.char<1,13>>
+    %c12_i32 = arith.constant 12 : i32
+    %5 = fir.convert %2 : (!fir.box<!fir.array<20xi32>>) -> !fir.box<none>
+    %6 = fir.convert %4 : (!fir.ref<!fir.char<1,13>>) -> !fir.ref<i8>
+    %7 = fir.convert %c0 : (index) -> i32
+    %8 = fir.convert %3 : (!fir.box<i1>) -> !fir.box<none>
+    %9 = fir.call @_FortranASumInteger4(%5, %6, %c12_i32, %7, %8) : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
+    fir.store %9 to %0 : !fir.ref<i32>
+    %10 = fir.load %0 : !fir.ref<i32>
+    return %10 : i32
+  }
+  func.func private @_FortranASumInteger4(!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32 attributes {fir.runtime}
+  fir.global linkonce @_QQcl.2E2F6973756D5F372E66393000 constant : !fir.char<1,13> {
+    %0 = fir.string_lit "./isum_7.f90\00"(13) : !fir.char<1,13>
+    fir.has_value %0 : !fir.char<1,13>
+  }
+}
+
+// CHECK-LABEL:   func.func @sum_1d_calla(%{{.*}}) -> i32 {
+// CHECK-NOT:       fir.call @_FortranASumInteger4({{.*}})
+// CHECK:           fir.call @_FortranASumInteger4_simplified(%{{.*}})
+// CHECK-NOT:       fir.call @_FortranASumInteger4({{.*}})
+// CHECK:         }
+
+// CHECK-LABEL:   func.func @sum_1d_callb(%{{.*}}) -> i32 {
+// CHECK-NOT:       fir.call @_FortranASumInteger4({{.*}})
+// CHECK:           fir.call @_FortranASumInteger4_simplified(%{{.*}})
+// CHECK-NOT:       fir.call @_FortranASumInteger4({{.*}})
+// CHECK:         }
+
+// CHECK-LABEL:   func.func private @_FortranASumInteger4_simplified({{.*}}) -> i32 {{.*}} {
+// CHECK:           return %{{.*}} : i32
+// CHECK:         }
+// CHECK-NOT:   func.func private @_FortranASumInteger4_simplified({{.*}})
+
+// -----
+
+module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.target_triple = "native"} {
+  func.func @sum_1d_stride(%arg0: !fir.ref<!fir.array<20xi32>> {fir.bindc_name = "a"}) -> i32 {
+    %c20 = arith.constant 20 : index
+    %0 = fir.alloca i32 {bindc_name = "sum_1d_stride", uniq_name = "_QFsum_1d_strideEsum_1d_stride"}
+    %c1 = arith.constant 1 : index
+    %c2_i64 = arith.constant 2 : i64
+    %1 = fir.convert %c2_i64 : (i64) -> index
+    %2 = arith.addi %c1, %c20 : index
+    %3 = arith.subi %2, %c1 : index
+    %4 = fir.shape %c20 : (index) -> !fir.shape<1>
+    %5 = fir.slice %c1, %3, %1 : (index, index, index) -> !fir.slice<1>
+    %6 = fir.embox %arg0(%4) [%5] : (!fir.ref<!fir.array<20xi32>>, !fir.shape<1>, !fir.slice<1>) -> !fir.box<!fir.array<?xi32>>
+    %7 = fir.absent !fir.box<i1>
+    %c0 = arith.constant 0 : index
+    %8 = fir.address_of(@_QQcl.2E2F6973756D5F382E66393000) : !fir.ref<!fir.char<1,13>>
+    %c5_i32 = arith.constant 5 : i32
+    %9 = fir.convert %6 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
+    %10 = fir.convert %8 : (!fir.ref<!fir.char<1,13>>) -> !fir.ref<i8>
+    %11 = fir.convert %c0 : (index) -> i32
+    %12 = fir.convert %7 : (!fir.box<i1>) -> !fir.box<none>
+    %13 = fir.call @_FortranASumInteger4(%9, %10, %c5_i32, %11, %12) : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
+    fir.store %13 to %0 : !fir.ref<i32>
+    %14 = fir.load %0 : !fir.ref<i32>
+    return %14 : i32
+  }
+  func.func private @_FortranASumInteger4(!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32 attributes {fir.runtime}
+  fir.global linkonce @_QQcl.2E2F6973756D5F382E66393000 constant : !fir.char<1,13> {
+    %0 = fir.string_lit "./isum_8.f90\00"(13) : !fir.char<1,13>
+    fir.has_value %0 : !fir.char<1,13>
+  }
+}
+
+// CHECK-LABEL:   func.func @sum_1d_stride(%{{.*}} -> i32 {
+// CHECK:           %[[CI64_2:.*]] = arith.constant 2 : i64
+// CHECK:           %[[CINDEX_2:.*]] = fir.convert %[[CI64_2]] : (i64) -> index
+// CHECK:           %[[SHAPE:.*]] = fir.shape %{{.*}}
+// CHECK:           %[[SLICE:.*]] = fir.slice %{{.*}}, %{{.*}}, %[[CINDEX_2]] : (index, index, index) -> !fir.slice<1>
+// CHECK:           %[[A_BOX_I32:.*]] = fir.embox %{{.*}}(%[[SHAPE]]) {{\[}}%[[SLICE]]] : (!fir.ref<!fir.array<20xi32>>, !fir.shape<1>, !fir.slice<1>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:           %[[A_BOX_NONE:.*]] = fir.convert %[[A_BOX_I32]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
+// CHECK:           %{{.*}} = fir.call @_FortranASumInteger4_simplified(%[[A_BOX_NONE]]) : (!fir.box<none>) -> i32
+// CHECK:           return %{{.*}} : i32
+// CHECK:         }
+
+// CHECK-LABEL:   func.func private @_FortranASumInteger4_simplified(%{{.*}}) -> i32 attributes {llvm.linkage = #llvm.linkage<linkonce_odr>} {
+// CHECK:           %[[ARR_BOX_I32:.*]] = fir.convert %{{.*}} : (!fir.box<none>) -> !fir.box<!fir.array<?xi32>>
+// CHECK:           %[[DIMS:.*]]:3 = fir.box_dims %[[ARR_BOX_I32]], %{{.*}} : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+// CHECK:           %[[CINDEX_1:.*]] = arith.constant 1 : index
+// CHECK:           %[[EXTENT:.*]] = arith.subi %[[DIMS]]#1, %[[CINDEX_1]] : index
+// CHECK:           fir.do_loop %[[ITER:.*]] = %{{.*}} to %[[EXTENT]] step %[[CINDEX_1]] {
+// CHECK:             %{{.*}} = fir.coordinate_of %[[ARR_BOX_I32]], %[[ITER]] : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+// CHECK:           }
+// CHECK:           return %{{.*}} : i32
+// CHECK:         }