[Inliner] Penalise inlining of calls with loops at Oz

We currently seem to underestimate the size of functions with loops in them, both in terms of absolute code size and in the difficulties of dealing with such code. (Calls, for example, can be tail merged to further reduce codesize). At -Oz, we can then increase code size by inlining small loops multiple times. This attempts to penalise functions with loops at -Oz by adding a CallPenalty for each top level loop in the function. It uses LI (and hence DT) to calculate the number of loops. As we are dealing with minsize, the inline threshold is small and functions at this point should be relatively small, making the construction of these cheap. Differential Revision: https://reviews.llvm.org/D52716 llvm-svn: 346134
2018-11-05 14:54:34 +00:00 · 2018-11-05 14:54:34 +00:00 · ba9f245b0d
parent 8d7c351799
commit ba9f245b0d
4 changed files with 251 additions and 0 deletions
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@ -23,6 +23,7 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@ -30,6 +31,7 @@
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/InstVisitor.h"
@ -1885,6 +1887,24 @@ InlineResult CallAnalyzer::analyzeCall(CallSite CS) {
  if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
    return "noduplicate";
  // Loops generally act a lot like calls in that they act like barriers to
  // movement, require a certain amount of setup, etc. So when optimising for
  // size, we penalise any call sites that perform loops. We do this after all
  // other costs here, so will likely only be dealing with relatively small
  // functions (and hence DT and LI will hopefully be cheap).
  if (Caller->optForMinSize()) {
    DominatorTree DT(F);
    LoopInfo LI(DT);
    int NumLoops = 0;
    for (Loop *L : LI) {
      // Ignore loops that will not be executed
      if (DeadBlocks.count(L->getHeader()))
        continue;
      NumLoops++;
    }
    Cost += NumLoops * InlineConstants::CallPenalty;
  }
  // We applied the maximum possible vector bonus at the beginning. Now,
  // subtract the excess bonus, if any, from the Threshold before
  // comparing against Cost.
--- a/llvm/test/Transforms/Inline/ARM/loop-add.ll
+++ b/llvm/test/Transforms/Inline/ARM/loop-add.ll
@ -0,0 +1,95 @@
 ; RUN: opt -inline %s -S | FileCheck %s
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv7m-arm-none-eabi"
 ; CHECK-LABEL: void @doCalls
 define void @doCalls(i8* nocapture %p1, i8* nocapture %p2, i32 %n) #0 {
 entry:
  %div = lshr i32 %n, 1
 ; CHECK: call void @LoopCall
  tail call void @LoopCall(i8* %p1, i8* %p2, i32 %div) #0
  %div2 = lshr i32 %n, 2
 ; CHECK: call void @LoopCall
  tail call void @LoopCall(i8* %p1, i8* %p2, i32 %div2) #0
 ; CHECK-NOT: call void @LoopCall
  tail call void @LoopCall(i8* %p2, i8* %p1, i32 0) #0
 ; CHECK-NOT: call void @LoopCall_internal
  tail call void @LoopCall_internal(i8* %p1, i8* %p2, i32 %div2) #0
  %div3 = lshr i32 %n, 4
 ; CHECK-NOT: call void @SimpleCall
  tail call void @SimpleCall(i8* %p2, i8* %p1, i32 %div3) #0
  ret void
 }
 ; CHECK-LABEL: define void @LoopCall
 define void @LoopCall(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 {
 entry:
  %c = icmp ne i32 %num, 0
  br i1 %c, label %while.cond, label %while.end
 while.cond:                                       ; preds = %while.body, %entry
  %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ]
  %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr2, %while.body ]
  %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ]
  %cmp = icmp eq i32 %num.addr.0, 0
  br i1 %cmp, label %while.end, label %while.body
 while.body:                                       ; preds = %while.cond
  %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1
  %0 = load i8, i8* %p_source.0, align 1
  %1 = trunc i32 %num.addr.0 to i8
  %conv1 = add i8 %0, %1
  %incdec.ptr2 = getelementptr inbounds i8, i8* %p_dest.0, i32 1
  store i8 %conv1, i8* %p_dest.0, align 1
  %dec = add i32 %num.addr.0, -1
  br label %while.cond
 while.end:                                        ; preds = %while.cond
  ret void
 }
 ; CHECK-LABEL-NOT: define void @LoopCall_internal
 define internal void @LoopCall_internal(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 {
 entry:
  %c = icmp ne i32 %num, 0
  br i1 %c, label %while.cond, label %while.end
 while.cond:                                       ; preds = %while.body, %entry
  %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ]
  %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr2, %while.body ]
  %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ]
  %cmp = icmp eq i32 %num.addr.0, 0
  br i1 %cmp, label %while.end, label %while.body
 while.body:                                       ; preds = %while.cond
  %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1
  %0 = load i8, i8* %p_source.0, align 1
  %1 = trunc i32 %num.addr.0 to i8
  %conv1 = add i8 %0, %1
  %incdec.ptr2 = getelementptr inbounds i8, i8* %p_dest.0, i32 1
  store i8 %conv1, i8* %p_dest.0, align 1
  %dec = add i32 %num.addr.0, -1
  br label %while.cond
 while.end:                                        ; preds = %while.cond
  ret void
 }
 ; CHECK-LABEL: define void @SimpleCall
 define void @SimpleCall(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 {
 entry:
  %arrayidx = getelementptr inbounds i8, i8* %source, i32 %num
  %0 = load i8, i8* %arrayidx, align 1
  %1 = xor i8 %0, 127
  %arrayidx2 = getelementptr inbounds i8, i8* %dest, i32 %num
  store i8 %1, i8* %arrayidx2, align 1
  ret void
 }
 attributes #0 = { minsize optsize }
--- a/llvm/test/Transforms/Inline/ARM/loop-memcpy.ll
+++ b/llvm/test/Transforms/Inline/ARM/loop-memcpy.ll
@ -0,0 +1,87 @@
 ; RUN: opt -inline %s -S | FileCheck %s
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv7m-arm-none-eabi"
 ; CHECK-LABEL: define void @matcpy
 define void @matcpy(i8* %dest, i8* %source, i32 %num) #0 {
 entry:
  %0 = ptrtoint i8* %dest to i32
  %1 = ptrtoint i8* %source to i32
  %2 = xor i32 %0, %1
  %3 = and i32 %2, 3
  %cmp = icmp eq i32 %3, 0
  br i1 %cmp, label %if.then, label %if.else20
 if.then:                                          ; preds = %entry
  %sub = sub i32 0, %0
  %and2 = and i32 %sub, 3
  %add = or i32 %and2, 4
  %cmp3 = icmp ugt i32 %add, %num
  br i1 %cmp3, label %if.else, label %if.then4
 if.then4:                                         ; preds = %if.then
  %sub5 = sub i32 %num, %and2
  %shr = and i32 %sub5, -4
  %sub7 = sub i32 %sub5, %shr
  %tobool = icmp eq i32 %and2, 0
  br i1 %tobool, label %if.end, label %if.then8
 if.then8:                                         ; preds = %if.then4
 ; CHECK: call fastcc void @memcpy
  call fastcc void @memcpy(i8* %dest, i8* %source, i32 %and2) #0
  %add.ptr = getelementptr inbounds i8, i8* %dest, i32 %and2
  %add.ptr9 = getelementptr inbounds i8, i8* %source, i32 %and2
  br label %if.end
 if.end:                                           ; preds = %if.then4, %if.then8
  %p_dest.0 = phi i8* [ %add.ptr, %if.then8 ], [ %dest, %if.then4 ]
  %p_source.0 = phi i8* [ %add.ptr9, %if.then8 ], [ %source, %if.then4 ]
  %tobool14 = icmp eq i32 %sub7, 0
  br i1 %tobool14, label %if.end22, label %if.then15
 if.then15:                                        ; preds = %if.end
  %add.ptr13 = getelementptr inbounds i8, i8* %p_source.0, i32 %shr
  %add.ptr11 = getelementptr inbounds i8, i8* %p_dest.0, i32 %shr
 ; CHECK: call fastcc void @memcpy
  call fastcc void @memcpy(i8* %add.ptr11, i8* %add.ptr13, i32 %sub7) #0
  br label %if.end22
 if.else:                                          ; preds = %if.then
  call fastcc void @memcpy(i8* %dest, i8* %source, i32 %num) #0
  br label %if.end22
 if.else20:                                        ; preds = %entry
  call fastcc void @memcpy(i8* %dest, i8* %source, i32 %num) #0
  br label %if.end22
 if.end22:                                         ; preds = %if.then15, %if.end, %if.else, %if.else20
  ret void
 }
 ; CHECK-LABEL: define internal void @memcpy
 define internal void @memcpy(i8* nocapture %dest, i8* nocapture readonly %source, i32 %num) #0 {
 entry:
  br label %while.cond
 while.cond:                                       ; preds = %while.body, %entry
  %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %while.body ]
  %p_dest.0 = phi i8* [ %dest, %entry ], [ %incdec.ptr1, %while.body ]
  %p_source.0 = phi i8* [ %source, %entry ], [ %incdec.ptr, %while.body ]
  %cmp = icmp eq i32 %num.addr.0, 0
  br i1 %cmp, label %while.end, label %while.body
 while.body:                                       ; preds = %while.cond
  %incdec.ptr = getelementptr inbounds i8, i8* %p_source.0, i32 1
  %0 = load i8, i8* %p_source.0, align 1
  %incdec.ptr1 = getelementptr inbounds i8, i8* %p_dest.0, i32 1
  store i8 %0, i8* %p_dest.0, align 1
  %dec = add i32 %num.addr.0, -1
  br label %while.cond
 while.end:                                        ; preds = %while.cond
  ret void
 }
 attributes #0 = { minsize optsize }
--- a/llvm/test/Transforms/Inline/ARM/loop-noinline.ll
+++ b/llvm/test/Transforms/Inline/ARM/loop-noinline.ll
@ -0,0 +1,49 @@
 ; RUN: opt -inline %s -S | FileCheck %s
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv7m-arm-none-eabi"
 ; Check we don't inline loops at -Oz. They tend to be larger than we
 ; expect.
 ; CHECK: define i8* @H
@digits = constant [16 x i8] c"0123456789ABCDEF", align 1
 define i8* @H(i8* %p, i32 %val, i32 %num) #0 {
 entry:
  br label %do.body
 do.body:                                          ; preds = %do.body, %entry
  %p.addr.0 = phi i8* [ %p, %entry ], [ %incdec.ptr, %do.body ]
  %val.addr.0 = phi i32 [ %val, %entry ], [ %shl, %do.body ]
  %num.addr.0 = phi i32 [ %num, %entry ], [ %dec, %do.body ]
  %shr = lshr i32 %val.addr.0, 28
  %arrayidx = getelementptr inbounds [16 x i8], [16 x i8]* @digits, i32 0, i32 %shr
  %0 = load i8, i8* %arrayidx, align 1
  %incdec.ptr = getelementptr inbounds i8, i8* %p.addr.0, i32 1
  store i8 %0, i8* %p.addr.0, align 1
  %shl = shl i32 %val.addr.0, 4
  %dec = add i32 %num.addr.0, -1
  %tobool = icmp eq i32 %dec, 0
  br i1 %tobool, label %do.end, label %do.body
 do.end:                                           ; preds = %do.body
  %scevgep = getelementptr i8, i8* %p, i32 %num
  ret i8* %scevgep
 }
 define nonnull i8* @call1(i8* %p, i32 %val, i32 %num) #0 {
 entry:
 ; CHECK: tail call i8* @H
  %call = tail call i8* @H(i8* %p, i32 %val, i32 %num) #0
  ret i8* %call
 }
 define nonnull i8* @call2(i8* %p, i32 %val) #0 {
 entry:
 ; CHECK: tail call i8* @H
  %call = tail call i8* @H(i8* %p, i32 %val, i32 32) #0
  ret i8* %call
 }
 attributes #0 = { minsize optsize }