From 08556afc54e7ddfa7cc2fdd69c615ad417722517 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Fri, 20 Mar 2020 11:57:20 +0100
Subject: [PATCH] [InstSimplify] Remove known bits constant folding

If SimplifyInstruction() does not succeed in simplifying the
instruction, it will compute the known bits of the instruction
in the hope that all bits are known and the instruction can be
folded to a constant. I have removed a similar optimization
from InstCombine in D75801, and would like to drop this one as well.

On average, we spend ~1% of total compile-time performing this
known bits calculation. However, if we introduce some additional
statistics for known bits computations and how many of them succeed
in simplifying the instruction we get (on test-suite):

    instsimplify.NumKnownBits: 216
    instsimplify.NumKnownBitsComputed: 13828375
    valuetracking.NumKnownBitsComputed: 45860806

Out of ~14M known bits calculations (accounting for approximately
one third of all known bits calculations), only 0.0015% succeed in
producing a constant. Those cases where we do succeed to compute
all known bits will get folded by other passes like InstCombine
later. On test-suite, only lencod.test and GCC-C-execute-pr44858.test
show a hash difference after this change. On lencod we see an
improvement (a loop phi is optimized away), on the GCC torture
test a regression (a function return value is determined only
after IPSCCP, preventing propagation from a noinline function.)

There are various regressions in InstSimplify tests. However, all
of these cases are already handled by InstCombine, and corresponding
tests have already been added there.

Differential Revision: https://reviews.llvm.org/D79294
---
 llvm/lib/Analysis/InstructionSimplify.cpp     | 11 ---
 .../Analysis/ValueTracking/knownzero-shift.ll | 26 ++++--
 llvm/test/Transforms/GVN/PRE/volatile.ll      |  9 +-
 llvm/test/Transforms/InstSimplify/assume.ll   | 93 -------------------
 llvm/test/Transforms/InstSimplify/call.ll     |  2 +-
 llvm/test/Transforms/InstSimplify/or.ll       | 16 +++-
 .../InstSimplify/shift-knownbits.ll           | 29 ++++--
 7 files changed, 63 insertions(+), 123 deletions(-)
 delete mode 100644 llvm/test/Transforms/InstSimplify/assume.ll
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 7de4a0744c29..ef7f35c90861 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -5600,9 +5600,6 @@ Value *llvm::SimplifyInstruction(Instruction *I, const SimplifyQuery &SQ,
     break;
   case Instruction::Call: {
     Result = SimplifyCall(cast<CallInst>(I), Q);
-    // Don't perform known bits simplification below for musttail calls.
-    if (cast<CallInst>(I)->isMustTailCall())
-      return Result;
     break;
   }
   case Instruction::Freeze:
@@ -5620,14 +5617,6 @@ Value *llvm::SimplifyInstruction(Instruction *I, const SimplifyQuery &SQ,
     break;
   }
 
-  // In general, it is possible for computeKnownBits to determine all bits in a
-  // value even when the operands are not all constants.
-  if (!Result && I->getType()->isIntOrIntVectorTy()) {
-    KnownBits Known = computeKnownBits(I, Q.DL, /*Depth*/ 0, Q.AC, I, Q.DT, ORE);
-    if (Known.isConstant())
-      Result = ConstantInt::get(I->getType(), Known.getConstant());
-  }
-
   /// If called on unreachable code, the above logic may report that the
   /// instruction simplified to itself.  Make life easier for users by
   /// detecting that case here, returning a safe value instead.
diff --git a/llvm/test/Analysis/ValueTracking/knownzero-shift.ll b/llvm/test/Analysis/ValueTracking/knownzero-shift.ll
index 4ceb822afa18..0dcd828aa33f 100644
--- a/llvm/test/Analysis/ValueTracking/knownzero-shift.ll
+++ b/llvm/test/Analysis/ValueTracking/knownzero-shift.ll
@@ -15,9 +15,15 @@ define i1 @test(i8 %p, i8* %pq) {
 
 !0 = !{ i8 1, i8 5 }
 
+; The following cases only get folded by InstCombine,
+; see InstCombine/shift-shift.ll. If we wanted to,
+; we could explicitly handle them in InstSimplify as well.
+
 define i32 @shl_shl(i32 %A) {
 ; CHECK-LABEL: @shl_shl(
-; CHECK-NEXT:    ret i32 0
+; CHECK-NEXT:    [[B:%.*]] = shl i32 [[A:%.*]], 6
+; CHECK-NEXT:    [[C:%.*]] = shl i32 [[B]], 28
+; CHECK-NEXT:    ret i32 [[C]]
 ;
   %B = shl i32 %A, 6
   %C = shl i32 %B, 28
@@ -26,7 +32,9 @@ define i32 @shl_shl(i32 %A) {
 
 define <2 x i33> @shl_shl_splat_vec(<2 x i33> %A) {
 ; CHECK-LABEL: @shl_shl_splat_vec(
-; CHECK-NEXT:    ret <2 x i33> zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = shl <2 x i33> [[A:%.*]], <i33 5, i33 5>
+; CHECK-NEXT:    [[C:%.*]] = shl <2 x i33> [[B]], <i33 28, i33 28>
+; CHECK-NEXT:    ret <2 x i33> [[C]]
 ;
   %B = shl <2 x i33> %A, <i33 5, i33 5>
   %C = shl <2 x i33> %B, <i33 28, i33 28>
@@ -37,7 +45,7 @@ define <2 x i33> @shl_shl_splat_vec(<2 x i33> %A) {
 
 define <2 x i33> @shl_shl_vec(<2 x i33> %A) {
 ; CHECK-LABEL: @shl_shl_vec(
-; CHECK-NEXT:    [[B:%.*]] = shl <2 x i33> %A, <i33 6, i33 5>
+; CHECK-NEXT:    [[B:%.*]] = shl <2 x i33> [[A:%.*]], <i33 6, i33 5>
 ; CHECK-NEXT:    [[C:%.*]] = shl <2 x i33> [[B]], <i33 27, i33 28>
 ; CHECK-NEXT:    ret <2 x i33> [[C]]
 ;
@@ -48,7 +56,9 @@ define <2 x i33> @shl_shl_vec(<2 x i33> %A) {
 
 define i232 @lshr_lshr(i232 %A) {
 ; CHECK-LABEL: @lshr_lshr(
-; CHECK-NEXT:    ret i232 0
+; CHECK-NEXT:    [[B:%.*]] = lshr i232 [[A:%.*]], 231
+; CHECK-NEXT:    [[C:%.*]] = lshr i232 [[B]], 1
+; CHECK-NEXT:    ret i232 [[C]]
 ;
   %B = lshr i232 %A, 231
   %C = lshr i232 %B, 1
@@ -57,7 +67,9 @@ define i232 @lshr_lshr(i232 %A) {
 
 define <2 x i32> @lshr_lshr_splat_vec(<2 x i32> %A) {
 ; CHECK-LABEL: @lshr_lshr_splat_vec(
-; CHECK-NEXT:    ret <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = lshr <2 x i32> [[A:%.*]], <i32 28, i32 28>
+; CHECK-NEXT:    [[C:%.*]] = lshr <2 x i32> [[B]], <i32 4, i32 4>
+; CHECK-NEXT:    ret <2 x i32> [[C]]
 ;
   %B = lshr <2 x i32> %A, <i32 28, i32 28>
   %C = lshr <2 x i32> %B, <i32 4, i32 4>
@@ -66,7 +78,9 @@ define <2 x i32> @lshr_lshr_splat_vec(<2 x i32> %A) {
 
 define <2 x i32> @lshr_lshr_vec(<2 x i32> %A) {
 ; CHECK-LABEL: @lshr_lshr_vec(
-; CHECK-NEXT:    ret <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = lshr <2 x i32> [[A:%.*]], <i32 29, i32 28>
+; CHECK-NEXT:    [[C:%.*]] = lshr <2 x i32> [[B]], <i32 4, i32 5>
+; CHECK-NEXT:    ret <2 x i32> [[C]]
 ;
   %B = lshr <2 x i32> %A, <i32 29, i32 28>
   %C = lshr <2 x i32> %B, <i32 4, i32 5>
diff --git a/llvm/test/Transforms/GVN/PRE/volatile.ll b/llvm/test/Transforms/GVN/PRE/volatile.ll
index 552f8dce7833..6fd1b096363d 100644
--- a/llvm/test/Transforms/GVN/PRE/volatile.ll
+++ b/llvm/test/Transforms/GVN/PRE/volatile.ll
@@ -197,14 +197,17 @@ exit:
   ret i32 %add
 }
 
+; This test checks that we don't optimize away instructions that are
+; simplified by SimplifyInstruction(), but are not trivially dead.
+
 define i32 @test9(i32* %V) {
 ; CHECK-LABEL: @test9(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LOAD:%.*]] = load volatile i32, i32* [[V:%.*]], !range !0
-; CHECK-NEXT:    ret i32 0
+; CHECK-NEXT:    [[LOAD:%.*]] = call i32 undef()
+; CHECK-NEXT:    ret i32 undef
 ;
 entry:
-  %load = load volatile i32, i32* %V, !range !0
+  %load = call i32 undef()
   ret i32 %load
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/assume.ll b/llvm/test/Transforms/InstSimplify/assume.ll
deleted file mode 100644
index a43f90adee37..000000000000
--- a/llvm/test/Transforms/InstSimplify/assume.ll
+++ /dev/null
@@ -1,93 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -instsimplify -S < %s 2>&1 -pass-remarks-analysis=.* | FileCheck %s
-
-; Verify that warnings are emitted for the 2nd and 3rd tests.
-
-; CHECK: remark: /tmp/s.c:1:13: Detected conflicting code assumptions.
-; CHECK: remark: /tmp/s.c:4:10: Detected conflicting code assumptions.
-; CHECK: remark: /tmp/s.c:5:50: Detected conflicting code assumptions.
-
-define void @test1() {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.assume(i1 1)
-  ret void
-
-}
-
-; The alloca guarantees that the low bits of %a are zero because of alignment.
-; The assume says the opposite. The assume is processed last, so that's the
-; return value. There's no way to win (we can't undo transforms that happened
-; based on half-truths), so just don't crash.
-
-define i64 @PR31809() !dbg !7 {
-; CHECK-LABEL: @PR31809(
-; CHECK-NEXT:    ret i64 3
-;
-  %a = alloca i32
-  %t1 = ptrtoint i32* %a to i64, !dbg !9
-  %cond = icmp eq i64 %t1, 3
-  call void @llvm.assume(i1 %cond)
-  ret i64 %t1
-}
-
-; Similar to above: there's no way to know which assumption is truthful,
-; so just don't crash.
-
-define i8 @conflicting_assumptions(i8 %x) !dbg !10 {
-; CHECK-LABEL: @conflicting_assumptions(
-; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[X:%.*]], 1, !dbg !10
-; CHECK-NEXT:    call void @llvm.assume(i1 false)
-; CHECK-NEXT:    [[COND2:%.*]] = icmp eq i8 [[X]], 4
-; CHECK-NEXT:    call void @llvm.assume(i1 [[COND2]])
-; CHECK-NEXT:    ret i8 [[ADD]]
-;
-  %add = add i8 %x, 1, !dbg !11
-  %cond1 = icmp eq i8 %x, 3
-  call void @llvm.assume(i1 %cond1)
-  %cond2 = icmp eq i8 %x, 4
-  call void @llvm.assume(i1 %cond2)
-  ret i8 %add
-}
-
-; Another case of conflicting assumptions. This would crash because we'd
-; try to set more known bits than existed in the known bits struct.
-
-define void @PR36270(i32 %b) !dbg !13 {
-; CHECK-LABEL: @PR36270(
-; CHECK-NEXT:    tail call void @llvm.assume(i1 false)
-; CHECK-NEXT:    unreachable
-;
-  %B7 = xor i32 -1, 2147483647
-  %and1 = and i32 %b, 3
-  %B12 = lshr i32 %B7, %and1, !dbg !14
-  %C1 = icmp ult i32 %and1, %B12
-  tail call void @llvm.assume(i1 %C1)
-  %cmp2 = icmp eq i32 0, %B12
-  tail call void @llvm.assume(i1 %cmp2)
-  unreachable
-}
-
-declare void @llvm.assume(i1) nounwind
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0 (trunk 282540) (llvm/trunk 282542)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
-!1 = !DIFile(filename: "/tmp/s.c", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"PIC Level", i32 2}
-!6 = !{!"clang version 4.0.0 (trunk 282540) (llvm/trunk 282542)"}
-!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !2)
-!8 = !DISubroutineType(types: !2)
-!9 = !DILocation(line: 1, column: 13, scope: !7)
-!10 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 3, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: true, unit: !0, retainedNodes: !2)
-!11 = !DILocation(line: 4, column: 10, scope: !10)
-!12 = !DILocation(line: 4, column: 3, scope: !10)
-!13 = distinct !DISubprogram(name: "PR36270", scope: !1, file: !1, line: 3, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: true, unit: !0, retainedNodes: !2)
-!14 = !DILocation(line: 5, column: 50, scope: !13)
-
diff --git a/llvm/test/Transforms/InstSimplify/call.ll b/llvm/test/Transforms/InstSimplify/call.ll
index 07fdcdbbd5e6..9d0147b5ba52 100644
--- a/llvm/test/Transforms/InstSimplify/call.ll
+++ b/llvm/test/Transforms/InstSimplify/call.ll
@@ -988,7 +988,7 @@ declare i8* @passthru_p8(i8* returned)
 define i32 @returned_const_int_arg() {
 ; CHECK-LABEL: @returned_const_int_arg(
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @passthru_i32(i32 42)
-; CHECK-NEXT:    ret i32 42
+; CHECK-NEXT:    ret i32 [[X]]
 ;
   %x = call i32 @passthru_i32(i32 42)
   ret i32 %x
diff --git a/llvm/test/Transforms/InstSimplify/or.ll b/llvm/test/Transforms/InstSimplify/or.ll
index 7369cddf2fbe..465b30c9cf94 100644
--- a/llvm/test/Transforms/InstSimplify/or.ll
+++ b/llvm/test/Transforms/InstSimplify/or.ll
@@ -98,10 +98,17 @@ define i8 @test10(i8 %A) {
   ret i8 %D
 }
 
+; The following two cases only get folded by InstCombine,
+; see InstCombine/or-xor.ll.
+
 ; (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2)
 define i8 @test11(i8 %A) {
 ; CHECK-LABEL: @test11(
-; CHECK-NEXT:    ret i8 -1
+; CHECK-NEXT:    [[B:%.*]] = or i8 [[A:%.*]], -2
+; CHECK-NEXT:    [[C:%.*]] = xor i8 [[B]], 13
+; CHECK-NEXT:    [[D:%.*]] = or i8 [[C]], 1
+; CHECK-NEXT:    [[E:%.*]] = xor i8 [[D]], 12
+; CHECK-NEXT:    ret i8 [[E]]
 ;
   %B = or i8 %A, -2
   %C = xor i8 %B, 13
@@ -112,7 +119,12 @@ define i8 @test11(i8 %A) {
 
 define i8 @test11v(<2 x i8> %A) {
 ; CHECK-LABEL: @test11v(
-; CHECK-NEXT:    ret i8 -1
+; CHECK-NEXT:    [[B:%.*]] = or <2 x i8> [[A:%.*]], <i8 -2, i8 0>
+; CHECK-NEXT:    [[CV:%.*]] = xor <2 x i8> [[B]], <i8 13, i8 13>
+; CHECK-NEXT:    [[C:%.*]] = extractelement <2 x i8> [[CV]], i32 0
+; CHECK-NEXT:    [[D:%.*]] = or i8 [[C]], 1
+; CHECK-NEXT:    [[E:%.*]] = xor i8 [[D]], 12
+; CHECK-NEXT:    ret i8 [[E]]
 ;
   %B = or <2 x i8> %A, <i8 -2, i8 0>
   %CV = xor <2 x i8> %B, <i8 13, i8 13>
diff --git a/llvm/test/Transforms/InstSimplify/shift-knownbits.ll b/llvm/test/Transforms/InstSimplify/shift-knownbits.ll
index 66e987182190..c023048fb538 100644
--- a/llvm/test/Transforms/InstSimplify/shift-knownbits.ll
+++ b/llvm/test/Transforms/InstSimplify/shift-knownbits.ll
@@ -145,7 +145,8 @@ define i1 @shl_i1(i1 %a, i1 %b) {
   ret i1 %shl
 }
 
-; Simplify count leading/trailing zeros to zero if all valid bits are shifted out.
+; The following cases only get folded by InstCombine,
+; see InstCombine/lshr.ll.
 
 declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
 declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
@@ -154,7 +155,9 @@ declare <2 x i8> @llvm.ctlz.v2i8(<2 x i8>, i1) nounwind readnone
 
 define i32 @lshr_ctlz_zero_is_undef(i32 %x) {
 ; CHECK-LABEL: @lshr_ctlz_zero_is_undef(
-; CHECK-NEXT:    ret i32 0
+; CHECK-NEXT:    [[CT:%.*]] = call i32 @llvm.ctlz.i32(i32 [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[SH:%.*]] = lshr i32 [[CT]], 5
+; CHECK-NEXT:    ret i32 [[SH]]
 ;
   %ct = call i32 @llvm.ctlz.i32(i32 %x, i1 true)
   %sh = lshr i32 %ct, 5
@@ -163,7 +166,9 @@ define i32 @lshr_ctlz_zero_is_undef(i32 %x) {
 
 define i32 @lshr_cttz_zero_is_undef(i32 %x) {
 ; CHECK-LABEL: @lshr_cttz_zero_is_undef(
-; CHECK-NEXT:    ret i32 0
+; CHECK-NEXT:    [[CT:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[SH:%.*]] = lshr i32 [[CT]], 5
+; CHECK-NEXT:    ret i32 [[SH]]
 ;
   %ct = call i32 @llvm.cttz.i32(i32 %x, i1 true)
   %sh = lshr i32 %ct, 5
@@ -172,7 +177,9 @@ define i32 @lshr_cttz_zero_is_undef(i32 %x) {
 
 define <2 x i8> @lshr_ctlz_zero_is_undef_splat_vec(<2 x i8> %x) {
 ; CHECK-LABEL: @lshr_ctlz_zero_is_undef_splat_vec(
-; CHECK-NEXT:    ret <2 x i8> zeroinitializer
+; CHECK-NEXT:    [[CT:%.*]] = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[SH:%.*]] = lshr <2 x i8> [[CT]], <i8 3, i8 3>
+; CHECK-NEXT:    ret <2 x i8> [[SH]]
 ;
   %ct = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %x, i1 true)
   %sh = lshr <2 x i8> %ct, <i8 3, i8 3>
@@ -181,7 +188,10 @@ define <2 x i8> @lshr_ctlz_zero_is_undef_splat_vec(<2 x i8> %x) {
 
 define i8 @lshr_ctlz_zero_is_undef_vec(<2 x i8> %x) {
 ; CHECK-LABEL: @lshr_ctlz_zero_is_undef_vec(
-; CHECK-NEXT:    ret i8 0
+; CHECK-NEXT:    [[CT:%.*]] = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[SH:%.*]] = lshr <2 x i8> [[CT]], <i8 3, i8 0>
+; CHECK-NEXT:    [[EX:%.*]] = extractelement <2 x i8> [[SH]], i32 0
+; CHECK-NEXT:    ret i8 [[EX]]
 ;
   %ct = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %x, i1 true)
   %sh = lshr <2 x i8> %ct, <i8 3, i8 0>
@@ -191,7 +201,9 @@ define i8 @lshr_ctlz_zero_is_undef_vec(<2 x i8> %x) {
 
 define <2 x i8> @lshr_cttz_zero_is_undef_splat_vec(<2 x i8> %x) {
 ; CHECK-LABEL: @lshr_cttz_zero_is_undef_splat_vec(
-; CHECK-NEXT:    ret <2 x i8> zeroinitializer
+; CHECK-NEXT:    [[CT:%.*]] = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[SH:%.*]] = lshr <2 x i8> [[CT]], <i8 3, i8 3>
+; CHECK-NEXT:    ret <2 x i8> [[SH]]
 ;
   %ct = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %x, i1 true)
   %sh = lshr <2 x i8> %ct, <i8 3, i8 3>
@@ -200,7 +212,10 @@ define <2 x i8> @lshr_cttz_zero_is_undef_splat_vec(<2 x i8> %x) {
 
 define i8 @lshr_cttz_zero_is_undef_vec(<2 x i8> %x) {
 ; CHECK-LABEL: @lshr_cttz_zero_is_undef_vec(
-; CHECK-NEXT:    ret i8 0
+; CHECK-NEXT:    [[CT:%.*]] = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> [[X:%.*]], i1 true)
+; CHECK-NEXT:    [[SH:%.*]] = lshr <2 x i8> [[CT]], <i8 3, i8 0>
+; CHECK-NEXT:    [[EX:%.*]] = extractelement <2 x i8> [[SH]], i32 0
+; CHECK-NEXT:    ret i8 [[EX]]
 ;
   %ct = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %x, i1 true)
   %sh = lshr <2 x i8> %ct, <i8 3, i8 0>