Revert "Revert "As a follow-up to my initial mail to llvm-dev here's a first pass at the O1 described there.""

This reapplies: 8ff85ed905 Original commit message: As a follow-up to my initial mail to llvm-dev here's a first pass at the O1 described there. This change doesn't include any change to move from selection dag to fast isel and that will come with other numbers that should help inform that decision. There also haven't been any real debuggability studies with this pipeline yet, this is just the initial start done so that people could see it and we could start tweaking after. Test updates: Outside of the newpm tests most of the updates are coming from either optimization passes not run anymore (and without a compelling argument at the moment) that were largely used for canonicalization in clang. Original post: http://lists.llvm.org/pipermail/llvm-dev/2019-April/131494.html Tags: #llvm Differential Revision: https://reviews.llvm.org/D65410 This reverts commit c9ddb02659.
2019-11-26 20:28:52 -08:00 · 2019-11-26 20:28:52 -08:00 · fd39b1bb20
parent 82b4dc0256
commit fd39b1bb20
36 changed files with 352 additions and 349 deletions
--- a/clang/test/CodeGen/2008-07-30-implicit-initialization.c
+++ b/clang/test/CodeGen/2008-07-30-implicit-initialization.c
@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple i386-unknown-unknown -O1 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple i386-unknown-unknown -O2 -emit-llvm -o - %s | FileCheck %s
 // CHECK-LABEL: define i32 @f0()
 // CHECK:   ret i32 0
 // CHECK-LABEL: define i32 @f1()
--- a/clang/test/CodeGen/arm-fp16-arguments.c
+++ b/clang/test/CodeGen/arm-fp16-arguments.c
@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi soft -fallow-half-arguments-and-returns -emit-llvm -o - -O1 %s | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT
-// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi hard -fallow-half-arguments-and-returns -emit-llvm -o - -O1 %s | FileCheck %s --check-prefix=CHECK --check-prefix=HARD
-// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi soft -fnative-half-arguments-and-returns -emit-llvm -o - -O1 %s | FileCheck %s --check-prefix=NATIVE
+// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi soft -fallow-half-arguments-and-returns -emit-llvm -o - -O2 %s | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT
+// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi hard -fallow-half-arguments-and-returns -emit-llvm -o - -O2 %s | FileCheck %s --check-prefix=CHECK --check-prefix=HARD
+// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi soft -fnative-half-arguments-and-returns -emit-llvm -o - -O2 %s | FileCheck %s --check-prefix=NATIVE

 __fp16 g;

--- a/clang/test/CodeGen/arm-vfp16-arguments2.cpp
+++ b/clang/test/CodeGen/arm-vfp16-arguments2.cpp
@ -1,12 +1,12 @@
 // RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs \
-// RUN:   -mfloat-abi soft -target-feature +neon -emit-llvm -o - -O1 %s \
+// RUN:   -mfloat-abi soft -target-feature +neon -emit-llvm -o - -O2 %s \
 // RUN:   | FileCheck %s --check-prefix=CHECK-SOFT
 // RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs \
-// RUN:   -mfloat-abi hard -target-feature +neon -emit-llvm -o - -O1 %s \
+// RUN:   -mfloat-abi hard -target-feature +neon -emit-llvm -o - -O2 %s \
 // RUN:   | FileCheck %s --check-prefix=CHECK-HARD
 // RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs \
 // RUN:   -mfloat-abi hard -target-feature +neon -target-feature +fullfp16 \
-// RUN:   -emit-llvm -o - -O1 %s \
+// RUN:   -emit-llvm -o - -O2 %s \
 // RUN:   | FileCheck %s --check-prefix=CHECK-FULL

 typedef float float32_t;
--- a/clang/test/CodeGen/atomic-ops-libcall.c
+++ b/clang/test/CodeGen/atomic-ops-libcall.c
@ -10,109 +10,109 @@ enum memory_order {

 int *test_c11_atomic_fetch_add_int_ptr(_Atomic(int *) *p) {
  // CHECK: test_c11_atomic_fetch_add_int_ptr
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_add_4(i8* {{%[0-9]+}}, i32 12, i32 5)
+  // CHECK: {{%[^ ]*}} = call i32 @__atomic_fetch_add_4(i8* {{%[0-9]+}}, i32 12, i32 5)
  return __c11_atomic_fetch_add(p, 3, memory_order_seq_cst);
 }

 int *test_c11_atomic_fetch_sub_int_ptr(_Atomic(int *) *p) {
  // CHECK: test_c11_atomic_fetch_sub_int_ptr
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_sub_4(i8* {{%[0-9]+}}, i32 20, i32 5)
+  // CHECK: {{%[^ ]*}} = call i32 @__atomic_fetch_sub_4(i8* {{%[0-9]+}}, i32 20, i32 5)
  return __c11_atomic_fetch_sub(p, 5, memory_order_seq_cst);
 }

 int test_c11_atomic_fetch_add_int(_Atomic(int) *p) {
  // CHECK: test_c11_atomic_fetch_add_int
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_add_4(i8* {{%[0-9]+}}, i32 3, i32 5)
+  // CHECK: {{%[^ ]*}} = call i32 @__atomic_fetch_add_4(i8* {{%[0-9]+}}, i32 3, i32 5)
  return __c11_atomic_fetch_add(p, 3, memory_order_seq_cst);
 }

 int test_c11_atomic_fetch_sub_int(_Atomic(int) *p) {
  // CHECK: test_c11_atomic_fetch_sub_int
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_sub_4(i8* {{%[0-9]+}}, i32 5, i32 5)
+  // CHECK: {{%[^ ]*}} = call i32 @__atomic_fetch_sub_4(i8* {{%[0-9]+}}, i32 5, i32 5)
  return __c11_atomic_fetch_sub(p, 5, memory_order_seq_cst);
 }

 int *fp2a(int **p) {
  // CHECK: @fp2a
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_sub_4(i8* {{%[0-9]+}}, i32 4, i32 0)
+  // CHECK: {{%[^ ]*}} = call i32 @__atomic_fetch_sub_4(i8* {{%[0-9]+}}, i32 4, i32 0)
  // Note, the GNU builtins do not multiply by sizeof(T)!
  return __atomic_fetch_sub(p, 4, memory_order_relaxed);
 }

 int test_atomic_fetch_add(int *p) {
  // CHECK: test_atomic_fetch_add
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_add_4(i8* {{%[0-9]+}}, i32 55, i32 5)
+  // CHECK: {{%[^ ]*}} = call i32 @__atomic_fetch_add_4(i8* {{%[0-9]+}}, i32 55, i32 5)
  return __atomic_fetch_add(p, 55, memory_order_seq_cst);
 }

 int test_atomic_fetch_sub(int *p) {
  // CHECK: test_atomic_fetch_sub
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_sub_4(i8* {{%[0-9]+}}, i32 55, i32 5)
+  // CHECK: {{%[^ ]*}} = call i32 @__atomic_fetch_sub_4(i8* {{%[0-9]+}}, i32 55, i32 5)
  return __atomic_fetch_sub(p, 55, memory_order_seq_cst);
 }

 int test_atomic_fetch_and(int *p) {
  // CHECK: test_atomic_fetch_and
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_and_4(i8* {{%[0-9]+}}, i32 55, i32 5)
+  // CHECK: {{%[^ ]*}} = call i32 @__atomic_fetch_and_4(i8* {{%[0-9]+}}, i32 55, i32 5)
  return __atomic_fetch_and(p, 55, memory_order_seq_cst);
 }

 int test_atomic_fetch_or(int *p) {
  // CHECK: test_atomic_fetch_or
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_or_4(i8* {{%[0-9]+}}, i32 55, i32 5)
+  // CHECK: {{%[^ ]*}} = call i32 @__atomic_fetch_or_4(i8* {{%[0-9]+}}, i32 55, i32 5)
  return __atomic_fetch_or(p, 55, memory_order_seq_cst);
 }

 int test_atomic_fetch_xor(int *p) {
  // CHECK: test_atomic_fetch_xor
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_xor_4(i8* {{%[0-9]+}}, i32 55, i32 5)
+  // CHECK: {{%[^ ]*}} = call i32 @__atomic_fetch_xor_4(i8* {{%[0-9]+}}, i32 55, i32 5)
  return __atomic_fetch_xor(p, 55, memory_order_seq_cst);
 }

 int test_atomic_fetch_nand(int *p) {
  // CHECK: test_atomic_fetch_nand
-  // CHECK: {{%[^ ]*}} = tail call i32 @__atomic_fetch_nand_4(i8* {{%[0-9]+}}, i32 55, i32 5)
+  // CHECK: {{%[^ ]*}} = call i32 @__atomic_fetch_nand_4(i8* {{%[0-9]+}}, i32 55, i32 5)
  return __atomic_fetch_nand(p, 55, memory_order_seq_cst);
 }

 int test_atomic_add_fetch(int *p) {
  // CHECK: test_atomic_add_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_add_4(i8* {{%[0-9]+}}, i32 55, i32 5)
+  // CHECK: [[CALL:%[^ ]*]] = call i32 @__atomic_fetch_add_4(i8* {{%[0-9]+}}, i32 55, i32 5)
  // CHECK: {{%[^ ]*}} = add i32 [[CALL]], 55
  return __atomic_add_fetch(p, 55, memory_order_seq_cst);
 }

 int test_atomic_sub_fetch(int *p) {
  // CHECK: test_atomic_sub_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_sub_4(i8* {{%[0-9]+}}, i32 55, i32 5)
+  // CHECK: [[CALL:%[^ ]*]] = call i32 @__atomic_fetch_sub_4(i8* {{%[0-9]+}}, i32 55, i32 5)
  // CHECK: {{%[^ ]*}} = add i32 [[CALL]], -55
  return __atomic_sub_fetch(p, 55, memory_order_seq_cst);
 }

 int test_atomic_and_fetch(int *p) {
  // CHECK: test_atomic_and_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_and_4(i8* {{%[0-9]+}}, i32 55, i32 5)
+  // CHECK: [[CALL:%[^ ]*]] = call i32 @__atomic_fetch_and_4(i8* {{%[0-9]+}}, i32 55, i32 5)
  // CHECK: {{%[^ ]*}} = and i32 [[CALL]], 55
  return __atomic_and_fetch(p, 55, memory_order_seq_cst);
 }

 int test_atomic_or_fetch(int *p) {
  // CHECK: test_atomic_or_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_or_4(i8* {{%[0-9]+}}, i32 55, i32 5)
+  // CHECK: [[CALL:%[^ ]*]] = call i32 @__atomic_fetch_or_4(i8* {{%[0-9]+}}, i32 55, i32 5)
  // CHECK: {{%[^ ]*}} = or i32 [[CALL]], 55
  return __atomic_or_fetch(p, 55, memory_order_seq_cst);
 }

 int test_atomic_xor_fetch(int *p) {
  // CHECK: test_atomic_xor_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_xor_4(i8* {{%[0-9]+}}, i32 55, i32 5)
+  // CHECK: [[CALL:%[^ ]*]] = call i32 @__atomic_fetch_xor_4(i8* {{%[0-9]+}}, i32 55, i32 5)
  // CHECK: {{%[^ ]*}} = xor i32 [[CALL]], 55
  return __atomic_xor_fetch(p, 55, memory_order_seq_cst);
 }

 int test_atomic_nand_fetch(int *p) {
  // CHECK: test_atomic_nand_fetch
-  // CHECK: [[CALL:%[^ ]*]] = tail call i32 @__atomic_fetch_nand_4(i8* {{%[0-9]+}}, i32 55, i32 5)
+  // CHECK: [[CALL:%[^ ]*]] = call i32 @__atomic_fetch_nand_4(i8* {{%[0-9]+}}, i32 55, i32 5)
  // FIXME: We should not be checking optimized IR. It changes independently of clang.
  // FIXME-CHECK: [[AND:%[^ ]*]] = and i32 [[CALL]], 55
  // FIXME-CHECK: {{%[^ ]*}} = xor i32 [[AND]], -1
--- a/clang/test/CodeGenCXX/atomicinit.cpp
+++ b/clang/test/CodeGenCXX/atomicinit.cpp
@ -31,7 +31,7 @@ _Atomic(B) b;
 // CHECK-LABEL: define void @_Z11atomic_initR1Ai
 void atomic_init(A& a, int i) {
  // CHECK-NOT: atomic
-  // CHECK: tail call void @_ZN1BC1Ei
+  // CHECK: call void @_ZN1BC1Ei
  __c11_atomic_init(&b, B(i));
  // CHECK-NEXT: ret void
 }
--- a/clang/test/CodeGenCXX/auto-var-init.cpp
+++ b/clang/test/CodeGenCXX/auto-var-init.cpp
@ -645,7 +645,7 @@ TEST_UNINIT(smallpartinit, smallpartinit);
 // ZERO-LABEL: @test_smallpartinit_uninit()
 // ZERO-O0: call void @llvm.memset{{.*}}, i8 0,
 // ZERO-O1-LEGACY: store i16 0, i16* %uninit, align 2
-// ZERO-O1-NEWPM: store i16 42, i16* %uninit, align 2
+// ZERO-O1-NEWPM: store i16 0, i16* %uninit, align 2

 TEST_BRACES(smallpartinit, smallpartinit);
 // CHECK-LABEL: @test_smallpartinit_braces()
@ -718,7 +718,7 @@ TEST_UNINIT(paddednullinit, paddednullinit);
 // PATTERN-LABEL: @test_paddednullinit_uninit()
 // PATTERN-O0: call void @llvm.memcpy{{.*}} @__const.test_paddednullinit_uninit.uninit
 // PATTERN-O1-LEGACY: store i64 [[I64]], i64* %uninit, align 8
-// PATTERN-O1-NEWPM: store i64 2863311360, i64* %uninit, align 8
+// PATTERN-O1-NEWPM: store i64 [[I64]], i64* %uninit, align 8
 // ZERO-LABEL: @test_paddednullinit_uninit()
 // ZERO-O0: call void @llvm.memset{{.*}}, i8 0,
 // ZERO-O1: store i64 0, i64* %uninit, align 8
@ -1344,10 +1344,7 @@ TEST_UNINIT(virtualderived, virtualderived);
 // ZERO-LABEL: @test_virtualderived_uninit()
 // ZERO-O0: call void @llvm.memset{{.*}}, i8 0,
 // ZERO-O1-LEGACY: call void @llvm.memset{{.*}}, i8 0,
-// ZERO-O1-NEWPM: [[FIELD1:%.*]] = getelementptr inbounds %struct.virtualderived, %struct.virtualderived* %uninit, i64 0, i32 1, i32 0, i32 0
-// ZERO-O1-NEWPM: [[FIELD0:%.*]] = getelementptr inbounds %struct.virtualderived, %struct.virtualderived* %uninit, i64 0, i32 0, i32 0
-// ZERO-O1-NEWPM: store i32 (...)** bitcast (i8** getelementptr inbounds ({ [7 x i8*], [5 x i8*] }, { [7 x i8*], [5 x i8*] }* @_ZTV14virtualderived, i64 0, inrange i32 0, i64 5) to i32 (...)**), i32 (...)*** [[FIELD0]], align 8
-// ZERO-O1-NEWPM: store i32 (...)** bitcast (i8** getelementptr inbounds ({ [7 x i8*], [5 x i8*] }, { [7 x i8*], [5 x i8*] }* @_ZTV14virtualderived, i64 0, inrange i32 1, i64 3) to i32 (...)**), i32 (...)*** [[FIELD1]], align 8
+// ZERO-O1-NEWPM: call void @llvm.memset{{.*}}, i8 0,

 TEST_BRACES(virtualderived, virtualderived);
 // CHECK-LABEL: @test_virtualderived_braces()
--- a/clang/test/CodeGenCXX/discard-name-values.cpp
+++ b/clang/test/CodeGenCXX/discard-name-values.cpp
@ -11,11 +11,11 @@ bool test(bool pred) {

  if (pred) {
    // DISCARDVALUE: 2:
-    // DISCARDVALUE-NEXT: tail call void @branch()
+    // DISCARDVALUE-NEXT: call void @branch()
    // DISCARDVALUE-NEXT: br label %3

    // CHECK: if.then:
-    // CHECK-NEXT: tail call void @branch()
+    // CHECK-NEXT: call void @branch()
    // CHECK-NEXT: br label %if.end
    branch();
  }
--- a/clang/test/CodeGenCXX/microsoft-abi-dynamic-cast.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-dynamic-cast.cpp
@ -13,7 +13,7 @@ T* test0() { return dynamic_cast<T*>((B*)0); }
 T* test1(V* x) { return &dynamic_cast<T&>(*x); }
 // CHECK-LABEL: define dso_local %struct.T* @"?test1@@YAPAUT@@PAUV@@@Z"(%struct.V* %x)
 // CHECK:        [[CAST:%.*]] = bitcast %struct.V* %x to i8*
-// CHECK-NEXT:   [[CALL:%.*]] = tail call i8* @__RTDynamicCast(i8* [[CAST]], i32 0, i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUV@@@8" to i8*), i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUT@@@8" to i8*), i32 1)
+// CHECK-NEXT:   [[CALL:%.*]] = call i8* @__RTDynamicCast(i8* [[CAST]], i32 0, i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUV@@@8" to i8*), i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUT@@@8" to i8*), i32 1)
 // CHECK-NEXT:   [[RET:%.*]] = bitcast i8* [[CALL]] to %struct.T*
 // CHECK-NEXT:   ret %struct.T* [[RET]]

@ -25,7 +25,7 @@ T* test2(A* x) { return &dynamic_cast<T&>(*x); }
 // CHECK-NEXT:   [[VBOFFP:%.*]] = getelementptr inbounds i32, i32* [[VBTBL]], i32 1
 // CHECK-NEXT:   [[VBOFFS:%.*]] = load i32, i32* [[VBOFFP]], align 4
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr inbounds i8, i8* [[CAST]], i32 [[VBOFFS]]
-// CHECK-NEXT:   [[CALL:%.*]] = tail call i8* @__RTDynamicCast(i8* [[ADJ]], i32 [[VBOFFS]], i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUA@@@8" to i8*), i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUT@@@8" to i8*), i32 1)
+// CHECK-NEXT:   [[CALL:%.*]] = call i8* @__RTDynamicCast(i8* [[ADJ]], i32 [[VBOFFS]], i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUA@@@8" to i8*), i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUT@@@8" to i8*), i32 1)
 // CHECK-NEXT:   [[RET:%.*]] = bitcast i8* [[CALL]] to %struct.T*
 // CHECK-NEXT:   ret %struct.T* [[RET]]

@ -39,14 +39,14 @@ T* test3(B* x) { return &dynamic_cast<T&>(*x); }
 // CHECK-NEXT:   [[VBOFFS:%.*]] = load i32, i32* [[VBOFFP]], align 4
 // CHECK-NEXT:   [[DELTA:%.*]] = add nsw i32 [[VBOFFS]], 4
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr inbounds i8, i8* [[VOIDP]], i32 [[DELTA]]
-// CHECK-NEXT:   [[CALL:%.*]] = tail call i8* @__RTDynamicCast(i8* [[ADJ]], i32 [[DELTA]], i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUB@@@8" to i8*), i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUT@@@8" to i8*), i32 1)
+// CHECK-NEXT:   [[CALL:%.*]] = call i8* @__RTDynamicCast(i8* [[ADJ]], i32 [[DELTA]], i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUB@@@8" to i8*), i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUT@@@8" to i8*), i32 1)
 // CHECK-NEXT:   [[RET:%.*]] = bitcast i8* [[CALL]] to %struct.T*
 // CHECK-NEXT:   ret %struct.T* [[RET]]

 T* test4(V* x) { return dynamic_cast<T*>(x); }
 // CHECK-LABEL: define dso_local %struct.T* @"?test4@@YAPAUT@@PAUV@@@Z"(%struct.V* %x)
 // CHECK:        [[CAST:%.*]] = bitcast %struct.V* %x to i8*
-// CHECK-NEXT:   [[CALL:%.*]] = tail call i8* @__RTDynamicCast(i8* [[CAST]], i32 0, i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUV@@@8" to i8*), i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUT@@@8" to i8*), i32 0)
+// CHECK-NEXT:   [[CALL:%.*]] = call i8* @__RTDynamicCast(i8* [[CAST]], i32 0, i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUV@@@8" to i8*), i8* bitcast (%rtti.TypeDescriptor7* @"??_R0?AUT@@@8" to i8*), i32 0)
 // CHECK-NEXT:   [[RET:%.*]] = bitcast i8* [[CALL]] to %struct.T*
 // CHECK-NEXT:   ret %struct.T* [[RET]]

@ -60,7 +60,7 @@ T* test5(A* x) { return dynamic_cast<T*>(x); }
 // CHECK-NEXT:   [[VBOFFP:%.*]] = getelementptr inbounds i32, i32* [[VBTBL]], i32 1
 // CHECK-NEXT:   [[VBOFFS:%.*]] = load i32, i32* [[VBOFFP]], align 4
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr inbounds i8, i8* [[VOIDP]], i32 [[VBOFFS]]
-// CHECK-NEXT:   [[CALL:%.*]] = tail call i8* @__RTDynamicCast(i8* nonnull [[ADJ]], i32 [[VBOFFS]], i8* {{.*}}bitcast (%rtti.TypeDescriptor7* @"??_R0?AUA@@@8" to i8*), i8* {{.*}}bitcast (%rtti.TypeDescriptor7* @"??_R0?AUT@@@8" to i8*), i32 0)
+// CHECK-NEXT:   [[CALL:%.*]] = call i8* @__RTDynamicCast(i8* nonnull [[ADJ]], i32 [[VBOFFS]], i8* {{.*}}bitcast (%rtti.TypeDescriptor7* @"??_R0?AUA@@@8" to i8*), i8* {{.*}}bitcast (%rtti.TypeDescriptor7* @"??_R0?AUT@@@8" to i8*), i32 0)
 // CHECK-NEXT:   [[RES:%.*]] = bitcast i8* [[CALL]] to %struct.T*
 // CHECK-NEXT:   br label
 // CHECK:        [[RET:%.*]] = phi %struct.T*
@ -78,7 +78,7 @@ T* test6(B* x) { return dynamic_cast<T*>(x); }
 // CHECK-NEXT:   [[VBOFFS:%.*]] = load i32, i32* [[VBOFFP]], align 4
 // CHECK-NEXT:   [[DELTA:%.*]] = add nsw i32 [[VBOFFS]], 4
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr inbounds i8, i8* [[CAST]], i32 [[DELTA]]
-// CHECK-NEXT:   [[CALL:%.*]] = tail call i8* @__RTDynamicCast(i8* [[ADJ]], i32 [[DELTA]], i8* {{.*}}bitcast (%rtti.TypeDescriptor7* @"??_R0?AUB@@@8" to i8*), i8* {{.*}}bitcast (%rtti.TypeDescriptor7* @"??_R0?AUT@@@8" to i8*), i32 0)
+// CHECK-NEXT:   [[CALL:%.*]] = call i8* @__RTDynamicCast(i8* [[ADJ]], i32 [[DELTA]], i8* {{.*}}bitcast (%rtti.TypeDescriptor7* @"??_R0?AUB@@@8" to i8*), i8* {{.*}}bitcast (%rtti.TypeDescriptor7* @"??_R0?AUT@@@8" to i8*), i32 0)
 // CHECK-NEXT:   [[RES:%.*]] = bitcast i8* [[CALL]] to %struct.T*
 // CHECK-NEXT:   br label
 // CHECK:        [[RET:%.*]] = phi %struct.T*
@ -87,7 +87,7 @@ T* test6(B* x) { return dynamic_cast<T*>(x); }
 void* test7(V* x) { return dynamic_cast<void*>(x); }
 // CHECK-LABEL: define dso_local i8* @"?test7@@YAPAXPAUV@@@Z"(%struct.V* %x)
 // CHECK:        [[CAST:%.*]] = bitcast %struct.V* %x to i8*
-// CHECK-NEXT:   [[RET:%.*]] = tail call i8* @__RTCastToVoid(i8* [[CAST]])
+// CHECK-NEXT:   [[RET:%.*]] = call i8* @__RTCastToVoid(i8* [[CAST]])
 // CHECK-NEXT:   ret i8* [[RET]]

 void* test8(A* x) { return dynamic_cast<void*>(x); }
@ -100,7 +100,7 @@ void* test8(A* x) { return dynamic_cast<void*>(x); }
 // CHECK-NEXT:   [[VBOFFP:%.*]] = getelementptr inbounds i32, i32* [[VBTBL]], i32 1
 // CHECK-NEXT:   [[VBOFFS:%.*]] = load i32, i32* [[VBOFFP]], align 4
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr inbounds i8, i8* [[VOIDP]], i32 [[VBOFFS]]
-// CHECK-NEXT:   [[RES:%.*]] = tail call i8* @__RTCastToVoid(i8* nonnull [[ADJ]])
+// CHECK-NEXT:   [[RES:%.*]] = call i8* @__RTCastToVoid(i8* nonnull [[ADJ]])
 // CHECK-NEXT:   br label
 // CHECK:        [[RET:%.*]] = phi i8*
 // CHECK-NEXT:   ret i8* [[RET]]
@ -117,7 +117,7 @@ void* test9(B* x) { return dynamic_cast<void*>(x); }
 // CHECK-NEXT:   [[VBOFFS:%.*]] = load i32, i32* [[VBOFFP]], align 4
 // CHECK-NEXT:   [[DELTA:%.*]] = add nsw i32 [[VBOFFS]], 4
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr inbounds i8, i8* [[CAST]], i32 [[DELTA]]
-// CHECK-NEXT:   [[CALL:%.*]] = tail call i8* @__RTCastToVoid(i8* [[ADJ]])
+// CHECK-NEXT:   [[CALL:%.*]] = call i8* @__RTCastToVoid(i8* [[ADJ]])
 // CHECK-NEXT:   br label
 // CHECK:        [[RET:%.*]] = phi i8*
 // CHECK-NEXT:   ret i8* [[RET]]
--- a/clang/test/CodeGenCXX/microsoft-abi-typeid.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-typeid.cpp
@ -25,10 +25,10 @@ const std::type_info* test2_typeid() { return &typeid(&a); }

 const std::type_info* test3_typeid() { return &typeid(*fn()); }
 // CHECK-LABEL: define dso_local %struct.type_info* @"?test3_typeid@@YAPBUtype_info@@XZ"()
-// CHECK:        [[CALL:%.*]] = tail call %struct.A* @"?fn@@YAPAUA@@XZ"()
+// CHECK:        [[CALL:%.*]] = call %struct.A* @"?fn@@YAPAUA@@XZ"()
 // CHECK-NEXT:   [[CMP:%.*]] = icmp eq %struct.A* [[CALL]], null
 // CHECK-NEXT:   br i1 [[CMP]]
-// CHECK:        tail call i8* @__RTtypeid(i8* null)
+// CHECK:        call i8* @__RTtypeid(i8* null)
 // CHECK-NEXT:   unreachable
 // CHECK:        [[THIS:%.*]] = bitcast %struct.A* [[CALL]] to i8*
 // CHECK-NEXT:   [[VBTBLP:%.*]] = getelementptr %struct.A, %struct.A* [[CALL]], i32 0, i32 0
@ -36,7 +36,7 @@ const std::type_info* test3_typeid() { return &typeid(*fn()); }
 // CHECK-NEXT:   [[VBSLOT:%.*]] = getelementptr inbounds i32, i32* [[VBTBL]], i32 1
 // CHECK-NEXT:   [[VBASE_OFFS:%.*]] = load i32, i32* [[VBSLOT]], align 4
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr inbounds i8, i8* [[THIS]], i32 [[VBASE_OFFS]]
-// CHECK-NEXT:   [[RT:%.*]] = tail call i8* @__RTtypeid(i8* nonnull [[ADJ]])
+// CHECK-NEXT:   [[RT:%.*]] = call i8* @__RTtypeid(i8* nonnull [[ADJ]])
 // CHECK-NEXT:   [[RET:%.*]] = bitcast i8* [[RT]] to %struct.type_info*
 // CHECK-NEXT:   ret %struct.type_info* [[RET]]

@ -46,7 +46,7 @@ const std::type_info* test4_typeid() { return &typeid(b); }

 const std::type_info* test5_typeid() { return &typeid(v); }
 // CHECK: define dso_local %struct.type_info* @"?test5_typeid@@YAPBUtype_info@@XZ"()
-// CHECK:        [[RT:%.*]] = tail call i8* @__RTtypeid(i8* bitcast (%struct.V* @"?v@@3UV@@A" to i8*))
+// CHECK:        [[RT:%.*]] = call i8* @__RTtypeid(i8* bitcast (%struct.V* @"?v@@3UV@@A" to i8*))
 // CHECK-NEXT:   [[RET:%.*]] = bitcast i8* [[RT]] to %struct.type_info*
 // CHECK-NEXT:   ret %struct.type_info* [[RET]]

--- a/clang/test/CodeGenCXX/nrvo.cpp
+++ b/clang/test/CodeGenCXX/nrvo.cpp
@ -33,13 +33,13 @@ X test0() {
 // CHECK-LABEL: define void @_Z5test1b(
 // CHECK-EH-LABEL: define void @_Z5test1b(
 X test1(bool B) {
-  // CHECK:      tail call {{.*}} @_ZN1XC1Ev
+  // CHECK:      call {{.*}} @_ZN1XC1Ev
  // CHECK-NEXT: ret void
  X x;
  if (B)
    return (x);
  return x;
-  // CHECK-EH:      tail call {{.*}} @_ZN1XC1Ev
+  // CHECK-EH:      call {{.*}} @_ZN1XC1Ev
  // CHECK-EH-NEXT: ret void
 }

@ -130,7 +130,7 @@ X test2(bool B) {

 // CHECK-LABEL: define void @_Z5test3b
 X test3(bool B) {
-  // CHECK: tail call {{.*}} @_ZN1XC1Ev
+  // CHECK: call {{.*}} @_ZN1XC1Ev
  // CHECK-NOT: call {{.*}} @_ZN1XC1ERKS_
  // CHECK: call {{.*}} @_ZN1XC1Ev
  // CHECK: call {{.*}} @_ZN1XC1ERKS_
@ -148,14 +148,14 @@ extern "C" void exit(int) throw();
 // CHECK-LABEL: define void @_Z5test4b
 X test4(bool B) {
  {
-    // CHECK: tail call {{.*}} @_ZN1XC1Ev
+    // CHECK: call {{.*}} @_ZN1XC1Ev
    X x;
    // CHECK: br i1
    if (B)
      return x;
  }
-  // CHECK: tail call {{.*}} @_ZN1XD1Ev
-  // CHECK: tail call void @exit(i32 1)
+  // CHECK: call {{.*}} @_ZN1XD1Ev
+  // CHECK: call void @exit(i32 1)
  exit(1);
 }

@ -191,7 +191,7 @@ X test6() {

 // CHECK-LABEL: define void @_Z5test7b
 X test7(bool b) {
-  // CHECK: tail call {{.*}} @_ZN1XC1Ev
+  // CHECK: call {{.*}} @_ZN1XC1Ev
  // CHECK-NEXT: ret
  if (b) {
    X x;
@ -202,7 +202,7 @@ X test7(bool b) {

 // CHECK-LABEL: define void @_Z5test8b
 X test8(bool b) {
-  // CHECK: tail call {{.*}} @_ZN1XC1Ev
+  // CHECK: call {{.*}} @_ZN1XC1Ev
  // CHECK-NEXT: ret
  if (b) {
    X x;
@ -218,6 +218,6 @@ Y<int> test9() {
 }

 // CHECK-LABEL: define linkonce_odr void @_ZN1YIiE1fEv
-// CHECK: tail call {{.*}} @_ZN1YIiEC1Ev
+// CHECK: call {{.*}} @_ZN1YIiEC1Ev

 // CHECK-EH-03: attributes [[NR_NUW]] = { noreturn nounwind }
--- a/clang/test/CodeGenCXX/stack-reuse.cpp
+++ b/clang/test/CodeGenCXX/stack-reuse.cpp
@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple armv7-unknown-linux-gnueabihf %s -o - -emit-llvm -O1 | FileCheck %s
+// RUN: %clang_cc1 -triple armv7-unknown-linux-gnueabihf %s -o - -emit-llvm -O2 | FileCheck %s

 // Stack should be reused when possible, no need to allocate two separate slots
 // if they have disjoint lifetime.
--- a/clang/test/CodeGenCXX/wasm-args-returns.cpp
+++ b/clang/test/CodeGenCXX/wasm-args-returns.cpp
@ -19,8 +19,8 @@ test(one_field);
 // CHECK: define double @_Z7forward9one_field(double returned %{{.*}})
 //
 // CHECK: define void @_Z14test_one_fieldv()
-// CHECK: %[[call:.*]] = tail call double @_Z13def_one_fieldv()
-// CHECK: tail call void @_Z3use9one_field(double %[[call]])
+// CHECK: %[[call:.*]] = call double @_Z13def_one_fieldv()
+// CHECK: call void @_Z3use9one_field(double %[[call]])
 // CHECK: ret void
 //
 // CHECK: declare void @_Z3use9one_field(double)
@ -82,8 +82,8 @@ test(empty);
 // CHECK: define void @_Z7forward5empty()
 //
 // CHECK: define void @_Z10test_emptyv()
-// CHECK: tail call void @_Z9def_emptyv()
-// CHECK: tail call void @_Z3use5empty()
+// CHECK: call void @_Z9def_emptyv()
+// CHECK: call void @_Z3use5empty()
 // CHECK: ret void
 //
 // CHECK: declare void @_Z3use5empty()
@ -96,8 +96,8 @@ test(one_bitfield);
 // CHECK: define i32 @_Z7forward12one_bitfield(i32 returned %{{.*}})
 //
 // CHECK: define void @_Z17test_one_bitfieldv()
-// CHECK: %[[call:.*]] = tail call i32 @_Z16def_one_bitfieldv()
-// CHECK: tail call void @_Z3use12one_bitfield(i32 %[[call]])
+// CHECK: %[[call:.*]] = call i32 @_Z16def_one_bitfieldv()
+// CHECK: call void @_Z3use12one_bitfield(i32 %[[call]])
 // CHECK: ret void
 //
 // CHECK: declare void @_Z3use12one_bitfield(i32)
--- a/clang/test/CodeGenObjCXX/arc-blocks.mm
+++ b/clang/test/CodeGenObjCXX/arc-blocks.mm
@ -122,7 +122,7 @@ namespace test1 {
 // CHECK: call void @__clang_call_terminate(

 // CHECK-O1-LABEL: define linkonce_odr hidden void @__copy_helper_block_ea8_32s40r48w56c15_ZTSN5test12S0E60c15_ZTSN5test12S0E(
-// CHECK-O1: tail call void @llvm.objc.release({{.*}}) {{.*}} !clang.imprecise_release
+// CHECK-O1: call void @llvm.objc.release({{.*}}) {{.*}} !clang.imprecise_release
 // CHECK-NOEXCP: define linkonce_odr hidden void @__copy_helper_block_8_32s40r48w56c15_ZTSN5test12S0E60c15_ZTSN5test12S0E(

 // CHECK: define linkonce_odr hidden void @__destroy_helper_block_ea8_32s40r48w56c15_ZTSN5test12S0E60c15_ZTSN5test12S0E(
@ -170,8 +170,8 @@ namespace test1 {
 // CHECK: call void @__clang_call_terminate(

 // CHECK-O1-LABEL: define linkonce_odr hidden void @__destroy_helper_block_ea8_32s40r48w56c15_ZTSN5test12S0E60c15_ZTSN5test12S0E(
-// CHECK-O1: tail call void @llvm.objc.release({{.*}}) {{.*}} !clang.imprecise_release
-// CHECK-O1: tail call void @llvm.objc.release({{.*}}) {{.*}} !clang.imprecise_release
+// CHECK-O1: call void @llvm.objc.release({{.*}}) {{.*}} !clang.imprecise_release
+// CHECK-O1: call void @llvm.objc.release({{.*}}) {{.*}} !clang.imprecise_release
 // CHECK-NOEXCP: define linkonce_odr hidden void @__destroy_helper_block_8_32s40r48w56c15_ZTSN5test12S0E60c15_ZTSN5test12S0E(

 namespace {
--- a/clang/test/CodeGenObjCXX/nrvo.mm
+++ b/clang/test/CodeGenObjCXX/nrvo.mm
@ -14,7 +14,7 @@ struct X {
 // CHECK: define internal void @"\01-[NRVO getNRVO]"
 - (X)getNRVO { 
  X x;
-  // CHECK: tail call void @_ZN1XC1Ev
+  // CHECK: call void @_ZN1XC1Ev
  // CHECK-NEXT: ret void
  return x;
 }
@ -24,7 +24,7 @@ X blocksNRVO() {
  return ^{
    // CHECK-LABEL: define internal void @___Z10blocksNRVOv_block_invoke
    X x;
-    // CHECK: tail call void @_ZN1XC1Ev
+    // CHECK: call void @_ZN1XC1Ev
    // CHECK-NEXT: ret void
    return x;
  }() ;
--- a/clang/test/PCH/no-escaping-block-tail-calls.cpp
+++ b/clang/test/PCH/no-escaping-block-tail-calls.cpp
@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -x c++-header -triple x86_64-apple-darwin11 -emit-pch -O1 -fblocks -fno-escaping-block-tail-calls -o %t %S/no-escaping-block-tail-calls.h
-// RUN: %clang_cc1 -triple x86_64-apple-darwin11 -include-pch %t -emit-llvm -O1 -fblocks -fno-escaping-block-tail-calls -o - %s | FileCheck %s
+// RUN: %clang_cc1 -x c++-header -triple x86_64-apple-darwin11 -emit-pch -O2 -fblocks -fno-escaping-block-tail-calls -o %t %S/no-escaping-block-tail-calls.h
+// RUN: %clang_cc1 -triple x86_64-apple-darwin11 -include-pch %t -emit-llvm -O2 -fblocks -fno-escaping-block-tail-calls -o - %s | FileCheck %s

 // Check that -fno-escaping-block-tail-calls doesn't disable tail-call
 // optimization if the block is non-escaping.
--- a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/ambiguous_tail_call_seq1/Makefile
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/ambiguous_tail_call_seq1/Makefile
@ -1,4 +1,4 @@
 CXX_SOURCES := main.cpp

-CXXFLAGS_EXTRAS := -g -O1 -glldb
+CXXFLAGS_EXTRAS := -g -O2 -glldb
 include Makefile.rules
--- a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/ambiguous_tail_call_seq2/Makefile
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/ambiguous_tail_call_seq2/Makefile
@ -1,4 +1,4 @@
 CXX_SOURCES := main.cpp

-CXXFLAGS_EXTRAS := -g -O1 -glldb
+CXXFLAGS_EXTRAS := -g -O2 -glldb
 include Makefile.rules
--- a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/disambiguate_call_site/Makefile
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/disambiguate_call_site/Makefile
@ -1,4 +1,4 @@
 CXX_SOURCES := main.cpp

-CXXFLAGS_EXTRAS := -g -O1 -glldb
+CXXFLAGS_EXTRAS := -g -O2 -glldb
 include Makefile.rules
--- a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/disambiguate_paths_to_common_sink/Makefile
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/disambiguate_paths_to_common_sink/Makefile
@ -1,4 +1,4 @@
 CXX_SOURCES := main.cpp

-CXXFLAGS_EXTRAS := -g -O1 -glldb
+CXXFLAGS_EXTRAS := -g -O2 -glldb
 include Makefile.rules
--- a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/disambiguate_tail_call_seq/Makefile
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/disambiguate_tail_call_seq/Makefile
@ -1,4 +1,4 @@
 CXX_SOURCES := main.cpp

-CXXFLAGS_EXTRAS := -g -O1 -glldb
+CXXFLAGS_EXTRAS := -g -O2 -glldb
 include Makefile.rules
--- a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/inlining_and_tail_calls/Makefile
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/inlining_and_tail_calls/Makefile
@ -1,4 +1,4 @@
 CXX_SOURCES := main.cpp

-CXXFLAGS_EXTRAS := -g -O1 -glldb
+CXXFLAGS_EXTRAS := -g -O2 -glldb
 include Makefile.rules
--- a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/sbapi_support/Makefile
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/sbapi_support/Makefile
@ -1,4 +1,4 @@
 CXX_SOURCES := main.cpp

-CXXFLAGS_EXTRAS := -g -O1 -glldb
+CXXFLAGS_EXTRAS := -g -O2 -glldb
 include Makefile.rules
--- a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/thread_step_out_message/Makefile
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/thread_step_out_message/Makefile
@ -1,4 +1,4 @@
 CXX_SOURCES := main.cpp

-CXXFLAGS_EXTRAS := -g -O1 -glldb
+CXXFLAGS_EXTRAS := -g -O2 -glldb
 include Makefile.rules
--- a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/thread_step_out_or_return/Makefile
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/thread_step_out_or_return/Makefile
@ -1,4 +1,4 @@
 CXX_SOURCES := main.cpp

-CXXFLAGS_EXTRAS := -g -O1 -glldb
+CXXFLAGS_EXTRAS := -g -O2 -glldb
 include Makefile.rules
--- a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/unambiguous_sequence/Makefile
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/unambiguous_sequence/Makefile
@ -1,4 +1,4 @@
 CXX_SOURCES := main.cpp

-CXXFLAGS_EXTRAS := -g -O1 -glldb
+CXXFLAGS_EXTRAS := -g -O2 -glldb
 include Makefile.rules
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@ -151,10 +151,6 @@ public:

    /// Optimize quickly without destroying debuggability.
    ///
-    /// FIXME: The current and historical behavior of this level does *not*
-    /// agree with this goal, but we would like to move toward this goal in the
-    /// future.
-    ///
    /// This level is tuned to produce a result from the optimizer as quickly
    /// as possible and to avoid destroying debuggability. This tends to result
    /// in a very good development mode where the compiled code will be
@ -164,9 +160,9 @@ public:
    /// debugging of the resulting binary.
    ///
    /// As an example, complex loop transformations such as versioning,
-    /// vectorization, or fusion might not make sense here due to the degree to
-    /// which the executed code would differ from the source code, and the
-    /// potential compile time cost.
+    /// vectorization, or fusion don't make sense here due to the degree to
+    /// which the executed code differs from the source code, and the compile time
+    /// cost.
    O1,

    /// Optimize for fast execution as much as possible without triggering
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@ -400,21 +400,25 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
  FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */));

  // Hoisting of scalars and load expressions.
-  if (EnableGVNHoist)
-    FPM.addPass(GVNHoistPass());
+  if (Level > O1) {
+    if (EnableGVNHoist)
+      FPM.addPass(GVNHoistPass());

-  // Global value numbering based sinking.
-  if (EnableGVNSink) {
-    FPM.addPass(GVNSinkPass());
-    FPM.addPass(SimplifyCFGPass());
+    // Global value numbering based sinking.
+    if (EnableGVNSink) {
+      FPM.addPass(GVNSinkPass());
+      FPM.addPass(SimplifyCFGPass());
+    }
  }

  // Speculative execution if the target has divergent branches; otherwise nop.
-  FPM.addPass(SpeculativeExecutionPass());
+  if (Level > O1) {
+    FPM.addPass(SpeculativeExecutionPass());

-  // Optimize based on known information about branches, and cleanup afterward.
-  FPM.addPass(JumpThreadingPass());
-  FPM.addPass(CorrelatedValuePropagationPass());
+    // Optimize based on known information about branches, and cleanup afterward.
+    FPM.addPass(JumpThreadingPass());
+    FPM.addPass(CorrelatedValuePropagationPass());
+  }
  FPM.addPass(SimplifyCFGPass());
  if (Level == O3)
    FPM.addPass(AggressiveInstCombinePass());
@ -428,10 +432,12 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
  // For PGO use pipeline, try to optimize memory intrinsics such as memcpy
  // using the size value profile. Don't perform this when optimizing for size.
  if (PGOOpt && PGOOpt->Action == PGOOptions::IRUse &&
-      !isOptimizingForSize(Level))
+      !isOptimizingForSize(Level) && Level > O1)
    FPM.addPass(PGOMemOPSizeOpt());

-  FPM.addPass(TailCallElimPass());
+  // TODO: Investigate the cost/benefit of tail call elimination on debugging.
+  if (Level > O1)
+    FPM.addPass(TailCallElimPass());
  FPM.addPass(SimplifyCFGPass());

  // Form canonically associated expression trees, and simplify the trees using
@ -458,6 +464,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,

  // Rotate Loop - disable header duplication at -Oz
  LPM1.addPass(LoopRotatePass(Level != Oz));
+  // TODO: Investigate promotion cap for O1.
  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
  LPM1.addPass(SimpleLoopUnswitchPass());
  LPM2.addPass(IndVarSimplifyPass());
@ -525,18 +532,21 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,

  // Re-consider control flow based optimizations after redundancy elimination,
  // redo DCE, etc.
-  FPM.addPass(JumpThreadingPass());
-  FPM.addPass(CorrelatedValuePropagationPass());
-  FPM.addPass(DSEPass());
-  FPM.addPass(createFunctionToLoopPassAdaptor(
-      LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
-      EnableMSSALoopDependency, DebugLogging));
+  if (Level > O1) {
+    FPM.addPass(JumpThreadingPass());
+    FPM.addPass(CorrelatedValuePropagationPass());
+    FPM.addPass(DSEPass());
+    FPM.addPass(createFunctionToLoopPassAdaptor(
+        LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
+        EnableMSSALoopDependency, DebugLogging));
+  }

  for (auto &C : ScalarOptimizerLateEPCallbacks)
    C(FPM, Level);

  // Finally, do an expensive DCE pass to catch all the dead code exposed by
  // the simplifications and basic cleanup after all the simplifications.
+  // TODO: Investigate if this is too expensive.
  FPM.addPass(ADCEPass());
  FPM.addPass(SimplifyCFGPass());
  FPM.addPass(InstCombinePass());
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@ -320,19 +320,26 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
    legacy::PassManagerBase &MPM) {
  // Start of function pass.
  // Break up aggregate allocas, using SSAUpdater.
+  assert(OptLevel >= 1 && "Calling function optimizer with no optimization level!");
  MPM.add(createSROAPass());
  MPM.add(createEarlyCSEPass(true /* Enable mem-ssa. */)); // Catch trivial redundancies
-  if (EnableGVNHoist)
-    MPM.add(createGVNHoistPass());
-  if (EnableGVNSink) {
-    MPM.add(createGVNSinkPass());
-    MPM.add(createCFGSimplificationPass());
+
+  if (OptLevel > 1) {
+    if (EnableGVNHoist)
+      MPM.add(createGVNHoistPass());
+    if (EnableGVNSink) {
+      MPM.add(createGVNSinkPass());
+      MPM.add(createCFGSimplificationPass());
+    }
  }

-  // Speculative execution if the target has divergent branches; otherwise nop.
-  MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass());
-  MPM.add(createJumpThreadingPass());         // Thread jumps.
-  MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
+  if (OptLevel > 1) {
+    // Speculative execution if the target has divergent branches; otherwise nop.
+    MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass());
+
+    MPM.add(createJumpThreadingPass());         // Thread jumps.
+    MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
+  }
  MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
  // Combine silly seq's
  if (OptLevel > 2)
@ -346,8 +353,10 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
  if (SizeLevel == 0)
    MPM.add(createPGOMemOPSizeOptLegacyPass());

-  MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
-  MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+  // TODO: Investigate the cost/benefit of tail call elimination on debugging.
+  if (OptLevel > 1)
+    MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
+  MPM.add(createCFGSimplificationPass());      // Merge & remove BBs
  MPM.add(createReassociatePass());           // Reassociate expressions

  // Begin the loop pass pipeline.
@ -360,6 +369,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
  }
  // Rotate Loop - disable header duplication at -Oz
  MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
+  // TODO: Investigate promotion cap for O1.
  MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
  if (EnableSimpleLoopUnswitch)
    MPM.add(createSimpleLoopUnswitchLegacyPass());
@ -402,16 +412,19 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
  // opened up by them.
  addInstructionCombiningPass(MPM);
  addExtensionsToPM(EP_Peephole, MPM);
-  MPM.add(createJumpThreadingPass());         // Thread jumps
-  MPM.add(createCorrelatedValuePropagationPass());
-  MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
-  MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+  if (OptLevel > 1) {
+    MPM.add(createJumpThreadingPass());         // Thread jumps
+    MPM.add(createCorrelatedValuePropagationPass());
+    MPM.add(createDeadStoreEliminationPass());  // Delete dead stores
+    MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+  }

  addExtensionsToPM(EP_ScalarOptimizerLate, MPM);

  if (RerollLoops)
    MPM.add(createLoopRerollPass());

+  // TODO: Investigate if this is too expensive at O1.
  MPM.add(createAggressiveDCEPass());         // Delete dead instructions
  MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
  // Clean up after everything.
@ -899,7 +912,8 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {

  // LTO provides additional opportunities for tailcall elimination due to
  // link-time inlining, and visibility of nocapture attribute.
-  PM.add(createTailCallEliminationPass());
+  if (OptLevel > 1)
+    PM.add(createTailCallEliminationPass());

  // Infer attributes on declarations, call sites, arguments, etc.
  PM.add(createPostOrderFunctionAttrsLegacyPass()); // Add nocapture.
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@ -3,17 +3,17 @@
 ; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-use-native -amdgpu-prelink < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-NATIVE %s

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos
-; GCN-POSTLINK: tail call fast float @_Z3sinf(
-; GCN-POSTLINK: tail call fast float @_Z3cosf(
+; GCN-POSTLINK: call fast float @_Z3sinf(
+; GCN-POSTLINK: call fast float @_Z3cosf(
 ; GCN-PRELINK: call fast float @_Z6sincosfPf(
-; GCN-NATIVE: tail call fast float @_Z10native_sinf(
-; GCN-NATIVE: tail call fast float @_Z10native_cosf(
+; GCN-NATIVE: call fast float @_Z10native_sinf(
+; GCN-NATIVE: call fast float @_Z10native_cosf(
 define amdgpu_kernel void @test_sincos(float addrspace(1)* nocapture %a) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3sinf(float %tmp)
+  %call = call fast float @_Z3sinf(float %tmp)
  store float %call, float addrspace(1)* %a, align 4
-  %call2 = tail call fast float @_Z3cosf(float %tmp)
+  %call2 = call fast float @_Z3cosf(float %tmp)
  %arrayidx3 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
  store float %call2, float addrspace(1)* %arrayidx3, align 4
  ret void
@ -24,17 +24,17 @@ declare float @_Z3sinf(float)
 declare float @_Z3cosf(float)

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v2
-; GCN-POSTLINK: tail call fast <2 x float> @_Z3sinDv2_f(
-; GCN-POSTLINK: tail call fast <2 x float> @_Z3cosDv2_f(
+; GCN-POSTLINK: call fast <2 x float> @_Z3sinDv2_f(
+; GCN-POSTLINK: call fast <2 x float> @_Z3cosDv2_f(
 ; GCN-PRELINK: call fast <2 x float> @_Z6sincosDv2_fPS_(
-; GCN-NATIVE: tail call fast <2 x float> @_Z10native_sinDv2_f(
-; GCN-NATIVE: tail call fast <2 x float> @_Z10native_cosDv2_f(
+; GCN-NATIVE: call fast <2 x float> @_Z10native_sinDv2_f(
+; GCN-NATIVE: call fast <2 x float> @_Z10native_cosDv2_f(
 define amdgpu_kernel void @test_sincos_v2(<2 x float> addrspace(1)* nocapture %a) {
 entry:
  %tmp = load <2 x float>, <2 x float> addrspace(1)* %a, align 8
-  %call = tail call fast <2 x float> @_Z3sinDv2_f(<2 x float> %tmp)
+  %call = call fast <2 x float> @_Z3sinDv2_f(<2 x float> %tmp)
  store <2 x float> %call, <2 x float> addrspace(1)* %a, align 8
-  %call2 = tail call fast <2 x float> @_Z3cosDv2_f(<2 x float> %tmp)
+  %call2 = call fast <2 x float> @_Z3cosDv2_f(<2 x float> %tmp)
  %arrayidx3 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %a, i64 1
  store <2 x float> %call2, <2 x float> addrspace(1)* %arrayidx3, align 8
  ret void
@ -45,20 +45,20 @@ declare <2 x float> @_Z3sinDv2_f(<2 x float>)
 declare <2 x float> @_Z3cosDv2_f(<2 x float>)

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v3
-; GCN-POSTLINK: tail call fast <3 x float> @_Z3sinDv3_f(
-; GCN-POSTLINK: tail call fast <3 x float> @_Z3cosDv3_f(
+; GCN-POSTLINK: call fast <3 x float> @_Z3sinDv3_f(
+; GCN-POSTLINK: call fast <3 x float> @_Z3cosDv3_f(
 ; GCN-PRELINK: call fast <3 x float> @_Z6sincosDv3_fPS_(
-; GCN-NATIVE: tail call fast <3 x float> @_Z10native_sinDv3_f(
-; GCN-NATIVE: tail call fast <3 x float> @_Z10native_cosDv3_f(
+; GCN-NATIVE: call fast <3 x float> @_Z10native_sinDv3_f(
+; GCN-NATIVE: call fast <3 x float> @_Z10native_cosDv3_f(
 define amdgpu_kernel void @test_sincos_v3(<3 x float> addrspace(1)* nocapture %a) {
 entry:
  %castToVec4 = bitcast <3 x float> addrspace(1)* %a to <4 x float> addrspace(1)*
  %loadVec4 = load <4 x float>, <4 x float> addrspace(1)* %castToVec4, align 16
  %extractVec4 = shufflevector <4 x float> %loadVec4, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
-  %call = tail call fast <3 x float> @_Z3sinDv3_f(<3 x float> %extractVec4)
+  %call = call fast <3 x float> @_Z3sinDv3_f(<3 x float> %extractVec4)
  %extractVec6 = shufflevector <3 x float> %call, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
  store <4 x float> %extractVec6, <4 x float> addrspace(1)* %castToVec4, align 16
-  %call11 = tail call fast <3 x float> @_Z3cosDv3_f(<3 x float> %extractVec4)
+  %call11 = call fast <3 x float> @_Z3cosDv3_f(<3 x float> %extractVec4)
  %arrayidx12 = getelementptr inbounds <3 x float>, <3 x float> addrspace(1)* %a, i64 1
  %extractVec13 = shufflevector <3 x float> %call11, <3 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
  %storetmp14 = bitcast <3 x float> addrspace(1)* %arrayidx12 to <4 x float> addrspace(1)*
@ -71,17 +71,17 @@ declare <3 x float> @_Z3sinDv3_f(<3 x float>)
 declare <3 x float> @_Z3cosDv3_f(<3 x float>)

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v4
-; GCN-POSTLINK: tail call fast <4 x float> @_Z3sinDv4_f(
-; GCN-POSTLINK: tail call fast <4 x float> @_Z3cosDv4_f(
+; GCN-POSTLINK: call fast <4 x float> @_Z3sinDv4_f(
+; GCN-POSTLINK: call fast <4 x float> @_Z3cosDv4_f(
 ; GCN-PRELINK: call fast <4 x float> @_Z6sincosDv4_fPS_(
-; GCN-NATIVE: tail call fast <4 x float> @_Z10native_sinDv4_f(
-; GCN-NATIVE: tail call fast <4 x float> @_Z10native_cosDv4_f(
+; GCN-NATIVE: call fast <4 x float> @_Z10native_sinDv4_f(
+; GCN-NATIVE: call fast <4 x float> @_Z10native_cosDv4_f(
 define amdgpu_kernel void @test_sincos_v4(<4 x float> addrspace(1)* nocapture %a) {
 entry:
  %tmp = load <4 x float>, <4 x float> addrspace(1)* %a, align 16
-  %call = tail call fast <4 x float> @_Z3sinDv4_f(<4 x float> %tmp)
+  %call = call fast <4 x float> @_Z3sinDv4_f(<4 x float> %tmp)
  store <4 x float> %call, <4 x float> addrspace(1)* %a, align 16
-  %call2 = tail call fast <4 x float> @_Z3cosDv4_f(<4 x float> %tmp)
+  %call2 = call fast <4 x float> @_Z3cosDv4_f(<4 x float> %tmp)
  %arrayidx3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %a, i64 1
  store <4 x float> %call2, <4 x float> addrspace(1)* %arrayidx3, align 16
  ret void
@ -92,17 +92,17 @@ declare <4 x float> @_Z3sinDv4_f(<4 x float>)
 declare <4 x float> @_Z3cosDv4_f(<4 x float>)

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v8
-; GCN-POSTLINK: tail call fast <8 x float> @_Z3sinDv8_f(
-; GCN-POSTLINK: tail call fast <8 x float> @_Z3cosDv8_f(
+; GCN-POSTLINK: call fast <8 x float> @_Z3sinDv8_f(
+; GCN-POSTLINK: call fast <8 x float> @_Z3cosDv8_f(
 ; GCN-PRELINK: call fast <8 x float> @_Z6sincosDv8_fPS_(
-; GCN-NATIVE: tail call fast <8 x float> @_Z10native_sinDv8_f(
-; GCN-NATIVE: tail call fast <8 x float> @_Z10native_cosDv8_f(
+; GCN-NATIVE: call fast <8 x float> @_Z10native_sinDv8_f(
+; GCN-NATIVE: call fast <8 x float> @_Z10native_cosDv8_f(
 define amdgpu_kernel void @test_sincos_v8(<8 x float> addrspace(1)* nocapture %a) {
 entry:
  %tmp = load <8 x float>, <8 x float> addrspace(1)* %a, align 32
-  %call = tail call fast <8 x float> @_Z3sinDv8_f(<8 x float> %tmp)
+  %call = call fast <8 x float> @_Z3sinDv8_f(<8 x float> %tmp)
  store <8 x float> %call, <8 x float> addrspace(1)* %a, align 32
-  %call2 = tail call fast <8 x float> @_Z3cosDv8_f(<8 x float> %tmp)
+  %call2 = call fast <8 x float> @_Z3cosDv8_f(<8 x float> %tmp)
  %arrayidx3 = getelementptr inbounds <8 x float>, <8 x float> addrspace(1)* %a, i64 1
  store <8 x float> %call2, <8 x float> addrspace(1)* %arrayidx3, align 32
  ret void
@ -113,17 +113,17 @@ declare <8 x float> @_Z3sinDv8_f(<8 x float>)
 declare <8 x float> @_Z3cosDv8_f(<8 x float>)

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v16
-; GCN-POSTLINK: tail call fast <16 x float> @_Z3sinDv16_f(
-; GCN-POSTLINK: tail call fast <16 x float> @_Z3cosDv16_f(
+; GCN-POSTLINK: call fast <16 x float> @_Z3sinDv16_f(
+; GCN-POSTLINK: call fast <16 x float> @_Z3cosDv16_f(
 ; GCN-PRELINK: call fast <16 x float> @_Z6sincosDv16_fPS_(
-; GCN-NATIVE: tail call fast <16 x float> @_Z10native_sinDv16_f(
-; GCN-NATIVE: tail call fast <16 x float> @_Z10native_cosDv16_f(
+; GCN-NATIVE: call fast <16 x float> @_Z10native_sinDv16_f(
+; GCN-NATIVE: call fast <16 x float> @_Z10native_cosDv16_f(
 define amdgpu_kernel void @test_sincos_v16(<16 x float> addrspace(1)* nocapture %a) {
 entry:
  %tmp = load <16 x float>, <16 x float> addrspace(1)* %a, align 64
-  %call = tail call fast <16 x float> @_Z3sinDv16_f(<16 x float> %tmp)
+  %call = call fast <16 x float> @_Z3sinDv16_f(<16 x float> %tmp)
  store <16 x float> %call, <16 x float> addrspace(1)* %a, align 64
-  %call2 = tail call fast <16 x float> @_Z3cosDv16_f(<16 x float> %tmp)
+  %call2 = call fast <16 x float> @_Z3cosDv16_f(<16 x float> %tmp)
  %arrayidx3 = getelementptr inbounds <16 x float>, <16 x float> addrspace(1)* %a, i64 1
  store <16 x float> %call2, <16 x float> addrspace(1)* %arrayidx3, align 64
  ret void
@ -137,7 +137,7 @@ declare <16 x float> @_Z3cosDv16_f(<16 x float>)
 ; GCN: store float 0x3FD5555560000000, float addrspace(1)* %a
 define amdgpu_kernel void @test_native_recip(float addrspace(1)* nocapture %a) {
 entry:
-  %call = tail call fast float @_Z12native_recipf(float 3.000000e+00)
+  %call = call fast float @_Z12native_recipf(float 3.000000e+00)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -148,7 +148,7 @@ declare float @_Z12native_recipf(float)
 ; GCN: store float 0x3FD5555560000000, float addrspace(1)* %a
 define amdgpu_kernel void @test_half_recip(float addrspace(1)* nocapture %a) {
 entry:
-  %call = tail call fast float @_Z10half_recipf(float 3.000000e+00)
+  %call = call fast float @_Z10half_recipf(float 3.000000e+00)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -160,7 +160,7 @@ declare float @_Z10half_recipf(float)
 define amdgpu_kernel void @test_native_divide(float addrspace(1)* nocapture %a) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z13native_divideff(float %tmp, float 3.000000e+00)
+  %call = call fast float @_Z13native_divideff(float %tmp, float 3.000000e+00)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -172,7 +172,7 @@ declare float @_Z13native_divideff(float, float)
 define amdgpu_kernel void @test_half_divide(float addrspace(1)* nocapture %a) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z11half_divideff(float %tmp, float 3.000000e+00)
+  %call = call fast float @_Z11half_divideff(float %tmp, float 3.000000e+00)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -184,7 +184,7 @@ declare float @_Z11half_divideff(float, float)
 define amdgpu_kernel void @test_pow_0f(float addrspace(1)* nocapture %a) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3powff(float %tmp, float 0.000000e+00)
+  %call = call fast float @_Z3powff(float %tmp, float 0.000000e+00)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -196,7 +196,7 @@ declare float @_Z3powff(float, float)
 define amdgpu_kernel void @test_pow_0i(float addrspace(1)* nocapture %a) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3powff(float %tmp, float 0.000000e+00)
+  %call = call fast float @_Z3powff(float %tmp, float 0.000000e+00)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -208,7 +208,7 @@ define amdgpu_kernel void @test_pow_1f(float addrspace(1)* nocapture %a) {
 entry:
  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
  %tmp = load float, float addrspace(1)* %arrayidx, align 4
-  %call = tail call fast float @_Z3powff(float %tmp, float 1.000000e+00)
+  %call = call fast float @_Z3powff(float %tmp, float 1.000000e+00)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -220,7 +220,7 @@ define amdgpu_kernel void @test_pow_1i(float addrspace(1)* nocapture %a) {
 entry:
  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
  %tmp = load float, float addrspace(1)* %arrayidx, align 4
-  %call = tail call fast float @_Z3powff(float %tmp, float 1.000000e+00)
+  %call = call fast float @_Z3powff(float %tmp, float 1.000000e+00)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -231,7 +231,7 @@ entry:
 define amdgpu_kernel void @test_pow_2f(float addrspace(1)* nocapture %a) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3powff(float %tmp, float 2.000000e+00)
+  %call = call fast float @_Z3powff(float %tmp, float 2.000000e+00)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -242,7 +242,7 @@ entry:
 define amdgpu_kernel void @test_pow_2i(float addrspace(1)* nocapture %a) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3powff(float %tmp, float 2.000000e+00)
+  %call = call fast float @_Z3powff(float %tmp, float 2.000000e+00)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -254,7 +254,7 @@ define amdgpu_kernel void @test_pow_m1f(float addrspace(1)* nocapture %a) {
 entry:
  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
  %tmp = load float, float addrspace(1)* %arrayidx, align 4
-  %call = tail call fast float @_Z3powff(float %tmp, float -1.000000e+00)
+  %call = call fast float @_Z3powff(float %tmp, float -1.000000e+00)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -266,31 +266,31 @@ define amdgpu_kernel void @test_pow_m1i(float addrspace(1)* nocapture %a) {
 entry:
  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
  %tmp = load float, float addrspace(1)* %arrayidx, align 4
-  %call = tail call fast float @_Z3powff(float %tmp, float -1.000000e+00)
+  %call = call fast float @_Z3powff(float %tmp, float -1.000000e+00)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_half
-; GCN-POSTLINK: tail call fast float @_Z3powff(float %tmp, float 5.000000e-01)
-; GCN-PRELINK: %__pow2sqrt = tail call fast float @_Z4sqrtf(float %tmp)
+; GCN-POSTLINK: call fast float @_Z3powff(float %tmp, float 5.000000e-01)
+; GCN-PRELINK: %__pow2sqrt = call fast float @_Z4sqrtf(float %tmp)
 define amdgpu_kernel void @test_pow_half(float addrspace(1)* nocapture %a) {
 entry:
  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
  %tmp = load float, float addrspace(1)* %arrayidx, align 4
-  %call = tail call fast float @_Z3powff(float %tmp, float 5.000000e-01)
+  %call = call fast float @_Z3powff(float %tmp, float 5.000000e-01)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_mhalf
-; GCN-POSTLINK: tail call fast float @_Z3powff(float %tmp, float -5.000000e-01)
-; GCN-PRELINK: %__pow2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp)
+; GCN-POSTLINK: call fast float @_Z3powff(float %tmp, float -5.000000e-01)
+; GCN-PRELINK: %__pow2rsqrt = call fast float @_Z5rsqrtf(float %tmp)
 define amdgpu_kernel void @test_pow_mhalf(float addrspace(1)* nocapture %a) {
 entry:
  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
  %tmp = load float, float addrspace(1)* %arrayidx, align 4
-  %call = tail call fast float @_Z3powff(float %tmp, float -5.000000e-01)
+  %call = call fast float @_Z3powff(float %tmp, float -5.000000e-01)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -305,7 +305,7 @@ define amdgpu_kernel void @test_pow_c(float addrspace(1)* nocapture %a) {
 entry:
  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
  %tmp = load float, float addrspace(1)* %arrayidx, align 4
-  %call = tail call fast float @_Z3powff(float %tmp, float 1.100000e+01)
+  %call = call fast float @_Z3powff(float %tmp, float 1.100000e+01)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -320,7 +320,7 @@ define amdgpu_kernel void @test_powr_c(float addrspace(1)* nocapture %a) {
 entry:
  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
  %tmp = load float, float addrspace(1)* %arrayidx, align 4
-  %call = tail call fast float @_Z4powrff(float %tmp, float 1.100000e+01)
+  %call = call fast float @_Z4powrff(float %tmp, float 1.100000e+01)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -337,7 +337,7 @@ define amdgpu_kernel void @test_pown_c(float addrspace(1)* nocapture %a) {
 entry:
  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
  %tmp = load float, float addrspace(1)* %arrayidx, align 4
-  %call = tail call fast float @_Z4pownfi(float %tmp, i32 11)
+  %call = call fast float @_Z4pownfi(float %tmp, i32 11)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -345,11 +345,11 @@ entry:
 declare float @_Z4pownfi(float, i32)

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow
-; GCN-POSTLINK: tail call fast float @_Z3powff(float %tmp, float 1.013000e+03)
-; GCN-PRELINK: %__fabs = tail call fast float @_Z4fabsf(float %tmp)
-; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs)
+; GCN-POSTLINK: call fast float @_Z3powff(float %tmp, float 1.013000e+03)
+; GCN-PRELINK: %__fabs = call fast float @_Z4fabsf(float %tmp)
+; GCN-PRELINK: %__log2 = call fast float @_Z4log2f(float %__fabs)
 ; GCN-PRELINK: %__ylogx = fmul fast float %__log2, 1.013000e+03
-; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
+; GCN-PRELINK: %__exp2 = call fast float @_Z4exp2f(float %__ylogx)
 ; GCN-PRELINK: %[[r0:.*]] = bitcast float %tmp to i32
 ; GCN-PRELINK: %__pow_sign = and i32 %[[r0]], -2147483648
 ; GCN-PRELINK: %[[r1:.*]] = bitcast float %__exp2 to i32
@ -359,39 +359,39 @@ declare float @_Z4pownfi(float, i32)
 define amdgpu_kernel void @test_pow(float addrspace(1)* nocapture %a) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3powff(float %tmp, float 1.013000e+03)
+  %call = call fast float @_Z3powff(float %tmp, float 1.013000e+03)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_powr
-; GCN-POSTLINK: tail call fast float @_Z4powrff(float %tmp, float %tmp1)
-; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %tmp)
+; GCN-POSTLINK: call fast float @_Z4powrff(float %tmp, float %tmp1)
+; GCN-PRELINK: %__log2 = call fast float @_Z4log2f(float %tmp)
 ; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %tmp1
-; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
+; GCN-PRELINK: %__exp2 = call fast float @_Z4exp2f(float %__ylogx)
 ; GCN-PRELINK: store float %__exp2, float addrspace(1)* %a, align 4
-; GCN-NATIVE:  %__log2 = tail call fast float @_Z11native_log2f(float %tmp)
+; GCN-NATIVE:  %__log2 = call fast float @_Z11native_log2f(float %tmp)
 ; GCN-NATIVE:  %__ylogx = fmul fast float %__log2, %tmp1
-; GCN-NATIVE:  %__exp2 = tail call fast float @_Z11native_exp2f(float %__ylogx)
+; GCN-NATIVE:  %__exp2 = call fast float @_Z11native_exp2f(float %__ylogx)
 ; GCN-NATIVE:  store float %__exp2, float addrspace(1)* %a, align 4
 define amdgpu_kernel void @test_powr(float addrspace(1)* nocapture %a) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
  %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
-  %call = tail call fast float @_Z4powrff(float %tmp, float %tmp1)
+  %call = call fast float @_Z4powrff(float %tmp, float %tmp1)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pown
-; GCN-POSTLINK: tail call fast float @_Z4pownfi(float %tmp, i32 %conv)
+; GCN-POSTLINK: call fast float @_Z4pownfi(float %tmp, i32 %conv)
 ; GCN-PRELINK: %conv = fptosi float %tmp1 to i32
-; GCN-PRELINK: %__fabs = tail call fast float @_Z4fabsf(float %tmp)
-; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs)
+; GCN-PRELINK: %__fabs = call fast float @_Z4fabsf(float %tmp)
+; GCN-PRELINK: %__log2 = call fast float @_Z4log2f(float %__fabs)
 ; GCN-PRELINK: %pownI2F = sitofp i32 %conv to float
 ; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %pownI2F
-; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx)
+; GCN-PRELINK: %__exp2 = call fast float @_Z4exp2f(float %__ylogx)
 ; GCN-PRELINK: %__yeven = shl i32 %conv, 31
 ; GCN-PRELINK: %[[r0:.*]] = bitcast float %tmp to i32
 ; GCN-PRELINK: %__pow_sign = and i32 %__yeven, %[[r0]]
@ -405,7 +405,7 @@ entry:
  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
  %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
  %conv = fptosi float %tmp1 to i32
-  %call = tail call fast float @_Z4pownfi(float %tmp, i32 %conv)
+  %call = call fast float @_Z4pownfi(float %tmp, i32 %conv)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -417,7 +417,7 @@ define amdgpu_kernel void @test_rootn_1(float addrspace(1)* nocapture %a) {
 entry:
  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
  %tmp = load float, float addrspace(1)* %arrayidx, align 4
-  %call = tail call fast float @_Z5rootnfi(float %tmp, i32 1)
+  %call = call fast float @_Z5rootnfi(float %tmp, i32 1)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -425,23 +425,23 @@ entry:
 declare float @_Z5rootnfi(float, i32)

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_2
-; GCN-POSTLINK: tail call fast float @_Z5rootnfi(float %tmp, i32 2)
-; GCN-PRELINK: %__rootn2sqrt = tail call fast float @_Z4sqrtf(float %tmp)
+; GCN-POSTLINK: call fast float @_Z5rootnfi(float %tmp, i32 2)
+; GCN-PRELINK: %__rootn2sqrt = call fast float @_Z4sqrtf(float %tmp)
 define amdgpu_kernel void @test_rootn_2(float addrspace(1)* nocapture %a) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z5rootnfi(float %tmp, i32 2)
+  %call = call fast float @_Z5rootnfi(float %tmp, i32 2)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_3
-; GCN-POSTLINK: tail call fast float @_Z5rootnfi(float %tmp, i32 3)
-; GCN-PRELINK: %__rootn2cbrt = tail call fast float @_Z4cbrtf(float %tmp)
+; GCN-POSTLINK: call fast float @_Z5rootnfi(float %tmp, i32 3)
+; GCN-PRELINK: %__rootn2cbrt = call fast float @_Z4cbrtf(float %tmp)
 define amdgpu_kernel void @test_rootn_3(float addrspace(1)* nocapture %a) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z5rootnfi(float %tmp, i32 3)
+  %call = call fast float @_Z5rootnfi(float %tmp, i32 3)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -451,18 +451,18 @@ entry:
 define amdgpu_kernel void @test_rootn_m1(float addrspace(1)* nocapture %a) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z5rootnfi(float %tmp, i32 -1)
+  %call = call fast float @_Z5rootnfi(float %tmp, i32 -1)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_m2
-; GCN-POSTLINK: tail call fast float @_Z5rootnfi(float %tmp, i32 -2)
-; GCN-PRELINK: %__rootn2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp)
+; GCN-POSTLINK: call fast float @_Z5rootnfi(float %tmp, i32 -2)
+; GCN-PRELINK: %__rootn2rsqrt = call fast float @_Z5rsqrtf(float %tmp)
 define amdgpu_kernel void @test_rootn_m2(float addrspace(1)* nocapture %a) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z5rootnfi(float %tmp, i32 -2)
+  %call = call fast float @_Z5rootnfi(float %tmp, i32 -2)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -472,7 +472,7 @@ entry:
 define amdgpu_kernel void @test_fma_0x(float addrspace(1)* nocapture %a, float %y) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3fmafff(float 0.000000e+00, float %tmp, float %y)
+  %call = call fast float @_Z3fmafff(float 0.000000e+00, float %tmp, float %y)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -484,7 +484,7 @@ declare float @_Z3fmafff(float, float, float)
 define amdgpu_kernel void @test_fma_x0(float addrspace(1)* nocapture %a, float %y) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3fmafff(float %tmp, float 0.000000e+00, float %y)
+  %call = call fast float @_Z3fmafff(float %tmp, float 0.000000e+00, float %y)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -494,7 +494,7 @@ entry:
 define amdgpu_kernel void @test_mad_0x(float addrspace(1)* nocapture %a, float %y) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3madfff(float 0.000000e+00, float %tmp, float %y)
+  %call = call fast float @_Z3madfff(float 0.000000e+00, float %tmp, float %y)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -506,7 +506,7 @@ declare float @_Z3madfff(float, float, float)
 define amdgpu_kernel void @test_mad_x0(float addrspace(1)* nocapture %a, float %y) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3madfff(float %tmp, float 0.000000e+00, float %y)
+  %call = call fast float @_Z3madfff(float %tmp, float 0.000000e+00, float %y)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -516,7 +516,7 @@ entry:
 define amdgpu_kernel void @test_fma_x1y(float addrspace(1)* nocapture %a, float %y) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3fmafff(float %tmp, float 1.000000e+00, float %y)
+  %call = call fast float @_Z3fmafff(float %tmp, float 1.000000e+00, float %y)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -526,7 +526,7 @@ entry:
 define amdgpu_kernel void @test_fma_1xy(float addrspace(1)* nocapture %a, float %y) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3fmafff(float 1.000000e+00, float %tmp, float %y)
+  %call = call fast float @_Z3fmafff(float 1.000000e+00, float %tmp, float %y)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -538,17 +538,17 @@ entry:
  %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i64 1
  %tmp = load float, float addrspace(1)* %arrayidx, align 4
  %tmp1 = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3fmafff(float %tmp, float %tmp1, float 0.000000e+00)
+  %call = call fast float @_Z3fmafff(float %tmp, float %tmp1, float 0.000000e+00)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp
-; GCN-NATIVE: tail call fast float @_Z10native_expf(float %tmp)
+; GCN-NATIVE: call fast float @_Z10native_expf(float %tmp)
 define amdgpu_kernel void @test_use_native_exp(float addrspace(1)* nocapture %a) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3expf(float %tmp)
+  %call = call fast float @_Z3expf(float %tmp)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -556,11 +556,11 @@ entry:
 declare float @_Z3expf(float)

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp2
-; GCN-NATIVE: tail call fast float @_Z11native_exp2f(float %tmp)
+; GCN-NATIVE: call fast float @_Z11native_exp2f(float %tmp)
 define amdgpu_kernel void @test_use_native_exp2(float addrspace(1)* nocapture %a) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z4exp2f(float %tmp)
+  %call = call fast float @_Z4exp2f(float %tmp)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -568,11 +568,11 @@ entry:
 declare float @_Z4exp2f(float)

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp10
-; GCN-NATIVE: tail call fast float @_Z12native_exp10f(float %tmp)
+; GCN-NATIVE: call fast float @_Z12native_exp10f(float %tmp)
 define amdgpu_kernel void @test_use_native_exp10(float addrspace(1)* nocapture %a) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z5exp10f(float %tmp)
+  %call = call fast float @_Z5exp10f(float %tmp)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -580,11 +580,11 @@ entry:
 declare float @_Z5exp10f(float)

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log
-; GCN-NATIVE: tail call fast float @_Z10native_logf(float %tmp)
+; GCN-NATIVE: call fast float @_Z10native_logf(float %tmp)
 define amdgpu_kernel void @test_use_native_log(float addrspace(1)* nocapture %a) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3logf(float %tmp)
+  %call = call fast float @_Z3logf(float %tmp)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -592,11 +592,11 @@ entry:
 declare float @_Z3logf(float)

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log2
-; GCN-NATIVE: tail call fast float @_Z11native_log2f(float %tmp)
+; GCN-NATIVE: call fast float @_Z11native_log2f(float %tmp)
 define amdgpu_kernel void @test_use_native_log2(float addrspace(1)* nocapture %a) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z4log2f(float %tmp)
+  %call = call fast float @_Z4log2f(float %tmp)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -604,11 +604,11 @@ entry:
 declare float @_Z4log2f(float)

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log10
-; GCN-NATIVE: tail call fast float @_Z12native_log10f(float %tmp)
+; GCN-NATIVE: call fast float @_Z12native_log10f(float %tmp)
 define amdgpu_kernel void @test_use_native_log10(float addrspace(1)* nocapture %a) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z5log10f(float %tmp)
+  %call = call fast float @_Z5log10f(float %tmp)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -617,36 +617,36 @@ declare float @_Z5log10f(float)

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_powr
 ; GCN-NATIVE: %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
-; GCN-NATIVE: %__log2 = tail call fast float @_Z11native_log2f(float %tmp)
+; GCN-NATIVE: %__log2 = call fast float @_Z11native_log2f(float %tmp)
 ; GCN-NATIVE: %__ylogx = fmul fast float %__log2, %tmp1
-; GCN-NATIVE: %__exp2 = tail call fast float @_Z11native_exp2f(float %__ylogx)
+; GCN-NATIVE: %__exp2 = call fast float @_Z11native_exp2f(float %__ylogx)
 ; GCN-NATIVE: store float %__exp2, float addrspace(1)* %a, align 4
 define amdgpu_kernel void @test_use_native_powr(float addrspace(1)* nocapture %a) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
  %tmp1 = load float, float addrspace(1)* %arrayidx1, align 4
-  %call = tail call fast float @_Z4powrff(float %tmp, float %tmp1)
+  %call = call fast float @_Z4powrff(float %tmp, float %tmp1)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_sqrt
-; GCN-NATIVE: tail call fast float @_Z11native_sqrtf(float %tmp)
+; GCN-NATIVE: call fast float @_Z11native_sqrtf(float %tmp)
 define amdgpu_kernel void @test_use_native_sqrt(float addrspace(1)* nocapture %a) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z4sqrtf(float %tmp)
+  %call = call fast float @_Z4sqrtf(float %tmp)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_dont_use_native_sqrt_fast_f64
-; GCN: tail call fast double @_Z4sqrtd(double %tmp)
+; GCN: call fast double @_Z4sqrtd(double %tmp)
 define amdgpu_kernel void @test_dont_use_native_sqrt_fast_f64(double addrspace(1)* nocapture %a) {
 entry:
  %tmp = load double, double addrspace(1)* %a, align 8
-  %call = tail call fast double @_Z4sqrtd(double %tmp)
+  %call = call fast double @_Z4sqrtd(double %tmp)
  store double %call, double addrspace(1)* %a, align 8
  ret void
 }
@ -655,11 +655,11 @@ declare float @_Z4sqrtf(float)
 declare double @_Z4sqrtd(double)

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_rsqrt
-; GCN-NATIVE: tail call fast float @_Z12native_rsqrtf(float %tmp)
+; GCN-NATIVE: call fast float @_Z12native_rsqrtf(float %tmp)
 define amdgpu_kernel void @test_use_native_rsqrt(float addrspace(1)* nocapture %a) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z5rsqrtf(float %tmp)
+  %call = call fast float @_Z5rsqrtf(float %tmp)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -667,11 +667,11 @@ entry:
 declare float @_Z5rsqrtf(float)

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_tan
-; GCN-NATIVE: tail call fast float @_Z10native_tanf(float %tmp)
+; GCN-NATIVE: call fast float @_Z10native_tanf(float %tmp)
 define amdgpu_kernel void @test_use_native_tan(float addrspace(1)* nocapture %a) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
-  %call = tail call fast float @_Z3tanf(float %tmp)
+  %call = call fast float @_Z3tanf(float %tmp)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -679,14 +679,14 @@ entry:
 declare float @_Z3tanf(float)

 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_sincos
-; GCN-NATIVE: tail call float @_Z10native_sinf(float %tmp)
-; GCN-NATIVE: tail call float @_Z10native_cosf(float %tmp)
+; GCN-NATIVE: call float @_Z10native_sinf(float %tmp)
+; GCN-NATIVE: call float @_Z10native_cosf(float %tmp)
 define amdgpu_kernel void @test_use_native_sincos(float addrspace(1)* %a) {
 entry:
  %tmp = load float, float addrspace(1)* %a, align 4
  %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
  %tmp1 = addrspacecast float addrspace(1)* %arrayidx1 to float*
-  %call = tail call fast float @_Z6sincosfPf(float %tmp, float* %tmp1)
+  %call = call fast float @_Z6sincosfPf(float %tmp, float* %tmp1)
  store float %call, float addrspace(1)* %a, align 4
  ret void
 }
@ -703,10 +703,10 @@ define amdgpu_kernel void @test_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 a
 entry:
  %tmp = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)*
  %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8*
-  %tmp2 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8* %tmp1, i32 4, i32 4) #0
-  %tmp3 = tail call %opencl.reserve_id_t addrspace(5)* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4)
-  %tmp4 = tail call i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 2, i8* %tmp1, i32 4, i32 4) #0
-  tail call void @__commit_read_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 4, i32 4)
+  %tmp2 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8* %tmp1, i32 4, i32 4) #0
+  %tmp3 = call %opencl.reserve_id_t addrspace(5)* @__reserve_read_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4)
+  %tmp4 = call i32 @__read_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 2, i8* %tmp1, i32 4, i32 4) #0
+  call void @__commit_read_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 4, i32 4)
  ret void
 }

@ -725,10 +725,10 @@ define amdgpu_kernel void @test_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32
 entry:
  %tmp = bitcast i32 addrspace(1)* %ptr to i8 addrspace(1)*
  %tmp1 = addrspacecast i8 addrspace(1)* %tmp to i8*
-  %tmp2 = tail call i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8* %tmp1, i32 4, i32 4) #0
-  %tmp3 = tail call %opencl.reserve_id_t addrspace(5)* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4) #0
-  %tmp4 = tail call i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 2, i8* %tmp1, i32 4, i32 4) #0
-  tail call void @__commit_write_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 4, i32 4) #0
+  %tmp2 = call i32 @__write_pipe_2(%opencl.pipe_t addrspace(1)* %p, i8* %tmp1, i32 4, i32 4) #0
+  %tmp3 = call %opencl.reserve_id_t addrspace(5)* @__reserve_write_pipe(%opencl.pipe_t addrspace(1)* %p, i32 2, i32 4, i32 4) #0
+  %tmp4 = call i32 @__write_pipe_4(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 2, i8* %tmp1, i32 4, i32 4) #0
+  call void @__commit_write_pipe(%opencl.pipe_t addrspace(1)* %p, %opencl.reserve_id_t addrspace(5)* %tmp3, i32 4, i32 4) #0
  ret void
 }

@ -755,31 +755,31 @@ declare void @__commit_write_pipe(%opencl.pipe_t addrspace(1)*, %opencl.reserve_
 define amdgpu_kernel void @test_pipe_size(%opencl.pipe_t addrspace(1)* %p1, i8 addrspace(1)* %ptr1, %opencl.pipe_t addrspace(1)* %p2, i16 addrspace(1)* %ptr2, %opencl.pipe_t addrspace(1)* %p4, i32 addrspace(1)* %ptr4, %opencl.pipe_t addrspace(1)* %p8, i64 addrspace(1)* %ptr8, %opencl.pipe_t addrspace(1)* %p16, <2 x i64> addrspace(1)* %ptr16, %opencl.pipe_t addrspace(1)* %p32, <4 x i64> addrspace(1)* %ptr32, %opencl.pipe_t addrspace(1)* %p64, <8 x i64> addrspace(1)* %ptr64, %opencl.pipe_t addrspace(1)* %p128, <16 x i64> addrspace(1)* %ptr128, %opencl.pipe_t addrspace(1)* %pu, %struct.S addrspace(1)* %ptru) local_unnamed_addr #0 {
 entry:
  %tmp = addrspacecast i8 addrspace(1)* %ptr1 to i8*
-  %tmp1 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p1, i8* %tmp, i32 1, i32 1) #0
+  %tmp1 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p1, i8* %tmp, i32 1, i32 1) #0
  %tmp2 = bitcast i16 addrspace(1)* %ptr2 to i8 addrspace(1)*
  %tmp3 = addrspacecast i8 addrspace(1)* %tmp2 to i8*
-  %tmp4 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p2, i8* %tmp3, i32 2, i32 2) #0
+  %tmp4 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p2, i8* %tmp3, i32 2, i32 2) #0
  %tmp5 = bitcast i32 addrspace(1)* %ptr4 to i8 addrspace(1)*
  %tmp6 = addrspacecast i8 addrspace(1)* %tmp5 to i8*
-  %tmp7 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p4, i8* %tmp6, i32 4, i32 4) #0
+  %tmp7 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p4, i8* %tmp6, i32 4, i32 4) #0
  %tmp8 = bitcast i64 addrspace(1)* %ptr8 to i8 addrspace(1)*
  %tmp9 = addrspacecast i8 addrspace(1)* %tmp8 to i8*
-  %tmp10 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p8, i8* %tmp9, i32 8, i32 8) #0
+  %tmp10 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p8, i8* %tmp9, i32 8, i32 8) #0
  %tmp11 = bitcast <2 x i64> addrspace(1)* %ptr16 to i8 addrspace(1)*
  %tmp12 = addrspacecast i8 addrspace(1)* %tmp11 to i8*
-  %tmp13 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p16, i8* %tmp12, i32 16, i32 16) #0
+  %tmp13 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p16, i8* %tmp12, i32 16, i32 16) #0
  %tmp14 = bitcast <4 x i64> addrspace(1)* %ptr32 to i8 addrspace(1)*
  %tmp15 = addrspacecast i8 addrspace(1)* %tmp14 to i8*
-  %tmp16 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p32, i8* %tmp15, i32 32, i32 32) #0
+  %tmp16 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p32, i8* %tmp15, i32 32, i32 32) #0
  %tmp17 = bitcast <8 x i64> addrspace(1)* %ptr64 to i8 addrspace(1)*
  %tmp18 = addrspacecast i8 addrspace(1)* %tmp17 to i8*
-  %tmp19 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p64, i8* %tmp18, i32 64, i32 64) #0
+  %tmp19 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p64, i8* %tmp18, i32 64, i32 64) #0
  %tmp20 = bitcast <16 x i64> addrspace(1)* %ptr128 to i8 addrspace(1)*
  %tmp21 = addrspacecast i8 addrspace(1)* %tmp20 to i8*
-  %tmp22 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p128, i8* %tmp21, i32 128, i32 128) #0
+  %tmp22 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %p128, i8* %tmp21, i32 128, i32 128) #0
  %tmp23 = bitcast %struct.S addrspace(1)* %ptru to i8 addrspace(1)*
  %tmp24 = addrspacecast i8 addrspace(1)* %tmp23 to i8*
-  %tmp25 = tail call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %pu, i8* %tmp24, i32 400, i32 4) #0
+  %tmp25 = call i32 @__read_pipe_2(%opencl.pipe_t addrspace(1)* %pu, i8* %tmp24, i32 400, i32 4) #0
  ret void
 }

--- a/llvm/test/Feature/optnone-opt.ll
+++ b/llvm/test/Feature/optnone-opt.ll
@ -39,16 +39,10 @@ attributes #0 = { optnone noinline }
 ; IR passes run at -O1 and higher.
 ; OPT-O1-DAG: Skipping pass 'Aggressive Dead Code Elimination'
 ; OPT-O1-DAG: Skipping pass 'Combine redundant instructions'
-; OPT-O1-DAG: Skipping pass 'Dead Store Elimination'
 ; OPT-O1-DAG: Skipping pass 'Early CSE'
-; OPT-O1-DAG: Skipping pass 'Jump Threading'
-; OPT-O1-DAG: Skipping pass 'MemCpy Optimization'
 ; OPT-O1-DAG: Skipping pass 'Reassociate expressions'
 ; OPT-O1-DAG: Skipping pass 'Simplify the CFG'
 ; OPT-O1-DAG: Skipping pass 'Sparse Conditional Constant Propagation'
-; OPT-O1-DAG: Skipping pass 'SROA'
-; OPT-O1-DAG: Skipping pass 'Tail Call Elimination'
-; OPT-O1-DAG: Skipping pass 'Value Propagation'

 ; Additional IR passes run at -O2 and higher.
 ; OPT-O2O3-DAG: Skipping pass 'Global Value Numbering'
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@ -12,66 +12,70 @@
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O1
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes='default<O2>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O2
+; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O2 \
+; RUN:     --check-prefix=CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3
+; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
+; RUN:     --check-prefix=CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes='default<Os>' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-Os
+; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-Os \
+; RUN:     --check-prefix=CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes='default<Oz>' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-Oz
+; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-Oz \
+; RUN:     --check-prefix=CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes='lto-pre-link<O2>' -S %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O2 \
-; RUN:     --check-prefix=CHECK-O2-LTO
+; RUN:     --check-prefix=CHECK-O2-LTO --check-prefix=CHECK-O23SZ

 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes-ep-peephole='no-op-function' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
-; RUN:     --check-prefix=CHECK-EP-PEEPHOLE
+; RUN:     --check-prefix=CHECK-EP-PEEPHOLE --check-prefix=CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes-ep-late-loop-optimizations='no-op-loop' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
-; RUN:     --check-prefix=CHECK-EP-LOOP-LATE
+; RUN:     --check-prefix=CHECK-EP-LOOP-LATE --check-prefix=CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes-ep-loop-optimizer-end='no-op-loop' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
-; RUN:     --check-prefix=CHECK-EP-LOOP-END
+; RUN:     --check-prefix=CHECK-EP-LOOP-END --check-prefix=CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes-ep-scalar-optimizer-late='no-op-function' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
-; RUN:     --check-prefix=CHECK-EP-SCALAR-LATE
+; RUN:     --check-prefix=CHECK-EP-SCALAR-LATE --check-prefix=CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes-ep-cgscc-optimizer-late='no-op-cgscc' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
-; RUN:     --check-prefix=CHECK-EP-CGSCC-LATE
+; RUN:     --check-prefix=CHECK-EP-CGSCC-LATE --check-prefix=CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes-ep-vectorizer-start='no-op-function' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
-; RUN:     --check-prefix=CHECK-EP-VECTORIZER-START
+; RUN:     --check-prefix=CHECK-EP-VECTORIZER-START --check-prefix=CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes-ep-pipeline-start='no-op-module' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
-; RUN:     --check-prefix=CHECK-EP-PIPELINE-START
+; RUN:     --check-prefix=CHECK-EP-PIPELINE-START --check-prefix=CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes-ep-pipeline-start='no-op-module' \
 ; RUN:     -passes='lto-pre-link<O3>' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
-; RUN:     --check-prefix=CHECK-EP-PIPELINE-START
+; RUN:     --check-prefix=CHECK-EP-PIPELINE-START --check-prefix=CHECK-O23SZ
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes-ep-optimizer-last='no-op-function' \
 ; RUN:     -passes='default<O3>' -S  %s 2>&1 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK-O --check-prefix=CHECK-O3 \
-; RUN:     --check-prefix=CHECK-EP-OPTIMIZER-LAST
+; RUN:     --check-prefix=CHECK-EP-OPTIMIZER-LAST --check-prefix=CHECK-O23SZ

 ; CHECK-O: Running analysis: PassInstrumentationAnalysis
 ; CHECK-O-NEXT: Starting llvm::Module pass manager run.
@ -132,10 +136,10 @@
 ; CHECK-O-NEXT: Running pass: SROA
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
-; CHECK-O-NEXT: Running pass: SpeculativeExecutionPass
-; CHECK-O-NEXT: Running pass: JumpThreadingPass
-; CHECK-O-NEXT: Running analysis: LazyValueAnalysis
-; CHECK-O-NEXT: Running pass: CorrelatedValuePropagationPass
+; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
+; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
+; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
+; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O3-NEXT: AggressiveInstCombinePass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
@ -143,7 +147,7 @@
 ; CHECK-O2-NEXT: Running pass: LibCallsShrinkWrapPass
 ; CHECK-O3-NEXT: Running pass: LibCallsShrinkWrapPass
 ; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass
-; CHECK-O-NEXT: Running pass: TailCallElimPass
+; CHECK-O23SZ-NEXT: Running pass: TailCallElimPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: ReassociatePass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis
@ -180,22 +184,10 @@
 ; CHECK-EP-LOOP-END-NEXT: Running pass: NoOpLoopPass
 ; CHECK-O-NEXT: Finished Loop pass manager run.
 ; CHECK-O-NEXT: Running pass: SROA on foo
-; CHECK-Os-NEXT: Running pass: MergedLoadStoreMotionPass
-; CHECK-Os-NEXT: Running pass: GVN
-; CHECK-Os-NEXT: Running analysis: MemoryDependenceAnalysis
-; CHECK-Os-NEXT: Running analysis: PhiValuesAnalysis
-; CHECK-Oz-NEXT: Running pass: MergedLoadStoreMotionPass
-; CHECK-Oz-NEXT: Running pass: GVN
-; CHECK-Oz-NEXT: Running analysis: MemoryDependenceAnalysis
-; CHECK-Oz-NEXT: Running analysis: PhiValuesAnalysis
-; CHECK-O2-NEXT: Running pass: MergedLoadStoreMotionPass
-; CHECK-O2-NEXT: Running pass: GVN
-; CHECK-O2-NEXT: Running analysis: MemoryDependenceAnalysis
-; CHECK-O2-NEXT: Running analysis: PhiValuesAnalysis
-; CHECK-O3-NEXT: Running pass: MergedLoadStoreMotionPass
-; CHECK-O3-NEXT: Running pass: GVN
-; CHECK-O3-NEXT: Running analysis: MemoryDependenceAnalysis
-; CHECK-O3-NEXT: Running analysis: PhiValuesAnalysis
+; CHECK-O23SZ-NEXT: Running pass: MergedLoadStoreMotionPass
+; CHECK-O23SZ-NEXT: Running pass: GVN
+; CHECK-O23SZ-NEXT: Running analysis: MemoryDependenceAnalysis
+; CHECK-O23SZ-NEXT: Running analysis: PhiValuesAnalysis
 ; CHECK-O-NEXT: Running pass: MemCpyOptPass
 ; CHECK-O1-NEXT: Running analysis: MemoryDependenceAnalysis
 ; CHECK-O1-NEXT: Running analysis: PhiValuesAnalysis
@ -204,14 +196,14 @@
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass
-; CHECK-O-NEXT: Running pass: JumpThreadingPass
-; CHECK-O-NEXT: Running pass: CorrelatedValuePropagationPass
-; CHECK-O-NEXT: Running pass: DSEPass
-; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass{{.*}}>
-; CHECK-O-NEXT: Starting llvm::Function pass manager run.
-; CHECK-O-NEXT: Running pass: LoopSimplifyPass
-; CHECK-O-NEXT: Running pass: LCSSAPass
-; CHECK-O-NEXT: Finished llvm::Function pass manager run.
+; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
+; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
+; CHECK-O23SZ-NEXT: Running pass: DSEPass
+; CHECK-O23SZ-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass{{.*}}>
+; CHECK-O23SZ-NEXT: Starting llvm::Function pass manager run.
+; CHECK-O23SZ-NEXT: Running pass: LoopSimplifyPass
+; CHECK-O23SZ-NEXT: Running pass: LCSSAPass
+; CHECK-O23SZ-NEXT: Finished llvm::Function pass manager run.
 ; CHECK-EP-SCALAR-LATE-NEXT: Running pass: NoOpFunctionPass
 ; CHECK-O-NEXT: Running pass: ADCEPass
 ; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis
--- a/llvm/test/Other/new-pm-thinlto-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-defaults.ll
@ -13,19 +13,19 @@
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O1,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS,CHECK-PRELINK-O1
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes='thinlto-pre-link<O2>,name-anon-globals' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS,CHECK-PRELINK-O2
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS,CHECK-PRELINK-O2
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes='thinlto-pre-link<O3>,name-anon-globals' -S -passes-ep-pipeline-start='no-op-module' %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS,CHECK-PRELINK-O3,CHECK-EP-PIPELINE-START
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-O23SZ,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS,CHECK-PRELINK-O3,CHECK-EP-PIPELINE-START
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes='thinlto-pre-link<Os>,name-anon-globals' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-Os,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS,CHECK-PRELINK-Os
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-Os,CHECK-O23SZ,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS,CHECK-PRELINK-Os
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes='thinlto-pre-link<Oz>,name-anon-globals' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-Oz,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS,CHECK-PRELINK-Oz
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-Oz,CHECK-O23SZ,CHECK-PRELINK-O,CHECK-PRELINK-O-NODIS,CHECK-PRELINK-Oz
 ; RUN: opt -disable-verify -debug-pass-manager -new-pm-debug-info-for-profiling \
 ; RUN:     -passes='thinlto-pre-link<O2>,name-anon-globals' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-DIS,CHECK-O,CHECK-O2,CHECK-PRELINK-O,CHECK-PRELINK-O2
+; RUN:     | FileCheck %s --check-prefixes=CHECK-DIS,CHECK-O,CHECK-O2,CHECK-O23SZ,CHECK-PRELINK-O,CHECK-PRELINK-O2
 ;
 ; Postlink pipelines:
 ; RUN: opt -disable-verify -debug-pass-manager \
@ -33,19 +33,19 @@
 ; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O1,CHECK-POSTLINK-O,CHECK-POSTLINK-O1
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes='thinlto<O2>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-POSTLINK-O,CHECK-POSTLINK-O2
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ,CHECK-POSTLINK-O,CHECK-POSTLINK-O2
 ; RUN: opt -disable-verify -debug-pass-manager -passes-ep-pipeline-start='no-op-module' \
 ; RUN:     -passes='thinlto<O3>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-POSTLINK-O,CHECK-POSTLINK-O3
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O3,CHECK-O23SZ,CHECK-POSTLINK-O,CHECK-POSTLINK-O3
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes='thinlto<Os>' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-Os,CHECK-POSTLINK-O,CHECK-POSTLINK-Os
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-Os,CHECK-O23SZ,CHECK-POSTLINK-O,CHECK-POSTLINK-Os
 ; RUN: opt -disable-verify -debug-pass-manager \
 ; RUN:     -passes='thinlto<Oz>' -S %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-Oz,CHECK-POSTLINK-O,CHECK-POSTLINK-Oz
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-Oz,CHECK-O23SZ,CHECK-POSTLINK-O,CHECK-POSTLINK-Oz
 ; RUN: opt -disable-verify -debug-pass-manager -new-pm-debug-info-for-profiling \
 ; RUN:     -passes='thinlto<O2>' -S  %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-POSTLINK-O,CHECK-POSTLINK-O2
+; RUN:     | FileCheck %s --check-prefixes=CHECK-O,CHECK-O2,CHECK-O23SZ,CHECK-POSTLINK-O,CHECK-POSTLINK-O2
 ;
 ; CHECK-O: Running analysis: PassInstrumentationAnalysis
 ; CHECK-O-NEXT: Starting llvm::Module pass manager run.
@ -112,17 +112,17 @@
 ; CHECK-O-NEXT: Running pass: SROA
 ; CHECK-O-NEXT: Running pass: EarlyCSEPass
 ; CHECK-O-NEXT: Running analysis: MemorySSAAnalysis
-; CHECK-O-NEXT: Running pass: SpeculativeExecutionPass
-; CHECK-O-NEXT: Running pass: JumpThreadingPass
-; CHECK-O-NEXT: Running analysis: LazyValueAnalysis
-; CHECK-O-NEXT: Running pass: CorrelatedValuePropagationPass
+; CHECK-O23SZ-NEXT: Running pass: SpeculativeExecutionPass
+; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
+; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
+; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O3-NEXT: Running pass: AggressiveInstCombinePass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O1-NEXT: Running pass: LibCallsShrinkWrapPass
 ; CHECK-O2-NEXT: Running pass: LibCallsShrinkWrapPass
 ; CHECK-O3-NEXT: Running pass: LibCallsShrinkWrapPass
-; CHECK-O-NEXT: Running pass: TailCallElimPass
+; CHECK-O23SZ-NEXT: Running pass: TailCallElimPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: ReassociatePass
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis
@ -180,14 +180,14 @@
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
-; CHECK-O-NEXT: Running pass: JumpThreadingPass
-; CHECK-O-NEXT: Running pass: CorrelatedValuePropagationPass
-; CHECK-O-NEXT: Running pass: DSEPass
-; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass{{.*}}>
-; CHECK-O-NEXT: Starting llvm::Function pass manager run
-; CHECK-O-NEXT: Running pass: LoopSimplifyPass
-; CHECK-O-NEXT: Running pass: LCSSAPass
-; CHECK-O-NEXT: Finished llvm::Function pass manager run
+; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
+; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
+; CHECK-O23SZ-NEXT: Running pass: DSEPass
+; CHECK-O23SZ-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass{{.*}}>
+; CHECK-O23SZ-NEXT: Starting llvm::Function pass manager run
+; CHECK-O23SZ-NEXT: Running pass: LoopSimplifyPass
+; CHECK-O23SZ-NEXT: Running pass: LCSSAPass
+; CHECK-O23SZ-NEXT: Finished llvm::Function pass manager run
 ; CHECK-O-NEXT: Running pass: ADCEPass
 ; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
--- a/llvm/test/Transforms/MemCpyOpt/lifetime.ll
+++ b/llvm/test/Transforms/MemCpyOpt/lifetime.ll
@ -1,4 +1,4 @@
-; RUN: opt < %s -O1 -S | FileCheck %s
+; RUN: opt < %s -O2 -S | FileCheck %s

 ; performCallSlotOptzn in MemCpy should not exchange the calls to
 ; @llvm.lifetime.start and @llvm.memcpy.
--- a/llvm/test/Transforms/PhaseOrdering/simplifycfg-options.ll
+++ b/llvm/test/Transforms/PhaseOrdering/simplifycfg-options.ll
@ -7,7 +7,7 @@

 define i1 @PR33605(i32 %a, i32 %b, i32* %c) {
 ; ALL-LABEL: @PR33605(
-; ALL-NEXT:  for.body:
+; ALL-NEXT:  entry:
 ; ALL-NEXT:    [[OR:%.*]] = or i32 [[B:%.*]], [[A:%.*]]
 ; ALL-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1
 ; ALL-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
@ -15,16 +15,16 @@ define i1 @PR33605(i32 %a, i32 %b, i32* %c) {
 ; ALL-NEXT:    br i1 [[CMP]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
 ; ALL:       if.then:
 ; ALL-NEXT:    store i32 [[OR]], i32* [[ARRAYIDX]], align 4
-; ALL-NEXT:    tail call void @foo()
+; ALL-NEXT:    call void @foo()
 ; ALL-NEXT:    br label [[IF_END]]
 ; ALL:       if.end:
-; ALL-NEXT:    [[CHANGED_1_OFF0:%.*]] = phi i1 [ true, [[IF_THEN]] ], [ false, [[FOR_BODY:%.*]] ]
+; ALL-NEXT:    [[CHANGED_1_OFF0:%.*]] = phi i1 [ true, [[IF_THEN]] ], [ false, [[ENTRY:%.*]] ]
 ; ALL-NEXT:    [[TMP1:%.*]] = load i32, i32* [[C]], align 4
 ; ALL-NEXT:    [[CMP_1:%.*]] = icmp eq i32 [[OR]], [[TMP1]]
 ; ALL-NEXT:    br i1 [[CMP_1]], label [[IF_END_1:%.*]], label [[IF_THEN_1:%.*]]
 ; ALL:       if.then.1:
 ; ALL-NEXT:    store i32 [[OR]], i32* [[C]], align 4
-; ALL-NEXT:    tail call void @foo()
+; ALL-NEXT:    call void @foo()
 ; ALL-NEXT:    br label [[IF_END_1]]
 ; ALL:       if.end.1:
 ; ALL-NEXT:    [[CHANGED_1_OFF0_1:%.*]] = phi i1 [ true, [[IF_THEN_1]] ], [ [[CHANGED_1_OFF0]], [[IF_END]] ]
--- a/llvm/test/Transforms/PhaseOrdering/two-shifts-by-sext.ll
+++ b/llvm/test/Transforms/PhaseOrdering/two-shifts-by-sext.ll
@ -74,7 +74,7 @@ define i32 @two_shifts_by_same_sext(i32 %val, i8 signext %len) {
 define i32 @two_shifts_by_sext_with_extra_use(i32 %val, i8 signext %len) {
 ; CHECK-LABEL: @two_shifts_by_sext_with_extra_use(
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[LEN:%.*]] to i32
-; CHECK-NEXT:    tail call void @use_int32(i32 [[CONV]])
+; CHECK-NEXT:    call void @use_int32(i32 [[CONV]])
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[VAL:%.*]], [[CONV]]
 ; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[SHL]], [[CONV]]
 ; CHECK-NEXT:    ret i32 [[SHR]]
@ -101,7 +101,7 @@ declare void @use_int32(i32)
 define i32 @two_shifts_by_same_sext_with_extra_use(i32 %val, i8 signext %len) {
 ; CHECK-LABEL: @two_shifts_by_same_sext_with_extra_use(
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i8 [[LEN:%.*]] to i32
-; CHECK-NEXT:    tail call void @use_int32(i32 [[CONV]])
+; CHECK-NEXT:    call void @use_int32(i32 [[CONV]])
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[VAL:%.*]], [[CONV]]
 ; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[SHL]], [[CONV]]
 ; CHECK-NEXT:    ret i32 [[SHR]]