[AArch64] Regenerate some test checks. NFC

This patch just reruns the update_llc_test_checks script on the AArch64 tests claiming to be updated by the script, cleaning up the output.
2021-09-08 11:08:32 +01:00 · 2021-09-08 11:08:32 +01:00 · caabf2a445
parent c01b76e733
commit caabf2a445
16 changed files with 321 additions and 351 deletions
--- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext-crash.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-crash.ll
@ -9,6 +9,16 @@ target triple = "aarch64-unknown-linux-gnu"
 ; here, only that this case no longer causes said crash.
 define dso_local i32 @dupext_crashtest(i32 %e) local_unnamed_addr {
 ; CHECK-LABEL: dupext_crashtest:
+; CHECK:       // %bb.0: // %for.body.lr.ph
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    dup v0.2s, w8
+; CHECK-NEXT:  .LBB0_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr d1, [x8]
+; CHECK-NEXT:    smull v1.2d, v0.2s, v1.2s
+; CHECK-NEXT:    xtn v1.2s, v1.2d
+; CHECK-NEXT:    str d1, [x8]
+; CHECK-NEXT:    b .LBB0_1
 for.body.lr.ph:
  %conv314 = zext i32 %e to i64
  br label %vector.memcheck
--- a/llvm/test/CodeGen/AArch64/f16-imm.ll
+++ b/llvm/test/CodeGen/AArch64/f16-imm.ll
@ -11,7 +11,7 @@ define half @Const0() {
 ;
 ; CHECK-ZCZ-LABEL: Const0:
 ; CHECK-ZCZ:       // %bb.0: // %entry
-; CHECK-ZCZ-NEXT:    movi d0, #0
+; CHECK-ZCZ-NEXT:    movi d0, #0000000000000000
 ; CHECK-ZCZ-NEXT:    ret
 ;
 ; CHECK-NOFP16-LABEL: Const0:
--- a/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
+++ b/llvm/test/CodeGen/AArch64/fp16_intrinsic_lane.ll
@ -10,8 +10,7 @@ declare half @llvm.fma.f16(half, half, half) #1

 define dso_local <4 x half> @t_vfma_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfma_lane_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-NEXT:    fmla v0.4h, v1.4h, v2.h[0]
 ; CHECK-NEXT:    ret
@ -23,8 +22,7 @@ entry:

 define dso_local <8 x half> @t_vfmaq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmaq_lane_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-NEXT:    fmla v0.8h, v1.8h, v2.h[0]
 ; CHECK-NEXT:    ret
@ -36,8 +34,7 @@ entry:

 define dso_local <4 x half> @t_vfma_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfma_laneq_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmla v0.4h, v1.4h, v2.h[0]
 ; CHECK-NEXT:    ret
 entry:
@ -48,8 +45,7 @@ entry:

 define dso_local <8 x half> @t_vfmaq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmaq_laneq_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmla v0.8h, v1.8h, v2.h[0]
 ; CHECK-NEXT:    ret
 entry:
@ -60,8 +56,7 @@ entry:

 define dso_local <4 x half> @t_vfma_n_f16(<4 x half> %a, <4 x half> %b, half %c) {
 ; CHECK-LABEL: t_vfma_n_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
 ; CHECK-NEXT:    fmla v0.4h, v1.4h, v2.h[0]
 ; CHECK-NEXT:    ret
@ -74,8 +69,7 @@ entry:

 define dso_local <8 x half> @t_vfmaq_n_f16(<8 x half> %a, <8 x half> %b, half %c) {
 ; CHECK-LABEL: t_vfmaq_n_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
 ; CHECK-NEXT:    fmla v0.8h, v1.8h, v2.h[0]
 ; CHECK-NEXT:    ret
@ -88,8 +82,7 @@ entry:

 define dso_local half @t_vfmah_lane_f16(half %a, half %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmah_lane_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-NEXT:    fmla h0, h1, v2.h[0]
 ; CHECK-NEXT:    ret
@ -101,8 +94,7 @@ entry:

 define dso_local half @t_vfmah_laneq_f16(half %a, half %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmah_laneq_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmla h0, h1, v2.h[0]
 ; CHECK-NEXT:    ret
 entry:
@ -113,8 +105,7 @@ entry:

 define dso_local <4 x half> @t_vfms_lane_f16(<4 x half> %a, <4 x half> %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfms_lane_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-NEXT:    fmls v0.4h, v1.4h, v2.h[0]
 ; CHECK-NEXT:    ret
@ -127,8 +118,7 @@ entry:

 define dso_local <8 x half> @t_vfmsq_lane_f16(<8 x half> %a, <8 x half> %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsq_lane_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-NEXT:    fmls v0.8h, v1.8h, v2.h[0]
 ; CHECK-NEXT:    ret
@ -141,8 +131,7 @@ entry:

 define dso_local <4 x half> @t_vfms_laneq_f16(<4 x half> %a, <4 x half> %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfms_laneq_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmls v0.4h, v1.4h, v2.h[0]
 ; CHECK-NEXT:    ret
 entry:
@ -154,8 +143,7 @@ entry:

 define dso_local <8 x half> @t_vfmsq_laneq_f16(<8 x half> %a, <8 x half> %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsq_laneq_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmls v0.8h, v1.8h, v2.h[0]
 ; CHECK-NEXT:    ret
 entry:
@ -167,8 +155,7 @@ entry:

 define dso_local <4 x half> @t_vfms_n_f16(<4 x half> %a, <4 x half> %b, half %c) {
 ; CHECK-LABEL: t_vfms_n_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
 ; CHECK-NEXT:    fmls v0.4h, v1.4h, v2.h[0]
 ; CHECK-NEXT:    ret
@ -182,8 +169,7 @@ entry:

 define dso_local <8 x half> @t_vfmsq_n_f16(<8 x half> %a, <8 x half> %b, half %c) {
 ; CHECK-LABEL: t_vfmsq_n_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $h2 killed $h2 def $q2
 ; CHECK-NEXT:    fmls v0.8h, v1.8h, v2.h[0]
 ; CHECK-NEXT:    ret
@ -197,8 +183,7 @@ entry:

 define dso_local half @t_vfmsh_lane_f16(half %a, half %b, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsh_lane_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-NEXT:    fmls h0, h1, v2.h[0]
 ; CHECK-NEXT:    ret
@ -211,8 +196,7 @@ entry:

 define dso_local half @t_vfmsh_laneq_f16(half %a, half %b, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vfmsh_laneq_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmls h0, h1, v2.h[0]
 ; CHECK-NEXT:    ret
 entry:
@ -224,8 +208,7 @@ entry:

 define dso_local <4 x half> @t_vmul_laneq_f16(<4 x half> %a, <8 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmul_laneq_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmul v0.4h, v0.4h, v1.h[0]
 ; CHECK-NEXT:    ret
 entry:
@ -236,8 +219,7 @@ entry:

 define dso_local <8 x half> @t_vmulq_laneq_f16(<8 x half> %a, <8 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulq_laneq_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmul v0.8h, v0.8h, v1.h[0]
 ; CHECK-NEXT:    ret
 entry:
@ -248,8 +230,7 @@ entry:

 define dso_local half @t_vmulh_lane_f16(half %a, <4 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vmulh_lane_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-NEXT:    fmul h0, h0, v1.h[0]
 ; CHECK-NEXT:    ret
@ -261,8 +242,7 @@ entry:

 define dso_local half @t_vmulh_laneq_f16(half %a, <8 x half> %c, i32 %lane) {
 ; CHECK-LABEL: t_vmulh_laneq_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmul h0, h0, v1.h[0]
 ; CHECK-NEXT:    ret
 entry:
@ -273,8 +253,7 @@ entry:

 define dso_local half @t_vmulx_f16(half %a, half %b) {
 ; CHECK-LABEL: t_vmulx_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmulx h0, h0, h1
 ; CHECK-NEXT:    ret
 entry:
@ -284,8 +263,7 @@ entry:

 define dso_local half @t_vmulxh_lane_f16(half %a, <4 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulxh_lane_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-NEXT:    fmulx h0, h0, v1.h[3]
 ; CHECK-NEXT:    ret
@ -297,8 +275,7 @@ entry:

 define dso_local <4 x half> @t_vmulx_lane_f16(<4 x half> %a, <4 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulx_lane_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-NEXT:    fmulx v0.4h, v0.4h, v1.h[0]
 ; CHECK-NEXT:    ret
@ -310,8 +287,7 @@ entry:

 define dso_local <8 x half> @t_vmulxq_lane_f16(<8 x half> %a, <4 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulxq_lane_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-NEXT:    fmulx v0.8h, v0.8h, v1.h[0]
 ; CHECK-NEXT:    ret
@ -323,8 +299,7 @@ entry:

 define dso_local <4 x half> @t_vmulx_laneq_f16(<4 x half> %a, <8 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulx_laneq_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmulx v0.4h, v0.4h, v1.h[0]
 ; CHECK-NEXT:    ret
 entry:
@ -335,8 +310,7 @@ entry:

 define dso_local <8 x half> @t_vmulxq_laneq_f16(<8 x half> %a, <8 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulxq_laneq_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmulx v0.8h, v0.8h, v1.h[0]
 ; CHECK-NEXT:    ret
 entry:
@ -347,8 +321,7 @@ entry:

 define dso_local half @t_vmulxh_laneq_f16(half %a, <8 x half> %b, i32 %lane) {
 ; CHECK-LABEL: t_vmulxh_laneq_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmulx h0, h0, v1.h[7]
 ; CHECK-NEXT:    ret
 entry:
@ -359,8 +332,7 @@ entry:

 define dso_local <4 x half> @t_vmulx_n_f16(<4 x half> %a, half %c) {
 ; CHECK-LABEL: t_vmulx_n_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $h1 killed $h1 def $q1
 ; CHECK-NEXT:    dup v1.4h, v1.h[0]
 ; CHECK-NEXT:    fmulx v0.4h, v0.4h, v1.4h
@ -374,8 +346,7 @@ entry:

 define dso_local <8 x half> @t_vmulxq_n_f16(<8 x half> %a, half %c) {
 ; CHECK-LABEL: t_vmulxq_n_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $h1 killed $h1 def $q1
 ; CHECK-NEXT:    dup v1.8h, v1.h[0]
 ; CHECK-NEXT:    fmulx v0.8h, v0.8h, v1.8h
@ -389,8 +360,7 @@ entry:

 define dso_local half @t_vfmah_lane3_f16(half %a, half %b, <4 x half> %c) {
 ; CHECK-LABEL: t_vfmah_lane3_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-NEXT:    fmla h0, h1, v2.h[3]
 ; CHECK-NEXT:    ret
@ -402,8 +372,7 @@ entry:

 define dso_local half @t_vfmah_laneq7_f16(half %a, half %b, <8 x half> %c) {
 ; CHECK-LABEL: t_vfmah_laneq7_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmla h0, h1, v2.h[7]
 ; CHECK-NEXT:    ret
 entry:
@ -414,8 +383,7 @@ entry:

 define dso_local half @t_vfmsh_lane3_f16(half %a, half %b, <4 x half> %c) {
 ; CHECK-LABEL: t_vfmsh_lane3_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-NEXT:    fmls h0, h1, v2.h[3]
 ; CHECK-NEXT:    ret
@ -428,8 +396,7 @@ entry:

 define dso_local half @t_vfmsh_laneq7_f16(half %a, half %b, <8 x half> %c) {
 ; CHECK-LABEL: t_vfmsh_laneq7_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fmls h0, h1, v2.h[7]
 ; CHECK-NEXT:    ret
 entry:
@ -441,8 +408,7 @@ entry:

 define dso_local half @t_fadd_vfmah_f16(half %a, half %b, <4 x half> %c, <4 x half> %d) {
 ; CHECK-LABEL: t_fadd_vfmah_f16:
-; CHECK-NEXT:    .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fadd v2.4h, v2.4h, v3.4h
 ; CHECK-NEXT:    fmla h0, h1, v2.h[3]
 ; CHECK-NEXT:    ret
--- a/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll
+++ b/llvm/test/CodeGen/AArch64/load-combine-big-endian.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=arm64eb-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64_be-unknown | FileCheck %s

 ; i8* p; // p is 4 byte aligned
 ; ((i32) p[0] << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3]
@ -207,7 +207,6 @@ define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) {
 ; CHECK-NEXT:    ldur w8, [x0, #1]
 ; CHECK-NEXT:    rev w0, w8
 ; CHECK-NEXT:    ret
-
  %tmp = bitcast i32* %arg to i8*
  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
  %tmp2 = load i8, i8* %tmp1, align 4
@ -238,7 +237,6 @@ define i32 @load_i32_by_i8_neg_offset(i32* %arg) {
 ; CHECK-NEXT:    ldur w8, [x0, #-4]
 ; CHECK-NEXT:    rev w0, w8
 ; CHECK-NEXT:    ret
-
  %tmp = bitcast i32* %arg to i8*
  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4
  %tmp2 = load i8, i8* %tmp1, align 4
@ -268,7 +266,6 @@ define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldur w0, [x0, #1]
 ; CHECK-NEXT:    ret
-
  %tmp = bitcast i32* %arg to i8*
  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4
  %tmp2 = load i8, i8* %tmp1, align 1
@ -298,7 +295,6 @@ define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldur w0, [x0, #-4]
 ; CHECK-NEXT:    ret
-
  %tmp = bitcast i32* %arg to i8*
  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1
  %tmp2 = load i8, i8* %tmp1, align 1
@ -449,7 +445,6 @@ define i32 @zext_load_i32_by_i8(i32* %arg) {
 ; CHECK-NEXT:    lsl w8, w8, #16
 ; CHECK-NEXT:    rev w0, w8
 ; CHECK-NEXT:    ret
-
  %tmp = bitcast i32* %arg to i8*
  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
  %tmp2 = load i8, i8* %tmp1, align 2
@ -472,7 +467,6 @@ define i32 @zext_load_i32_by_i8_shl_8(i32* %arg) {
 ; CHECK-NEXT:    lsl w0, w8, #8
 ; CHECK-NEXT:    bfi w0, w9, #16, #8
 ; CHECK-NEXT:    ret
-
  %tmp = bitcast i32* %arg to i8*
  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
  %tmp2 = load i8, i8* %tmp1, align 2
@ -496,7 +490,6 @@ define i32 @zext_load_i32_by_i8_shl_16(i32* %arg) {
 ; CHECK-NEXT:    lsl w0, w8, #16
 ; CHECK-NEXT:    bfi w0, w9, #24, #8
 ; CHECK-NEXT:    ret
-
  %tmp = bitcast i32* %arg to i8*
  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
  %tmp2 = load i8, i8* %tmp1, align 2
@ -516,7 +509,6 @@ define i32 @zext_load_i32_by_i8_bswap(i32* %arg) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w0, [x0]
 ; CHECK-NEXT:    ret
-
  %tmp = bitcast i32* %arg to i8*
  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
  %tmp2 = load i8, i8* %tmp1, align 1
@ -539,7 +531,6 @@ define i32 @zext_load_i32_by_i8_bswap_shl_8(i32* %arg) {
 ; CHECK-NEXT:    lsl w0, w8, #8
 ; CHECK-NEXT:    bfi w0, w9, #16, #8
 ; CHECK-NEXT:    ret
-
  %tmp = bitcast i32* %arg to i8*
  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
  %tmp2 = load i8, i8* %tmp1, align 1
@ -563,7 +554,6 @@ define i32 @zext_load_i32_by_i8_bswap_shl_16(i32* %arg) {
 ; CHECK-NEXT:    lsl w0, w8, #16
 ; CHECK-NEXT:    bfi w0, w9, #24, #8
 ; CHECK-NEXT:    ret
-
  %tmp = bitcast i32* %arg to i8*
  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
  %tmp2 = load i8, i8* %tmp1, align 1
@ -590,7 +580,6 @@ define i16 @load_i16_from_nonzero_offset(i8* %p) {
 ; CHECK-NEXT:    ldrb w0, [x0, #2]
 ; CHECK-NEXT:    bfi w0, w8, #8, #24
 ; CHECK-NEXT:    ret
-
  %p1.i16 = bitcast i8* %p to i16*
  %p2.i8 = getelementptr i8, i8* %p, i64 2
  %v1 = load i16, i16* %p1.i16
--- a/llvm/test/CodeGen/AArch64/machine-outliner-thunk.ll
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-thunk.ll
@ -12,7 +12,7 @@ define i32 @a() {
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl [[OUTLINED_DIRECT:OUTLINED_FUNCTION_[0-9]+]]
+; CHECK-NEXT:    bl OUTLINED_FUNCTION_1
 ; CHECK-NEXT:    add w0, w0, #8
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@ -28,7 +28,7 @@ define i32 @b() {
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl [[OUTLINED_DIRECT]]
+; CHECK-NEXT:    bl OUTLINED_FUNCTION_1
 ; CHECK-NEXT:    add w0, w0, #88
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@ -44,7 +44,7 @@ define hidden i32 @c(i32 (i32, i32, i32, i32)* %fptr) {
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl [[OUTLINED_INDIRECT:OUTLINED_FUNCTION_[0-9]+]]
+; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
 ; CHECK-NEXT:    add w0, w0, #8
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@ -60,7 +60,7 @@ define hidden i32 @d(i32 (i32, i32, i32, i32)* %fptr) {
 ; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl [[OUTLINED_INDIRECT]]
+; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
 ; CHECK-NEXT:    add w0, w0, #88
 ; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
@ -70,8 +70,8 @@ entry:
  ret i32 %add
 }

-; CHECK: [[OUTLINED_INDIRECT]]:
-; CHECK-SAME: // @[[OUTLINED_INDIRECT]] Thunk
+; CHECK: OUTLINED_FUNCTION_0:
+; CHECK-SAME: // @OUTLINED_FUNCTION_0 Thunk
 ; CHECK:        // %bb.0:
 ; CHECK-NEXT:   mov     x8, x0
 ; CHECK-NEXT:   mov     w0, #1
@ -80,8 +80,8 @@ entry:
 ; CHECK-NEXT:   mov     w3, #4
 ; CHECK-NEXT:   br      x8

-; CHECK: [[OUTLINED_DIRECT]]:
-; CHECK-SAME: // @[[OUTLINED_DIRECT]] Thunk
+; CHECK: OUTLINED_FUNCTION_1:
+; CHECK-SAME: // @OUTLINED_FUNCTION_1 Thunk
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov     w0, #1
 ; CHECK-NEXT:    mov     w1, #2
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-sve.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffle-reverse-sve.ll
@ -50,22 +50,24 @@ define <vscale x 16 x i1> @reverse_nxv16i1(<vscale x 16 x i1> %a) #0 {

 ; Verify splitvec type legalisation works as expected.
 define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) #0 {
-; CHECK-LABEL: reverse_nxv32i1:
+; CHECK-SELDAG-LABEL: reverse_nxv32i1:
 ; CHECK-SELDAG:       // %bb.0:
 ; CHECK-SELDAG-NEXT:    rev p2.b, p1.b
 ; CHECK-SELDAG-NEXT:    rev p1.b, p0.b
 ; CHECK-SELDAG-NEXT:    mov p0.b, p2.b
 ; CHECK-SELDAG-NEXT:    ret
+;
+; CHECK-FASTISEL-LABEL: reverse_nxv32i1:
 ; CHECK-FASTISEL:       // %bb.0:
-; CHECK-FASTISEL-NEXT:    str    x29, [sp, #-16]
-; CHECK-FASTISEL-NEXT:    addvl    sp, sp, #-1
-; CHECK-FASTISEL-NEXT:    str    p1, [sp, #7, mul vl]
-; CHECK-FASTISEL-NEXT:    mov    p1.b, p0.b
-; CHECK-FASTISEL-NEXT:    ldr    p0, [sp, #7, mul vl]
-; CHECK-FASTISEL-NEXT:    rev    p0.b, p0.b
-; CHECK-FASTISEL-NEXT:    rev    p1.b, p1.b
-; CHECK-FASTISEL-NEXT:    addvl    sp, sp, #1
-; CHECK-FASTISEL-NEXT:    ldr    x29, [sp], #16
+; CHECK-FASTISEL-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-FASTISEL-NEXT:    addvl sp, sp, #-1
+; CHECK-FASTISEL-NEXT:    str p1, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-FASTISEL-NEXT:    mov p1.b, p0.b
+; CHECK-FASTISEL-NEXT:    ldr p0, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-FASTISEL-NEXT:    rev p0.b, p0.b
+; CHECK-FASTISEL-NEXT:    rev p1.b, p1.b
+; CHECK-FASTISEL-NEXT:    addvl sp, sp, #1
+; CHECK-FASTISEL-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-FASTISEL-NEXT:    ret

  %res = call <vscale x 32 x i1> @llvm.experimental.vector.reverse.nxv32i1(<vscale x 32 x i1> %a)
@ -158,22 +160,24 @@ define <vscale x 2 x i8> @reverse_nxv2i8(<vscale x 2 x i8> %a) #0 {

 ; Verify splitvec type legalisation works as expected.
 define <vscale x 8 x i32> @reverse_nxv8i32(<vscale x 8 x i32> %a) #0 {
-; CHECK-LABEL: reverse_nxv8i32:
+; CHECK-SELDAG-LABEL: reverse_nxv8i32:
 ; CHECK-SELDAG:       // %bb.0:
 ; CHECK-SELDAG-NEXT:    rev z2.s, z1.s
 ; CHECK-SELDAG-NEXT:    rev z1.s, z0.s
 ; CHECK-SELDAG-NEXT:    mov z0.d, z2.d
 ; CHECK-SELDAG-NEXT:    ret
+;
+; CHECK-FASTISEL-LABEL: reverse_nxv8i32:
 ; CHECK-FASTISEL:       // %bb.0:
-; CHECK-FASTISEL-NEXT:    str    x29, [sp, #-16]
-; CHECK-FASTISEL-NEXT:    addvl    sp, sp, #-1
-; CHECK-FASTISEL-NEXT:    str    z1, [sp]
-; CHECK-FASTISEL-NEXT:    mov    z1.d, z0.d
-; CHECK-FASTISEL-NEXT:    ldr    z0, [sp]
-; CHECK-FASTISEL-NEXT:    rev    z0.s, z0.s
-; CHECK-FASTISEL-NEXT:    rev    z1.s, z1.s
-; CHECK-FASTISEL-NEXT:    addvl    sp, sp, #1
-; CHECK-FASTISEL-NEXT:    ldr    x29, [sp], #16
+; CHECK-FASTISEL-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-FASTISEL-NEXT:    addvl sp, sp, #-1
+; CHECK-FASTISEL-NEXT:    str z1, [sp] // 16-byte Folded Spill
+; CHECK-FASTISEL-NEXT:    mov z1.d, z0.d
+; CHECK-FASTISEL-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-FASTISEL-NEXT:    rev z0.s, z0.s
+; CHECK-FASTISEL-NEXT:    rev z1.s, z1.s
+; CHECK-FASTISEL-NEXT:    addvl sp, sp, #1
+; CHECK-FASTISEL-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-FASTISEL-NEXT:    ret

  %res = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> %a)
@ -182,7 +186,7 @@ define <vscale x 8 x i32> @reverse_nxv8i32(<vscale x 8 x i32> %a) #0 {

 ; Verify splitvec type legalisation works as expected.
 define <vscale x 16 x float> @reverse_nxv16f32(<vscale x 16 x float> %a) #0 {
-; CHECK-LABEL: reverse_nxv16f32:
+; CHECK-SELDAG-LABEL: reverse_nxv16f32:
 ; CHECK-SELDAG:       // %bb.0:
 ; CHECK-SELDAG-NEXT:    rev z5.s, z3.s
 ; CHECK-SELDAG-NEXT:    rev z4.s, z2.s
@ -191,21 +195,23 @@ define <vscale x 16 x float> @reverse_nxv16f32(<vscale x 16 x float> %a) #0 {
 ; CHECK-SELDAG-NEXT:    mov z0.d, z5.d
 ; CHECK-SELDAG-NEXT:    mov z1.d, z4.d
 ; CHECK-SELDAG-NEXT:    ret
+;
+; CHECK-FASTISEL-LABEL: reverse_nxv16f32:
 ; CHECK-FASTISEL:       // %bb.0:
-; CHECK-FASTISEL-NEXT:    str    x29, [sp, #-16]
-; CHECK-FASTISEL-NEXT:    addvl    sp, sp, #-2
-; CHECK-FASTISEL-NEXT:    str    z3, [sp, #1, mul vl]
-; CHECK-FASTISEL-NEXT:    str    z2, [sp]
-; CHECK-FASTISEL-NEXT:    mov    z2.d, z1.d
-; CHECK-FASTISEL-NEXT:    ldr    z1, [sp]
-; CHECK-FASTISEL-NEXT:    mov    z3.d, z0.d
-; CHECK-FASTISEL-NEXT:    ldr    z0, [sp, #1, mul vl]
-; CHECK-FASTISEL-NEXT:    rev    z0.s, z0.s
-; CHECK-FASTISEL-NEXT:    rev    z1.s, z1.s
-; CHECK-FASTISEL-NEXT:    rev    z2.s, z2.s
-; CHECK-FASTISEL-NEXT:    rev    z3.s, z3.s
-; CHECK-FASTISEL-NEXT:    addvl    sp, sp, #2
-; CHECK-FASTISEL-NEXT:    ldr    x29, [sp], #16
+; CHECK-FASTISEL-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-FASTISEL-NEXT:    addvl sp, sp, #-2
+; CHECK-FASTISEL-NEXT:    str z3, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-FASTISEL-NEXT:    str z2, [sp] // 16-byte Folded Spill
+; CHECK-FASTISEL-NEXT:    mov z2.d, z1.d
+; CHECK-FASTISEL-NEXT:    ldr z1, [sp] // 16-byte Folded Reload
+; CHECK-FASTISEL-NEXT:    mov z3.d, z0.d
+; CHECK-FASTISEL-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-FASTISEL-NEXT:    rev z0.s, z0.s
+; CHECK-FASTISEL-NEXT:    rev z1.s, z1.s
+; CHECK-FASTISEL-NEXT:    rev z2.s, z2.s
+; CHECK-FASTISEL-NEXT:    rev z3.s, z3.s
+; CHECK-FASTISEL-NEXT:    addvl sp, sp, #2
+; CHECK-FASTISEL-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-FASTISEL-NEXT:    ret

  %res = call <vscale x 16 x float> @llvm.experimental.vector.reverse.nxv16f32(<vscale x 16 x float> %a)
--- a/llvm/test/CodeGen/AArch64/pow.75.ll
+++ b/llvm/test/CodeGen/AArch64/pow.75.ll
@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=aarch64-- -debug 2>&1 | FileCheck %s
 ; REQUIRES: asserts

--- a/llvm/test/CodeGen/AArch64/shift-mod.ll
+++ b/llvm/test/CodeGen/AArch64/shift-mod.ll
@ -18,8 +18,8 @@ define i32 @test1(i32 %x, i64 %y) {
 define i64 @test2(i32 %x, i64 %y) {
 ; CHECK-LABEL: test2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg w[[REG:[0-9]+]], w0
-; CHECK-NEXT:    asr x0, x1, x[[REG]]
+; CHECK-NEXT:    neg w8, w0
+; CHECK-NEXT:    asr x0, x1, x8
 ; CHECK-NEXT:    ret
  %sub9 = sub nsw i32 64, %x
  %sh_prom12.i = zext i32 %sub9 to i64
--- a/llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/stack-guard-remat-bitcast.ll
@ -5,6 +5,7 @@

 ; PR20558

+; Load the stack guard for the second time, just in case the previous value gets spilled.
 define i32 @test_stack_guard_remat2() ssp {
 ; CHECK-LABEL: test_stack_guard_remat2:
 ; CHECK:       ; %bb.0: ; %entry
@ -17,7 +18,6 @@ define i32 @test_stack_guard_remat2() ssp {
 ; CHECK-NEXT:  Lloh0:
 ; CHECK-NEXT:    adrp x8, ___stack_chk_guard@GOTPAGE
 ; CHECK-NEXT:  Lloh1:
-; Load the stack guard for the second time, just in case the previous value gets spilled.
 ; CHECK-NEXT:    adrp x9, ___stack_chk_guard@GOTPAGE
 ; CHECK-NEXT:  Lloh2:
 ; CHECK-NEXT:    ldr x8, [x8, ___stack_chk_guard@GOTPAGEOFF]
--- a/llvm/test/CodeGen/AArch64/strqu.ll
+++ b/llvm/test/CodeGen/AArch64/strqu.ll
@ -1,39 +1,23 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-linux-gnu    | FileCheck --check-prefixes=CHECK,NOSPLIT %s
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-linux-gnu | FileCheck --check-prefixes=CHECK,NOSPLIT %s
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-linux-gnu    -mcpu=exynos-m3 | FileCheck --check-prefixes=CHECK,NOSPLIT %s
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-linux-gnu -mcpu=exynos-m3 | FileCheck --check-prefixes=CHECK,NOSPLIT %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-linux-gnu    | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-linux-gnu | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-linux-gnu    -mcpu=exynos-m3 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-linux-gnu -mcpu=exynos-m3 | FileCheck %s

 define void @test_split_f(<4 x float> %val, <4 x float>* %addr) {
-; NOSPLIT-LABEL: test_split_f:
-; NOSPLIT:       // %bb.0:
-; NOSPLIT-NEXT:    str q0, [x0]
-; NOSPLIT-NEXT:    ret
-;
-; SPLIT-LABEL: test_split_f:
-; SPLIT:       // %bb.0:
-; SPLIT-NEXT:    rev64 v0.4s, v0.4s
-; SPLIT-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; SPLIT-NEXT:    st1 { v0.2s }, [x0]
-; SPLIT-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; SPLIT-NEXT:    add x8, x0, #8 // =8
-; SPLIT-NEXT:    st1 { v0.2s }, [x8]
-; SPLIT-NEXT:    ret
+; CHECK-LABEL: test_split_f:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
  store <4 x float> %val, <4 x float>* %addr, align 8
  ret void
 }

 define void @test_split_d(<2 x double> %val, <2 x double>* %addr) {
-; NOSPLIT-LABEL: test_split_d:
-; NOSPLIT:       // %bb.0:
-; NOSPLIT-NEXT:    str q0, [x0]
-; NOSPLIT-NEXT:    ret
-;
-; SPLIT-LABEL: test_split_d:
-; SPLIT:       // %bb.0:
-; SPLIT-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; SPLIT-NEXT:    st1 { v0.2d }, [x0]
-; SPLIT-NEXT:    ret
+; CHECK-LABEL: test_split_d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ret
  store <2 x double> %val, <2 x double>* %addr, align 8
  ret void
 }
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll
@ -6,7 +6,7 @@
 define <vscale x 16 x i8> @add_i8(<vscale x 16 x i8> %a) {
 ; CHECK-LABEL: add_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add z0.b, z0.b, #127
+; CHECK-NEXT:    add z0.b, z0.b, #127 // =0x7f
 ; CHECK-NEXT:    ret
  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
  %elt = insertelement <vscale x 16 x i8> undef, i8 127, i32 0
@ -20,7 +20,7 @@ define <vscale x 16 x i8> @add_i8(<vscale x 16 x i8> %a) {
 define <vscale x 8 x i16> @add_i16(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: add_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add z0.h, z0.h, #127
+; CHECK-NEXT:    add z0.h, z0.h, #127 // =0x7f
 ; CHECK-NEXT:    ret
  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
  %elt = insertelement <vscale x 8 x i16> undef, i16 127, i32 0
@ -50,7 +50,7 @@ define <vscale x 8 x i16> @add_i16_out_of_range(<vscale x 8 x i16> %a) {
 define <vscale x 4 x i32> @add_i32(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: add_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add z0.s, z0.s, #127
+; CHECK-NEXT:    add z0.s, z0.s, #127 // =0x7f
 ; CHECK-NEXT:    ret
  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
  %elt = insertelement <vscale x 4 x i32> undef, i32 127, i32 0
@ -80,7 +80,7 @@ define <vscale x 4 x i32> @add_i32_out_of_range(<vscale x 4 x i32> %a) {
 define <vscale x 2 x i64> @add_i64(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: add_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add z0.d, z0.d, #127
+; CHECK-NEXT:    add z0.d, z0.d, #127 // =0x7f
 ; CHECK-NEXT:    ret
  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
  %elt = insertelement <vscale x 2 x i64> undef, i64 127, i64 0
@ -112,7 +112,7 @@ define <vscale x 2 x i64> @add_i64_out_of_range(<vscale x 2 x i64> %a) {
 define <vscale x 16 x i8> @sub_i8(<vscale x 16 x i8> %a) {
 ; CHECK-LABEL: sub_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub z0.b, z0.b, #127
+; CHECK-NEXT:    sub z0.b, z0.b, #127 // =0x7f
 ; CHECK-NEXT:    ret
  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
  %elt = insertelement <vscale x 16 x i8> undef, i8 127, i32 0
@ -126,7 +126,7 @@ define <vscale x 16 x i8> @sub_i8(<vscale x 16 x i8> %a) {
 define <vscale x 8 x i16> @sub_i16(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: sub_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub z0.h, z0.h, #127
+; CHECK-NEXT:    sub z0.h, z0.h, #127 // =0x7f
 ; CHECK-NEXT:    ret
  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
  %elt = insertelement <vscale x 8 x i16> undef, i16 127, i32 0
@ -156,7 +156,7 @@ define <vscale x 8 x i16> @sub_i16_out_of_range(<vscale x 8 x i16> %a) {
 define <vscale x 4 x i32> @sub_i32(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: sub_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub z0.s, z0.s, #127
+; CHECK-NEXT:    sub z0.s, z0.s, #127 // =0x7f
 ; CHECK-NEXT:    ret
  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
  %elt = insertelement <vscale x 4 x i32> undef, i32 127, i32 0
@ -186,7 +186,7 @@ define <vscale x 4 x i32> @sub_i32_out_of_range(<vscale x 4 x i32> %a) {
 define <vscale x 2 x i64> @sub_i64(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: sub_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub z0.d, z0.d, #127
+; CHECK-NEXT:    sub z0.d, z0.d, #127 // =0x7f
 ; CHECK-NEXT:    ret
  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
  %elt = insertelement <vscale x 2 x i64> undef, i64 127, i64 0
@ -216,8 +216,9 @@ define <vscale x 2 x i64> @sub_i64_out_of_range(<vscale x 2 x i64> %a) {
 ; As sub_i32 but where pg is i8 based and thus compatible for i32.
 define <vscale x 4 x i32> @sub_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: sub_i32_ptrue_all_b:
-; CHECK: sub z0.s, z0.s, #1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub z0.s, z0.s, #1 // =0x1
+; CHECK-NEXT:    ret
  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
@ -230,8 +231,9 @@ define <vscale x 4 x i32> @sub_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
 ; As sub_i32 but where pg is i16 based and thus compatible for i32.
 define <vscale x 4 x i32> @sub_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: sub_i32_ptrue_all_h:
-; CHECK: sub z0.s, z0.s, #1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub z0.s, z0.s, #1 // =0x1
+; CHECK-NEXT:    ret
  %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h)
  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
@ -246,10 +248,11 @@ define <vscale x 4 x i32> @sub_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
 ; thus inactive lanes are important and the immediate form cannot be used.
 define <vscale x 4 x i32> @sub_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: sub_i32_ptrue_all_d:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].d
-; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1
-; CHECK-DAG: sub z0.s, [[PG]]/m, z0.s, [[DUP]].s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z1.s, #1 // =0x1
+; CHECK-NEXT:    sub z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
  %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d)
  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
@ -372,8 +375,9 @@ define <vscale x 2 x i64> @smax_i64_out_of_range(<vscale x 2 x i64> %a) {
 ; As smax_i32 but where pg is i8 based and thus compatible for i32.
 define <vscale x 4 x i32> @smax_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: smax_i32_ptrue_all_b:
-; CHECK: smax z0.s, z0.s, #1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smax z0.s, z0.s, #1
+; CHECK-NEXT:    ret
  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
@ -386,8 +390,9 @@ define <vscale x 4 x i32> @smax_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
 ; As smax_i32 but where pg is i16 based and thus compatible for i32.
 define <vscale x 4 x i32> @smax_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: smax_i32_ptrue_all_h:
-; CHECK: smax z0.s, z0.s, #1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smax z0.s, z0.s, #1
+; CHECK-NEXT:    ret
  %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h)
  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
@ -402,10 +407,11 @@ define <vscale x 4 x i32> @smax_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
 ; thus inactive lanes are important and the immediate form cannot be used.
 define <vscale x 4 x i32> @smax_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: smax_i32_ptrue_all_d:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].d
-; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1
-; CHECK-DAG: smax z0.s, [[PG]]/m, z0.s, [[DUP]].s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z1.s, #1 // =0x1
+; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
  %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d)
  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
@ -528,8 +534,9 @@ define <vscale x 2 x i64> @smin_i64_out_of_range(<vscale x 2 x i64> %a) {
 ; As smin_i32 but where pg is i8 based and thus compatible for i32.
 define <vscale x 4 x i32> @smin_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: smin_i32_ptrue_all_b:
-; CHECK: smin z0.s, z0.s, #1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smin z0.s, z0.s, #1
+; CHECK-NEXT:    ret
  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
@ -542,8 +549,9 @@ define <vscale x 4 x i32> @smin_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
 ; As smin_i32 but where pg is i16 based and thus compatible for i32.
 define <vscale x 4 x i32> @smin_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: smin_i32_ptrue_all_h:
-; CHECK: smin z0.s, z0.s, #1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    smin z0.s, z0.s, #1
+; CHECK-NEXT:    ret
  %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h)
  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
@ -558,10 +566,11 @@ define <vscale x 4 x i32> @smin_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
 ; thus inactive lanes are important and the immediate form cannot be used.
 define <vscale x 4 x i32> @smin_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: smin_i32_ptrue_all_d:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].d
-; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1
-; CHECK-DAG: smin z0.s, [[PG]]/m, z0.s, [[DUP]].s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z1.s, #1 // =0x1
+; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
  %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d)
  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
@ -684,8 +693,9 @@ define <vscale x 2 x i64> @umax_i64_out_of_range(<vscale x 2 x i64> %a) {
 ; As umax_i32 but where pg is i8 based and thus compatible for i32.
 define <vscale x 4 x i32> @umax_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: umax_i32_ptrue_all_b:
-; CHECK: umax z0.s, z0.s, #1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umax z0.s, z0.s, #1
+; CHECK-NEXT:    ret
  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
@ -698,8 +708,9 @@ define <vscale x 4 x i32> @umax_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
 ; As umax_i32 but where pg is i16 based and thus compatible for i32.
 define <vscale x 4 x i32> @umax_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: umax_i32_ptrue_all_h:
-; CHECK: umax z0.s, z0.s, #1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umax z0.s, z0.s, #1
+; CHECK-NEXT:    ret
  %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h)
  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
@ -714,10 +725,11 @@ define <vscale x 4 x i32> @umax_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
 ; thus inactive lanes are important and the immediate form cannot be used.
 define <vscale x 4 x i32> @umax_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: umax_i32_ptrue_all_d:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].d
-; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1
-; CHECK-DAG: umax z0.s, [[PG]]/m, z0.s, [[DUP]].s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z1.s, #1 // =0x1
+; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
  %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d)
  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
@ -840,8 +852,9 @@ define <vscale x 2 x i64> @umin_i64_out_of_range(<vscale x 2 x i64> %a) {
 ; As umin_i32 but where pg is i8 based and thus compatible for i32.
 define <vscale x 4 x i32> @umin_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: umin_i32_ptrue_all_b:
-; CHECK: umin z0.s, z0.s, #1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umin z0.s, z0.s, #1
+; CHECK-NEXT:    ret
  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
@ -854,8 +867,9 @@ define <vscale x 4 x i32> @umin_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
 ; As umin_i32 but where pg is i16 based and thus compatible for i32.
 define <vscale x 4 x i32> @umin_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: umin_i32_ptrue_all_h:
-; CHECK: umin z0.s, z0.s, #1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    umin z0.s, z0.s, #1
+; CHECK-NEXT:    ret
  %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h)
  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
@ -870,10 +884,11 @@ define <vscale x 4 x i32> @umin_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
 ; thus inactive lanes are important and the immediate form cannot be used.
 define <vscale x 4 x i32> @umin_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: umin_i32_ptrue_all_d:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].d
-; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1
-; CHECK-DAG: umin z0.s, [[PG]]/m, z0.s, [[DUP]].s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z1.s, #1 // =0x1
+; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
  %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d)
  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
@ -1769,8 +1784,9 @@ define <vscale x 2 x i64> @lsr_i64_too_small(<vscale x 2 x i1> %pg, <vscale x 2
 ; As lsr_i32 but where pg is i8 based and thus compatible for i32.
 define <vscale x 4 x i32> @lsr_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: lsr_i32_ptrue_all_b:
-; CHECK: lsr z0.s, z0.s, #1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr z0.s, z0.s, #1
+; CHECK-NEXT:    ret
  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
@ -1783,8 +1799,9 @@ define <vscale x 4 x i32> @lsr_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
 ; As lsr_i32 but where pg is i16 based and thus compatible for i32.
 define <vscale x 4 x i32> @lsr_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: lsr_i32_ptrue_all_h:
-; CHECK: lsr z0.s, z0.s, #1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr z0.s, z0.s, #1
+; CHECK-NEXT:    ret
  %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h)
  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
@ -1799,9 +1816,10 @@ define <vscale x 4 x i32> @lsr_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
 ; thus inactive lanes are important and the immediate form cannot be used.
 define <vscale x 4 x i32> @lsr_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: lsr_i32_ptrue_all_d:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].d
-; CHECK-DAG: lsr z0.s, [[PG]]/m, z0.s, #1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, #1
+; CHECK-NEXT:    ret
  %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d)
  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
@ -1819,8 +1837,9 @@ define <vscale x 4 x i32> @lsr_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
 ; As mul_i32 but where pg is i8 based and thus compatible for i32.
 define <vscale x 4 x i32> @mul_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: mul_i32_ptrue_all_b:
-; CHECK: mul z0.s, z0.s, #1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul z0.s, z0.s, #1
+; CHECK-NEXT:    ret
  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
@ -1833,8 +1852,9 @@ define <vscale x 4 x i32> @mul_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
 ; As mul_i32 but where pg is i16 based and thus compatible for i32.
 define <vscale x 4 x i32> @mul_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: mul_i32_ptrue_all_h:
-; CHECK: mul z0.s, z0.s, #1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mul z0.s, z0.s, #1
+; CHECK-NEXT:    ret
  %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h)
  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
@ -1849,10 +1869,11 @@ define <vscale x 4 x i32> @mul_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
 ; thus inactive lanes are important and the immediate form cannot be used.
 define <vscale x 4 x i32> @mul_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: mul_i32_ptrue_all_d:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].d
-; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1
-; CHECK-DAG: mul z0.s, [[PG]]/m, z0.s, [[DUP]].s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z1.s, #1 // =0x1
+; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
  %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d)
  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
--- a/llvm/test/CodeGen/AArch64/sve-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-select.ll
@ -135,41 +135,37 @@ define <vscale x  2 x i1> @select_nxv2i1(i1 %cond, <vscale x  2 x i1> %a, <vscal

 ; Integer vector select

-define <vscale x 16 x i8> @sel_nxv16i8(<vscale x 16 x i1> %p,
-                                       <vscale x 16 x i8> %dst,
-                                       <vscale x 16 x i8> %a) {
+define <vscale x 16 x i8> @sel_nxv16i8(<vscale x 16 x i1> %p, <vscale x 16 x i8> %dst, <vscale x 16 x i8> %a) {
 ; CHECK-LABEL: sel_nxv16i8:
-; CHECK:         mov z0.b, p0/m, z1.b
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.b, p0/m, z1.b
 ; CHECK-NEXT:    ret
  %sel = select <vscale x 16 x i1> %p, <vscale x 16 x i8> %a, <vscale x 16 x i8> %dst
  ret <vscale x 16 x i8> %sel
 }

-define <vscale x 8 x i16> @sel_nxv8i16(<vscale x 8 x i1> %p,
-                                       <vscale x 8 x i16> %dst,
-                                       <vscale x 8 x i16> %a) {
+define <vscale x 8 x i16> @sel_nxv8i16(<vscale x 8 x i1> %p, <vscale x 8 x i16> %dst, <vscale x 8 x i16> %a) {
 ; CHECK-LABEL: sel_nxv8i16:
-; CHECK:         mov z0.h, p0/m, z1.h
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
  %sel = select <vscale x 8 x i1> %p, <vscale x 8 x i16> %a, <vscale x 8 x i16> %dst
  ret <vscale x 8 x i16> %sel
 }

-define <vscale x 4 x i32> @sel_nxv4i32(<vscale x 4 x i1> %p,
-                                       <vscale x 4 x i32> %dst,
-                                       <vscale x 4 x i32> %a) {
+define <vscale x 4 x i32> @sel_nxv4i32(<vscale x 4 x i1> %p, <vscale x 4 x i32> %dst, <vscale x 4 x i32> %a) {
 ; CHECK-LABEL: sel_nxv4i32:
-; CHECK:         mov z0.s, p0/m, z1.s
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
  %sel = select <vscale x 4 x i1> %p, <vscale x 4 x i32> %a, <vscale x 4 x i32> %dst
  ret <vscale x 4 x i32> %sel
 }

-define <vscale x 2 x i64> @sel_nxv2i64(<vscale x 2 x i1> %p,
-                                       <vscale x 2 x i64> %dst,
-                                       <vscale x 2 x i64> %a) {
+define <vscale x 2 x i64> @sel_nxv2i64(<vscale x 2 x i1> %p, <vscale x 2 x i64> %dst, <vscale x 2 x i64> %a) {
 ; CHECK-LABEL: sel_nxv2i64:
-; CHECK:         mov z0.d, p0/m, z1.d
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
  %sel = select <vscale x 2 x i1> %p, <vscale x 2 x i64> %a, <vscale x 2 x i64> %dst
  ret <vscale x 2 x i64> %sel
@ -177,41 +173,37 @@ define <vscale x 2 x i64> @sel_nxv2i64(<vscale x 2 x i1> %p,

 ; Floating point vector select

-define <vscale x 8 x half> @sel_nxv8f16(<vscale x 8 x i1> %p,
-                                        <vscale x 8 x half> %dst,
-                                        <vscale x 8 x half> %a) {
+define <vscale x 8 x half> @sel_nxv8f16(<vscale x 8 x i1> %p, <vscale x 8 x half> %dst, <vscale x 8 x half> %a) {
 ; CHECK-LABEL: sel_nxv8f16:
-; CHECK:         mov z0.h, p0/m, z1.h
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
  %sel = select <vscale x 8 x i1> %p, <vscale x 8 x half> %a, <vscale x 8 x half> %dst
  ret <vscale x 8 x half> %sel
 }

-define <vscale x 4 x float> @sel_nxv4f32(<vscale x 4 x i1> %p,
-                                         <vscale x 4 x float> %dst,
-                                         <vscale x 4 x float> %a) {
+define <vscale x 4 x float> @sel_nxv4f32(<vscale x 4 x i1> %p, <vscale x 4 x float> %dst, <vscale x 4 x float> %a) {
 ; CHECK-LABEL: sel_nxv4f32:
-; CHECK:         mov z0.s, p0/m, z1.s
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
  %sel = select <vscale x 4 x i1> %p, <vscale x 4 x float> %a, <vscale x 4 x float> %dst
  ret <vscale x 4 x float> %sel
 }

-define <vscale x 2 x float> @sel_nxv2f32(<vscale x 2 x i1> %p,
-                                         <vscale x 2 x float> %dst,
-                                         <vscale x 2 x float> %a) {
+define <vscale x 2 x float> @sel_nxv2f32(<vscale x 2 x i1> %p, <vscale x 2 x float> %dst, <vscale x 2 x float> %a) {
 ; CHECK-LABEL: sel_nxv2f32:
-; CHECK:         mov z0.d, p0/m, z1.d
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
  %sel = select <vscale x 2 x i1> %p, <vscale x 2 x float> %a, <vscale x 2 x float> %dst
  ret <vscale x 2 x float> %sel
 }

-define <vscale x 2 x double> @sel_nxv8f64(<vscale x 2 x i1> %p,
-                                          <vscale x 2 x double> %dst,
-                                          <vscale x 2 x double> %a) {
+define <vscale x 2 x double> @sel_nxv8f64(<vscale x 2 x i1> %p, <vscale x 2 x double> %dst, <vscale x 2 x double> %a) {
 ; CHECK-LABEL: sel_nxv8f64:
-; CHECK:         mov z0.d, p0/m, z1.d
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
  %sel = select <vscale x 2 x i1> %p, <vscale x 2 x double> %a, <vscale x 2 x double> %dst
  ret <vscale x 2 x double> %sel
@ -220,13 +212,13 @@ define <vscale x 2 x double> @sel_nxv8f64(<vscale x 2 x i1> %p,
 ; Check icmp+select

 define <vscale x 2 x half> @icmp_select_nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i64 %x0) {
-; CHECK-LABEL: icmp_select_nxv2f16
+; CHECK-LABEL: icmp_select_nxv2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp	x0, #0
-; CHECK-NEXT:    cset	w8, eq
-; CHECK-NEXT:    sbfx	x8, x8, #0, #1
-; CHECK-NEXT:    whilelo	p0.d, xzr, x8
-; CHECK-NEXT:    sel	z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    cmp x0, #0
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
  %mask = icmp eq i64 %x0, 0
  %sel = select i1 %mask, <vscale x 2 x half> %a, <vscale x 2 x half> %b
@ -234,13 +226,13 @@ define <vscale x 2 x half> @icmp_select_nxv2f16(<vscale x 2 x half> %a, <vscale
 }

 define <vscale x 2 x float> @icmp_select_nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i64 %x0) {
-; CHECK-LABEL: icmp_select_nxv2f32
+; CHECK-LABEL: icmp_select_nxv2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp	x0, #0
-; CHECK-NEXT:    cset	w8, eq
-; CHECK-NEXT:    sbfx	x8, x8, #0, #1
-; CHECK-NEXT:    whilelo	p0.d, xzr, x8
-; CHECK-NEXT:    sel	z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    cmp x0, #0
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
  %mask = icmp eq i64 %x0, 0
  %sel = select i1 %mask, <vscale x 2 x float> %a, <vscale x 2 x float> %b
@ -248,13 +240,13 @@ define <vscale x 2 x float> @icmp_select_nxv2f32(<vscale x 2 x float> %a, <vscal
 }

 define <vscale x 2 x double> @icmp_select_nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i64 %x0) {
-; CHECK-LABEL: icmp_select_nxv2f64
+; CHECK-LABEL: icmp_select_nxv2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp	x0, #0
-; CHECK-NEXT:    cset	w8, eq
-; CHECK-NEXT:    sbfx	x8, x8, #0, #1
-; CHECK-NEXT:    whilelo	p0.d, xzr, x8
-; CHECK-NEXT:    sel	z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    cmp x0, #0
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
  %mask = icmp eq i64 %x0, 0
  %sel = select i1 %mask, <vscale x 2 x double> %a, <vscale x 2 x double> %b
@ -262,13 +254,13 @@ define <vscale x 2 x double> @icmp_select_nxv2f64(<vscale x 2 x double> %a, <vsc
 }

 define <vscale x 4 x half> @icmp_select_nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i64 %x0) {
-; CHECK-LABEL: icmp_select_nxv4f16
+; CHECK-LABEL: icmp_select_nxv4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp	x0, #0
-; CHECK-NEXT:    cset	w8, eq
-; CHECK-NEXT:    sbfx	x8, x8, #0, #1
-; CHECK-NEXT:    whilelo	p0.s, xzr, x8
-; CHECK-NEXT:    sel	z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    cmp x0, #0
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
  %mask = icmp eq i64 %x0, 0
  %sel = select i1 %mask, <vscale x 4 x half> %a, <vscale x 4 x half> %b
@ -276,13 +268,13 @@ define <vscale x 4 x half> @icmp_select_nxv4f16(<vscale x 4 x half> %a, <vscale
 }

 define <vscale x 4 x float> @icmp_select_nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i64 %x0) {
-; CHECK-LABEL: icmp_select_nxv4f32
+; CHECK-LABEL: icmp_select_nxv4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp	x0, #0
-; CHECK-NEXT:    cset	w8, eq
-; CHECK-NEXT:    sbfx	x8, x8, #0, #1
-; CHECK-NEXT:    whilelo	p0.s, xzr, x8
-; CHECK-NEXT:    sel	z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    cmp x0, #0
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
  %mask = icmp eq i64 %x0, 0
  %sel = select i1 %mask, <vscale x 4 x float> %a, <vscale x 4 x float> %b
@ -290,13 +282,13 @@ define <vscale x 4 x float> @icmp_select_nxv4f32(<vscale x 4 x float> %a, <vscal
 }

 define <vscale x 8 x half> @icmp_select_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i64 %x0) {
-; CHECK-LABEL: icmp_select_nxv8f16
+; CHECK-LABEL: icmp_select_nxv8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp	x0, #0
-; CHECK-NEXT:    cset	w8, eq
-; CHECK-NEXT:    sbfx	x8, x8, #0, #1
-; CHECK-NEXT:    whilelo	p0.h, xzr, x8
-; CHECK-NEXT:    sel	z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    cmp x0, #0
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
  %mask = icmp eq i64 %x0, 0
  %sel = select i1 %mask, <vscale x 8 x half> %a, <vscale x 8 x half> %b
@ -304,13 +296,13 @@ define <vscale x 8 x half> @icmp_select_nxv8f16(<vscale x 8 x half> %a, <vscale
 }

 define <vscale x 2 x i64> @icmp_select_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i64 %x0) {
-; CHECK-LABEL: icmp_select_nxv2i64
+; CHECK-LABEL: icmp_select_nxv2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp	x0, #0
-; CHECK-NEXT:    cset	w8, eq
-; CHECK-NEXT:    sbfx	x8, x8, #0, #1
-; CHECK-NEXT:    whilelo	p0.d, xzr, x8
-; CHECK-NEXT:    sel	z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    cmp x0, #0
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
  %mask = icmp eq i64 %x0, 0
  %sel = select i1 %mask, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b
@ -318,13 +310,13 @@ define <vscale x 2 x i64> @icmp_select_nxv2i64(<vscale x 2 x i64> %a, <vscale x
 }

 define <vscale x 4 x i32> @icmp_select_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i64 %x0) {
-; CHECK-LABEL: icmp_select_nxv4i32
+; CHECK-LABEL: icmp_select_nxv4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp	x0, #0
-; CHECK-NEXT:    cset	w8, eq
-; CHECK-NEXT:    sbfx	x8, x8, #0, #1
-; CHECK-NEXT:    whilelo	p0.s, xzr, x8
-; CHECK-NEXT:    sel	z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    cmp x0, #0
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
  %mask = icmp eq i64 %x0, 0
  %sel = select i1 %mask, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b
@ -332,13 +324,13 @@ define <vscale x 4 x i32> @icmp_select_nxv4i32(<vscale x 4 x i32> %a, <vscale x
 }

 define <vscale x 8 x i16> @icmp_select_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i64 %x0) {
-; CHECK-LABEL: icmp_select_nxv8i16
+; CHECK-LABEL: icmp_select_nxv8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp	x0, #0
-; CHECK-NEXT:    cset	w8, eq
-; CHECK-NEXT:    sbfx	x8, x8, #0, #1
-; CHECK-NEXT:    whilelo	p0.h, xzr, x8
-; CHECK-NEXT:    sel	z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    cmp x0, #0
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
  %mask = icmp eq i64 %x0, 0
  %sel = select i1 %mask, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b
@ -346,13 +338,13 @@ define <vscale x 8 x i16> @icmp_select_nxv8i16(<vscale x 8 x i16> %a, <vscale x
 }

 define  <vscale x 16 x i8> @icmp_select_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i64 %x0) {
-; CHECK-LABEL: icmp_select_nxv16i8
+; CHECK-LABEL: icmp_select_nxv16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp	x0, #0
-; CHECK-NEXT:    cset	w8, eq
-; CHECK-NEXT:    sbfx	x8, x8, #0, #1
-; CHECK-NEXT:    whilelo	p0.b, xzr, x8
-; CHECK-NEXT:    sel	z0.b, p0, z0.b, z1.b
+; CHECK-NEXT:    cmp x0, #0
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    ret
  %mask = icmp eq i64 %x0, 0
  %sel = select i1 %mask, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b
@ -360,52 +352,52 @@ define  <vscale x 16 x i8> @icmp_select_nxv16i8(<vscale x 16 x i8> %a, <vscale x
 }

 define <vscale x 2 x i1> @icmp_select_nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i64 %x0) {
-; CHECK-LABEL: icmp_select_nxv2i1
+; CHECK-LABEL: icmp_select_nxv2i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp	x0, #0
-; CHECK-NEXT:    cset	w8, eq
-; CHECK-NEXT:    sbfx	x8, x8, #0, #1
-; CHECK-NEXT:    whilelo	p2.d, xzr, x8
-; CHECK-NEXT:    sel	p0.b, p2, p0.b, p1.b
+; CHECK-NEXT:    cmp x0, #0
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-NEXT:    whilelo p2.d, xzr, x8
+; CHECK-NEXT:    sel p0.b, p2, p0.b, p1.b
 ; CHECK-NEXT:    ret
    %mask = icmp eq i64 %x0, 0
    %sel = select i1 %mask, <vscale x 2 x i1> %a, <vscale x 2 x i1> %b
    ret <vscale x 2 x i1> %sel
 }
 define <vscale x 4 x i1> @icmp_select_nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i64 %x0) {
-; CHECK-LABEL: icmp_select_nxv4i1
+; CHECK-LABEL: icmp_select_nxv4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp	x0, #0
-; CHECK-NEXT:    cset	w8, eq
-; CHECK-NEXT:    sbfx	x8, x8, #0, #1
-; CHECK-NEXT:    whilelo	p2.s, xzr, x8
-; CHECK-NEXT:    sel	p0.b, p2, p0.b, p1.b
+; CHECK-NEXT:    cmp x0, #0
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-NEXT:    whilelo p2.s, xzr, x8
+; CHECK-NEXT:    sel p0.b, p2, p0.b, p1.b
 ; CHECK-NEXT:    ret
    %mask = icmp eq i64 %x0, 0
    %sel = select i1 %mask, <vscale x 4 x i1> %a, <vscale x 4 x i1> %b
    ret <vscale x 4 x i1> %sel
 }
 define <vscale x 8 x i1> @icmp_select_nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i64 %x0) {
-; CHECK-LABEL: icmp_select_nxv8i1
+; CHECK-LABEL: icmp_select_nxv8i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp	x0, #0
-; CHECK-NEXT:    cset	w8, eq
-; CHECK-NEXT:    sbfx	x8, x8, #0, #1
-; CHECK-NEXT:    whilelo	p2.h, xzr, x8
-; CHECK-NEXT:    sel	p0.b, p2, p0.b, p1.b
+; CHECK-NEXT:    cmp x0, #0
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-NEXT:    whilelo p2.h, xzr, x8
+; CHECK-NEXT:    sel p0.b, p2, p0.b, p1.b
 ; CHECK-NEXT:    ret
    %mask = icmp eq i64 %x0, 0
    %sel = select i1 %mask, <vscale x 8 x i1> %a, <vscale x 8 x i1> %b
    ret <vscale x 8 x i1> %sel
 }
 define <vscale x 16 x i1> @icmp_select_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i64 %x0) {
-; CHECK-LABEL: icmp_select_nxv16i1
+; CHECK-LABEL: icmp_select_nxv16i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmp	x0, #0
-; CHECK-NEXT:    cset	w8, eq
-; CHECK-NEXT:    sbfx	x8, x8, #0, #1
-; CHECK-NEXT:    whilelo	p2.b, xzr, x8
-; CHECK-NEXT:    sel	p0.b, p2, p0.b, p1.b
+; CHECK-NEXT:    cmp x0, #0
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    sbfx x8, x8, #0, #1
+; CHECK-NEXT:    whilelo p2.b, xzr, x8
+; CHECK-NEXT:    sel p0.b, p2, p0.b, p1.b
 ; CHECK-NEXT:    ret
    %mask = icmp eq i64 %x0, 0
    %sel = select i1 %mask, <vscale x 16 x i1> %a, <vscale x 16 x i1> %b
--- a/llvm/test/CodeGen/AArch64/sve-unary-movprfx.ll
+++ b/llvm/test/CodeGen/AArch64/sve-unary-movprfx.ll
@ -233,7 +233,7 @@ define <vscale x 2 x i64> @abs_i64_active(<vscale x 2 x i64> %a, <vscale x 2 x i
 define <vscale x 2 x i64> @abs_i64_not_active(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i1> %pg) #0 {
 ; CHECK-LABEL: abs_i64_not_active:
 ; CHECK:       // %bb.0:
-; CHECK:         abs z0.d, p0/m, z1.d
+; CHECK-NEXT:    abs z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
  %ret = tail call <vscale x 2 x i64> @llvm.aarch64.sve.abs.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %b)
  ret <vscale x 2 x i64> %ret
@ -424,7 +424,7 @@ define <vscale x 2 x i64> @cls_i64_active(<vscale x 2 x i64> %a, <vscale x 2 x i
 define <vscale x 2 x i64> @cls_i64_not_active(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i1> %pg) #0 {
 ; CHECK-LABEL: cls_i64_not_active:
 ; CHECK:       // %bb.0:
-; CHECK:         cls z0.d, p0/m, z1.d
+; CHECK-NEXT:    cls z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
  %ret = tail call <vscale x 2 x i64> @llvm.aarch64.sve.cls.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %b)
  ret <vscale x 2 x i64> %ret
@ -598,7 +598,7 @@ define <vscale x 2 x double> @fabs_f64_active(<vscale x 2 x double> %a, <vscale
 define <vscale x 2 x double> @fabs_f64_not_active(<vscale x 2 x double> %a, <vscale x 2 x double> %b, <vscale x 2 x i1> %pg) #0 {
 ; CHECK-LABEL: fabs_f64_not_active:
 ; CHECK:       // %bb.0:
-; CHECK:         fabs z0.d, p0/m, z1.d
+; CHECK-NEXT:    fabs z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
  %ret = tail call <vscale x 2 x double> @llvm.aarch64.sve.fabs.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x i1> %pg, <vscale x 2 x double> %b)
  ret <vscale x 2 x double> %ret
@ -772,7 +772,7 @@ define <vscale x 2 x i64> @sxtb_i64_active(<vscale x 2 x i64> %a, <vscale x 2 x
 define <vscale x 2 x i64> @sxtb_i64_not_active(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i1> %pg) #0 {
 ; CHECK-LABEL: sxtb_i64_not_active:
 ; CHECK:       // %bb.0:
-; CHECK:         sxtb z0.d, p0/m, z1.d
+; CHECK-NEXT:    sxtb z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
  %ret = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sxtb.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %b)
  ret <vscale x 2 x i64> %ret
@ -888,7 +888,7 @@ define <vscale x 2 x i64> @sxth_i64_active(<vscale x 2 x i64> %a, <vscale x 2 x
 define <vscale x 2 x i64> @sxth_i64_not_active(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i1> %pg) #0 {
 ; CHECK-LABEL: sxth_i64_not_active:
 ; CHECK:       // %bb.0:
-; CHECK:         sxth z0.d, p0/m, z1.d
+; CHECK-NEXT:    sxth z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
  %ret = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sxth.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %b)
  ret <vscale x 2 x i64> %ret
@ -946,7 +946,7 @@ define <vscale x 2 x i64> @sxtw_i64_active(<vscale x 2 x i64> %a, <vscale x 2 x
 define <vscale x 2 x i64> @sxtw_i64_not_active(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i1> %pg) #0 {
 ; CHECK-LABEL: sxtw_i64_not_active:
 ; CHECK:       // %bb.0:
-; CHECK:         sxtw z0.d, p0/m, z1.d
+; CHECK-NEXT:    sxtw z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
  %ret = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %b)
  ret <vscale x 2 x i64> %ret
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-arith-imm.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-arith-imm.ll
@ -404,8 +404,9 @@ define <vscale x 2 x i64> @uqsub_d_highimm(<vscale x 2 x i64> %a) {
 ; As uqsub_i32 but where pg is i8 based and thus compatible for i32.
 define <vscale x 4 x i32> @uqsub_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: uqsub_i32_ptrue_all_b:
-; CHECK: uqsub z0.s, z0.s, #1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqsub z0.s, z0.s, #1 // =0x1
+; CHECK-NEXT:    ret
  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
  %b = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dup.x.nxv4i32(i32 1)
@ -418,8 +419,9 @@ define <vscale x 4 x i32> @uqsub_i32_ptrue_all_b(<vscale x 4 x i32> %a) #0 {
 ; As uqsub_i32 but where pg is i16 based and thus compatible for i32.
 define <vscale x 4 x i32> @uqsub_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: uqsub_i32_ptrue_all_h:
-; CHECK: uqsub z0.s, z0.s, #1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uqsub z0.s, z0.s, #1 // =0x1
+; CHECK-NEXT:    ret
  %pg.h = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg.h)
  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
@ -434,10 +436,11 @@ define <vscale x 4 x i32> @uqsub_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
 ; thus inactive lanes are important and the immediate form cannot be used.
 define <vscale x 4 x i32> @uqsub_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: uqsub_i32_ptrue_all_d:
-; CHECK-DAG: ptrue [[PG:p[0-9]+]].d
-; CHECK-DAG: mov [[DUP:z[0-9]+]].s, #1
-; CHECK-DAG: uqsub z0.s, [[PG]]/m, z0.s, [[DUP]].s
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z1.s, #1 // =0x1
+; CHECK-NEXT:    uqsub z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    ret
  %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
  %pg.b = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %pg.d)
  %pg.s = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %pg.b)
--- a/llvm/test/CodeGen/AArch64/sve2-unary-movprfx.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-unary-movprfx.ll
@ -192,7 +192,7 @@ define <vscale x 2 x i64> @sqabs_i64_active(<vscale x 2 x i64> %a, <vscale x 2 x
 define <vscale x 2 x i64> @sqabs_i64_not_active(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i1> %pg) #0 {
 ; CHECK-LABEL: sqabs_i64_not_active:
 ; CHECK:       // %bb.0:
-; CHECK:         sqabs z0.d, p0/m, z1.d
+; CHECK-NEXT:    sqabs z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
  %ret = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sqabs.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %b)
  ret <vscale x 2 x i64> %ret
--- a/llvm/test/CodeGen/AArch64/unwind-preserved.ll
+++ b/llvm/test/CodeGen/AArch64/unwind-preserved.ll
@ -365,7 +365,7 @@ define aarch64_vector_pcs <4 x i32> @invoke_callee_may_throw_neon(<4 x i32> %v)
 ; GISEL-NEXT:    bl may_throw_neon
 ; GISEL-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; GISEL-NEXT:  .Ltmp4:
-; GISEL-NEXT:    b .LBB1_1 
+; GISEL-NEXT:    b .LBB1_1
 ; GISEL-NEXT:  .LBB1_1: // %.Lcontinue
 ; GISEL-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldp x29, x30, [sp, #288] // 16-byte Folded Reload