Inliner: don't mark call sites as 'nounwind' if that would be redundant

When F calls G calls H, G is nounwind, and G is inlined into F, then the inlined call-site to H should be effectively nounwind so as not to lose information during inlining. If H itself is nounwind (which often happens when H is an intrinsic), we no longer mark the callsite explicitly as nounwind. Previously, there were cases where the inlined call-site of H differs from a pre-existing call-site of H in F *only* in the explicitly added nounwind attribute, thus preventing common subexpression elimination. v2: - just check CI->doesNotThrow v3 (resubmit after revert at 3443788087): - update Clang tests Differential Revision: https://reviews.llvm.org/D129860
2022-07-15 15:27:26 +02:00 · 2022-07-15 15:27:26 +02:00 · 1ddc51d89d
parent e0ccd190ae
commit 1ddc51d89d
61 changed files with 1366 additions and 1458 deletions
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-ld-st-rmb.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-ld-st-rmb.c
@ -55,21 +55,21 @@
 // BE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
-// BE-PWR9-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]]) #[[ATTR4:[0-9]+]]
+// BE-PWR9-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]])
 // BE-PWR9-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP5:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
 // BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // BE-PWR9-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]])
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* [[__MASK_I]], align 16
 // BE-PWR9-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]])
 // BE-PWR9-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
 // BE-PWR9-NEXT:    ret <16 x i8> [[TMP14]]
 //
@ -87,21 +87,21 @@
 // LE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
-// LE-PWR9-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]]) #[[ATTR4:[0-9]+]]
+// LE-PWR9-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]])
 // LE-PWR9-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP5:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
 // LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // LE-PWR9-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]])
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* [[__MASK_I]], align 16
 // LE-PWR9-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]])
 // LE-PWR9-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
 // LE-PWR9-NEXT:    ret <16 x i8> [[TMP14]]
 //
@ -167,14 +167,14 @@ vector unsigned char test_ldrmb1(char *ptr) { return __vec_ldrmb(ptr, 1); }
 // BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -182,7 +182,7 @@ vector unsigned char test_ldrmb1(char *ptr) { return __vec_ldrmb(ptr, 1); }
 // BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // BE-PWR9-NEXT:    ret void
 //
 // LE-PWR9-LABEL: @test_strmb1(
@ -205,14 +205,14 @@ vector unsigned char test_ldrmb1(char *ptr) { return __vec_ldrmb(ptr, 1); }
 // LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -220,7 +220,7 @@ vector unsigned char test_ldrmb1(char *ptr) { return __vec_ldrmb(ptr, 1); }
 // LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // LE-PWR9-NEXT:    ret void
 //
 // BE32-PWR9-LABEL: @test_strmb1(
@ -291,14 +291,14 @@ void test_strmb1(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -306,7 +306,7 @@ void test_strmb1(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // BE-PWR9-NEXT:    ret void
 //
 // LE-PWR9-LABEL: @test_strmb2(
@ -329,14 +329,14 @@ void test_strmb1(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -344,7 +344,7 @@ void test_strmb1(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // LE-PWR9-NEXT:    ret void
 //
 // BE32-PWR9-LABEL: @test_strmb2(
@ -423,14 +423,14 @@ void test_strmb2(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -438,7 +438,7 @@ void test_strmb2(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // BE-PWR9-NEXT:    ret void
 //
 // LE-PWR9-LABEL: @test_strmb3(
@ -461,14 +461,14 @@ void test_strmb2(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -476,7 +476,7 @@ void test_strmb2(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // LE-PWR9-NEXT:    ret void
 //
 // BE32-PWR9-LABEL: @test_strmb3(
@ -552,14 +552,14 @@ void test_strmb3(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -567,7 +567,7 @@ void test_strmb3(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // BE-PWR9-NEXT:    ret void
 //
 // LE-PWR9-LABEL: @test_strmb4(
@ -590,14 +590,14 @@ void test_strmb3(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -605,7 +605,7 @@ void test_strmb3(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // LE-PWR9-NEXT:    ret void
 //
 // BE32-PWR9-LABEL: @test_strmb4(
@ -684,14 +684,14 @@ void test_strmb4(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -699,7 +699,7 @@ void test_strmb4(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // BE-PWR9-NEXT:    ret void
 //
 // LE-PWR9-LABEL: @test_strmb5(
@ -722,14 +722,14 @@ void test_strmb4(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -737,7 +737,7 @@ void test_strmb4(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // LE-PWR9-NEXT:    ret void
 //
 // BE32-PWR9-LABEL: @test_strmb5(
@ -824,14 +824,14 @@ void test_strmb5(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -839,7 +839,7 @@ void test_strmb5(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // BE-PWR9-NEXT:    ret void
 //
 // LE-PWR9-LABEL: @test_strmb6(
@ -862,14 +862,14 @@ void test_strmb5(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -877,7 +877,7 @@ void test_strmb5(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // LE-PWR9-NEXT:    ret void
 //
 // BE32-PWR9-LABEL: @test_strmb6(
@ -972,14 +972,14 @@ void test_strmb6(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -987,7 +987,7 @@ void test_strmb6(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // BE-PWR9-NEXT:    ret void
 //
 // LE-PWR9-LABEL: @test_strmb7(
@ -1010,14 +1010,14 @@ void test_strmb6(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -1025,7 +1025,7 @@ void test_strmb6(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // LE-PWR9-NEXT:    ret void
 //
 // BE32-PWR9-LABEL: @test_strmb7(
@ -1106,14 +1106,14 @@ void test_strmb7(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -1121,7 +1121,7 @@ void test_strmb7(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // BE-PWR9-NEXT:    ret void
 //
 // LE-PWR9-LABEL: @test_strmb8(
@ -1144,14 +1144,14 @@ void test_strmb7(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -1159,7 +1159,7 @@ void test_strmb7(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // LE-PWR9-NEXT:    ret void
 //
 // BE32-PWR9-LABEL: @test_strmb8(
@ -1222,21 +1222,21 @@ void test_strmb8(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
-// BE-PWR9-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]])
 // BE-PWR9-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP5:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
 // BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // BE-PWR9-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]])
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* [[__MASK_I]], align 16
 // BE-PWR9-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]])
 // BE-PWR9-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
 // BE-PWR9-NEXT:    ret <16 x i8> [[TMP14]]
 //
@ -1254,21 +1254,21 @@ void test_strmb8(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
-// LE-PWR9-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]])
 // LE-PWR9-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP5:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
 // LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // LE-PWR9-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]])
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* [[__MASK_I]], align 16
 // LE-PWR9-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]])
 // LE-PWR9-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
 // LE-PWR9-NEXT:    ret <16 x i8> [[TMP14]]
 //
@ -1345,14 +1345,14 @@ vector unsigned char test_ldrmb9(char *ptr) { return __vec_ldrmb(ptr, 9); }
 // BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -1360,7 +1360,7 @@ vector unsigned char test_ldrmb9(char *ptr) { return __vec_ldrmb(ptr, 9); }
 // BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // BE-PWR9-NEXT:    ret void
 //
 // LE-PWR9-LABEL: @test_strmb9(
@ -1383,14 +1383,14 @@ vector unsigned char test_ldrmb9(char *ptr) { return __vec_ldrmb(ptr, 9); }
 // LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -1398,7 +1398,7 @@ vector unsigned char test_ldrmb9(char *ptr) { return __vec_ldrmb(ptr, 9); }
 // LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // LE-PWR9-NEXT:    ret void
 //
 // BE32-PWR9-LABEL: @test_strmb9(
@ -1485,14 +1485,14 @@ void test_strmb9(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -1500,7 +1500,7 @@ void test_strmb9(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // BE-PWR9-NEXT:    ret void
 //
 // LE-PWR9-LABEL: @test_strmb10(
@ -1523,14 +1523,14 @@ void test_strmb9(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -1538,7 +1538,7 @@ void test_strmb9(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // LE-PWR9-NEXT:    ret void
 //
 // BE32-PWR9-LABEL: @test_strmb10(
@ -1633,14 +1633,14 @@ void test_strmb10(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -1648,7 +1648,7 @@ void test_strmb10(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // BE-PWR9-NEXT:    ret void
 //
 // LE-PWR9-LABEL: @test_strmb11(
@ -1671,14 +1671,14 @@ void test_strmb10(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -1686,7 +1686,7 @@ void test_strmb10(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // LE-PWR9-NEXT:    ret void
 //
 // BE32-PWR9-LABEL: @test_strmb11(
@ -1778,14 +1778,14 @@ void test_strmb11(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -1793,7 +1793,7 @@ void test_strmb11(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // BE-PWR9-NEXT:    ret void
 //
 // LE-PWR9-LABEL: @test_strmb12(
@ -1816,14 +1816,14 @@ void test_strmb11(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -1831,7 +1831,7 @@ void test_strmb11(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // LE-PWR9-NEXT:    ret void
 //
 // BE32-PWR9-LABEL: @test_strmb12(
@ -1926,14 +1926,14 @@ void test_strmb12(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -1941,7 +1941,7 @@ void test_strmb12(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // BE-PWR9-NEXT:    ret void
 //
 // LE-PWR9-LABEL: @test_strmb13(
@ -1964,14 +1964,14 @@ void test_strmb12(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -1979,7 +1979,7 @@ void test_strmb12(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // LE-PWR9-NEXT:    ret void
 //
 // BE32-PWR9-LABEL: @test_strmb13(
@ -2082,14 +2082,14 @@ void test_strmb13(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -2097,7 +2097,7 @@ void test_strmb13(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // BE-PWR9-NEXT:    ret void
 //
 // LE-PWR9-LABEL: @test_strmb14(
@ -2120,14 +2120,14 @@ void test_strmb13(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -2135,7 +2135,7 @@ void test_strmb13(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // LE-PWR9-NEXT:    ret void
 //
 // BE32-PWR9-LABEL: @test_strmb14(
@ -2246,14 +2246,14 @@ void test_strmb14(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -2261,7 +2261,7 @@ void test_strmb14(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // BE-PWR9-NEXT:    ret void
 //
 // LE-PWR9-LABEL: @test_strmb15(
@ -2284,14 +2284,14 @@ void test_strmb14(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -2299,7 +2299,7 @@ void test_strmb14(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // LE-PWR9-NEXT:    ret void
 //
 // BE32-PWR9-LABEL: @test_strmb15(
@ -2366,21 +2366,21 @@ void test_strmb15(char *ptr, vector unsigned char data) {
 // BE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
-// BE-PWR9-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]])
 // BE-PWR9-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP5:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
 // BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // BE-PWR9-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]])
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* [[__MASK_I]], align 16
 // BE-PWR9-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]])
 // BE-PWR9-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
 // BE-PWR9-NEXT:    ret <16 x i8> [[TMP14]]
 //
@ -2398,21 +2398,21 @@ void test_strmb15(char *ptr, vector unsigned char data) {
 // LE-PWR9-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
-// LE-PWR9-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]])
 // LE-PWR9-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP5:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
 // LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // LE-PWR9-NEXT:    [[TMP6:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]])
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP7]], <16 x i8>* [[__MASK_I]], align 16
 // LE-PWR9-NEXT:    [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]])
 // LE-PWR9-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
 // LE-PWR9-NEXT:    ret <16 x i8> [[TMP14]]
 //
@ -2472,14 +2472,14 @@ vector unsigned char test_ldrmb16(char *ptr) { return __vec_ldrmb(ptr, 16); }
 // BE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // BE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // BE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // BE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // BE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // BE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // BE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // BE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // BE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -2487,7 +2487,7 @@ vector unsigned char test_ldrmb16(char *ptr) { return __vec_ldrmb(ptr, 16); }
 // BE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // BE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// BE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // BE-PWR9-NEXT:    ret void
 //
 // LE-PWR9-LABEL: @test_strmb16(
@ -2510,14 +2510,14 @@ vector unsigned char test_ldrmb16(char *ptr) { return __vec_ldrmb(ptr, 16); }
 // LE-PWR9-NEXT:    [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
 // LE-PWR9-NEXT:    [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
 // LE-PWR9-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
-// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]])
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
 // LE-PWR9-NEXT:    [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
 // LE-PWR9-NEXT:    [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
 // LE-PWR9-NEXT:    [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
-// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]])
 // LE-PWR9-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // LE-PWR9-NEXT:    store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
 // LE-PWR9-NEXT:    [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
@ -2525,7 +2525,7 @@ vector unsigned char test_ldrmb16(char *ptr) { return __vec_ldrmb(ptr, 16); }
 // LE-PWR9-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
 // LE-PWR9-NEXT:    [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
-// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
+// LE-PWR9-NEXT:    call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]])
 // LE-PWR9-NEXT:    ret void
 //
 // BE32-PWR9-LABEL: @test_strmb16(
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-quadword-noi128.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-quadword-noi128.c
@ -14,7 +14,7 @@
 // CHECK-LE-NEXT:  entry:
 // CHECK-LE-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
 // CHECK-LE-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
-// CHECK-LE-NEXT:    [[TMP2:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vsubcuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]]) #[[ATTR3:[0-9]+]]
+// CHECK-LE-NEXT:    [[TMP2:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vsubcuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]])
 // CHECK-LE-NEXT:    [[TMP3:%.*]] = bitcast <1 x i128> [[TMP2]] to <16 x i8>
 // CHECK-LE-NEXT:    ret <16 x i8> [[TMP3]]
 //
@ -22,7 +22,7 @@
 // CHECK-AIX-NEXT:  entry:
 // CHECK-AIX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
 // CHECK-AIX-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
-// CHECK-AIX-NEXT:    [[TMP2:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vsubcuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]]) #[[ATTR3:[0-9]+]]
+// CHECK-AIX-NEXT:    [[TMP2:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vsubcuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]])
 // CHECK-AIX-NEXT:    [[TMP3:%.*]] = bitcast <1 x i128> [[TMP2]] to <16 x i8>
 // CHECK-AIX-NEXT:    ret <16 x i8> [[TMP3]]
 //
@ -34,7 +34,7 @@ vector unsigned char test_subc(vector unsigned char a, vector unsigned char b) {
 // CHECK-LE-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
 // CHECK-LE-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
 // CHECK-LE-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[C:%.*]] to <1 x i128>
-// CHECK-LE-NEXT:    [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vsubecuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]]) #[[ATTR3]]
+// CHECK-LE-NEXT:    [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vsubecuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]])
 // CHECK-LE-NEXT:    [[TMP4:%.*]] = bitcast <1 x i128> [[TMP3]] to <16 x i8>
 // CHECK-LE-NEXT:    ret <16 x i8> [[TMP4]]
 //
@ -43,7 +43,7 @@ vector unsigned char test_subc(vector unsigned char a, vector unsigned char b) {
 // CHECK-AIX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
 // CHECK-AIX-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
 // CHECK-AIX-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[C:%.*]] to <1 x i128>
-// CHECK-AIX-NEXT:    [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vsubecuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]]) #[[ATTR3]]
+// CHECK-AIX-NEXT:    [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vsubecuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]])
 // CHECK-AIX-NEXT:    [[TMP4:%.*]] = bitcast <1 x i128> [[TMP3]] to <16 x i8>
 // CHECK-AIX-NEXT:    ret <16 x i8> [[TMP4]]
 //
@ -56,7 +56,7 @@ vector unsigned char test_subec(vector unsigned char a, vector unsigned char b,
 // CHECK-LE-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
 // CHECK-LE-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
 // CHECK-LE-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[C:%.*]] to <1 x i128>
-// CHECK-LE-NEXT:    [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vsubeuqm(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]]) #[[ATTR3]]
+// CHECK-LE-NEXT:    [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vsubeuqm(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]])
 // CHECK-LE-NEXT:    [[TMP4:%.*]] = bitcast <1 x i128> [[TMP3]] to <16 x i8>
 // CHECK-LE-NEXT:    ret <16 x i8> [[TMP4]]
 //
@ -65,7 +65,7 @@ vector unsigned char test_subec(vector unsigned char a, vector unsigned char b,
 // CHECK-AIX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
 // CHECK-AIX-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
 // CHECK-AIX-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[C:%.*]] to <1 x i128>
-// CHECK-AIX-NEXT:    [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vsubeuqm(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]]) #[[ATTR3]]
+// CHECK-AIX-NEXT:    [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vsubeuqm(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]])
 // CHECK-AIX-NEXT:    [[TMP4:%.*]] = bitcast <1 x i128> [[TMP3]] to <16 x i8>
 // CHECK-AIX-NEXT:    ret <16 x i8> [[TMP4]]
 //
@ -101,7 +101,7 @@ vector unsigned char test_sub(vector unsigned char a, vector unsigned char b,
 // CHECK-LE-NEXT:  entry:
 // CHECK-LE-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
 // CHECK-LE-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
-// CHECK-LE-NEXT:    [[TMP2:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vaddcuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]]) #[[ATTR3]]
+// CHECK-LE-NEXT:    [[TMP2:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vaddcuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]])
 // CHECK-LE-NEXT:    [[TMP3:%.*]] = bitcast <1 x i128> [[TMP2]] to <16 x i8>
 // CHECK-LE-NEXT:    ret <16 x i8> [[TMP3]]
 //
@ -109,7 +109,7 @@ vector unsigned char test_sub(vector unsigned char a, vector unsigned char b,
 // CHECK-AIX-NEXT:  entry:
 // CHECK-AIX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
 // CHECK-AIX-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
-// CHECK-AIX-NEXT:    [[TMP2:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vaddcuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]]) #[[ATTR3]]
+// CHECK-AIX-NEXT:    [[TMP2:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vaddcuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]])
 // CHECK-AIX-NEXT:    [[TMP3:%.*]] = bitcast <1 x i128> [[TMP2]] to <16 x i8>
 // CHECK-AIX-NEXT:    ret <16 x i8> [[TMP3]]
 //
@ -121,7 +121,7 @@ vector unsigned char test_addc(vector unsigned char a, vector unsigned char b) {
 // CHECK-LE-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
 // CHECK-LE-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
 // CHECK-LE-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[C:%.*]] to <1 x i128>
-// CHECK-LE-NEXT:    [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vaddecuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]]) #[[ATTR3]]
+// CHECK-LE-NEXT:    [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vaddecuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]])
 // CHECK-LE-NEXT:    [[TMP4:%.*]] = bitcast <1 x i128> [[TMP3]] to <16 x i8>
 // CHECK-LE-NEXT:    ret <16 x i8> [[TMP4]]
 //
@ -130,7 +130,7 @@ vector unsigned char test_addc(vector unsigned char a, vector unsigned char b) {
 // CHECK-AIX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
 // CHECK-AIX-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
 // CHECK-AIX-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[C:%.*]] to <1 x i128>
-// CHECK-AIX-NEXT:    [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vaddecuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]]) #[[ATTR3]]
+// CHECK-AIX-NEXT:    [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vaddecuq(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]])
 // CHECK-AIX-NEXT:    [[TMP4:%.*]] = bitcast <1 x i128> [[TMP3]] to <16 x i8>
 // CHECK-AIX-NEXT:    ret <16 x i8> [[TMP4]]
 //
@ -143,7 +143,7 @@ vector unsigned char test_addec(vector unsigned char a, vector unsigned char b,
 // CHECK-LE-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
 // CHECK-LE-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
 // CHECK-LE-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[C:%.*]] to <1 x i128>
-// CHECK-LE-NEXT:    [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vaddeuqm(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]]) #[[ATTR3]]
+// CHECK-LE-NEXT:    [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vaddeuqm(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]])
 // CHECK-LE-NEXT:    [[TMP4:%.*]] = bitcast <1 x i128> [[TMP3]] to <16 x i8>
 // CHECK-LE-NEXT:    ret <16 x i8> [[TMP4]]
 //
@ -152,7 +152,7 @@ vector unsigned char test_addec(vector unsigned char a, vector unsigned char b,
 // CHECK-AIX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[A:%.*]] to <1 x i128>
 // CHECK-AIX-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[B:%.*]] to <1 x i128>
 // CHECK-AIX-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[C:%.*]] to <1 x i128>
-// CHECK-AIX-NEXT:    [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vaddeuqm(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]]) #[[ATTR3]]
+// CHECK-AIX-NEXT:    [[TMP3:%.*]] = tail call <1 x i128> @llvm.ppc.altivec.vaddeuqm(<1 x i128> [[TMP0]], <1 x i128> [[TMP1]], <1 x i128> [[TMP2]])
 // CHECK-AIX-NEXT:    [[TMP4:%.*]] = bitcast <1 x i128> [[TMP3]] to <16 x i8>
 // CHECK-AIX-NEXT:    ret <16 x i8> [[TMP4]]
 //
--- a/clang/test/CodeGen/X86/keylocker.c
+++ b/clang/test/CodeGen/X86/keylocker.c
@ -30,7 +30,7 @@
 // CHECK64-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[__ENKEY_LO_ADDR_I]], align 16
 // CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[__ENKEY_HI_ADDR_I]], align 16
 // CHECK64-NEXT:    [[TMP7:%.*]] = load i32, i32* [[__CTL_ADDR_I]], align 4
-// CHECK64-NEXT:    call void @llvm.x86.loadiwkey(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], i32 [[TMP7]]) #[[ATTR1:[0-9]+]]
+// CHECK64-NEXT:    call void @llvm.x86.loadiwkey(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], i32 [[TMP7]])
 // CHECK64-NEXT:    ret void
 //
 // CHECK32-LABEL: @test_loadiwkey(
@ -59,7 +59,7 @@
 // CHECK32-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[__ENKEY_LO_ADDR_I]], align 16
 // CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[__ENKEY_HI_ADDR_I]], align 16
 // CHECK32-NEXT:    [[TMP7:%.*]] = load i32, i32* [[__CTL_ADDR_I]], align 4
-// CHECK32-NEXT:    call void @llvm.x86.loadiwkey(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], i32 [[TMP7]]) #[[ATTR1:[0-9]+]]
+// CHECK32-NEXT:    call void @llvm.x86.loadiwkey(<2 x i64> [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], i32 [[TMP7]])
 // CHECK32-NEXT:    ret void
 //
 void test_loadiwkey(unsigned int ctl, __m128i intkey, __m128i enkey_lo, __m128i enkey_hi) {
@ -86,7 +86,7 @@ void test_loadiwkey(unsigned int ctl, __m128i intkey, __m128i enkey_lo, __m128i
 // CHECK64-NEXT:    [[TMP3:%.*]] = load i32, i32* [[__HTYPE_ADDR_I]], align 4
 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_ADDR_I]], align 16
 // CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
-// CHECK64-NEXT:    [[TMP6:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32 [[TMP3]], <2 x i64> [[TMP4]]) #[[ATTR1]]
+// CHECK64-NEXT:    [[TMP6:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32 [[TMP3]], <2 x i64> [[TMP4]])
 // CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 1
 // CHECK64-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP5]] to <2 x i64>*
 // CHECK64-NEXT:    store <2 x i64> [[TMP7]], <2 x i64>* [[TMP8]], align 1
@ -121,7 +121,7 @@ void test_loadiwkey(unsigned int ctl, __m128i intkey, __m128i enkey_lo, __m128i
 // CHECK32-NEXT:    [[TMP3:%.*]] = load i32, i32* [[__HTYPE_ADDR_I]], align 4
 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_ADDR_I]], align 16
 // CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
-// CHECK32-NEXT:    [[TMP6:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32 [[TMP3]], <2 x i64> [[TMP4]]) #[[ATTR1]]
+// CHECK32-NEXT:    [[TMP6:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey128(i32 [[TMP3]], <2 x i64> [[TMP4]])
 // CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP6]], 1
 // CHECK32-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP5]] to <2 x i64>*
 // CHECK32-NEXT:    store <2 x i64> [[TMP7]], <2 x i64>* [[TMP8]], align 1
@ -166,7 +166,7 @@ unsigned int test_encodekey128_u32(unsigned int htype, __m128i key, void *h) {
 // CHECK64-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_LO_ADDR_I]], align 16
 // CHECK64-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_HI_ADDR_I]], align 16
 // CHECK64-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
-// CHECK64-NEXT:    [[TMP8:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32 [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]]) #[[ATTR1]]
+// CHECK64-NEXT:    [[TMP8:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32 [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]])
 // CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 1
 // CHECK64-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP7]] to <2 x i64>*
 // CHECK64-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP10]], align 1
@ -211,7 +211,7 @@ unsigned int test_encodekey128_u32(unsigned int htype, __m128i key, void *h) {
 // CHECK32-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_LO_ADDR_I]], align 16
 // CHECK32-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[__KEY_HI_ADDR_I]], align 16
 // CHECK32-NEXT:    [[TMP7:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
-// CHECK32-NEXT:    [[TMP8:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32 [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]]) #[[ATTR1]]
+// CHECK32-NEXT:    [[TMP8:%.*]] = call { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.encodekey256(i32 [[TMP4]], <2 x i64> [[TMP5]], <2 x i64> [[TMP6]])
 // CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i32, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP8]], 1
 // CHECK32-NEXT:    [[TMP10:%.*]] = bitcast i8* [[TMP7]] to <2 x i64>*
 // CHECK32-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* [[TMP10]], align 1
@ -254,7 +254,7 @@ unsigned int test_encodekey256_u32(unsigned int htype, __m128i key_lo, __m128i k
 // CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
 // CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
-// CHECK64-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
+// CHECK64-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> [[TMP4]], i8* [[TMP5]])
 // CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
 // CHECK64-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
 // CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
@ -289,7 +289,7 @@ unsigned int test_encodekey256_u32(unsigned int htype, __m128i key_lo, __m128i k
 // CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
 // CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
-// CHECK32-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
+// CHECK32-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc256kl(<2 x i64> [[TMP4]], i8* [[TMP5]])
 // CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
 // CHECK32-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
 // CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
@ -328,7 +328,7 @@ unsigned char test_mm_aesenc256kl_u8(__m128i *odata, __m128i idata, const void *
 // CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
 // CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
-// CHECK64-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
+// CHECK64-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> [[TMP4]], i8* [[TMP5]])
 // CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
 // CHECK64-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
 // CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
@ -363,7 +363,7 @@ unsigned char test_mm_aesenc256kl_u8(__m128i *odata, __m128i idata, const void *
 // CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
 // CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
-// CHECK32-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
+// CHECK32-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec256kl(<2 x i64> [[TMP4]], i8* [[TMP5]])
 // CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
 // CHECK32-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
 // CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
@ -402,7 +402,7 @@ unsigned char test_mm_aesdec256kl_u8(__m128i *odata, __m128i idata, const void *
 // CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
 // CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
-// CHECK64-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
+// CHECK64-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> [[TMP4]], i8* [[TMP5]])
 // CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
 // CHECK64-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
 // CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
@ -437,7 +437,7 @@ unsigned char test_mm_aesdec256kl_u8(__m128i *odata, __m128i idata, const void *
 // CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
 // CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
-// CHECK32-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
+// CHECK32-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesenc128kl(<2 x i64> [[TMP4]], i8* [[TMP5]])
 // CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
 // CHECK32-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
 // CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
@ -476,7 +476,7 @@ unsigned char test_mm_aesenc128kl_u8(__m128i *odata, __m128i idata, const void *
 // CHECK64-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 8
 // CHECK64-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
 // CHECK64-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 8
-// CHECK64-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
+// CHECK64-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> [[TMP4]], i8* [[TMP5]])
 // CHECK64-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
 // CHECK64-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
 // CHECK64-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
@ -511,7 +511,7 @@ unsigned char test_mm_aesenc128kl_u8(__m128i *odata, __m128i idata, const void *
 // CHECK32-NEXT:    [[TMP3:%.*]] = load <2 x i64>*, <2 x i64>** [[__ODATA_ADDR_I]], align 4
 // CHECK32-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[__IDATA_ADDR_I]], align 16
 // CHECK32-NEXT:    [[TMP5:%.*]] = load i8*, i8** [[__H_ADDR_I]], align 4
-// CHECK32-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> [[TMP4]], i8* [[TMP5]]) #[[ATTR1]]
+// CHECK32-NEXT:    [[TMP6:%.*]] = call { i8, <2 x i64> } @llvm.x86.aesdec128kl(<2 x i64> [[TMP4]], i8* [[TMP5]])
 // CHECK32-NEXT:    [[TMP7:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 0
 // CHECK32-NEXT:    [[TMP8:%.*]] = trunc i8 [[TMP7]] to i1
 // CHECK32-NEXT:    [[TMP9:%.*]] = extractvalue { i8, <2 x i64> } [[TMP6]], 1
@ -565,7 +565,7 @@ unsigned char test_mm_aesdec128kl_u8(__m128i *odata, __m128i idata, const void *
 // CHECK64-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
 // CHECK64-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
 // CHECK64-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
-// CHECK64-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
+// CHECK64-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
 // CHECK64-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
 // CHECK64-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
 // CHECK64-NEXT:    br i1 [[TMP23]], label [[AESENCWIDE128KL_NO_ERROR_I:%.*]], label [[AESENCWIDE128KL_ERROR_I:%.*]]
@ -658,7 +658,7 @@ unsigned char test_mm_aesdec128kl_u8(__m128i *odata, __m128i idata, const void *
 // CHECK32-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
 // CHECK32-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
 // CHECK32-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
-// CHECK32-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
+// CHECK32-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
 // CHECK32-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
 // CHECK32-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
 // CHECK32-NEXT:    br i1 [[TMP23]], label [[AESENCWIDE128KL_NO_ERROR_I:%.*]], label [[AESENCWIDE128KL_ERROR_I:%.*]]
@ -755,7 +755,7 @@ unsigned char test__mm_aesencwide128kl_u8(__m128i odata[8], const __m128i idata[
 // CHECK64-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
 // CHECK64-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
 // CHECK64-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
-// CHECK64-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
+// CHECK64-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
 // CHECK64-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
 // CHECK64-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
 // CHECK64-NEXT:    br i1 [[TMP23]], label [[AESDECWIDE128KL_NO_ERROR_I:%.*]], label [[AESDECWIDE128KL_ERROR_I:%.*]]
@ -848,7 +848,7 @@ unsigned char test__mm_aesencwide128kl_u8(__m128i odata[8], const __m128i idata[
 // CHECK32-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
 // CHECK32-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
 // CHECK32-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
-// CHECK32-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
+// CHECK32-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide128kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
 // CHECK32-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
 // CHECK32-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
 // CHECK32-NEXT:    br i1 [[TMP23]], label [[AESDECWIDE128KL_NO_ERROR_I:%.*]], label [[AESDECWIDE128KL_ERROR_I:%.*]]
@ -945,7 +945,7 @@ unsigned char test__mm_aesdecwide128kl_u8(__m128i odata[8], const __m128i idata[
 // CHECK64-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
 // CHECK64-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
 // CHECK64-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
-// CHECK64-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
+// CHECK64-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
 // CHECK64-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
 // CHECK64-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
 // CHECK64-NEXT:    br i1 [[TMP23]], label [[AESENCWIDE256KL_NO_ERROR_I:%.*]], label [[AESENCWIDE256KL_ERROR_I:%.*]]
@ -1038,7 +1038,7 @@ unsigned char test__mm_aesdecwide128kl_u8(__m128i odata[8], const __m128i idata[
 // CHECK32-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
 // CHECK32-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
 // CHECK32-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
-// CHECK32-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
+// CHECK32-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesencwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
 // CHECK32-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
 // CHECK32-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
 // CHECK32-NEXT:    br i1 [[TMP23]], label [[AESENCWIDE256KL_NO_ERROR_I:%.*]], label [[AESENCWIDE256KL_ERROR_I:%.*]]
@ -1135,7 +1135,7 @@ unsigned char test__mm_aesencwide256kl_u8(__m128i odata[8], const __m128i idata[
 // CHECK64-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
 // CHECK64-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
 // CHECK64-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
-// CHECK64-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
+// CHECK64-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
 // CHECK64-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
 // CHECK64-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
 // CHECK64-NEXT:    br i1 [[TMP23]], label [[AESDECWIDE256KL_NO_ERROR_I:%.*]], label [[AESDECWIDE256KL_ERROR_I:%.*]]
@ -1228,7 +1228,7 @@ unsigned char test__mm_aesencwide256kl_u8(__m128i odata[8], const __m128i idata[
 // CHECK32-NEXT:    [[TMP18:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 16
 // CHECK32-NEXT:    [[TMP19:%.*]] = getelementptr <2 x i64>, <2 x i64>* [[TMP4]], i32 7
 // CHECK32-NEXT:    [[TMP20:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 16
-// CHECK32-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]]) #[[ATTR1]]
+// CHECK32-NEXT:    [[TMP21:%.*]] = call { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.x86.aesdecwide256kl(i8* [[TMP5]], <2 x i64> [[TMP6]], <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <2 x i64> [[TMP16]], <2 x i64> [[TMP18]], <2 x i64> [[TMP20]])
 // CHECK32-NEXT:    [[TMP22:%.*]] = extractvalue { i8, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[TMP21]], 0
 // CHECK32-NEXT:    [[TMP23:%.*]] = trunc i8 [[TMP22]] to i1
 // CHECK32-NEXT:    br i1 [[TMP23]], label [[AESDECWIDE256KL_NO_ERROR_I:%.*]], label [[AESDECWIDE256KL_ERROR_I:%.*]]
--- a/clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-bf16-dotprod-intrinsics.c
@ -11,7 +11,7 @@
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[B]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[B]])
 // CHECK-NEXT:    ret <2 x float> [[VBFDOT3_I]]
 //
 float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
@ -23,7 +23,7 @@ float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
 // CHECK-NEXT:    ret <4 x float> [[VBFDOT3_I]]
 //
 float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
@ -46,7 +46,7 @@ float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
 // CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <8 x i8>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP5]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP5]])
 // CHECK-NEXT:    ret <2 x float> [[VBFDOT3_I]]
 //
 float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
@ -69,7 +69,7 @@ float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
 // CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x bfloat> [[TMP5]] to <16 x i8>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP5]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP5]])
 // CHECK-NEXT:    ret <4 x float> [[VBFDOT3_I]]
 //
 float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
@ -92,7 +92,7 @@ float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b
 // CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <8 x i8>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP5]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP5]])
 // CHECK-NEXT:    ret <2 x float> [[VBFDOT3_I]]
 //
 float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) {
@ -115,7 +115,7 @@ float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
 // CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x bfloat> [[TMP5]] to <16 x i8>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP5]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP5]])
 // CHECK-NEXT:    ret <4 x float> [[VBFDOT3_I]]
 //
 float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
@ -127,7 +127,7 @@ float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMMLAQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VBFMMLAQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmmla(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
 // CHECK-NEXT:    [[VBFMMLAQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMMLAQ_V3_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x float> [[VBFMMLAQ_V3_I]]
 //
@ -140,7 +140,7 @@ float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
 // CHECK-NEXT:    [[VBFMLALBQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_V3_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x float> [[VBFMLALBQ_V3_I]]
 //
@ -153,7 +153,7 @@ float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
 // CHECK-NEXT:    [[VBFMLALTQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_V3_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x float> [[VBFMLALTQ_V3_I]]
 //
@ -182,7 +182,7 @@ float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
 // CHECK-NEXT:    [[VBFMLALBQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_V3_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x float> [[VBFMLALBQ_V3_I]]
 //
@ -211,7 +211,7 @@ float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
 // CHECK-NEXT:    [[VBFMLALBQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_V3_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x float> [[VBFMLALBQ_V3_I]]
 //
@ -240,7 +240,7 @@ float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
 // CHECK-NEXT:    [[VBFMLALTQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_V3_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x float> [[VBFMLALTQ_V3_I]]
 //
@ -269,7 +269,7 @@ float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
 // CHECK-NEXT:    [[VBFMLALTQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_V3_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x float> [[VBFMLALTQ_V3_I]]
 //
--- a/clang/test/CodeGen/aarch64-ls64.c
+++ b/clang/test/CodeGen/aarch64-ls64.c
@ -26,7 +26,7 @@ uint64_t status;
 // CHECK-C-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[__ADDR_ADDR_I]], align 8, !noalias !6
 // CHECK-C-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_DATA512_T]], %struct.data512_t* [[TMP]], i32 0, i32 0
 // CHECK-C-NEXT:    [[ARRAYDECAY_I:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* [[VAL_I]], i64 0, i64 0
-// CHECK-C-NEXT:    [[TMP2:%.*]] = call { i64, i64, i64, i64, i64, i64, i64, i64 } @llvm.aarch64.ld64b(i8* [[TMP1]]) #[[ATTR2:[0-9]+]], !noalias !6
+// CHECK-C-NEXT:    [[TMP2:%.*]] = call { i64, i64, i64, i64, i64, i64, i64, i64 } @llvm.aarch64.ld64b(i8* [[TMP1]]), !noalias !6
 // CHECK-C-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 0
 // CHECK-C-NEXT:    store i64 [[TMP3]], i64* [[ARRAYDECAY_I]], align 8, !alias.scope !6
 // CHECK-C-NEXT:    [[TMP4:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 1
@ -64,7 +64,7 @@ uint64_t status;
 // CHECK-CXX-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[__ADDR_ADDR_I]], align 8, !noalias !6
 // CHECK-CXX-NEXT:    [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_DATA512_T]], %struct.data512_t* [[REF_TMP]], i32 0, i32 0
 // CHECK-CXX-NEXT:    [[ARRAYDECAY_I:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* [[VAL_I]], i64 0, i64 0
-// CHECK-CXX-NEXT:    [[TMP2:%.*]] = call { i64, i64, i64, i64, i64, i64, i64, i64 } @llvm.aarch64.ld64b(i8* [[TMP1]]) #[[ATTR2:[0-9]+]], !noalias !6
+// CHECK-CXX-NEXT:    [[TMP2:%.*]] = call { i64, i64, i64, i64, i64, i64, i64, i64 } @llvm.aarch64.ld64b(i8* [[TMP1]]), !noalias !6
 // CHECK-CXX-NEXT:    [[TMP3:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 0
 // CHECK-CXX-NEXT:    store i64 [[TMP3]], i64* [[ARRAYDECAY_I]], align 8, !alias.scope !6
 // CHECK-CXX-NEXT:    [[TMP4:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 1
@ -123,7 +123,7 @@ EXTERN_C void test_ld64b(void)
 // CHECK-C-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
 // CHECK-C-NEXT:    [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7
 // CHECK-C-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
-// CHECK-C-NEXT:    call void @llvm.aarch64.st64b(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) #[[ATTR2]]
+// CHECK-C-NEXT:    call void @llvm.aarch64.st64b(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]])
 // CHECK-C-NEXT:    ret void
 //
 // CHECK-CXX-LABEL: @test_st64b(
@ -152,7 +152,7 @@ EXTERN_C void test_ld64b(void)
 // CHECK-CXX-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
 // CHECK-CXX-NEXT:    [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7
 // CHECK-CXX-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
-// CHECK-CXX-NEXT:    call void @llvm.aarch64.st64b(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) #[[ATTR2]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.st64b(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]])
 // CHECK-CXX-NEXT:    ret void
 //
 EXTERN_C void test_st64b(void)
@ -186,7 +186,7 @@ EXTERN_C void test_st64b(void)
 // CHECK-C-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
 // CHECK-C-NEXT:    [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7
 // CHECK-C-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
-// CHECK-C-NEXT:    [[TMP18:%.*]] = call i64 @llvm.aarch64.st64bv(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) #[[ATTR2]]
+// CHECK-C-NEXT:    [[TMP18:%.*]] = call i64 @llvm.aarch64.st64bv(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]])
 // CHECK-C-NEXT:    store i64 [[TMP18]], i64* @status, align 8
 // CHECK-C-NEXT:    ret void
 //
@ -216,7 +216,7 @@ EXTERN_C void test_st64b(void)
 // CHECK-CXX-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
 // CHECK-CXX-NEXT:    [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7
 // CHECK-CXX-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
-// CHECK-CXX-NEXT:    [[TMP18:%.*]] = call i64 @llvm.aarch64.st64bv(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) #[[ATTR2]]
+// CHECK-CXX-NEXT:    [[TMP18:%.*]] = call i64 @llvm.aarch64.st64bv(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]])
 // CHECK-CXX-NEXT:    store i64 [[TMP18]], i64* @status, align 8
 // CHECK-CXX-NEXT:    ret void
 //
@ -251,7 +251,7 @@ EXTERN_C void test_st64bv(void)
 // CHECK-C-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
 // CHECK-C-NEXT:    [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7
 // CHECK-C-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
-// CHECK-C-NEXT:    [[TMP18:%.*]] = call i64 @llvm.aarch64.st64bv0(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) #[[ATTR2]]
+// CHECK-C-NEXT:    [[TMP18:%.*]] = call i64 @llvm.aarch64.st64bv0(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]])
 // CHECK-C-NEXT:    store i64 [[TMP18]], i64* @status, align 8
 // CHECK-C-NEXT:    ret void
 //
@ -281,7 +281,7 @@ EXTERN_C void test_st64bv(void)
 // CHECK-CXX-NEXT:    [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8
 // CHECK-CXX-NEXT:    [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7
 // CHECK-CXX-NEXT:    [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8
-// CHECK-CXX-NEXT:    [[TMP18:%.*]] = call i64 @llvm.aarch64.st64bv0(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) #[[ATTR2]]
+// CHECK-CXX-NEXT:    [[TMP18:%.*]] = call i64 @llvm.aarch64.st64bv0(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]])
 // CHECK-CXX-NEXT:    store i64 [[TMP18]], i64* @status, align 8
 // CHECK-CXX-NEXT:    ret void
 //
--- a/clang/test/CodeGen/aarch64-neon-2velem.c
+++ b/clang/test/CodeGen/aarch64-neon-2velem.c
--- a/clang/test/CodeGen/aarch64-neon-across.c
+++ b/clang/test/CodeGen/aarch64-neon-across.c
@ -9,7 +9,7 @@
 // CHECK-LABEL: define {{[^@]+}}@test_vaddlv_s8
 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> [[A]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
 // CHECK-NEXT:    ret i16 [[TMP0]]
 //
@ -20,7 +20,7 @@ int16_t test_vaddlv_s8(int8x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vaddlv_s16
 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> [[A]])
 // CHECK-NEXT:    ret i32 [[VADDLV_I]]
 //
 int32_t test_vaddlv_s16(int16x4_t a) {
@ -30,7 +30,7 @@ int32_t test_vaddlv_s16(int16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vaddlv_u8
 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
 // CHECK-NEXT:    ret i16 [[TMP0]]
 //
@ -41,7 +41,7 @@ uint16_t test_vaddlv_u8(uint8x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vaddlv_u16
 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> [[A]])
 // CHECK-NEXT:    ret i32 [[VADDLV_I]]
 //
 uint32_t test_vaddlv_u16(uint16x4_t a) {
@ -51,7 +51,7 @@ uint32_t test_vaddlv_u16(uint16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_s8
 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
 // CHECK-NEXT:    ret i16 [[TMP0]]
 //
@ -62,7 +62,7 @@ int16_t test_vaddlvq_s8(int8x16_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_s16
 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> [[A]])
 // CHECK-NEXT:    ret i32 [[VADDLV_I]]
 //
 int32_t test_vaddlvq_s16(int16x8_t a) {
@ -72,7 +72,7 @@ int32_t test_vaddlvq_s16(int16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_s32
 // CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDLVQ_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VADDLVQ_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i64 [[VADDLVQ_S32_I]]
 //
 int64_t test_vaddlvq_s32(int32x4_t a) {
@ -82,7 +82,7 @@ int64_t test_vaddlvq_s32(int32x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_u8
 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
 // CHECK-NEXT:    ret i16 [[TMP0]]
 //
@ -93,7 +93,7 @@ uint16_t test_vaddlvq_u8(uint8x16_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_u16
 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> [[A]])
 // CHECK-NEXT:    ret i32 [[VADDLV_I]]
 //
 uint32_t test_vaddlvq_u16(uint16x8_t a) {
@ -103,7 +103,7 @@ uint32_t test_vaddlvq_u16(uint16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vaddlvq_u32
 // CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDLVQ_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VADDLVQ_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i64 [[VADDLVQ_U32_I]]
 //
 uint64_t test_vaddlvq_u32(uint32x4_t a) {
@ -113,7 +113,7 @@ uint64_t test_vaddlvq_u32(uint32x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxv_s8
 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
 // CHECK-NEXT:    ret i8 [[TMP0]]
 //
@ -124,7 +124,7 @@ int8_t test_vmaxv_s8(int8x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxv_s16
 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i16
 // CHECK-NEXT:    ret i16 [[TMP0]]
 //
@ -135,7 +135,7 @@ int16_t test_vmaxv_s16(int16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxv_u8
 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
 // CHECK-NEXT:    ret i8 [[TMP0]]
 //
@ -146,7 +146,7 @@ uint8_t test_vmaxv_u8(uint8x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxv_u16
 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i16
 // CHECK-NEXT:    ret i16 [[TMP0]]
 //
@ -157,7 +157,7 @@ uint16_t test_vmaxv_u16(uint16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_s8
 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
 // CHECK-NEXT:    ret i8 [[TMP0]]
 //
@ -168,7 +168,7 @@ int8_t test_vmaxvq_s8(int8x16_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_s16
 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i16
 // CHECK-NEXT:    ret i16 [[TMP0]]
 //
@ -179,7 +179,7 @@ int16_t test_vmaxvq_s16(int16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_s32
 // CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMAXVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMAXVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[VMAXVQ_S32_I]]
 //
 int32_t test_vmaxvq_s32(int32x4_t a) {
@ -189,7 +189,7 @@ int32_t test_vmaxvq_s32(int32x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_u8
 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
 // CHECK-NEXT:    ret i8 [[TMP0]]
 //
@ -200,7 +200,7 @@ uint8_t test_vmaxvq_u8(uint8x16_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_u16
 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i16
 // CHECK-NEXT:    ret i16 [[TMP0]]
 //
@ -211,7 +211,7 @@ uint16_t test_vmaxvq_u16(uint16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_u32
 // CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMAXVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMAXVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[VMAXVQ_U32_I]]
 //
 uint32_t test_vmaxvq_u32(uint32x4_t a) {
@ -221,7 +221,7 @@ uint32_t test_vmaxvq_u32(uint32x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vminv_s8
 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
 // CHECK-NEXT:    ret i8 [[TMP0]]
 //
@ -232,7 +232,7 @@ int8_t test_vminv_s8(int8x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vminv_s16
 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i16
 // CHECK-NEXT:    ret i16 [[TMP0]]
 //
@ -243,7 +243,7 @@ int16_t test_vminv_s16(int16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vminv_u8
 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
 // CHECK-NEXT:    ret i8 [[TMP0]]
 //
@ -254,7 +254,7 @@ uint8_t test_vminv_u8(uint8x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vminv_u16
 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i16
 // CHECK-NEXT:    ret i16 [[TMP0]]
 //
@ -265,7 +265,7 @@ uint16_t test_vminv_u16(uint16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vminvq_s8
 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
 // CHECK-NEXT:    ret i8 [[TMP0]]
 //
@ -276,7 +276,7 @@ int8_t test_vminvq_s8(int8x16_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vminvq_s16
 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i16
 // CHECK-NEXT:    ret i16 [[TMP0]]
 //
@ -287,7 +287,7 @@ int16_t test_vminvq_s16(int16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vminvq_s32
 // CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMINVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMINVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[VMINVQ_S32_I]]
 //
 int32_t test_vminvq_s32(int32x4_t a) {
@ -297,7 +297,7 @@ int32_t test_vminvq_s32(int32x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vminvq_u8
 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
 // CHECK-NEXT:    ret i8 [[TMP0]]
 //
@ -308,7 +308,7 @@ uint8_t test_vminvq_u8(uint8x16_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vminvq_u16
 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i16
 // CHECK-NEXT:    ret i16 [[TMP0]]
 //
@ -319,7 +319,7 @@ uint16_t test_vminvq_u16(uint16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vminvq_u32
 // CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMINVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMINVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[VMINVQ_U32_I]]
 //
 uint32_t test_vminvq_u32(uint32x4_t a) {
@ -329,7 +329,7 @@ uint32_t test_vminvq_u32(uint32x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vaddv_s8
 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
 // CHECK-NEXT:    ret i8 [[TMP0]]
 //
@ -340,7 +340,7 @@ int8_t test_vaddv_s8(int8x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vaddv_s16
 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i16
 // CHECK-NEXT:    ret i16 [[TMP0]]
 //
@ -351,7 +351,7 @@ int16_t test_vaddv_s16(int16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vaddv_u8
 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
 // CHECK-NEXT:    ret i8 [[TMP0]]
 //
@ -362,7 +362,7 @@ uint8_t test_vaddv_u8(uint8x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vaddv_u16
 // CHECK-SAME: (<4 x i16> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i16
 // CHECK-NEXT:    ret i16 [[TMP0]]
 //
@ -373,7 +373,7 @@ uint16_t test_vaddv_u16(uint16x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_s8
 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
 // CHECK-NEXT:    ret i8 [[TMP0]]
 //
@ -384,7 +384,7 @@ int8_t test_vaddvq_s8(int8x16_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_s16
 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i16
 // CHECK-NEXT:    ret i16 [[TMP0]]
 //
@ -395,7 +395,7 @@ int16_t test_vaddvq_s16(int16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_s32
 // CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VADDVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[VADDVQ_S32_I]]
 //
 int32_t test_vaddvq_s32(int32x4_t a) {
@ -405,7 +405,7 @@ int32_t test_vaddvq_s32(int32x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_u8
 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
 // CHECK-NEXT:    ret i8 [[TMP0]]
 //
@ -416,7 +416,7 @@ uint8_t test_vaddvq_u8(uint8x16_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_u16
 // CHECK-SAME: (<8 x i16> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> [[A]])
 // CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i16
 // CHECK-NEXT:    ret i16 [[TMP0]]
 //
@ -427,7 +427,7 @@ uint16_t test_vaddvq_u16(uint16x8_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vaddvq_u32
 // CHECK-SAME: (<4 x i32> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VADDVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VADDVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> [[A]])
 // CHECK-NEXT:    ret i32 [[VADDVQ_U32_I]]
 //
 uint32_t test_vaddvq_u32(uint32x4_t a) {
@ -437,7 +437,7 @@ uint32_t test_vaddvq_u32(uint32x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxvq_f32
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMAXVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMAXVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret float [[VMAXVQ_F32_I]]
 //
 float32_t test_vmaxvq_f32(float32x4_t a) {
@ -447,7 +447,7 @@ float32_t test_vmaxvq_f32(float32x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vminvq_f32
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMINVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMINVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret float [[VMINVQ_F32_I]]
 //
 float32_t test_vminvq_f32(float32x4_t a) {
@ -457,7 +457,7 @@ float32_t test_vminvq_f32(float32x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmaxnmvq_f32
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMAXNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMAXNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret float [[VMAXNMVQ_F32_I]]
 //
 float32_t test_vmaxnmvq_f32(float32x4_t a) {
@ -467,7 +467,7 @@ float32_t test_vmaxnmvq_f32(float32x4_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vminnmvq_f32
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMINNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMINNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret float [[VMINNMVQ_F32_I]]
 //
 float32_t test_vminnmvq_f32(float32x4_t a) {
--- a/clang/test/CodeGen/aarch64-neon-fcvt-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-neon-fcvt-intrinsics.c
@ -9,7 +9,7 @@
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtxd_f32_f64
 // CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTXD_F32_F64_I:%.*]] = call float @llvm.aarch64.sisd.fcvtxn(double [[A]]) #[[ATTR2:[0-9]+]]
+// CHECK-NEXT:    [[VCVTXD_F32_F64_I:%.*]] = call float @llvm.aarch64.sisd.fcvtxn(double [[A]])
 // CHECK-NEXT:    ret float [[VCVTXD_F32_F64_I]]
 //
 float32_t test_vcvtxd_f32_f64(float64_t a) {
@ -19,7 +19,7 @@ float32_t test_vcvtxd_f32_f64(float64_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtas_s32_f32
 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTAS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    [[VCVTAS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f32(float [[A]])
 // CHECK-NEXT:    ret i32 [[VCVTAS_S32_F32_I]]
 //
 int32_t test_vcvtas_s32_f32(float32_t a) {
@ -29,7 +29,7 @@ int32_t test_vcvtas_s32_f32(float32_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_test_vcvtad_s64_f64
 // CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTAD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    [[VCVTAD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double [[A]])
 // CHECK-NEXT:    ret i64 [[VCVTAD_S64_F64_I]]
 //
 int64_t test_test_vcvtad_s64_f64(float64_t a) {
@ -39,7 +39,7 @@ int64_t test_test_vcvtad_s64_f64(float64_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtas_u32_f32
 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTAS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    [[VCVTAS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f32(float [[A]])
 // CHECK-NEXT:    ret i32 [[VCVTAS_U32_F32_I]]
 //
 uint32_t test_vcvtas_u32_f32(float32_t a) {
@ -49,7 +49,7 @@ uint32_t test_vcvtas_u32_f32(float32_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtad_u64_f64
 // CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTAD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtau.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    [[VCVTAD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtau.i64.f64(double [[A]])
 // CHECK-NEXT:    ret i64 [[VCVTAD_U64_F64_I]]
 //
 uint64_t test_vcvtad_u64_f64(float64_t a) {
@ -59,7 +59,7 @@ uint64_t test_vcvtad_u64_f64(float64_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtms_s32_f32
 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTMS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtms.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    [[VCVTMS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtms.i32.f32(float [[A]])
 // CHECK-NEXT:    ret i32 [[VCVTMS_S32_F32_I]]
 //
 int32_t test_vcvtms_s32_f32(float32_t a) {
@ -69,7 +69,7 @@ int32_t test_vcvtms_s32_f32(float32_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtmd_s64_f64
 // CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTMD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtms.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    [[VCVTMD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtms.i64.f64(double [[A]])
 // CHECK-NEXT:    ret i64 [[VCVTMD_S64_F64_I]]
 //
 int64_t test_vcvtmd_s64_f64(float64_t a) {
@ -79,7 +79,7 @@ int64_t test_vcvtmd_s64_f64(float64_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtms_u32_f32
 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTMS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    [[VCVTMS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float [[A]])
 // CHECK-NEXT:    ret i32 [[VCVTMS_U32_F32_I]]
 //
 uint32_t test_vcvtms_u32_f32(float32_t a) {
@ -89,7 +89,7 @@ uint32_t test_vcvtms_u32_f32(float32_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtmd_u64_f64
 // CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTMD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    [[VCVTMD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double [[A]])
 // CHECK-NEXT:    ret i64 [[VCVTMD_U64_F64_I]]
 //
 uint64_t test_vcvtmd_u64_f64(float64_t a) {
@ -99,7 +99,7 @@ uint64_t test_vcvtmd_u64_f64(float64_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtns_s32_f32
 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTNS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtns.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    [[VCVTNS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtns.i32.f32(float [[A]])
 // CHECK-NEXT:    ret i32 [[VCVTNS_S32_F32_I]]
 //
 int32_t test_vcvtns_s32_f32(float32_t a) {
@ -109,7 +109,7 @@ int32_t test_vcvtns_s32_f32(float32_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtnd_s64_f64
 // CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTND_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtns.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    [[VCVTND_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtns.i64.f64(double [[A]])
 // CHECK-NEXT:    ret i64 [[VCVTND_S64_F64_I]]
 //
 int64_t test_vcvtnd_s64_f64(float64_t a) {
@ -119,7 +119,7 @@ int64_t test_vcvtnd_s64_f64(float64_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtns_u32_f32
 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTNS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    [[VCVTNS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float [[A]])
 // CHECK-NEXT:    ret i32 [[VCVTNS_U32_F32_I]]
 //
 uint32_t test_vcvtns_u32_f32(float32_t a) {
@ -129,7 +129,7 @@ uint32_t test_vcvtns_u32_f32(float32_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtnd_u64_f64
 // CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTND_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    [[VCVTND_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double [[A]])
 // CHECK-NEXT:    ret i64 [[VCVTND_U64_F64_I]]
 //
 uint64_t test_vcvtnd_u64_f64(float64_t a) {
@ -139,7 +139,7 @@ uint64_t test_vcvtnd_u64_f64(float64_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtps_s32_f32
 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTPS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtps.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    [[VCVTPS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtps.i32.f32(float [[A]])
 // CHECK-NEXT:    ret i32 [[VCVTPS_S32_F32_I]]
 //
 int32_t test_vcvtps_s32_f32(float32_t a) {
@ -149,7 +149,7 @@ int32_t test_vcvtps_s32_f32(float32_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtpd_s64_f64
 // CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTPD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtps.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    [[VCVTPD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtps.i64.f64(double [[A]])
 // CHECK-NEXT:    ret i64 [[VCVTPD_S64_F64_I]]
 //
 int64_t test_vcvtpd_s64_f64(float64_t a) {
@ -159,7 +159,7 @@ int64_t test_vcvtpd_s64_f64(float64_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtps_u32_f32
 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTPS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    [[VCVTPS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float [[A]])
 // CHECK-NEXT:    ret i32 [[VCVTPS_U32_F32_I]]
 //
 uint32_t test_vcvtps_u32_f32(float32_t a) {
@ -169,7 +169,7 @@ uint32_t test_vcvtps_u32_f32(float32_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtpd_u64_f64
 // CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTPD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    [[VCVTPD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double [[A]])
 // CHECK-NEXT:    ret i64 [[VCVTPD_U64_F64_I]]
 //
 uint64_t test_vcvtpd_u64_f64(float64_t a) {
@ -179,7 +179,7 @@ uint64_t test_vcvtpd_u64_f64(float64_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvts_s32_f32
 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    [[VCVTS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtzs.i32.f32(float [[A]])
 // CHECK-NEXT:    ret i32 [[VCVTS_S32_F32_I]]
 //
 int32_t test_vcvts_s32_f32(float32_t a) {
@ -189,7 +189,7 @@ int32_t test_vcvts_s32_f32(float32_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtd_s64_f64
 // CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    [[VCVTD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtzs.i64.f64(double [[A]])
 // CHECK-NEXT:    ret i64 [[VCVTD_S64_F64_I]]
 //
 int64_t test_vcvtd_s64_f64(float64_t a) {
@ -199,7 +199,7 @@ int64_t test_vcvtd_s64_f64(float64_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvts_u32_f32
 // CHECK-SAME: (float noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    [[VCVTS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtzu.i32.f32(float [[A]])
 // CHECK-NEXT:    ret i32 [[VCVTS_U32_F32_I]]
 //
 uint32_t test_vcvts_u32_f32(float32_t a) {
@ -209,7 +209,7 @@ uint32_t test_vcvts_u32_f32(float32_t a) {
 // CHECK-LABEL: define {{[^@]+}}@test_vcvtd_u64_f64
 // CHECK-SAME: (double noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VCVTD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double [[A]]) #[[ATTR2]]
+// CHECK-NEXT:    [[VCVTD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtzu.i64.f64(double [[A]])
 // CHECK-NEXT:    ret i64 [[VCVTD_U64_F64_I]]
 //
 uint64_t test_vcvtd_u64_f64(float64_t a) {
--- a/clang/test/CodeGen/aarch64-neon-fma.c
+++ b/clang/test/CodeGen/aarch64-neon-fma.c
@ -292,7 +292,7 @@ float32x4_t test_vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[B]], <2 x double> [[VECINIT1_I]], <2 x double> [[A]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[B]], <2 x double> [[VECINIT1_I]], <2 x double> [[A]])
 // CHECK-NEXT:    ret <2 x double> [[TMP3]]
 //
 float64x2_t test_vfmaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
@ -308,7 +308,7 @@ float64x2_t test_vfmaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[FNEG_I]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FNEG_I]], <2 x double> [[VECINIT1_I]], <2 x double> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FNEG_I]], <2 x double> [[VECINIT1_I]], <2 x double> [[A]])
 // CHECK-NEXT:    ret <2 x double> [[TMP3]]
 //
 float64x2_t test_vfmsq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
--- a/clang/test/CodeGen/aarch64-neon-fp16fml.c
+++ b/clang/test/CodeGen/aarch64-neon-fp16fml.c
@ -15,7 +15,7 @@
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]])
 // CHECK-NEXT:    ret <2 x float> [[VFMLAL_LOW3_I]]
 //
 float32x2_t test_vfmlal_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@ -27,7 +27,7 @@ float32x2_t test_vfmlal_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]])
 // CHECK-NEXT:    ret <2 x float> [[VFMLSL_LOW3_I]]
 //
 float32x2_t test_vfmlsl_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@ -39,7 +39,7 @@ float32x2_t test_vfmlsl_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]])
 // CHECK-NEXT:    ret <2 x float> [[VFMLAL_HIGH3_I]]
 //
 float32x2_t test_vfmlal_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@ -51,7 +51,7 @@ float32x2_t test_vfmlal_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[C]])
 // CHECK-NEXT:    ret <2 x float> [[VFMLSL_HIGH3_I]]
 //
 float32x2_t test_vfmlsl_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@ -63,7 +63,7 @@ float32x2_t test_vfmlsl_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]])
 // CHECK-NEXT:    ret <4 x float> [[VFMLAL_LOW3_I]]
 //
 float32x4_t test_vfmlalq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@ -75,7 +75,7 @@ float32x4_t test_vfmlalq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]])
 // CHECK-NEXT:    ret <4 x float> [[VFMLSL_LOW3_I]]
 //
 float32x4_t test_vfmlslq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@ -87,7 +87,7 @@ float32x4_t test_vfmlslq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]])
 // CHECK-NEXT:    ret <4 x float> [[VFMLAL_HIGH3_I]]
 //
 float32x4_t test_vfmlalq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@ -99,7 +99,7 @@ float32x4_t test_vfmlalq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[C]])
 // CHECK-NEXT:    ret <4 x float> [[VFMLSL_HIGH3_I]]
 //
 float32x4_t test_vfmlslq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@ -153,7 +153,7 @@ float32x4_t test_vfmlslq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
 // CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
 // CHECK-NEXT:    ret <2 x float> [[VFMLAL_LOW3_I]]
 //
 float32x2_t test_vfmlal_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@ -205,7 +205,7 @@ float32x2_t test_vfmlal_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c
 // CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
 // CHECK-NEXT:    ret <2 x float> [[VFMLAL_HIGH3_I]]
 //
 float32x2_t test_vfmlal_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@ -297,7 +297,7 @@ float32x2_t test_vfmlal_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t
 // CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
 // CHECK-NEXT:    ret <4 x float> [[VFMLAL_LOW3_I]]
 //
 float32x4_t test_vfmlalq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t c) {
@ -389,7 +389,7 @@ float32x4_t test_vfmlalq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t
 // CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
 // CHECK-NEXT:    ret <4 x float> [[VFMLAL_HIGH3_I]]
 //
 float32x4_t test_vfmlalq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t c) {
@ -441,7 +441,7 @@ float32x4_t test_vfmlalq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t
 // CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
 // CHECK-NEXT:    ret <2 x float> [[VFMLAL_LOW3_I]]
 //
 float32x2_t test_vfmlal_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t c) {
@ -493,7 +493,7 @@ float32x2_t test_vfmlal_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t
 // CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
 // CHECK-NEXT:    ret <2 x float> [[VFMLAL_HIGH3_I]]
 //
 float32x2_t test_vfmlal_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t c) {
@ -585,7 +585,7 @@ float32x2_t test_vfmlal_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t
 // CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VFMLAL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
 // CHECK-NEXT:    ret <4 x float> [[VFMLAL_LOW3_I]]
 //
 float32x4_t test_vfmlalq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@ -677,7 +677,7 @@ float32x4_t test_vfmlalq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t
 // CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VFMLAL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
 // CHECK-NEXT:    ret <4 x float> [[VFMLAL_HIGH3_I]]
 //
 float32x4_t test_vfmlalq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@ -729,7 +729,7 @@ float32x4_t test_vfmlalq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_
 // CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
 // CHECK-NEXT:    ret <2 x float> [[VFMLSL_LOW3_I]]
 //
 float32x2_t test_vfmlsl_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@ -781,7 +781,7 @@ float32x2_t test_vfmlsl_lane_low_f16(float32x2_t a, float16x4_t b, float16x4_t c
 // CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
 // CHECK-NEXT:    ret <2 x float> [[VFMLSL_HIGH3_I]]
 //
 float32x2_t test_vfmlsl_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t c) {
@ -873,7 +873,7 @@ float32x2_t test_vfmlsl_lane_high_f16(float32x2_t a, float16x4_t b, float16x4_t
 // CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
 // CHECK-NEXT:    ret <4 x float> [[VFMLSL_LOW3_I]]
 //
 float32x4_t test_vfmlslq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t c) {
@ -965,7 +965,7 @@ float32x4_t test_vfmlslq_lane_low_f16(float32x4_t a, float16x8_t b, float16x4_t
 // CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
 // CHECK-NEXT:    ret <4 x float> [[VFMLSL_HIGH3_I]]
 //
 float32x4_t test_vfmlslq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t c) {
@ -1017,7 +1017,7 @@ float32x4_t test_vfmlslq_lane_high_f16(float32x4_t a, float16x8_t b, float16x4_t
 // CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
 // CHECK-NEXT:    ret <2 x float> [[VFMLSL_LOW3_I]]
 //
 float32x2_t test_vfmlsl_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t c) {
@ -1069,7 +1069,7 @@ float32x2_t test_vfmlsl_laneq_low_f16(float32x2_t a, float16x4_t b, float16x8_t
 // CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP18:%.*]] = bitcast <4 x half> [[VECINIT31]] to <8 x i8>
-// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> [[A]], <4 x half> [[B]], <4 x half> [[VECINIT31]])
 // CHECK-NEXT:    ret <2 x float> [[VFMLSL_HIGH3_I]]
 //
 float32x2_t test_vfmlsl_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t c) {
@ -1161,7 +1161,7 @@ float32x2_t test_vfmlsl_laneq_high_f16(float32x2_t a, float16x4_t b, float16x8_t
 // CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VFMLSL_LOW3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
 // CHECK-NEXT:    ret <4 x float> [[VFMLSL_LOW3_I]]
 //
 float32x4_t test_vfmlslq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
@ -1253,7 +1253,7 @@ float32x4_t test_vfmlslq_laneq_low_f16(float32x4_t a, float16x8_t b, float16x8_t
 // CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP33:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x half> [[VECINIT71]] to <16 x i8>
-// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VFMLSL_HIGH3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> [[A]], <8 x half> [[B]], <8 x half> [[VECINIT71]])
 // CHECK-NEXT:    ret <4 x float> [[VFMLSL_HIGH3_I]]
 //
 float32x4_t test_vfmlslq_laneq_high_f16(float32x4_t a, float16x8_t b, float16x8_t c) {
--- a/clang/test/CodeGen/aarch64-neon-tbl.c
+++ b/clang/test/CodeGen/aarch64-neon-tbl.c
@ -10,7 +10,7 @@
 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL11_I]]
 //
 int8x8_t test_vtbl1_s8(int8x8_t a, int8x8_t b) {
@ -20,7 +20,7 @@ int8x8_t test_vtbl1_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vqtbl1_s8
 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR1:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[A]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[A]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL1_I]]
 //
 int8x8_t test_vqtbl1_s8(int8x16_t a, uint8x8_t b) {
@ -45,7 +45,7 @@ int8x8_t test_vqtbl1_s8(int8x16_t a, uint8x8_t b) {
 // CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
 // CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
 // CHECK-NEXT:    [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL13_I]]
 //
 int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) {
@ -69,7 +69,7 @@ int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) {
 // CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
 // CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL2_I]]
 //
 int8x8_t test_vqtbl2_s8(int8x16x2_t a, uint8x8_t b) {
@ -98,7 +98,7 @@ int8x8_t test_vqtbl2_s8(int8x16x2_t a, uint8x8_t b) {
 // CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
 // CHECK-NEXT:    [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK-NEXT:    [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL26_I]]
 //
 int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) {
@ -125,7 +125,7 @@ int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) {
 // CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
 // CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL3_I]]
 //
 int8x8_t test_vqtbl3_s8(int8x16x3_t a, uint8x8_t b) {
@ -157,7 +157,7 @@ int8x8_t test_vqtbl3_s8(int8x16x3_t a, uint8x8_t b) {
 // CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
 // CHECK-NEXT:    [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK-NEXT:    [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL28_I]]
 //
 int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) {
@ -187,7 +187,7 @@ int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) {
 // CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
 // CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL4_I]]
 //
 int8x8_t test_vqtbl4_s8(int8x16x4_t a, uint8x8_t b) {
@ -197,7 +197,7 @@ int8x8_t test_vqtbl4_s8(int8x16x4_t a, uint8x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vqtbl1q_s8
 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
 // CHECK-NEXT:    ret <16 x i8> [[VTBL1_I]]
 //
 int8x16_t test_vqtbl1q_s8(int8x16_t a, int8x16_t b) {
@ -221,7 +221,7 @@ int8x16_t test_vqtbl1q_s8(int8x16_t a, int8x16_t b) {
 // CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
 // CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[B]])
 // CHECK-NEXT:    ret <16 x i8> [[VTBL2_I]]
 //
 int8x16_t test_vqtbl2q_s8(int8x16x2_t a, int8x16_t b) {
@ -248,7 +248,7 @@ int8x16_t test_vqtbl2q_s8(int8x16x2_t a, int8x16_t b) {
 // CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
 // CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[B]])
 // CHECK-NEXT:    ret <16 x i8> [[VTBL3_I]]
 //
 int8x16_t test_vqtbl3q_s8(int8x16x3_t a, int8x16_t b) {
@ -278,7 +278,7 @@ int8x16_t test_vqtbl3q_s8(int8x16x3_t a, int8x16_t b) {
 // CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
 // CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[B]])
 // CHECK-NEXT:    ret <16 x i8> [[VTBL4_I]]
 //
 int8x16_t test_vqtbl4q_s8(int8x16x4_t a, int8x16_t b) {
@ -289,7 +289,7 @@ int8x16_t test_vqtbl4q_s8(int8x16x4_t a, int8x16_t b) {
 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[C]])
 // CHECK-NEXT:    [[TMP0:%.*]] = icmp uge <8 x i8> [[C]], <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
 // CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[A]]
@ -320,7 +320,7 @@ int8x8_t test_vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
 // CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
 // CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
 // CHECK-NEXT:    [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX1_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX1_I]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX13_I]]
 //
 int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) {
@ -349,7 +349,7 @@ int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) {
 // CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
 // CHECK-NEXT:    [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK-NEXT:    [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[C]])
 // CHECK-NEXT:    [[TMP4:%.*]] = icmp uge <8 x i8> [[C]], <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
 // CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8>
 // CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i8> [[TMP5]], [[A]]
@ -387,7 +387,7 @@ int8x8_t test_vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c) {
 // CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
 // CHECK-NEXT:    [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK-NEXT:    [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX28_I]]
 //
 int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) {
@ -397,7 +397,7 @@ int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vqtbx1_s8
 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX1_I]]
 //
 int8x8_t test_vqtbx1_s8(int8x8_t a, int8x16_t b, uint8x8_t c) {
@ -421,7 +421,7 @@ int8x8_t test_vqtbx1_s8(int8x8_t a, int8x16_t b, uint8x8_t c) {
 // CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
 // CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX2_I]]
 //
 int8x8_t test_vqtbx2_s8(int8x8_t a, int8x16x2_t b, uint8x8_t c) {
@ -448,7 +448,7 @@ int8x8_t test_vqtbx2_s8(int8x8_t a, int8x16x2_t b, uint8x8_t c) {
 // CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
 // CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX3_I]]
 //
 int8x8_t test_vqtbx3_s8(int8x8_t a, int8x16x3_t b, uint8x8_t c) {
@ -478,7 +478,7 @@ int8x8_t test_vqtbx3_s8(int8x8_t a, int8x16x3_t b, uint8x8_t c) {
 // CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
 // CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX4_I]]
 //
 int8x8_t test_vqtbx4_s8(int8x8_t a, int8x16x4_t b, uint8x8_t c) {
@ -488,7 +488,7 @@ int8x8_t test_vqtbx4_s8(int8x8_t a, int8x16x4_t b, uint8x8_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vqtbx1q_s8
 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]])
 // CHECK-NEXT:    ret <16 x i8> [[VTBX1_I]]
 //
 int8x16_t test_vqtbx1q_s8(int8x16_t a, int8x16_t b, uint8x16_t c) {
@ -512,7 +512,7 @@ int8x16_t test_vqtbx1q_s8(int8x16_t a, int8x16_t b, uint8x16_t c) {
 // CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X2_T]], %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
 // CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[C]])
 // CHECK-NEXT:    ret <16 x i8> [[VTBX2_I]]
 //
 int8x16_t test_vqtbx2q_s8(int8x16_t a, int8x16x2_t b, int8x16_t c) {
@ -539,7 +539,7 @@ int8x16_t test_vqtbx2q_s8(int8x16_t a, int8x16x2_t b, int8x16_t c) {
 // CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X3_T]], %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
 // CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[C]])
 // CHECK-NEXT:    ret <16 x i8> [[VTBX3_I]]
 //
 int8x16_t test_vqtbx3q_s8(int8x16_t a, int8x16x3_t b, int8x16_t c) {
@ -569,7 +569,7 @@ int8x16_t test_vqtbx3q_s8(int8x16_t a, int8x16x3_t b, int8x16_t c) {
 // CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_INT8X16X4_T]], %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
 // CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[C]])
 // CHECK-NEXT:    ret <16 x i8> [[VTBX4_I]]
 //
 int8x16_t test_vqtbx4q_s8(int8x16_t a, int8x16x4_t b, int8x16_t c) {
@ -580,7 +580,7 @@ int8x16_t test_vqtbx4q_s8(int8x16_t a, int8x16x4_t b, int8x16_t c) {
 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL11_I]]
 //
 uint8x8_t test_vtbl1_u8(uint8x8_t a, uint8x8_t b) {
@ -590,7 +590,7 @@ uint8x8_t test_vtbl1_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vqtbl1_u8
 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[A]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[A]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL1_I]]
 //
 uint8x8_t test_vqtbl1_u8(uint8x16_t a, uint8x8_t b) {
@ -615,7 +615,7 @@ uint8x8_t test_vqtbl1_u8(uint8x16_t a, uint8x8_t b) {
 // CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
 // CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
 // CHECK-NEXT:    [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL13_I]]
 //
 uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) {
@ -639,7 +639,7 @@ uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) {
 // CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
 // CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL2_I]]
 //
 uint8x8_t test_vqtbl2_u8(uint8x16x2_t a, uint8x8_t b) {
@ -668,7 +668,7 @@ uint8x8_t test_vqtbl2_u8(uint8x16x2_t a, uint8x8_t b) {
 // CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
 // CHECK-NEXT:    [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK-NEXT:    [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL26_I]]
 //
 uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) {
@ -695,7 +695,7 @@ uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) {
 // CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
 // CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL3_I]]
 //
 uint8x8_t test_vqtbl3_u8(uint8x16x3_t a, uint8x8_t b) {
@ -727,7 +727,7 @@ uint8x8_t test_vqtbl3_u8(uint8x16x3_t a, uint8x8_t b) {
 // CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
 // CHECK-NEXT:    [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK-NEXT:    [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL28_I]]
 //
 uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) {
@ -757,7 +757,7 @@ uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) {
 // CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
 // CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL4_I]]
 //
 uint8x8_t test_vqtbl4_u8(uint8x16x4_t a, uint8x8_t b) {
@ -767,7 +767,7 @@ uint8x8_t test_vqtbl4_u8(uint8x16x4_t a, uint8x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vqtbl1q_u8
 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
 // CHECK-NEXT:    ret <16 x i8> [[VTBL1_I]]
 //
 uint8x16_t test_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
@ -791,7 +791,7 @@ uint8x16_t test_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
 // CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[B]])
 // CHECK-NEXT:    ret <16 x i8> [[VTBL2_I]]
 //
 uint8x16_t test_vqtbl2q_u8(uint8x16x2_t a, uint8x16_t b) {
@ -818,7 +818,7 @@ uint8x16_t test_vqtbl2q_u8(uint8x16x2_t a, uint8x16_t b) {
 // CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
 // CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[B]])
 // CHECK-NEXT:    ret <16 x i8> [[VTBL3_I]]
 //
 uint8x16_t test_vqtbl3q_u8(uint8x16x3_t a, uint8x16_t b) {
@ -848,7 +848,7 @@ uint8x16_t test_vqtbl3q_u8(uint8x16x3_t a, uint8x16_t b) {
 // CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
 // CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[B]])
 // CHECK-NEXT:    ret <16 x i8> [[VTBL4_I]]
 //
 uint8x16_t test_vqtbl4q_u8(uint8x16x4_t a, uint8x16_t b) {
@ -859,7 +859,7 @@ uint8x16_t test_vqtbl4q_u8(uint8x16x4_t a, uint8x16_t b) {
 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[C]])
 // CHECK-NEXT:    [[TMP0:%.*]] = icmp uge <8 x i8> [[C]], <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
 // CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[A]]
@ -890,7 +890,7 @@ uint8x8_t test_vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
 // CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
 // CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
 // CHECK-NEXT:    [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX1_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX1_I]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX13_I]]
 //
 uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) {
@ -919,7 +919,7 @@ uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) {
 // CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
 // CHECK-NEXT:    [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK-NEXT:    [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[C]])
 // CHECK-NEXT:    [[TMP4:%.*]] = icmp uge <8 x i8> [[C]], <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
 // CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8>
 // CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i8> [[TMP5]], [[A]]
@ -957,7 +957,7 @@ uint8x8_t test_vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) {
 // CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
 // CHECK-NEXT:    [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK-NEXT:    [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX28_I]]
 //
 uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) {
@ -967,7 +967,7 @@ uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vqtbx1_u8
 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX1_I]]
 //
 uint8x8_t test_vqtbx1_u8(uint8x8_t a, uint8x16_t b, uint8x8_t c) {
@ -991,7 +991,7 @@ uint8x8_t test_vqtbx1_u8(uint8x8_t a, uint8x16_t b, uint8x8_t c) {
 // CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
 // CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX2_I]]
 //
 uint8x8_t test_vqtbx2_u8(uint8x8_t a, uint8x16x2_t b, uint8x8_t c) {
@ -1018,7 +1018,7 @@ uint8x8_t test_vqtbx2_u8(uint8x8_t a, uint8x16x2_t b, uint8x8_t c) {
 // CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
 // CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX3_I]]
 //
 uint8x8_t test_vqtbx3_u8(uint8x8_t a, uint8x16x3_t b, uint8x8_t c) {
@ -1048,7 +1048,7 @@ uint8x8_t test_vqtbx3_u8(uint8x8_t a, uint8x16x3_t b, uint8x8_t c) {
 // CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
 // CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX4_I]]
 //
 uint8x8_t test_vqtbx4_u8(uint8x8_t a, uint8x16x4_t b, uint8x8_t c) {
@ -1058,7 +1058,7 @@ uint8x8_t test_vqtbx4_u8(uint8x8_t a, uint8x16x4_t b, uint8x8_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vqtbx1q_u8
 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]])
 // CHECK-NEXT:    ret <16 x i8> [[VTBX1_I]]
 //
 uint8x16_t test_vqtbx1q_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
@ -1082,7 +1082,7 @@ uint8x16_t test_vqtbx1q_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
 // CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X2_T]], %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
 // CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[C]])
 // CHECK-NEXT:    ret <16 x i8> [[VTBX2_I]]
 //
 uint8x16_t test_vqtbx2q_u8(uint8x16_t a, uint8x16x2_t b, uint8x16_t c) {
@ -1109,7 +1109,7 @@ uint8x16_t test_vqtbx2q_u8(uint8x16_t a, uint8x16x2_t b, uint8x16_t c) {
 // CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X3_T]], %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
 // CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[C]])
 // CHECK-NEXT:    ret <16 x i8> [[VTBX3_I]]
 //
 uint8x16_t test_vqtbx3q_u8(uint8x16_t a, uint8x16x3_t b, uint8x16_t c) {
@ -1139,7 +1139,7 @@ uint8x16_t test_vqtbx3q_u8(uint8x16_t a, uint8x16x3_t b, uint8x16_t c) {
 // CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_UINT8X16X4_T]], %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
 // CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[C]])
 // CHECK-NEXT:    ret <16 x i8> [[VTBX4_I]]
 //
 uint8x16_t test_vqtbx4q_u8(uint8x16_t a, uint8x16x4_t b, uint8x16_t c) {
@ -1150,7 +1150,7 @@ uint8x16_t test_vqtbx4q_u8(uint8x16_t a, uint8x16x4_t b, uint8x16_t c) {
 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL11_I]]
 //
 poly8x8_t test_vtbl1_p8(poly8x8_t a, uint8x8_t b) {
@ -1160,7 +1160,7 @@ poly8x8_t test_vtbl1_p8(poly8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vqtbl1_p8
 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[A]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[A]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL1_I]]
 //
 poly8x8_t test_vqtbl1_p8(poly8x16_t a, uint8x8_t b) {
@ -1185,7 +1185,7 @@ poly8x8_t test_vqtbl1_p8(poly8x16_t a, uint8x8_t b) {
 // CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
 // CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
 // CHECK-NEXT:    [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL13_I]]
 //
 poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) {
@ -1209,7 +1209,7 @@ poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) {
 // CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
 // CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL2_I]]
 //
 poly8x8_t test_vqtbl2_p8(poly8x16x2_t a, uint8x8_t b) {
@ -1238,7 +1238,7 @@ poly8x8_t test_vqtbl2_p8(poly8x16x2_t a, uint8x8_t b) {
 // CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
 // CHECK-NEXT:    [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK-NEXT:    [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL26_I]]
 //
 poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) {
@ -1265,7 +1265,7 @@ poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) {
 // CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
 // CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL3_I]]
 //
 poly8x8_t test_vqtbl3_p8(poly8x16x3_t a, uint8x8_t b) {
@ -1297,7 +1297,7 @@ poly8x8_t test_vqtbl3_p8(poly8x16x3_t a, uint8x8_t b) {
 // CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
 // CHECK-NEXT:    [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK-NEXT:    [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL28_I]]
 //
 poly8x8_t test_vtbl4_p8(poly8x8x4_t a, uint8x8_t b) {
@ -1327,7 +1327,7 @@ poly8x8_t test_vtbl4_p8(poly8x8x4_t a, uint8x8_t b) {
 // CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
 // CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[B]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBL4_I]]
 //
 poly8x8_t test_vqtbl4_p8(poly8x16x4_t a, uint8x8_t b) {
@ -1337,7 +1337,7 @@ poly8x8_t test_vqtbl4_p8(poly8x16x4_t a, uint8x8_t b) {
 // CHECK-LABEL: define {{[^@]+}}@test_vqtbl1q_p8
 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]])
 // CHECK-NEXT:    ret <16 x i8> [[VTBL1_I]]
 //
 poly8x16_t test_vqtbl1q_p8(poly8x16_t a, uint8x16_t b) {
@ -1361,7 +1361,7 @@ poly8x16_t test_vqtbl1q_p8(poly8x16_t a, uint8x16_t b) {
 // CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
 // CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[B]])
 // CHECK-NEXT:    ret <16 x i8> [[VTBL2_I]]
 //
 poly8x16_t test_vqtbl2q_p8(poly8x16x2_t a, uint8x16_t b) {
@ -1388,7 +1388,7 @@ poly8x16_t test_vqtbl2q_p8(poly8x16x2_t a, uint8x16_t b) {
 // CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
 // CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[B]])
 // CHECK-NEXT:    ret <16 x i8> [[VTBL3_I]]
 //
 poly8x16_t test_vqtbl3q_p8(poly8x16x3_t a, uint8x16_t b) {
@ -1418,7 +1418,7 @@ poly8x16_t test_vqtbl3q_p8(poly8x16x3_t a, uint8x16_t b) {
 // CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
 // CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[B]])
 // CHECK-NEXT:    ret <16 x i8> [[VTBL4_I]]
 //
 poly8x16_t test_vqtbl4q_p8(poly8x16x4_t a, uint8x16_t b) {
@ -1429,7 +1429,7 @@ poly8x16_t test_vqtbl4q_p8(poly8x16x4_t a, uint8x16_t b) {
 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[C]])
 // CHECK-NEXT:    [[TMP0:%.*]] = icmp uge <8 x i8> [[C]], <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
 // CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[A]]
@ -1460,7 +1460,7 @@ poly8x8_t test_vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c) {
 // CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
 // CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
 // CHECK-NEXT:    [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX1_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX1_I]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX13_I]]
 //
 poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) {
@ -1489,7 +1489,7 @@ poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) {
 // CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
 // CHECK-NEXT:    [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK-NEXT:    [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[C]])
 // CHECK-NEXT:    [[TMP4:%.*]] = icmp uge <8 x i8> [[C]], <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
 // CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8>
 // CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i8> [[TMP5]], [[A]]
@ -1527,7 +1527,7 @@ poly8x8_t test_vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c) {
 // CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
 // CHECK-NEXT:    [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 // CHECK-NEXT:    [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-// CHECK-NEXT:    [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX28_I]]
 //
 poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) {
@ -1537,7 +1537,7 @@ poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vqtbx1_p8
 // CHECK-SAME: (<8 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX1_I]]
 //
 poly8x8_t test_vqtbx1_p8(poly8x8_t a, uint8x16_t b, uint8x8_t c) {
@ -1561,7 +1561,7 @@ poly8x8_t test_vqtbx1_p8(poly8x8_t a, uint8x16_t b, uint8x8_t c) {
 // CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
 // CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX2_I]]
 //
 poly8x8_t test_vqtbx2_p8(poly8x8_t a, poly8x16x2_t b, uint8x8_t c) {
@ -1588,7 +1588,7 @@ poly8x8_t test_vqtbx2_p8(poly8x8_t a, poly8x16x2_t b, uint8x8_t c) {
 // CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
 // CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX3_I]]
 //
 poly8x8_t test_vqtbx3_p8(poly8x8_t a, poly8x16x3_t b, uint8x8_t c) {
@ -1618,7 +1618,7 @@ poly8x8_t test_vqtbx3_p8(poly8x8_t a, poly8x16x3_t b, uint8x8_t c) {
 // CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
 // CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> [[C]])
 // CHECK-NEXT:    ret <8 x i8> [[VTBX4_I]]
 //
 poly8x8_t test_vqtbx4_p8(poly8x8_t a, poly8x16x4_t b, uint8x8_t c) {
@ -1628,7 +1628,7 @@ poly8x8_t test_vqtbx4_p8(poly8x8_t a, poly8x16x4_t b, uint8x8_t c) {
 // CHECK-LABEL: define {{[^@]+}}@test_vqtbx1q_p8
 // CHECK-SAME: (<16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]])
 // CHECK-NEXT:    ret <16 x i8> [[VTBX1_I]]
 //
 poly8x16_t test_vqtbx1q_p8(poly8x16_t a, uint8x16_t b, uint8x16_t c) {
@ -1652,7 +1652,7 @@ poly8x16_t test_vqtbx1q_p8(poly8x16_t a, uint8x16_t b, uint8x16_t c) {
 // CHECK-NEXT:    [[VAL1_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X2_T]], %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
 // CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
-// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[C]])
 // CHECK-NEXT:    ret <16 x i8> [[VTBX2_I]]
 //
 poly8x16_t test_vqtbx2q_p8(poly8x16_t a, poly8x16x2_t b, uint8x16_t c) {
@ -1679,7 +1679,7 @@ poly8x16_t test_vqtbx2q_p8(poly8x16_t a, poly8x16x2_t b, uint8x16_t c) {
 // CHECK-NEXT:    [[VAL3_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X3_T]], %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
 // CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
-// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[C]])
 // CHECK-NEXT:    ret <16 x i8> [[VTBX3_I]]
 //
 poly8x16_t test_vqtbx3q_p8(poly8x16_t a, poly8x16x3_t b, uint8x16_t c) {
@ -1709,7 +1709,7 @@ poly8x16_t test_vqtbx3q_p8(poly8x16_t a, poly8x16x3_t b, uint8x16_t c) {
 // CHECK-NEXT:    [[VAL5_I:%.*]] = getelementptr inbounds [[STRUCT_POLY8X16X4_T]], %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
 // CHECK-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
 // CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
-// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[C]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[A]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[C]])
 // CHECK-NEXT:    ret <16 x i8> [[VTBX4_I]]
 //
 poly8x16_t test_vqtbx4q_p8(poly8x16_t a, poly8x16x4_t b, uint8x16_t c) {
--- a/clang/test/CodeGen/aarch64-poly128.c
+++ b/clang/test/CodeGen/aarch64-poly128.c
@ -60,7 +60,7 @@ void test_ld_st_p128(poly128_t * ptr) {
 // CHECK-LABEL: define {{[^@]+}}@test_vmull_p64
 // CHECK-SAME: (i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VMULL_P64_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[A]], i64 [[B]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[VMULL_P64_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[A]], i64 [[B]])
 // CHECK-NEXT:    [[VMULL_P641_I:%.*]] = bitcast <16 x i8> [[VMULL_P64_I]] to i128
 // CHECK-NEXT:    ret i128 [[VMULL_P641_I]]
 //
@ -75,7 +75,7 @@ poly128_t test_vmull_p64(poly64_t a, poly64_t b) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[SHUFFLE_I5]] to i64
 // CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[B]], <2 x i64> [[B]], <1 x i32> <i32 1>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I]] to i64
-// CHECK-NEXT:    [[VMULL_P64_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[TMP0]], i64 [[TMP1]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMULL_P64_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[TMP0]], i64 [[TMP1]])
 // CHECK-NEXT:    [[VMULL_P641_I_I:%.*]] = bitcast <16 x i8> [[VMULL_P64_I_I]] to i128
 // CHECK-NEXT:    ret i128 [[VMULL_P641_I_I]]
 //
--- a/clang/test/CodeGen/aarch64-v8.1a-neon-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-v8.1a-neon-intrinsics.c
@ -11,7 +11,7 @@
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR4:[0-9]+]]
+// CHECK-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    ret <4 x i16> [[VQRDMLAH_V3_I]]
 //
 int16x4_t test_vqrdmlah_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
@ -23,7 +23,7 @@ int16x4_t test_vqrdmlah_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    ret <2 x i32> [[VQRDMLAH_V3_I]]
 //
 int32x2_t test_vqrdmlah_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
@ -35,7 +35,7 @@ int32x2_t test_vqrdmlah_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]])
 // CHECK-NEXT:    ret <8 x i16> [[VQRDMLAHQ_V3_I]]
 //
 int16x8_t test_vqrdmlahq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
@ -47,7 +47,7 @@ int16x8_t test_vqrdmlahq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]])
 // CHECK-NEXT:    ret <4 x i32> [[VQRDMLAHQ_V3_I]]
 //
 int32x4_t test_vqrdmlahq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
@ -59,7 +59,7 @@ int32x4_t test_vqrdmlahq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
 // CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
 // CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
 // CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i64 0
-// CHECK-NEXT:    [[VQRDMLAHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQRDMLAHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
 // CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLAHH_S16_I]], i64 0
 // CHECK-NEXT:    ret i16 [[TMP3]]
 //
@ -69,7 +69,7 @@ int16_t test_vqrdmlahh_s16(int16_t a, int16_t b, int16_t c) {

 // CHECK-LABEL: @test_vqrdmlahs_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VQRDMLAHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQRDMLAHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]])
 // CHECK-NEXT:    ret i32 [[VQRDMLAHS_S32_I]]
 //
 int32_t test_vqrdmlahs_s32(int32_t a, int32_t b, int32_t c) {
@ -82,7 +82,7 @@ int32_t test_vqrdmlahs_s32(int32_t a, int32_t b, int32_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
 // CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
 // CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
-// CHECK-NEXT:    [[VQRDMLAHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQRDMLAHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
 // CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLAHH_S16_I]], i64 0
 // CHECK-NEXT:    ret i16 [[TMP3]]
 //
@ -93,7 +93,7 @@ int16_t test_vqrdmlahh_lane_s16(int16_t a, int16_t b, int16x4_t c) {
 // CHECK-LABEL: @test_vqrdmlahs_lane_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 1
-// CHECK-NEXT:    [[VQRDMLAHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGET_LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQRDMLAHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGET_LANE]])
 // CHECK-NEXT:    ret i32 [[VQRDMLAHS_S32_I]]
 //
 int32_t test_vqrdmlahs_lane_s32(int32_t a, int32_t b, int32x2_t c) {
@ -106,7 +106,7 @@ int32_t test_vqrdmlahs_lane_s32(int32_t a, int32_t b, int32x2_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
 // CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
 // CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
-// CHECK-NEXT:    [[VQRDMLAHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQRDMLAHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
 // CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLAHH_S16_I]], i64 0
 // CHECK-NEXT:    ret i16 [[TMP3]]
 //
@ -117,7 +117,7 @@ int16_t test_vqrdmlahh_laneq_s16(int16_t a, int16_t b, int16x8_t c) {
 // CHECK-LABEL: @test_vqrdmlahs_laneq_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 3
-// CHECK-NEXT:    [[VQRDMLAHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGETQ_LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQRDMLAHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGETQ_LANE]])
 // CHECK-NEXT:    ret i32 [[VQRDMLAHS_S32_I]]
 //
 int32_t test_vqrdmlahs_laneq_s32(int32_t a, int32_t b, int32x4_t c) {
@ -129,7 +129,7 @@ int32_t test_vqrdmlahs_laneq_s32(int32_t a, int32_t b, int32x4_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
 // CHECK-NEXT:    ret <4 x i16> [[VQRDMLSH_V3_I]]
 //
 int16x4_t test_vqrdmlsh_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
@ -141,7 +141,7 @@ int16x4_t test_vqrdmlsh_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> <i32 3, i32 3>
-// CHECK-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
 // CHECK-NEXT:    ret <2 x i32> [[VQRDMLSH_V3_I]]
 //
 int32x2_t test_vqrdmlsh_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
@ -153,7 +153,7 @@ int32x2_t test_vqrdmlsh_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]])
 // CHECK-NEXT:    ret <8 x i16> [[VQRDMLSHQ_V3_I]]
 //
 int16x8_t test_vqrdmlshq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
@ -165,7 +165,7 @@ int16x8_t test_vqrdmlshq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]])
 // CHECK-NEXT:    ret <4 x i32> [[VQRDMLSHQ_V3_I]]
 //
 int32x4_t test_vqrdmlshq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
@ -177,7 +177,7 @@ int32x4_t test_vqrdmlshq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
 // CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
 // CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
 // CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[C:%.*]], i64 0
-// CHECK-NEXT:    [[VQRDMLSHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQRDMLSHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
 // CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLSHH_S16_I]], i64 0
 // CHECK-NEXT:    ret i16 [[TMP3]]
 //
@ -187,7 +187,7 @@ int16_t test_vqrdmlshh_s16(int16_t a, int16_t b, int16_t c) {

 // CHECK-LABEL: @test_vqrdmlshs_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VQRDMLSHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQRDMLSHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]])
 // CHECK-NEXT:    ret i32 [[VQRDMLSHS_S32_I]]
 //
 int32_t test_vqrdmlshs_s32(int32_t a, int32_t b, int32_t c) {
@ -200,7 +200,7 @@ int32_t test_vqrdmlshs_s32(int32_t a, int32_t b, int32_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
 // CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
 // CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
-// CHECK-NEXT:    [[VQRDMLSHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQRDMLSHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
 // CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLSHH_S16_I]], i64 0
 // CHECK-NEXT:    ret i16 [[TMP3]]
 //
@ -211,7 +211,7 @@ int16_t test_vqrdmlshh_lane_s16(int16_t a, int16_t b, int16x4_t c) {
 // CHECK-LABEL: @test_vqrdmlshs_lane_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <2 x i32> [[C:%.*]], i32 1
-// CHECK-NEXT:    [[VQRDMLSHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGET_LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQRDMLSHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGET_LANE]])
 // CHECK-NEXT:    ret i32 [[VQRDMLSHS_S32_I]]
 //
 int32_t test_vqrdmlshs_lane_s32(int32_t a, int32_t b, int32x2_t c) {
@ -224,7 +224,7 @@ int32_t test_vqrdmlshs_lane_s32(int32_t a, int32_t b, int32x2_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[A:%.*]], i64 0
 // CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 [[B:%.*]], i64 0
 // CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
-// CHECK-NEXT:    [[VQRDMLSHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQRDMLSHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <4 x i16> [[TMP2]])
 // CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i16> [[VQRDMLSHH_S16_I]], i64 0
 // CHECK-NEXT:    ret i16 [[TMP3]]
 //
@ -235,7 +235,7 @@ int16_t test_vqrdmlshh_laneq_s16(int16_t a, int16_t b, int16x8_t c) {
 // CHECK-LABEL: @test_vqrdmlshs_laneq_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 3
-// CHECK-NEXT:    [[VQRDMLSHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGETQ_LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VQRDMLSHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[VGETQ_LANE]])
 // CHECK-NEXT:    ret i32 [[VQRDMLSHS_S32_I]]
 //
 int32_t test_vqrdmlshs_laneq_s32(int32_t a, int32_t b, int32x4_t c) {
--- a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
@ -12,7 +12,7 @@
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VABS1_I:%.*]] = call <4 x half> @llvm.fabs.v4f16(<4 x half> [[A]]) #[[ATTR4:[0-9]+]]
+// CHECK-NEXT:    [[VABS1_I:%.*]] = call <4 x half> @llvm.fabs.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[VABS1_I]]
 //
 float16x4_t test_vabs_f16(float16x4_t a) {
@ -23,7 +23,7 @@ float16x4_t test_vabs_f16(float16x4_t a) {
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VABS1_I:%.*]] = call <8 x half> @llvm.fabs.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VABS1_I:%.*]] = call <8 x half> @llvm.fabs.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[VABS1_I]]
 //
 float16x8_t test_vabsq_f16(float16x8_t a) {
@ -198,7 +198,7 @@ float16x8_t test_vcvtq_f16_u16 (uint16x8_t a) {
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzs.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzs.v4i16.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VCVTZ1_I]]
 //
 int16x4_t test_vcvt_s16_f16 (float16x4_t a) {
@ -209,7 +209,7 @@ int16x4_t test_vcvt_s16_f16 (float16x4_t a) {
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzs.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzs.v8i16.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x i16> [[VCVTZ1_I]]
 //
 int16x8_t test_vcvtq_s16_f16 (float16x8_t a) {
@ -220,7 +220,7 @@ int16x8_t test_vcvtq_s16_f16 (float16x8_t a) {
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VCVTZ1_I]]
 //
 uint16x4_t test_vcvt_u16_f16 (float16x4_t a) {
@ -231,7 +231,7 @@ uint16x4_t test_vcvt_u16_f16 (float16x4_t a) {
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCVTZ1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtzu.v8i16.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x i16> [[VCVTZ1_I]]
 //
 uint16x8_t test_vcvtq_u16_f16 (float16x8_t a) {
@ -242,7 +242,7 @@ uint16x8_t test_vcvtq_u16_f16 (float16x8_t a) {
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVTA1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtas.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCVTA1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtas.v4i16.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VCVTA1_I]]
 //
 int16x4_t test_vcvta_s16_f16 (float16x4_t a) {
@ -253,7 +253,7 @@ int16x4_t test_vcvta_s16_f16 (float16x4_t a) {
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVTA1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtau.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCVTA1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtau.v4i16.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VCVTA1_I]]
 //
 uint16x4_t test_vcvta_u16_f16 (float16x4_t a) {
@ -264,7 +264,7 @@ uint16x4_t test_vcvta_u16_f16 (float16x4_t a) {
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVTA1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtas.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCVTA1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtas.v8i16.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x i16> [[VCVTA1_I]]
 //
 int16x8_t test_vcvtaq_s16_f16 (float16x8_t a) {
@ -275,7 +275,7 @@ int16x8_t test_vcvtaq_s16_f16 (float16x8_t a) {
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVTM1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtms.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCVTM1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtms.v4i16.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VCVTM1_I]]
 //
 int16x4_t test_vcvtm_s16_f16 (float16x4_t a) {
@ -286,7 +286,7 @@ int16x4_t test_vcvtm_s16_f16 (float16x4_t a) {
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVTM1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtms.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCVTM1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtms.v8i16.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x i16> [[VCVTM1_I]]
 //
 int16x8_t test_vcvtmq_s16_f16 (float16x8_t a) {
@ -297,7 +297,7 @@ int16x8_t test_vcvtmq_s16_f16 (float16x8_t a) {
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVTM1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtmu.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCVTM1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtmu.v4i16.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VCVTM1_I]]
 //
 uint16x4_t test_vcvtm_u16_f16 (float16x4_t a) {
@ -308,7 +308,7 @@ uint16x4_t test_vcvtm_u16_f16 (float16x4_t a) {
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVTM1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtmu.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCVTM1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtmu.v8i16.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x i16> [[VCVTM1_I]]
 //
 uint16x8_t test_vcvtmq_u16_f16 (float16x8_t a) {
@ -319,7 +319,7 @@ uint16x8_t test_vcvtmq_u16_f16 (float16x8_t a) {
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVTN1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtns.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCVTN1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtns.v4i16.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VCVTN1_I]]
 //
 int16x4_t test_vcvtn_s16_f16 (float16x4_t a) {
@ -330,7 +330,7 @@ int16x4_t test_vcvtn_s16_f16 (float16x4_t a) {
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVTN1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtns.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCVTN1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtns.v8i16.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x i16> [[VCVTN1_I]]
 //
 int16x8_t test_vcvtnq_s16_f16 (float16x8_t a) {
@ -341,7 +341,7 @@ int16x8_t test_vcvtnq_s16_f16 (float16x8_t a) {
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVTN1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtnu.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCVTN1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtnu.v4i16.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VCVTN1_I]]
 //
 uint16x4_t test_vcvtn_u16_f16 (float16x4_t a) {
@ -352,7 +352,7 @@ uint16x4_t test_vcvtn_u16_f16 (float16x4_t a) {
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVTN1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtnu.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCVTN1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtnu.v8i16.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x i16> [[VCVTN1_I]]
 //
 uint16x8_t test_vcvtnq_u16_f16 (float16x8_t a) {
@ -363,7 +363,7 @@ uint16x8_t test_vcvtnq_u16_f16 (float16x8_t a) {
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVTP1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtps.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCVTP1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtps.v4i16.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VCVTP1_I]]
 //
 int16x4_t test_vcvtp_s16_f16 (float16x4_t a) {
@ -374,7 +374,7 @@ int16x4_t test_vcvtp_s16_f16 (float16x4_t a) {
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVTP1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtps.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCVTP1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtps.v8i16.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x i16> [[VCVTP1_I]]
 //
 int16x8_t test_vcvtpq_s16_f16 (float16x8_t a) {
@ -385,7 +385,7 @@ int16x8_t test_vcvtpq_s16_f16 (float16x8_t a) {
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVTP1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtpu.v4i16.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCVTP1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.fcvtpu.v4i16.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VCVTP1_I]]
 //
 uint16x4_t test_vcvtp_u16_f16 (float16x4_t a) {
@ -396,7 +396,7 @@ uint16x4_t test_vcvtp_u16_f16 (float16x4_t a) {
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVTP1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtpu.v8i16.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCVTP1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.fcvtpu.v8i16.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x i16> [[VCVTP1_I]]
 //
 uint16x8_t test_vcvtpq_u16_f16 (float16x8_t a) {
@ -428,7 +428,7 @@ float16x8_t test_vnegq_f16(float16x8_t a) {
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VRECPE_V1_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frecpe.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRECPE_V1_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frecpe.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[VRECPE_V1_I]]
 //
 float16x4_t test_vrecpe_f16(float16x4_t a) {
@ -439,7 +439,7 @@ float16x4_t test_vrecpe_f16(float16x4_t a) {
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VRECPEQ_V1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frecpe.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRECPEQ_V1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frecpe.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[VRECPEQ_V1_I]]
 //
 float16x8_t test_vrecpeq_f16(float16x8_t a) {
@ -450,7 +450,7 @@ float16x8_t test_vrecpeq_f16(float16x8_t a) {
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VRNDZ1_I:%.*]] = call <4 x half> @llvm.trunc.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRNDZ1_I:%.*]] = call <4 x half> @llvm.trunc.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[VRNDZ1_I]]
 //
 float16x4_t test_vrnd_f16(float16x4_t a) {
@ -461,7 +461,7 @@ float16x4_t test_vrnd_f16(float16x4_t a) {
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VRNDZ1_I:%.*]] = call <8 x half> @llvm.trunc.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRNDZ1_I:%.*]] = call <8 x half> @llvm.trunc.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[VRNDZ1_I]]
 //
 float16x8_t test_vrndq_f16(float16x8_t a) {
@ -472,7 +472,7 @@ float16x8_t test_vrndq_f16(float16x8_t a) {
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VRNDA1_I:%.*]] = call <4 x half> @llvm.round.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRNDA1_I:%.*]] = call <4 x half> @llvm.round.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[VRNDA1_I]]
 //
 float16x4_t test_vrnda_f16(float16x4_t a) {
@ -483,7 +483,7 @@ float16x4_t test_vrnda_f16(float16x4_t a) {
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VRNDA1_I:%.*]] = call <8 x half> @llvm.round.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRNDA1_I:%.*]] = call <8 x half> @llvm.round.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[VRNDA1_I]]
 //
 float16x8_t test_vrndaq_f16(float16x8_t a) {
@ -494,7 +494,7 @@ float16x8_t test_vrndaq_f16(float16x8_t a) {
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VRNDI_V1_I:%.*]] = call <4 x half> @llvm.nearbyint.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRNDI_V1_I:%.*]] = call <4 x half> @llvm.nearbyint.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[VRNDI_V1_I]]
 //
 float16x4_t test_vrndi_f16(float16x4_t a) {
@ -505,7 +505,7 @@ float16x4_t test_vrndi_f16(float16x4_t a) {
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VRNDIQ_V1_I:%.*]] = call <8 x half> @llvm.nearbyint.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRNDIQ_V1_I:%.*]] = call <8 x half> @llvm.nearbyint.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[VRNDIQ_V1_I]]
 //
 float16x8_t test_vrndiq_f16(float16x8_t a) {
@ -516,7 +516,7 @@ float16x8_t test_vrndiq_f16(float16x8_t a) {
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VRNDM1_I:%.*]] = call <4 x half> @llvm.floor.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRNDM1_I:%.*]] = call <4 x half> @llvm.floor.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[VRNDM1_I]]
 //
 float16x4_t test_vrndm_f16(float16x4_t a) {
@ -527,7 +527,7 @@ float16x4_t test_vrndm_f16(float16x4_t a) {
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VRNDM1_I:%.*]] = call <8 x half> @llvm.floor.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRNDM1_I:%.*]] = call <8 x half> @llvm.floor.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[VRNDM1_I]]
 //
 float16x8_t test_vrndmq_f16(float16x8_t a) {
@ -538,7 +538,7 @@ float16x8_t test_vrndmq_f16(float16x8_t a) {
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VRNDN1_I:%.*]] = call <4 x half> @llvm.roundeven.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRNDN1_I:%.*]] = call <4 x half> @llvm.roundeven.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[VRNDN1_I]]
 //
 float16x4_t test_vrndn_f16(float16x4_t a) {
@ -549,7 +549,7 @@ float16x4_t test_vrndn_f16(float16x4_t a) {
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VRNDN1_I:%.*]] = call <8 x half> @llvm.roundeven.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRNDN1_I:%.*]] = call <8 x half> @llvm.roundeven.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[VRNDN1_I]]
 //
 float16x8_t test_vrndnq_f16(float16x8_t a) {
@ -560,7 +560,7 @@ float16x8_t test_vrndnq_f16(float16x8_t a) {
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VRNDP1_I:%.*]] = call <4 x half> @llvm.ceil.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRNDP1_I:%.*]] = call <4 x half> @llvm.ceil.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[VRNDP1_I]]
 //
 float16x4_t test_vrndp_f16(float16x4_t a) {
@ -571,7 +571,7 @@ float16x4_t test_vrndp_f16(float16x4_t a) {
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VRNDP1_I:%.*]] = call <8 x half> @llvm.ceil.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRNDP1_I:%.*]] = call <8 x half> @llvm.ceil.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[VRNDP1_I]]
 //
 float16x8_t test_vrndpq_f16(float16x8_t a) {
@ -582,7 +582,7 @@ float16x8_t test_vrndpq_f16(float16x8_t a) {
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VRNDX1_I:%.*]] = call <4 x half> @llvm.rint.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRNDX1_I:%.*]] = call <4 x half> @llvm.rint.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[VRNDX1_I]]
 //
 float16x4_t test_vrndx_f16(float16x4_t a) {
@ -593,7 +593,7 @@ float16x4_t test_vrndx_f16(float16x4_t a) {
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VRNDX1_I:%.*]] = call <8 x half> @llvm.rint.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRNDX1_I:%.*]] = call <8 x half> @llvm.rint.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[VRNDX1_I]]
 //
 float16x8_t test_vrndxq_f16(float16x8_t a) {
@ -604,7 +604,7 @@ float16x8_t test_vrndxq_f16(float16x8_t a) {
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VRSQRTE_V1_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frsqrte.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRSQRTE_V1_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frsqrte.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[VRSQRTE_V1_I]]
 //
 float16x4_t test_vrsqrte_f16(float16x4_t a) {
@ -615,7 +615,7 @@ float16x4_t test_vrsqrte_f16(float16x4_t a) {
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VRSQRTEQ_V1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frsqrte.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRSQRTEQ_V1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frsqrte.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[VRSQRTEQ_V1_I]]
 //
 float16x8_t test_vrsqrteq_f16(float16x8_t a) {
@ -626,7 +626,7 @@ float16x8_t test_vrsqrteq_f16(float16x8_t a) {
 // CHECK-SAME: (<4 x half> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VSQRT_I:%.*]] = call <4 x half> @llvm.sqrt.v4f16(<4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VSQRT_I:%.*]] = call <4 x half> @llvm.sqrt.v4f16(<4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[VSQRT_I]]
 //
 float16x4_t test_vsqrt_f16(float16x4_t a) {
@ -637,7 +637,7 @@ float16x4_t test_vsqrt_f16(float16x4_t a) {
 // CHECK-SAME: (<8 x half> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VSQRT_I:%.*]] = call <8 x half> @llvm.sqrt.v8f16(<8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VSQRT_I:%.*]] = call <8 x half> @llvm.sqrt.v8f16(<8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[VSQRT_I]]
 //
 float16x8_t test_vsqrtq_f16(float16x8_t a) {
@ -669,7 +669,7 @@ float16x8_t test_vaddq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fabd.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fabd.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    ret <4 x half> [[VABD2_I]]
 //
 float16x4_t test_vabd_f16(float16x4_t a, float16x4_t b) {
@ -681,7 +681,7 @@ float16x4_t test_vabd_f16(float16x4_t a, float16x4_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VABD2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fabd.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VABD2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fabd.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    ret <8 x half> [[VABD2_I]]
 //
 float16x8_t test_vabdq_f16(float16x8_t a, float16x8_t b) {
@ -693,7 +693,7 @@ float16x8_t test_vabdq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VCAGE_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facge.v4i16.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCAGE_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facge.v4i16.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    ret <4 x i16> [[VCAGE_V2_I]]
 //
 uint16x4_t test_vcage_f16(float16x4_t a, float16x4_t b) {
@ -705,7 +705,7 @@ uint16x4_t test_vcage_f16(float16x4_t a, float16x4_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VCAGEQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facge.v8i16.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCAGEQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facge.v8i16.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    ret <8 x i16> [[VCAGEQ_V2_I]]
 //
 uint16x8_t test_vcageq_f16(float16x8_t a, float16x8_t b) {
@ -717,7 +717,7 @@ uint16x8_t test_vcageq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VCAGT_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facgt.v4i16.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCAGT_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facgt.v4i16.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    ret <4 x i16> [[VCAGT_V2_I]]
 //
 uint16x4_t test_vcagt_f16(float16x4_t a, float16x4_t b) {
@ -729,7 +729,7 @@ uint16x4_t test_vcagt_f16(float16x4_t a, float16x4_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VCAGTQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facgt.v8i16.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCAGTQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facgt.v8i16.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    ret <8 x i16> [[VCAGTQ_V2_I]]
 //
 uint16x8_t test_vcagtq_f16(float16x8_t a, float16x8_t b) {
@ -741,7 +741,7 @@ uint16x8_t test_vcagtq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VCALE_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facge.v4i16.v4f16(<4 x half> [[B]], <4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCALE_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facge.v4i16.v4f16(<4 x half> [[B]], <4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VCALE_V2_I]]
 //
 uint16x4_t test_vcale_f16(float16x4_t a, float16x4_t b) {
@ -753,7 +753,7 @@ uint16x4_t test_vcale_f16(float16x4_t a, float16x4_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VCALEQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facge.v8i16.v8f16(<8 x half> [[B]], <8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCALEQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facge.v8i16.v8f16(<8 x half> [[B]], <8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x i16> [[VCALEQ_V2_I]]
 //
 uint16x8_t test_vcaleq_f16(float16x8_t a, float16x8_t b) {
@ -765,7 +765,7 @@ uint16x8_t test_vcaleq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VCALT_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facgt.v4i16.v4f16(<4 x half> [[B]], <4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCALT_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.facgt.v4i16.v4f16(<4 x half> [[B]], <4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x i16> [[VCALT_V2_I]]
 //
 uint16x4_t test_vcalt_f16(float16x4_t a, float16x4_t b) {
@ -777,7 +777,7 @@ uint16x4_t test_vcalt_f16(float16x4_t a, float16x4_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VCALTQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facgt.v8i16.v8f16(<8 x half> [[B]], <8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VCALTQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.facgt.v8i16.v8f16(<8 x half> [[B]], <8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x i16> [[VCALTQ_V2_I]]
 //
 uint16x8_t test_vcaltq_f16(float16x8_t a, float16x8_t b) {
@ -1015,7 +1015,7 @@ float16x8_t test_vdivq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VMAX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmax.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VMAX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmax.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    ret <4 x half> [[VMAX2_I]]
 //
 float16x4_t test_vmax_f16(float16x4_t a, float16x4_t b) {
@ -1027,7 +1027,7 @@ float16x4_t test_vmax_f16(float16x4_t a, float16x4_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VMAX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmax.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VMAX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmax.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    ret <8 x half> [[VMAX2_I]]
 //
 float16x8_t test_vmaxq_f16(float16x8_t a, float16x8_t b) {
@ -1039,7 +1039,7 @@ float16x8_t test_vmaxq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VMAXNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxnm.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VMAXNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxnm.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    ret <4 x half> [[VMAXNM2_I]]
 //
 float16x4_t test_vmaxnm_f16(float16x4_t a, float16x4_t b) {
@ -1051,7 +1051,7 @@ float16x4_t test_vmaxnm_f16(float16x4_t a, float16x4_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VMAXNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxnm.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VMAXNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxnm.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    ret <8 x half> [[VMAXNM2_I]]
 //
 float16x8_t test_vmaxnmq_f16(float16x8_t a, float16x8_t b) {
@ -1063,7 +1063,7 @@ float16x8_t test_vmaxnmq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VMIN2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmin.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VMIN2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmin.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    ret <4 x half> [[VMIN2_I]]
 //
 float16x4_t test_vmin_f16(float16x4_t a, float16x4_t b) {
@ -1075,7 +1075,7 @@ float16x4_t test_vmin_f16(float16x4_t a, float16x4_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VMIN2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmin.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VMIN2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmin.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    ret <8 x half> [[VMIN2_I]]
 //
 float16x8_t test_vminq_f16(float16x8_t a, float16x8_t b) {
@ -1087,7 +1087,7 @@ float16x8_t test_vminq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VMINNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminnm.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VMINNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminnm.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    ret <4 x half> [[VMINNM2_I]]
 //
 float16x4_t test_vminnm_f16(float16x4_t a, float16x4_t b) {
@ -1099,7 +1099,7 @@ float16x4_t test_vminnm_f16(float16x4_t a, float16x4_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VMINNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminnm.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VMINNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminnm.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    ret <8 x half> [[VMINNM2_I]]
 //
 float16x8_t test_vminnmq_f16(float16x8_t a, float16x8_t b) {
@ -1131,7 +1131,7 @@ float16x8_t test_vmulq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    ret <4 x half> [[VMULX2_I]]
 //
 float16x4_t test_vmulx_f16(float16x4_t a, float16x4_t b) {
@ -1143,7 +1143,7 @@ float16x4_t test_vmulx_f16(float16x4_t a, float16x4_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    ret <8 x half> [[VMULX2_I]]
 //
 float16x8_t test_vmulxq_f16(float16x8_t a, float16x8_t b) {
@ -1155,7 +1155,7 @@ float16x8_t test_vmulxq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VPADD_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.faddp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VPADD_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.faddp.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    [[VPADD_V3_I:%.*]] = bitcast <4 x half> [[VPADD_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x half> [[VPADD_V2_I]]
 //
@ -1168,7 +1168,7 @@ float16x4_t test_vpadd_f16(float16x4_t a, float16x4_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VPADDQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.faddp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VPADDQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.faddp.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    [[VPADDQ_V3_I:%.*]] = bitcast <8 x half> [[VPADDQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x half> [[VPADDQ_V2_I]]
 //
@ -1181,7 +1181,7 @@ float16x8_t test_vpaddq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VPMAX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VPMAX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxp.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    ret <4 x half> [[VPMAX2_I]]
 //
 float16x4_t test_vpmax_f16(float16x4_t a, float16x4_t b) {
@ -1193,7 +1193,7 @@ float16x4_t test_vpmax_f16(float16x4_t a, float16x4_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VPMAX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VPMAX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxp.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    ret <8 x half> [[VPMAX2_I]]
 //
 float16x8_t test_vpmaxq_f16(float16x8_t a, float16x8_t b) {
@ -1205,7 +1205,7 @@ float16x8_t test_vpmaxq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VPMAXNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxnmp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VPMAXNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmaxnmp.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    ret <4 x half> [[VPMAXNM2_I]]
 //
 float16x4_t test_vpmaxnm_f16(float16x4_t a, float16x4_t b) {
@ -1217,7 +1217,7 @@ float16x4_t test_vpmaxnm_f16(float16x4_t a, float16x4_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VPMAXNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxnmp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VPMAXNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmaxnmp.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    ret <8 x half> [[VPMAXNM2_I]]
 //
 float16x8_t test_vpmaxnmq_f16(float16x8_t a, float16x8_t b) {
@ -1229,7 +1229,7 @@ float16x8_t test_vpmaxnmq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VPMIN2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VPMIN2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminp.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    ret <4 x half> [[VPMIN2_I]]
 //
 float16x4_t test_vpmin_f16(float16x4_t a, float16x4_t b) {
@ -1241,7 +1241,7 @@ float16x4_t test_vpmin_f16(float16x4_t a, float16x4_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VPMIN2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VPMIN2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminp.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    ret <8 x half> [[VPMIN2_I]]
 //
 float16x8_t test_vpminq_f16(float16x8_t a, float16x8_t b) {
@ -1253,7 +1253,7 @@ float16x8_t test_vpminq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VPMINNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminnmp.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VPMINNM2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fminnmp.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    ret <4 x half> [[VPMINNM2_I]]
 //
 float16x4_t test_vpminnm_f16(float16x4_t a, float16x4_t b) {
@ -1265,7 +1265,7 @@ float16x4_t test_vpminnm_f16(float16x4_t a, float16x4_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VPMINNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminnmp.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VPMINNM2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fminnmp.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    ret <8 x half> [[VPMINNM2_I]]
 //
 float16x8_t test_vpminnmq_f16(float16x8_t a, float16x8_t b) {
@ -1277,7 +1277,7 @@ float16x8_t test_vpminnmq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VRECPS_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frecps.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRECPS_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frecps.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    [[VRECPS_V3_I:%.*]] = bitcast <4 x half> [[VRECPS_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x half> [[VRECPS_V2_I]]
 //
@ -1290,7 +1290,7 @@ float16x4_t test_vrecps_f16(float16x4_t a, float16x4_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VRECPSQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frecps.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRECPSQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frecps.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    [[VRECPSQ_V3_I:%.*]] = bitcast <8 x half> [[VRECPSQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x half> [[VRECPSQ_V2_I]]
 //
@ -1303,7 +1303,7 @@ float16x8_t test_vrecpsq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VRSQRTS_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frsqrts.v4f16(<4 x half> [[A]], <4 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRSQRTS_V2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.frsqrts.v4f16(<4 x half> [[A]], <4 x half> [[B]])
 // CHECK-NEXT:    [[VRSQRTS_V3_I:%.*]] = bitcast <4 x half> [[VRSQRTS_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <4 x half> [[VRSQRTS_V2_I]]
 //
@ -1316,7 +1316,7 @@ float16x4_t test_vrsqrts_f16(float16x4_t a, float16x4_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VRSQRTSQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frsqrts.v8f16(<8 x half> [[A]], <8 x half> [[B]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VRSQRTSQ_V2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.frsqrts.v8f16(<8 x half> [[A]], <8 x half> [[B]])
 // CHECK-NEXT:    [[VRSQRTSQ_V3_I:%.*]] = bitcast <8 x half> [[VRSQRTSQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <8 x half> [[VRSQRTSQ_V2_I]]
 //
@ -1350,7 +1350,7 @@ float16x8_t test_vsubq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[C]], <4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[C]], <4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[TMP3]]
 //
 float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
@ -1363,7 +1363,7 @@ float16x4_t test_vfma_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[C]], <8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[C]], <8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[TMP3]]
 //
 float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
@ -1377,7 +1377,7 @@ float16x8_t test_vfmaq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[FNEG_I]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[C]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG_I]], <4 x half> [[C]], <4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG_I]], <4 x half> [[C]], <4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[TMP3]]
 //
 float16x4_t test_vfms_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
@ -1391,7 +1391,7 @@ float16x4_t test_vfms_f16(float16x4_t a, float16x4_t b, float16x4_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[FNEG_I]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[C]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG_I]], <8 x half> [[C]], <8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG_I]], <8 x half> [[C]], <8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[TMP3]]
 //
 float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
@ -1476,7 +1476,7 @@ float16x8_t test_vfmaq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[VECINIT3]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[VECINIT3]], <4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[B]], <4 x half> [[VECINIT3]], <4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[TMP3]]
 //
 float16x4_t test_vfma_n_f16(float16x4_t a, float16x4_t b, float16_t c) {
@ -1497,7 +1497,7 @@ float16x4_t test_vfma_n_f16(float16x4_t a, float16x4_t b, float16_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[VECINIT7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[VECINIT7]], <8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[B]], <8 x half> [[VECINIT7]], <8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[TMP3]]
 //
 float16x8_t test_vfmaq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
@ -1609,7 +1609,7 @@ float16x8_t test_vfmsq_laneq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[FNEG]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[VECINIT3]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG]], <4 x half> [[VECINIT3]], <4 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x half> @llvm.fma.v4f16(<4 x half> [[FNEG]], <4 x half> [[VECINIT3]], <4 x half> [[A]])
 // CHECK-NEXT:    ret <4 x half> [[TMP3]]
 //
 float16x4_t test_vfms_n_f16(float16x4_t a, float16x4_t b, float16_t c) {
@ -1631,7 +1631,7 @@ float16x4_t test_vfms_n_f16(float16x4_t a, float16x4_t b, float16_t c) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[FNEG]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[VECINIT7]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG]], <8 x half> [[VECINIT7]], <8 x half> [[A]]) #[[ATTR4]]
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.fma.v8f16(<8 x half> [[FNEG]], <8 x half> [[VECINIT7]], <8 x half> [[A]])
 // CHECK-NEXT:    ret <8 x half> [[TMP3]]
 //
 float16x8_t test_vfmsq_n_f16(float16x8_t a, float16x8_t b, float16_t c) {
@ -1803,7 +1803,7 @@ float16_t test_vmulh_laneq_f16(float16_t a, float16x8_t b) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]])
 // CHECK-NEXT:    ret <4 x half> [[VMULX2_I]]
 //
 float16x4_t test_vmulx_lane_f16(float16x4_t a, float16x4_t b) {
@ -1818,7 +1818,7 @@ float16x4_t test_vmulx_lane_f16(float16x4_t a, float16x4_t b) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[LANE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]])
 // CHECK-NEXT:    ret <8 x half> [[VMULX2_I]]
 //
 float16x8_t test_vmulxq_lane_f16(float16x8_t a, float16x4_t b) {
@ -1833,7 +1833,7 @@ float16x8_t test_vmulxq_lane_f16(float16x8_t a, float16x4_t b) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x half> [[LANE]] to <8 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]])
 // CHECK-NEXT:    ret <4 x half> [[VMULX2_I]]
 //
 float16x4_t test_vmulx_laneq_f16(float16x4_t a, float16x8_t b) {
@ -1848,7 +1848,7 @@ float16x4_t test_vmulx_laneq_f16(float16x4_t a, float16x8_t b) {
 // CHECK-NEXT:    [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x half> [[LANE]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]])
 // CHECK-NEXT:    ret <8 x half> [[VMULX2_I]]
 //
 float16x8_t test_vmulxq_laneq_f16(float16x8_t a, float16x8_t b) {
@ -1864,7 +1864,7 @@ float16x8_t test_vmulxq_laneq_f16(float16x8_t a, float16x8_t b) {
 // CHECK-NEXT:    [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[B]], i32 3
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[VECINIT3]] to <8 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[VECINIT3]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[VECINIT3]])
 // CHECK-NEXT:    ret <4 x half> [[VMULX2_I]]
 //
 float16x4_t test_vmulx_n_f16(float16x4_t a, float16_t b) {
@ -1884,7 +1884,7 @@ float16x4_t test_vmulx_n_f16(float16x4_t a, float16_t b) {
 // CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[B]], i32 7
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[VECINIT7]] to <16 x i8>
-// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[VECINIT7]]) #[[ATTR4]]
+// CHECK-NEXT:    [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[VECINIT7]])
 // CHECK-NEXT:    ret <8 x half> [[VMULX2_I]]
 //
 float16x8_t test_vmulxq_n_f16(float16x8_t a, float16_t b) {
--- a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
+++ b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c
@ -262,7 +262,7 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
 // CHECK-A64-LABEL: @test_vcvt_bf16_f32(
 // CHECK-A64-NEXT:  entry:
 // CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_V1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]]) #[[ATTR3:[0-9]+]]
+// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_V1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
 // CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_V2_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_V1_I]] to <16 x i8>
 // CHECK-A64-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_V1_I]], <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_V1_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-A64-NEXT:    ret <4 x bfloat> [[SHUFFLE_I]]
@ -270,7 +270,7 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
 // CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32(
 // CHECK-A32-HARDFP-NEXT:  entry:
 // CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]]) #[[ATTR3:[0-9]+]]
+// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]])
 // CHECK-A32-HARDFP-NEXT:    ret <4 x bfloat> [[VCVTFP2BF1_I]]
 //
 // CHECK-A32-SOFTFP-LABEL: @test_vcvt_bf16_f32(
@ -281,7 +281,7 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
 // CHECK-A32-SOFTFP-NEXT:    [[RETVAL:%.*]] = alloca <4 x bfloat>, align 8
 // CHECK-A32-SOFTFP-NEXT:    [[COERCE:%.*]] = alloca <4 x bfloat>, align 8
 // CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]]) #[[ATTR3:[0-9]+]]
+// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
 // CHECK-A32-SOFTFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
 // CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP1]], <4 x bfloat>* [[RETVAL_I1]], align 8
 // CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = bitcast <4 x bfloat>* [[RETVAL_I1]] to <2 x i32>*
@ -307,14 +307,14 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
 // CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32(
 // CHECK-A64-NEXT:  entry:
 // CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_V1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_V1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
 // CHECK-A64-NEXT:    [[__A64_VCVTQ_LOW_BF16_V2_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_V1_I]] to <16 x i8>
 // CHECK-A64-NEXT:    ret <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_V1_I]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32(
 // CHECK-A32-HARDFP-NEXT:  entry:
 // CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]])
 // CHECK-A32-HARDFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x bfloat> zeroinitializer, <4 x bfloat> [[VCVTFP2BF1_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK-A32-HARDFP-NEXT:    ret <8 x bfloat> [[SHUFFLE_I]]
 //
@ -332,7 +332,7 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
 // CHECK-A32-SOFTFP-NEXT:    [[RETVAL:%.*]] = alloca <8 x bfloat>, align 8
 // CHECK-A32-SOFTFP-NEXT:    [[COERCE:%.*]] = alloca <8 x bfloat>, align 8
 // CHECK-A32-SOFTFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
 // CHECK-A32-SOFTFP-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
 // CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP1]], <4 x bfloat>* [[RETVAL_I1]], align 8
 // CHECK-A32-SOFTFP-NEXT:    [[TMP2:%.*]] = bitcast <4 x bfloat>* [[RETVAL_I1]] to <2 x i32>*
@ -378,14 +378,14 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
 // CHECK-A64-NEXT:  entry:
 // CHECK-A64-NEXT:    [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8>
 // CHECK-A64-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A64-NEXT:    [[VCVTQ_HIGH_BF16_V2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> [[INACTIVE]], <4 x float> [[A]]) #[[ATTR3]]
+// CHECK-A64-NEXT:    [[VCVTQ_HIGH_BF16_V2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> [[INACTIVE]], <4 x float> [[A]])
 // CHECK-A64-NEXT:    [[VCVTQ_HIGH_BF16_V3_I:%.*]] = bitcast <8 x bfloat> [[VCVTQ_HIGH_BF16_V2_I]] to <16 x i8>
 // CHECK-A64-NEXT:    ret <8 x bfloat> [[VCVTQ_HIGH_BF16_V2_I]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32(
 // CHECK-A32-HARDFP-NEXT:  entry:
 // CHECK-A32-HARDFP-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-A32-HARDFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x bfloat> @llvm.arm.neon.vcvtfp2bf.v4bf16(<4 x float> [[A]])
 // CHECK-A32-HARDFP-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[INACTIVE:%.*]], <8 x bfloat> [[INACTIVE]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-A32-HARDFP-NEXT:    [[SHUFFLE_I8:%.*]] = shufflevector <4 x bfloat> [[VCVTFP2BF1_I]], <4 x bfloat> [[SHUFFLE_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 // CHECK-A32-HARDFP-NEXT:    ret <8 x bfloat> [[SHUFFLE_I8]]
@ -420,7 +420,7 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
 // CHECK-A32-SOFTFP-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 8
 // CHECK-A32-SOFTFP-NEXT:    [[__P01_I:%.*]] = load <8 x bfloat>, <8 x bfloat>* [[__P0_I]], align 8
 // CHECK-A32-SOFTFP-NEXT:    [[TMP4:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
-// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-A32-SOFTFP-NEXT:    [[VCVTFP2BF1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2bf.v4i16(<4 x float> [[A]])
 // CHECK-A32-SOFTFP-NEXT:    [[TMP5:%.*]] = bitcast <4 x i16> [[VCVTFP2BF1_I]] to <4 x bfloat>
 // CHECK-A32-SOFTFP-NEXT:    store <4 x bfloat> [[TMP5]], <4 x bfloat>* [[RETVAL_I8]], align 8
 // CHECK-A32-SOFTFP-NEXT:    [[TMP6:%.*]] = bitcast <4 x bfloat>* [[RETVAL_I8]] to <2 x i32>*
@ -477,17 +477,17 @@ bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) {

 // CHECK-A64-LABEL: @test_vcvth_bf16_f32(
 // CHECK-A64-NEXT:  entry:
-// CHECK-A64-NEXT:    [[VCVTH_BF16_F32_I:%.*]] = call bfloat @llvm.aarch64.neon.bfcvt(float [[A:%.*]]) #[[ATTR3]]
+// CHECK-A64-NEXT:    [[VCVTH_BF16_F32_I:%.*]] = call bfloat @llvm.aarch64.neon.bfcvt(float [[A:%.*]])
 // CHECK-A64-NEXT:    ret bfloat [[VCVTH_BF16_F32_I]]
 //
 // CHECK-A32-HARDFP-LABEL: @test_vcvth_bf16_f32(
 // CHECK-A32-HARDFP-NEXT:  entry:
-// CHECK-A32-HARDFP-NEXT:    [[VCVTBFP2BF_I:%.*]] = call bfloat @llvm.arm.neon.vcvtbfp2bf(float [[A:%.*]]) #[[ATTR3]]
+// CHECK-A32-HARDFP-NEXT:    [[VCVTBFP2BF_I:%.*]] = call bfloat @llvm.arm.neon.vcvtbfp2bf(float [[A:%.*]])
 // CHECK-A32-HARDFP-NEXT:    ret bfloat [[VCVTBFP2BF_I]]
 //
 // CHECK-A32-SOFTFP-LABEL: @test_vcvth_bf16_f32(
 // CHECK-A32-SOFTFP-NEXT:  entry:
-// CHECK-A32-SOFTFP-NEXT:    [[VCVTBFP2BF_I:%.*]] = call bfloat @llvm.arm.neon.vcvtbfp2bf(float [[A:%.*]]) #[[ATTR3]]
+// CHECK-A32-SOFTFP-NEXT:    [[VCVTBFP2BF_I:%.*]] = call bfloat @llvm.arm.neon.vcvtbfp2bf(float [[A:%.*]])
 // CHECK-A32-SOFTFP-NEXT:    ret bfloat [[VCVTBFP2BF_I]]
 //
 bfloat16_t test_vcvth_bf16_f32(float32_t a) {
--- a/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c
+++ b/clang/test/CodeGen/arm-bf16-dotprod-intrinsics.c
@ -17,7 +17,7 @@
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <8 x i8>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[B]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[B]])
 // CHECK-NEXT:    ret <2 x float> [[VBFDOT3_I]]
 //
 float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
@ -29,7 +29,7 @@ float32x2_t test_vbfdot_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
 // CHECK-NEXT:    ret <4 x float> [[VBFDOT3_I]]
 //
 float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
@ -52,7 +52,7 @@ float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
 // CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <8 x i8>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP5]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP5]])
 // CHECK-NEXT:    ret <2 x float> [[VBFDOT3_I]]
 //
 float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
@ -75,7 +75,7 @@ float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
 // CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x bfloat> [[TMP5]] to <16 x i8>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP5]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP5]])
 // CHECK-NEXT:    ret <4 x float> [[VBFDOT3_I]]
 //
 float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
@ -98,7 +98,7 @@ float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b
 // CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x float> [[R:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x bfloat> [[A:%.*]] to <8 x i8>
 // CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x bfloat> [[TMP5]] to <8 x i8>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP5]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.arm.neon.bfdot.v2f32.v4bf16(<2 x float> [[R]], <4 x bfloat> [[A]], <4 x bfloat> [[TMP5]])
 // CHECK-NEXT:    ret <2 x float> [[VBFDOT3_I]]
 //
 float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) {
@ -121,7 +121,7 @@ float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
 // CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x bfloat> [[TMP5]] to <16 x i8>
-// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP5]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfdot.v4f32.v8bf16(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[TMP5]])
 // CHECK-NEXT:    ret <4 x float> [[VBFDOT3_I]]
 //
 float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {
@ -133,7 +133,7 @@ float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMMLAQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VBFMMLAQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmmla(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
 // CHECK-NEXT:    [[VBFMMLAQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMMLAQ_V3_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x float> [[VBFMMLAQ_V3_I]]
 //
@ -146,7 +146,7 @@ float32x4_t test_vbfmmlaq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
 // CHECK-NEXT:    [[VBFMLALBQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_V3_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x float> [[VBFMLALBQ_V3_I]]
 //
@ -159,7 +159,7 @@ float32x4_t test_vbfmlalbq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
 // CHECK-NEXT:    [[VBFMLALTQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_V3_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x float> [[VBFMLALTQ_V3_I]]
 //
@ -188,7 +188,7 @@ float32x4_t test_vbfmlaltq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
 // CHECK-NEXT:    [[VBFMLALBQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_V3_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x float> [[VBFMLALBQ_V3_I]]
 //
@ -217,7 +217,7 @@ float32x4_t test_vbfmlalbq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VBFMLALBQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalb(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
 // CHECK-NEXT:    [[VBFMLALBQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALBQ_V3_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x float> [[VBFMLALBQ_V3_I]]
 //
@ -246,7 +246,7 @@ float32x4_t test_vbfmlalbq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
 // CHECK-NEXT:    [[VBFMLALTQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_V3_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x float> [[VBFMLALTQ_V3_I]]
 //
@ -275,7 +275,7 @@ float32x4_t test_vbfmlaltq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[R:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[VECINIT35]] to <16 x i8>
-// CHECK-NEXT:    [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VBFMLALTQ_V3_I:%.*]] = call <4 x float> @llvm.arm.neon.bfmlalt(<4 x float> [[R]], <8 x bfloat> [[A]], <8 x bfloat> [[VECINIT35]])
 // CHECK-NEXT:    [[VBFMLALTQ_V4_I:%.*]] = bitcast <4 x float> [[VBFMLALTQ_V3_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x float> [[VBFMLALTQ_V3_I]]
 //
--- a/clang/test/CodeGen/arm-neon-fma.c
+++ b/clang/test/CodeGen/arm-neon-fma.c
@ -16,7 +16,7 @@
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[ACCUM]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[LHS]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[RHS]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LHS]], <2 x float> [[RHS]], <2 x float> [[ACCUM]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LHS]], <2 x float> [[RHS]], <2 x float> [[ACCUM]])
 // CHECK-NEXT:    ret <2 x float> [[TMP3]]
 //
 float32x2_t test_fma_order(float32x2_t accum, float32x2_t lhs, float32x2_t rhs) {
@ -29,7 +29,7 @@ float32x2_t test_fma_order(float32x2_t accum, float32x2_t lhs, float32x2_t rhs)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[ACCUM]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[LHS]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[RHS]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LHS]], <4 x float> [[RHS]], <4 x float> [[ACCUM]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LHS]], <4 x float> [[RHS]], <4 x float> [[ACCUM]])
 // CHECK-NEXT:    ret <4 x float> [[TMP3]]
 //
 float32x4_t test_fmaq_order(float32x4_t accum, float32x4_t lhs, float32x4_t rhs) {
@ -44,7 +44,7 @@ float32x4_t test_fmaq_order(float32x4_t accum, float32x4_t lhs, float32x4_t rhs)
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[B]], <2 x float> [[VECINIT1_I]], <2 x float> [[A]])
 // CHECK-NEXT:    ret <2 x float> [[TMP3]]
 //
 float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
@ -61,7 +61,7 @@ float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
-// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP3:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[B]], <4 x float> [[VECINIT3_I]], <4 x float> [[A]])
 // CHECK-NEXT:    ret <4 x float> [[TMP3]]
 //
 float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
--- a/clang/test/CodeGen/arm-neon-numeric-maxmin.c
+++ b/clang/test/CodeGen/arm-neon-numeric-maxmin.c
@ -10,7 +10,7 @@
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VMAXNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> [[A]], <2 x float> [[B]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[VMAXNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> [[A]], <2 x float> [[B]])
 // CHECK-NEXT:    [[VMAXNM_V3_I:%.*]] = bitcast <2 x float> [[VMAXNM_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x float> [[VMAXNM_V2_I]]
 //
@ -23,7 +23,7 @@ float32x2_t test_vmaxnm_f32(float32x2_t a, float32x2_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VMAXNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> [[A]], <4 x float> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMAXNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> [[A]], <4 x float> [[B]])
 // CHECK-NEXT:    [[VMAXNMQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXNMQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x float> [[VMAXNMQ_V2_I]]
 //
@ -36,7 +36,7 @@ float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x float> [[B]] to <8 x i8>
-// CHECK-NEXT:    [[VMINNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float> [[A]], <2 x float> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMINNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float> [[A]], <2 x float> [[B]])
 // CHECK-NEXT:    [[VMINNM_V3_I:%.*]] = bitcast <2 x float> [[VMINNM_V2_I]] to <8 x i8>
 // CHECK-NEXT:    ret <2 x float> [[VMINNM_V2_I]]
 //
@ -49,7 +49,7 @@ float32x2_t test_vminnm_f32(float32x2_t a, float32x2_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B]] to <16 x i8>
-// CHECK-NEXT:    [[VMINNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float> [[A]], <4 x float> [[B]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VMINNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float> [[A]], <4 x float> [[B]])
 // CHECK-NEXT:    [[VMINNMQ_V3_I:%.*]] = bitcast <4 x float> [[VMINNMQ_V2_I]] to <16 x i8>
 // CHECK-NEXT:    ret <4 x float> [[VMINNMQ_V2_I]]
 //
--- a/clang/test/CodeGen/arm-neon-vcvtX.c
+++ b/clang/test/CodeGen/arm-neon-vcvtX.c
@ -9,7 +9,7 @@
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVTA_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> [[A]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[VCVTA_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> [[A]])
 // CHECK-NEXT:    ret <2 x i32> [[VCVTA_S32_V1_I]]
 //
 int32x2_t test_vcvta_s32_f32(float32x2_t a) {
@ -20,7 +20,7 @@ int32x2_t test_vcvta_s32_f32(float32x2_t a) {
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVTA_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VCVTA_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> [[A]])
 // CHECK-NEXT:    ret <2 x i32> [[VCVTA_U32_V1_I]]
 //
 uint32x2_t test_vcvta_u32_f32(float32x2_t a) {
@ -31,7 +31,7 @@ uint32x2_t test_vcvta_u32_f32(float32x2_t a) {
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVTAQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VCVTAQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret <4 x i32> [[VCVTAQ_S32_V1_I]]
 //
 int32x4_t test_vcvtaq_s32_f32(float32x4_t a) {
@ -42,7 +42,7 @@ int32x4_t test_vcvtaq_s32_f32(float32x4_t a) {
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVTAQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VCVTAQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret <4 x i32> [[VCVTAQ_U32_V1_I]]
 //
 uint32x4_t test_vcvtaq_u32_f32(float32x4_t a) {
@ -53,7 +53,7 @@ uint32x4_t test_vcvtaq_u32_f32(float32x4_t a) {
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVTN_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VCVTN_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> [[A]])
 // CHECK-NEXT:    ret <2 x i32> [[VCVTN_S32_V1_I]]
 //
 int32x2_t test_vcvtn_s32_f32(float32x2_t a) {
@ -64,7 +64,7 @@ int32x2_t test_vcvtn_s32_f32(float32x2_t a) {
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVTN_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VCVTN_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> [[A]])
 // CHECK-NEXT:    ret <2 x i32> [[VCVTN_U32_V1_I]]
 //
 uint32x2_t test_vcvtn_u32_f32(float32x2_t a) {
@ -75,7 +75,7 @@ uint32x2_t test_vcvtn_u32_f32(float32x2_t a) {
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVTNQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VCVTNQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret <4 x i32> [[VCVTNQ_S32_V1_I]]
 //
 int32x4_t test_vcvtnq_s32_f32(float32x4_t a) {
@ -86,7 +86,7 @@ int32x4_t test_vcvtnq_s32_f32(float32x4_t a) {
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVTNQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VCVTNQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret <4 x i32> [[VCVTNQ_U32_V1_I]]
 //
 uint32x4_t test_vcvtnq_u32_f32(float32x4_t a) {
@ -97,7 +97,7 @@ uint32x4_t test_vcvtnq_u32_f32(float32x4_t a) {
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVTP_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VCVTP_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> [[A]])
 // CHECK-NEXT:    ret <2 x i32> [[VCVTP_S32_V1_I]]
 //
 int32x2_t test_vcvtp_s32_f32(float32x2_t a) {
@ -108,7 +108,7 @@ int32x2_t test_vcvtp_s32_f32(float32x2_t a) {
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVTP_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VCVTP_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> [[A]])
 // CHECK-NEXT:    ret <2 x i32> [[VCVTP_U32_V1_I]]
 //
 uint32x2_t test_vcvtp_u32_f32(float32x2_t a) {
@ -119,7 +119,7 @@ uint32x2_t test_vcvtp_u32_f32(float32x2_t a) {
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVTPQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VCVTPQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret <4 x i32> [[VCVTPQ_S32_V1_I]]
 //
 int32x4_t test_vcvtpq_s32_f32(float32x4_t a) {
@ -130,7 +130,7 @@ int32x4_t test_vcvtpq_s32_f32(float32x4_t a) {
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVTPQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VCVTPQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret <4 x i32> [[VCVTPQ_U32_V1_I]]
 //
 uint32x4_t test_vcvtpq_u32_f32(float32x4_t a) {
@ -141,7 +141,7 @@ uint32x4_t test_vcvtpq_u32_f32(float32x4_t a) {
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVTM_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VCVTM_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> [[A]])
 // CHECK-NEXT:    ret <2 x i32> [[VCVTM_S32_V1_I]]
 //
 int32x2_t test_vcvtm_s32_f32(float32x2_t a) {
@ -152,7 +152,7 @@ int32x2_t test_vcvtm_s32_f32(float32x2_t a) {
 // CHECK-SAME: (<2 x float> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[A]] to <8 x i8>
-// CHECK-NEXT:    [[VCVTM_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VCVTM_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> [[A]])
 // CHECK-NEXT:    ret <2 x i32> [[VCVTM_U32_V1_I]]
 //
 uint32x2_t test_vcvtm_u32_f32(float32x2_t a) {
@ -163,7 +163,7 @@ uint32x2_t test_vcvtm_u32_f32(float32x2_t a) {
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVTMQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VCVTMQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret <4 x i32> [[VCVTMQ_S32_V1_I]]
 //
 int32x4_t test_vcvtmq_s32_f32(float32x4_t a) {
@ -174,7 +174,7 @@ int32x4_t test_vcvtmq_s32_f32(float32x4_t a) {
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A]] to <16 x i8>
-// CHECK-NEXT:    [[VCVTMQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> [[A]]) #[[ATTR3]]
+// CHECK-NEXT:    [[VCVTMQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> [[A]])
 // CHECK-NEXT:    ret <4 x i32> [[VCVTMQ_U32_V1_I]]
 //
 uint32x4_t test_vcvtmq_u32_f32(float32x4_t a) {
--- a/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c
+++ b/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c
@ -13,12 +13,12 @@

 // CHECK-ARM-LABEL: @test_vqrdmlah_s16(
 // CHECK-ARM-NEXT:  entry:
-// CHECK-ARM-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR3:[0-9]+]]
+// CHECK-ARM-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]])
 // CHECK-ARM-NEXT:    ret <4 x i16> [[VQRDMLAH_V3_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlah_s16(
 // CHECK-AARCH64-NEXT:  entry:
-// CHECK-AARCH64-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR3:[0-9]+]]
+// CHECK-AARCH64-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]])
 // CHECK-AARCH64-NEXT:    ret <4 x i16> [[VQRDMLAH_V3_I]]
 //
 int16x4_t test_vqrdmlah_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
@ -28,12 +28,12 @@ int16x4_t test_vqrdmlah_s16(int16x4_t a, int16x4_t b, int16x4_t c) {

 // CHECK-ARM-LABEL: @test_vqrdmlah_s32(
 // CHECK-ARM-NEXT:  entry:
-// CHECK-ARM-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-ARM-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]])
 // CHECK-ARM-NEXT:    ret <2 x i32> [[VQRDMLAH_V3_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlah_s32(
 // CHECK-AARCH64-NEXT:  entry:
-// CHECK-AARCH64-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]])
 // CHECK-AARCH64-NEXT:    ret <2 x i32> [[VQRDMLAH_V3_I]]
 //
 int32x2_t test_vqrdmlah_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
@ -43,12 +43,12 @@ int32x2_t test_vqrdmlah_s32(int32x2_t a, int32x2_t b, int32x2_t c) {

 // CHECK-ARM-LABEL: @test_vqrdmlahq_s16(
 // CHECK-ARM-NEXT:  entry:
-// CHECK-ARM-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]]
+// CHECK-ARM-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
 // CHECK-ARM-NEXT:    ret <8 x i16> [[VQRDMLAHQ_V3_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlahq_s16(
 // CHECK-AARCH64-NEXT:  entry:
-// CHECK-AARCH64-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
 // CHECK-AARCH64-NEXT:    ret <8 x i16> [[VQRDMLAHQ_V3_I]]
 //
 int16x8_t test_vqrdmlahq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
@ -58,12 +58,12 @@ int16x8_t test_vqrdmlahq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {

 // CHECK-ARM-LABEL: @test_vqrdmlahq_s32(
 // CHECK-ARM-NEXT:  entry:
-// CHECK-ARM-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-ARM-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
 // CHECK-ARM-NEXT:    ret <4 x i32> [[VQRDMLAHQ_V3_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlahq_s32(
 // CHECK-AARCH64-NEXT:  entry:
-// CHECK-AARCH64-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
 // CHECK-AARCH64-NEXT:    ret <4 x i32> [[VQRDMLAHQ_V3_I]]
 //
 int32x4_t test_vqrdmlahq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
@ -76,7 +76,7 @@ int32x4_t test_vqrdmlahq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
 // CHECK-ARM-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
 // CHECK-ARM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-ARM-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-ARM-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
 // CHECK-ARM-NEXT:    ret <4 x i16> [[VQRDMLAH_V3_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlah_lane_s16(
@ -84,7 +84,7 @@ int32x4_t test_vqrdmlahq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
 // CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
 // CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-AARCH64-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
 // CHECK-AARCH64-NEXT:    ret <4 x i16> [[VQRDMLAH_V3_I]]
 //
 int16x4_t test_vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
@ -97,7 +97,7 @@ int16x4_t test_vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK-ARM-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
 // CHECK-ARM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-ARM-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-ARM-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
 // CHECK-ARM-NEXT:    ret <2 x i32> [[VQRDMLAH_V3_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlah_lane_s32(
@ -105,7 +105,7 @@ int16x4_t test_vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
 // CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-AARCH64-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQRDMLAH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
 // CHECK-AARCH64-NEXT:    ret <2 x i32> [[VQRDMLAH_V3_I]]
 //
 int32x2_t test_vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
@ -118,7 +118,7 @@ int32x2_t test_vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
 // CHECK-ARM-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
 // CHECK-ARM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-ARM-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-ARM-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]])
 // CHECK-ARM-NEXT:    ret <8 x i16> [[VQRDMLAHQ_V3_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlahq_lane_s16(
@ -126,7 +126,7 @@ int32x2_t test_vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
 // CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
 // CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-AARCH64-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlah.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]])
 // CHECK-AARCH64-NEXT:    ret <8 x i16> [[VQRDMLAHQ_V3_I]]
 //
 int16x8_t test_vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
@ -139,7 +139,7 @@ int16x8_t test_vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
 // CHECK-ARM-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
 // CHECK-ARM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-ARM-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-ARM-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]])
 // CHECK-ARM-NEXT:    ret <4 x i32> [[VQRDMLAHQ_V3_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlahq_lane_s32(
@ -147,7 +147,7 @@ int16x8_t test_vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
 // CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
 // CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-AARCH64-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQRDMLAHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlah.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]])
 // CHECK-AARCH64-NEXT:    ret <4 x i32> [[VQRDMLAHQ_V3_I]]
 //
 int32x4_t test_vqrdmlahq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
@ -157,12 +157,12 @@ int32x4_t test_vqrdmlahq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {

 // CHECK-ARM-LABEL: @test_vqrdmlsh_s16(
 // CHECK-ARM-NEXT:  entry:
-// CHECK-ARM-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR3]]
+// CHECK-ARM-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]])
 // CHECK-ARM-NEXT:    ret <4 x i16> [[VQRDMLSH_V3_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlsh_s16(
 // CHECK-AARCH64-NEXT:  entry:
-// CHECK-AARCH64-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]])
 // CHECK-AARCH64-NEXT:    ret <4 x i16> [[VQRDMLSH_V3_I]]
 //
 int16x4_t test_vqrdmlsh_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
@ -172,12 +172,12 @@ int16x4_t test_vqrdmlsh_s16(int16x4_t a, int16x4_t b, int16x4_t c) {

 // CHECK-ARM-LABEL: @test_vqrdmlsh_s32(
 // CHECK-ARM-NEXT:  entry:
-// CHECK-ARM-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-ARM-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]])
 // CHECK-ARM-NEXT:    ret <2 x i32> [[VQRDMLSH_V3_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlsh_s32(
 // CHECK-AARCH64-NEXT:  entry:
-// CHECK-AARCH64-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[C:%.*]])
 // CHECK-AARCH64-NEXT:    ret <2 x i32> [[VQRDMLSH_V3_I]]
 //
 int32x2_t test_vqrdmlsh_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
@ -187,12 +187,12 @@ int32x2_t test_vqrdmlsh_s32(int32x2_t a, int32x2_t b, int32x2_t c) {

 // CHECK-ARM-LABEL: @test_vqrdmlshq_s16(
 // CHECK-ARM-NEXT:  entry:
-// CHECK-ARM-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]]
+// CHECK-ARM-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
 // CHECK-ARM-NEXT:    ret <8 x i16> [[VQRDMLSHQ_V3_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlshq_s16(
 // CHECK-AARCH64-NEXT:  entry:
-// CHECK-AARCH64-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]])
 // CHECK-AARCH64-NEXT:    ret <8 x i16> [[VQRDMLSHQ_V3_I]]
 //
 int16x8_t test_vqrdmlshq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
@ -202,12 +202,12 @@ int16x8_t test_vqrdmlshq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {

 // CHECK-ARM-LABEL: @test_vqrdmlshq_s32(
 // CHECK-ARM-NEXT:  entry:
-// CHECK-ARM-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-ARM-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
 // CHECK-ARM-NEXT:    ret <4 x i32> [[VQRDMLSHQ_V3_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlshq_s32(
 // CHECK-AARCH64-NEXT:  entry:
-// CHECK-AARCH64-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]])
 // CHECK-AARCH64-NEXT:    ret <4 x i32> [[VQRDMLSHQ_V3_I]]
 //
 int32x4_t test_vqrdmlshq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
@ -220,7 +220,7 @@ int32x4_t test_vqrdmlshq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
 // CHECK-ARM-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
 // CHECK-ARM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-ARM-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-ARM-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
 // CHECK-ARM-NEXT:    ret <4 x i16> [[VQRDMLSH_V3_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlsh_lane_s16(
@ -228,7 +228,7 @@ int32x4_t test_vqrdmlshq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
 // CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
 // CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK-AARCH64-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[LANE]])
 // CHECK-AARCH64-NEXT:    ret <4 x i16> [[VQRDMLSH_V3_I]]
 //
 int16x4_t test_vqrdmlsh_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
@ -241,7 +241,7 @@ int16x4_t test_vqrdmlsh_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK-ARM-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
 // CHECK-ARM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-ARM-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-ARM-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
 // CHECK-ARM-NEXT:    ret <2 x i32> [[VQRDMLSH_V3_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlsh_lane_s32(
@ -249,7 +249,7 @@ int16x4_t test_vqrdmlsh_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
 // CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>
-// CHECK-AARCH64-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQRDMLSH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <2 x i32> [[LANE]])
 // CHECK-AARCH64-NEXT:    ret <2 x i32> [[VQRDMLSH_V3_I]]
 //
 int32x2_t test_vqrdmlsh_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
@ -262,7 +262,7 @@ int32x2_t test_vqrdmlsh_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
 // CHECK-ARM-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
 // CHECK-ARM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-ARM-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-ARM-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]])
 // CHECK-ARM-NEXT:    ret <8 x i16> [[VQRDMLSHQ_V3_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlshq_lane_s16(
@ -270,7 +270,7 @@ int32x2_t test_vqrdmlsh_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
 // CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8>
 // CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
 // CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
-// CHECK-AARCH64-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmlsh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[LANE]])
 // CHECK-AARCH64-NEXT:    ret <8 x i16> [[VQRDMLSHQ_V3_I]]
 //
 int16x8_t test_vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
@ -283,7 +283,7 @@ int16x8_t test_vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
 // CHECK-ARM-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
 // CHECK-ARM-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK-ARM-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-ARM-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-ARM-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]])
 // CHECK-ARM-NEXT:    ret <4 x i32> [[VQRDMLSHQ_V3_I]]
 //
 // CHECK-AARCH64-LABEL: @test_vqrdmlshq_lane_s32(
@ -291,7 +291,7 @@ int16x8_t test_vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
 // CHECK-AARCH64-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8>
 // CHECK-AARCH64-NEXT:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
 // CHECK-AARCH64-NEXT:    [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK-AARCH64-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]]) #[[ATTR3]]
+// CHECK-AARCH64-NEXT:    [[VQRDMLSHQ_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmlsh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[LANE]])
 // CHECK-AARCH64-NEXT:    ret <4 x i32> [[VQRDMLSHQ_V3_I]]
 //
 int32x4_t test_vqrdmlshq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
--- a/clang/test/CodeGen/arm_acle.c
+++ b/clang/test/CodeGen/arm_acle.c
--- a/clang/test/CodeGen/memcpy-inline-builtin.c
+++ b/clang/test/CodeGen/memcpy-inline-builtin.c
@ -36,7 +36,7 @@ AVAILABLE_EXTERNALLY void *memcpy(void *a, const void *b, size_t c) {
 // CHECK-NEXT:    [[TMP3:%.*]] = load i8*, i8** [[A_ADDR_I]], align 8
 // CHECK-NEXT:    [[TMP4:%.*]] = load i8*, i8** [[B_ADDR_I]], align 8
 // CHECK-NEXT:    [[TMP5:%.*]] = load i64, i64* [[C_ADDR_I]], align 8
-// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[TMP3]], i8* align 1 [[TMP4]], i64 [[TMP5]], i1 false) #[[ATTR3]]
+// CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[TMP3]], i8* align 1 [[TMP4]], i64 [[TMP5]], i1 false)
 // CHECK-NEXT:    ret i8* [[TMP3]]
 //
 void *foo(void *a, const void *b, size_t c) {
--- a/clang/test/Headers/wasm.c
+++ b/clang/test/Headers/wasm.c
@ -1522,7 +1522,7 @@ v128_t test_v128_andnot(v128_t a, v128_t b) {
 // CHECK-LABEL: @test_v128_any_true(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.wasm.anytrue.v16i8(<16 x i8> [[TMP0]]) #[[ATTR7:[0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.wasm.anytrue.v16i8(<16 x i8> [[TMP0]])
 // CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK-NEXT:    ret i1 [[TOBOOL_I]]
 //
@ -1532,7 +1532,7 @@ bool test_v128_any_true(v128_t a) {

 // CHECK-LABEL: @test_v128_bitselect(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.wasm.bitselect.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[MASK:%.*]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.wasm.bitselect.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[MASK:%.*]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 v128_t test_v128_bitselect(v128_t a, v128_t b, v128_t mask) {
@ -1542,7 +1542,7 @@ v128_t test_v128_bitselect(v128_t a, v128_t b, v128_t mask) {
 // CHECK-LABEL: @test_i8x16_abs(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP0]], i1 false) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.abs.v16i8(<16 x i8> [[TMP0]], i1 false)
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
@ -1564,7 +1564,7 @@ v128_t test_i8x16_neg(v128_t a) {
 // CHECK-LABEL: @test_i8x16_all_true(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v16i8(<16 x i8> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v16i8(<16 x i8> [[TMP0]])
 // CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK-NEXT:    ret i1 [[TOBOOL_I]]
 //
@ -1575,7 +1575,7 @@ bool test_i8x16_all_true(v128_t a) {
 // CHECK-LABEL: @test_i8x16_bitmask(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v16i8(<16 x i8> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v16i8(<16 x i8> [[TMP0]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 uint32_t test_i8x16_bitmask(v128_t a) {
@ -1585,7 +1585,7 @@ uint32_t test_i8x16_bitmask(v128_t a) {
 // CHECK-LABEL: @test_i8x16_popcnt(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
@ -1651,7 +1651,7 @@ v128_t test_i8x16_add(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -1663,7 +1663,7 @@ v128_t test_i8x16_add_sat(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -1687,7 +1687,7 @@ v128_t test_i8x16_sub(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.sub.sat.signed.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.sub.sat.signed.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -1699,7 +1699,7 @@ v128_t test_i8x16_sub_sat(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.sub.sat.unsigned.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.sub.sat.unsigned.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -1711,7 +1711,7 @@ v128_t test_u8x16_sub_sat(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -1723,7 +1723,7 @@ v128_t test_i8x16_min(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -1735,7 +1735,7 @@ v128_t test_u8x16_min(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -1747,7 +1747,7 @@ v128_t test_i8x16_max(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -1759,7 +1759,7 @@ v128_t test_u8x16_max(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.avgr.unsigned.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.avgr.unsigned.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -1770,7 +1770,7 @@ v128_t test_u8x16_avgr(v128_t a, v128_t b) {
 // CHECK-LABEL: @test_i16x8_abs(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP0]], i1 false) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.abs.v8i16(<8 x i16> [[TMP0]], i1 false)
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
@ -1792,7 +1792,7 @@ v128_t test_i16x8_neg(v128_t a) {
 // CHECK-LABEL: @test_i16x8_all_true(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v8i16(<8 x i16> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v8i16(<8 x i16> [[TMP0]])
 // CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK-NEXT:    ret i1 [[TOBOOL_I]]
 //
@ -1803,7 +1803,7 @@ bool test_i16x8_all_true(v128_t a) {
 // CHECK-LABEL: @test_i16x8_bitmask(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v8i16(<8 x i16> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v8i16(<8 x i16> [[TMP0]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 uint32_t test_i16x8_bitmask(v128_t a) {
@ -1868,7 +1868,7 @@ v128_t test_i16x8_add(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -1880,7 +1880,7 @@ v128_t test_i16x8_add_sat(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -1904,7 +1904,7 @@ v128_t test_i16x8_sub(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.sub.sat.signed.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.sub.sat.signed.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -1916,7 +1916,7 @@ v128_t test_i16x8_sub_sat(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.sub.sat.unsigned.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.sub.sat.unsigned.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -1940,7 +1940,7 @@ v128_t test_i16x8_mul(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -1952,7 +1952,7 @@ v128_t test_i16x8_min(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -1964,7 +1964,7 @@ v128_t test_u16x8_min(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.smax.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -1976,7 +1976,7 @@ v128_t test_i16x8_max(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -1988,7 +1988,7 @@ v128_t test_u16x8_max(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.avgr.unsigned.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.avgr.unsigned.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -1998,7 +1998,7 @@ v128_t test_u16x8_avgr(v128_t a, v128_t b) {

 // CHECK-LABEL: @test_i32x4_abs(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[A:%.*]], i1 false) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[A:%.*]], i1 false)
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 v128_t test_i32x4_abs(v128_t a) {
@ -2016,7 +2016,7 @@ v128_t test_i32x4_neg(v128_t a) {

 // CHECK-LABEL: @test_i32x4_all_true(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.wasm.alltrue.v4i32(<4 x i32> [[A:%.*]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.wasm.alltrue.v4i32(<4 x i32> [[A:%.*]])
 // CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP0]], 0
 // CHECK-NEXT:    ret i1 [[TOBOOL_I]]
 //
@ -2026,7 +2026,7 @@ bool test_i32x4_all_true(v128_t a) {

 // CHECK-LABEL: @test_i32x4_bitmask(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.wasm.bitmask.v4i32(<4 x i32> [[A:%.*]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.wasm.bitmask.v4i32(<4 x i32> [[A:%.*]])
 // CHECK-NEXT:    ret i32 [[TMP0]]
 //
 uint32_t test_i32x4_bitmask(v128_t a) {
@ -2095,7 +2095,7 @@ v128_t test_i32x4_mul(v128_t a, v128_t b) {

 // CHECK-LABEL: @test_i32x4_min(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 v128_t test_i32x4_min(v128_t a, v128_t b) {
@ -2104,7 +2104,7 @@ v128_t test_i32x4_min(v128_t a, v128_t b) {

 // CHECK-LABEL: @test_u32x4_min(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 v128_t test_u32x4_min(v128_t a, v128_t b) {
@ -2113,7 +2113,7 @@ v128_t test_u32x4_min(v128_t a, v128_t b) {

 // CHECK-LABEL: @test_i32x4_max(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 v128_t test_i32x4_max(v128_t a, v128_t b) {
@ -2122,7 +2122,7 @@ v128_t test_i32x4_max(v128_t a, v128_t b) {

 // CHECK-LABEL: @test_u32x4_max(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP0]]
 //
 v128_t test_u32x4_max(v128_t a, v128_t b) {
@ -2133,7 +2133,7 @@ v128_t test_u32x4_max(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.dot(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.wasm.dot(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
 v128_t test_i32x4_dot_i16x8(v128_t a, v128_t b) {
@ -2143,7 +2143,7 @@ v128_t test_i32x4_dot_i16x8(v128_t a, v128_t b) {
 // CHECK-LABEL: @test_i64x2_abs(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP0]], i1 false) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i64> @llvm.abs.v2i64(<2 x i64> [[TMP0]], i1 false)
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
@ -2165,7 +2165,7 @@ v128_t test_i64x2_neg(v128_t a) {
 // CHECK-LABEL: @test_i64x2_all_true(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v2i64(<2 x i64> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.wasm.alltrue.v2i64(<2 x i64> [[TMP0]])
 // CHECK-NEXT:    [[TOBOOL_I:%.*]] = icmp ne i32 [[TMP1]], 0
 // CHECK-NEXT:    ret i1 [[TOBOOL_I]]
 //
@ -2176,7 +2176,7 @@ bool test_i64x2_all_true(v128_t a) {
 // CHECK-LABEL: @test_i64x2_bitmask(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x i64>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v2i64(<2 x i64> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.wasm.bitmask.v2i64(<2 x i64> [[TMP0]])
 // CHECK-NEXT:    ret i32 [[TMP1]]
 //
 uint32_t test_i64x2_bitmask(v128_t a) {
@ -2264,7 +2264,7 @@ v128_t test_i64x2_mul(v128_t a, v128_t b) {
 // CHECK-LABEL: @test_f32x4_abs(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
@ -2286,7 +2286,7 @@ v128_t test_f32x4_neg(v128_t a) {
 // CHECK-LABEL: @test_f32x4_sqrt(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
@ -2297,7 +2297,7 @@ v128_t test_f32x4_sqrt(v128_t a) {
 // CHECK-LABEL: @test_f32x4_ceil(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.ceil.v4f32(<4 x float> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
@ -2308,7 +2308,7 @@ v128_t test_f32x4_ceil(v128_t a) {
 // CHECK-LABEL: @test_f32x4_floor(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.floor.v4f32(<4 x float> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
@ -2319,7 +2319,7 @@ v128_t test_f32x4_floor(v128_t a) {
 // CHECK-LABEL: @test_f32x4_trunc(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.trunc.v4f32(<4 x float> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
@ -2330,7 +2330,7 @@ v128_t test_f32x4_trunc(v128_t a) {
 // CHECK-LABEL: @test_f32x4_nearest(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
@ -2390,7 +2390,7 @@ v128_t test_f32x4_div(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.minimum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.minimum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -2402,7 +2402,7 @@ v128_t test_f32x4_min(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.maximum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.maximum.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -2414,7 +2414,7 @@ v128_t test_f32x4_max(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmin.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmin.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -2426,7 +2426,7 @@ v128_t test_f32x4_pmin(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmax.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.wasm.pmax.v4f32(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x float> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -2437,7 +2437,7 @@ v128_t test_f32x4_pmax(v128_t a, v128_t b) {
 // CHECK-LABEL: @test_f64x2_abs(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.fabs.v2f64(<2 x double> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
@ -2459,7 +2459,7 @@ v128_t test_f64x2_neg(v128_t a) {
 // CHECK-LABEL: @test_f64x2_sqrt(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
@ -2470,7 +2470,7 @@ v128_t test_f64x2_sqrt(v128_t a) {
 // CHECK-LABEL: @test_f64x2_ceil(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.ceil.v2f64(<2 x double> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
@ -2481,7 +2481,7 @@ v128_t test_f64x2_ceil(v128_t a) {
 // CHECK-LABEL: @test_f64x2_floor(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.floor.v2f64(<2 x double> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
@ -2492,7 +2492,7 @@ v128_t test_f64x2_floor(v128_t a) {
 // CHECK-LABEL: @test_f64x2_trunc(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.trunc.v2f64(<2 x double> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
@ -2503,7 +2503,7 @@ v128_t test_f64x2_trunc(v128_t a) {
 // CHECK-LABEL: @test_f64x2_nearest(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
@ -2563,7 +2563,7 @@ v128_t test_f64x2_div(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.minimum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.minimum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -2575,7 +2575,7 @@ v128_t test_f64x2_min(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.maximum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.maximum.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -2587,7 +2587,7 @@ v128_t test_f64x2_max(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmin.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmin.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -2599,7 +2599,7 @@ v128_t test_f64x2_pmin(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmax.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <2 x double> @llvm.wasm.pmax.v2f64(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x double> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -2610,7 +2610,7 @@ v128_t test_f64x2_pmax(v128_t a, v128_t b) {
 // CHECK-LABEL: @test_i32x4_trunc_sat_f32x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> [[TMP0]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
 v128_t test_i32x4_trunc_sat_f32x4(v128_t a) {
@ -2620,7 +2620,7 @@ v128_t test_i32x4_trunc_sat_f32x4(v128_t a) {
 // CHECK-LABEL: @test_u32x4_trunc_sat_f32x4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <4 x float>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> [[TMP0]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
 v128_t test_u32x4_trunc_sat_f32x4(v128_t a) {
@ -2672,7 +2672,7 @@ v128_t test_f64x2_convert_low_u32x4(v128_t a) {
 // CHECK-LABEL: @test_i32x4_trunc_sat_f64x2_zero(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
@ -2683,7 +2683,7 @@ v128_t test_i32x4_trunc_sat_f64x2_zero(v128_t a) {
 // CHECK-LABEL: @test_u32x4_trunc_sat_f64x2_zero(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <2 x double>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
@ -2767,7 +2767,7 @@ v128_t test_i64x2_shuffle(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.swizzle(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.swizzle(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -2779,7 +2779,7 @@ v128_t test_i8x16_swizzle(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.narrow.signed.v16i8.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.narrow.signed.v16i8.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -2791,7 +2791,7 @@ v128_t test_i8x16_narrow_i16x8(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.narrow.unsigned.v16i8.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.wasm.narrow.unsigned.v16i8.v8i16(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
@ -2801,7 +2801,7 @@ v128_t test_u8x16_narrow_i16x8(v128_t a, v128_t b) {

 // CHECK-LABEL: @test_i16x8_narrow_i32x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.wasm.narrow.signed.v8i16.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.wasm.narrow.signed.v8i16.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
@ -2811,7 +2811,7 @@ v128_t test_i16x8_narrow_i32x4(v128_t a, v128_t b) {

 // CHECK-LABEL: @test_u16x8_narrow_i32x4(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.wasm.narrow.unsigned.v8i16.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.wasm.narrow.unsigned.v8i16.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
@ -2958,7 +2958,7 @@ v128_t test_u64x2_extend_high_u32x4(v128_t a) {
 // CHECK-LABEL: @test_i16x8_extadd_pairwise_i8x16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.wasm.extadd.pairwise.signed.v8i16(<16 x i8> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.wasm.extadd.pairwise.signed.v8i16(<16 x i8> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
@ -2969,7 +2969,7 @@ v128_t test_i16x8_extadd_pairwise_i8x16(v128_t a) {
 // CHECK-LABEL: @test_u16x8_extadd_pairwise_u8x16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.wasm.extadd.pairwise.unsigned.v8i16(<16 x i8> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i16> @llvm.wasm.extadd.pairwise.unsigned.v8i16(<16 x i8> [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 //
@ -2980,7 +2980,7 @@ v128_t test_u16x8_extadd_pairwise_u8x16(v128_t a) {
 // CHECK-LABEL: @test_i32x4_extadd_pairwise_i16x8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.wasm.extadd.pairwise.signed.v4i32(<8 x i16> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.wasm.extadd.pairwise.signed.v4i32(<8 x i16> [[TMP0]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
 v128_t test_i32x4_extadd_pairwise_i16x8(v128_t a) {
@ -2990,7 +2990,7 @@ v128_t test_i32x4_extadd_pairwise_i16x8(v128_t a) {
 // CHECK-LABEL: @test_u32x4_extadd_pairwise_u16x8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.wasm.extadd.pairwise.unsigned.v4i32(<8 x i16> [[TMP0]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i32> @llvm.wasm.extadd.pairwise.unsigned.v4i32(<8 x i16> [[TMP0]])
 // CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 //
 v128_t test_u32x4_extadd_pairwise_u16x8(v128_t a) {
@ -3181,7 +3181,7 @@ v128_t test_u64x2_extmul_high_u32x4(v128_t a, v128_t b) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
 // CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[B:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.q15mulr.sat.signed(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.wasm.q15mulr.sat.signed(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
 // CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
--- a/clang/test/OpenMP/cancel_codegen.cpp
+++ b/clang/test/OpenMP/cancel_codegen.cpp
@ -396,7 +396,7 @@ for (int i = 0; i < argc; ++i) {
 // CHECK1-NEXT:    store %struct.anon* [[TMP8]], %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
 // CHECK1-NEXT:    [[TMP10:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
 // CHECK1-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
-// CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[TMP11]], i32 4) #[[ATTR2:[0-9]+]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[TMP11]], i32 4)
 // CHECK1-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
 // CHECK1-NEXT:    br i1 [[TMP13]], label [[DOTCANCEL_EXIT_I:%.*]], label [[DOTCANCEL_CONTINUE_I:%.*]]
 // CHECK1:       .cancel.exit.i:
@ -1045,8 +1045,8 @@ for (int i = 0; i < argc; ++i) {
 // CHECK3-NEXT:    store i8* [[TMP9]], i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !14
 // CHECK3-NEXT:    store %struct.anon* [[TMP8]], %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
 // CHECK3-NEXT:    [[TMP10:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
-// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12:[0-9]+]]) #[[ATTR2:[0-9]+]]
-// CHECK3-NEXT:    [[TMP11:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i32 4) #[[ATTR2]]
+// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB12:[0-9]+]])
+// CHECK3-NEXT:    [[TMP11:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i32 4)
 // CHECK3-NEXT:    [[TMP12:%.*]] = icmp ne i32 [[TMP11]], 0
 // CHECK3-NEXT:    br i1 [[TMP12]], label [[DOTCANCEL_EXIT_I:%.*]], label [[DOTCANCEL_CONTINUE_I:%.*]]
 // CHECK3:       .cancel.exit.i:
--- a/clang/test/OpenMP/cancellation_point_codegen.cpp
+++ b/clang/test/OpenMP/cancellation_point_codegen.cpp
@ -390,14 +390,14 @@ for (int i = 0; i < argc; ++i) {
 // CHECK1-NEXT:    store %struct.anon* [[TMP8]], %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
 // CHECK1-NEXT:    [[TMP10:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !14
 // CHECK1-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
-// CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_cancellationpoint(%struct.ident_t* @[[GLOB1]], i32 [[TMP11]], i32 4) #[[ATTR2:[0-9]+]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_cancellationpoint(%struct.ident_t* @[[GLOB1]], i32 [[TMP11]], i32 4)
 // CHECK1-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
 // CHECK1-NEXT:    br i1 [[TMP13]], label [[DOTCANCEL_EXIT_I:%.*]], label [[DOTCANCEL_CONTINUE_I:%.*]]
 // CHECK1:       .cancel.exit.i:
 // CHECK1-NEXT:    store i32 1, i32* [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !14
 // CHECK1-NEXT:    br label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
 // CHECK1:       .cancel.continue.i:
-// CHECK1-NEXT:    [[TMP14:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[TMP11]], i32 4) #[[ATTR2]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[TMP11]], i32 4)
 // CHECK1-NEXT:    [[TMP15:%.*]] = icmp ne i32 [[TMP14]], 0
 // CHECK1-NEXT:    br i1 [[TMP15]], label [[DOTCANCEL_EXIT1_I:%.*]], label [[DOTCANCEL_CONTINUE2_I:%.*]]
 // CHECK1:       .cancel.exit1.i:
@ -445,7 +445,7 @@ for (int i = 0; i < argc; ++i) {
 // CHECK1-NEXT:    store %struct.anon.0* [[TMP8]], %struct.anon.0** [[__CONTEXT_ADDR_I]], align 8, !noalias !24
 // CHECK1-NEXT:    [[TMP10:%.*]] = load %struct.anon.0*, %struct.anon.0** [[__CONTEXT_ADDR_I]], align 8, !noalias !24
 // CHECK1-NEXT:    [[TMP11:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !24
-// CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_cancellationpoint(%struct.ident_t* @[[GLOB1]], i32 [[TMP11]], i32 4) #[[ATTR2]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_cancellationpoint(%struct.ident_t* @[[GLOB1]], i32 [[TMP11]], i32 4)
 // CHECK1-NEXT:    [[TMP13:%.*]] = icmp ne i32 [[TMP12]], 0
 // CHECK1-NEXT:    br i1 [[TMP13]], label [[DOTCANCEL_EXIT_I:%.*]], label [[DOTCANCEL_CONTINUE_I:%.*]]
 // CHECK1:       .cancel.exit.i:
--- a/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_reduction_task_codegen.cpp
@ -569,7 +569,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP19:%.*]] = load i8*, i8** [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !12
 // CHECK1-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP18]] to i8*
-// CHECK1-NEXT:    [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]]) #[[ATTR5]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]])
 // CHECK1-NEXT:    [[CONV_I:%.*]] = bitcast i8* [[TMP22]] to i32*
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP24:%.*]] = load i8**, i8*** [[TMP23]], align 8
@ -592,7 +592,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
 // CHECK1-NEXT:    store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
-// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
+// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]])
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP42:%.*]] = load i8**, i8*** [[TMP41]], align 8
 // CHECK1-NEXT:    [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
--- a/clang/test/OpenMP/for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/for_reduction_task_codegen.cpp
@ -478,7 +478,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP19:%.*]] = load i8*, i8** [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !12
 // CHECK1-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP18]] to i8*
-// CHECK1-NEXT:    [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]]) #[[ATTR5]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]])
 // CHECK1-NEXT:    [[CONV_I:%.*]] = bitcast i8* [[TMP22]] to i32*
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP24:%.*]] = load i8**, i8*** [[TMP23]], align 8
@ -501,7 +501,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
 // CHECK1-NEXT:    store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
-// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
+// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]])
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP42:%.*]] = load i8**, i8*** [[TMP41]], align 8
 // CHECK1-NEXT:    [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
--- a/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/master_taskloop_in_reduction_codegen.cpp
@ -631,18 +631,18 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP32:%.*]] = load i8*, i8** [[TMP28]], align 8
 // CHECK1-NEXT:    [[TMP33:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
 // CHECK1-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP31]] to i8*
-// CHECK1-NEXT:    [[TMP35:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP32]], i8* [[TMP34]]) #[[ATTR3]]
+// CHECK1-NEXT:    [[TMP35:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP32]], i8* [[TMP34]])
 // CHECK1-NEXT:    [[CONV_I:%.*]] = bitcast i8* [[TMP35]] to i32*
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP22]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP37:%.*]] = load i16*, i16** [[TMP36]], align 8
 // CHECK1-NEXT:    [[TMP38:%.*]] = mul nuw i64 [[TMP24]], 2
 // CHECK1-NEXT:    [[TMP39:%.*]] = udiv exact i64 [[TMP38]], ptrtoint (i16* getelementptr (i16, i16* null, i32 1) to i64)
-// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_threadprivate_cached(%struct.ident_t* @[[GLOB1]], i32 [[TMP33]], i8* bitcast (i64* @{{reduction_size[.].+[.]}}) #[[ATTR3]]
+// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_threadprivate_cached(%struct.ident_t* @[[GLOB1]], i32 [[TMP33]], i8* bitcast (i64* @{{reduction_size[.].+[.]}})
 // CHECK1-NEXT:    [[TMP41:%.*]] = bitcast i8* [[TMP40]] to i64*
 // CHECK1-NEXT:    store i64 [[TMP39]], i64* [[TMP41]], align 8
 // CHECK1-NEXT:    [[TMP42:%.*]] = load i8*, i8** [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP43:%.*]] = bitcast i16* [[TMP37]] to i8*
-// CHECK1-NEXT:    [[TMP44:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP42]], i8* [[TMP43]]) #[[ATTR3]]
+// CHECK1-NEXT:    [[TMP44:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP42]], i8* [[TMP43]])
 // CHECK1-NEXT:    [[CONV2_I:%.*]] = bitcast i8* [[TMP44]] to i16*
 // CHECK1-NEXT:    [[TMP45:%.*]] = load i64, i64* [[DOTLB__ADDR_I]], align 8, !noalias !14
 // CHECK1-NEXT:    [[CONV3_I:%.*]] = trunc i64 [[TMP45]] to i32
--- a/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/master_taskloop_simd_in_reduction_codegen.cpp
@ -631,18 +631,18 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP32:%.*]] = load i8*, i8** [[TMP28]], align 8
 // CHECK1-NEXT:    [[TMP33:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
 // CHECK1-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP31]] to i8*
-// CHECK1-NEXT:    [[TMP35:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP32]], i8* [[TMP34]]) #[[ATTR3]]
+// CHECK1-NEXT:    [[TMP35:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP32]], i8* [[TMP34]])
 // CHECK1-NEXT:    [[CONV_I:%.*]] = bitcast i8* [[TMP35]] to i32*
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP22]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP37:%.*]] = load i16*, i16** [[TMP36]], align 8
 // CHECK1-NEXT:    [[TMP38:%.*]] = mul nuw i64 [[TMP24]], 2
 // CHECK1-NEXT:    [[TMP39:%.*]] = udiv exact i64 [[TMP38]], ptrtoint (i16* getelementptr (i16, i16* null, i32 1) to i64)
-// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_threadprivate_cached(%struct.ident_t* @[[GLOB1]], i32 [[TMP33]], i8* bitcast (i64* @{{reduction_size[.].+[.]}}) #[[ATTR3]]
+// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_threadprivate_cached(%struct.ident_t* @[[GLOB1]], i32 [[TMP33]], i8* bitcast (i64* @{{reduction_size[.].+[.]}})
 // CHECK1-NEXT:    [[TMP41:%.*]] = bitcast i8* [[TMP40]] to i64*
 // CHECK1-NEXT:    store i64 [[TMP39]], i64* [[TMP41]], align 8
 // CHECK1-NEXT:    [[TMP42:%.*]] = load i8*, i8** [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP43:%.*]] = bitcast i16* [[TMP37]] to i8*
-// CHECK1-NEXT:    [[TMP44:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP42]], i8* [[TMP43]]) #[[ATTR3]]
+// CHECK1-NEXT:    [[TMP44:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP42]], i8* [[TMP43]])
 // CHECK1-NEXT:    [[CONV2_I:%.*]] = bitcast i8* [[TMP44]] to i16*
 // CHECK1-NEXT:    [[TMP45:%.*]] = load i64, i64* [[DOTLB__ADDR_I]], align 8, !noalias !14
 // CHECK1-NEXT:    [[CONV3_I:%.*]] = trunc i64 [[TMP45]] to i32
--- a/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_for_reduction_task_codegen.cpp
@ -473,7 +473,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP19:%.*]] = load i8*, i8** [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !12
 // CHECK1-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP18]] to i8*
-// CHECK1-NEXT:    [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]]) #[[ATTR5]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]])
 // CHECK1-NEXT:    [[CONV_I:%.*]] = bitcast i8* [[TMP22]] to i32*
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP24:%.*]] = load i8**, i8*** [[TMP23]], align 8
@ -496,7 +496,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
 // CHECK1-NEXT:    store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
-// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
+// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]])
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP42:%.*]] = load i8**, i8*** [[TMP41]], align 8
 // CHECK1-NEXT:    [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
--- a/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_reduction_task_codegen.cpp
@ -428,7 +428,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP19:%.*]] = load i8*, i8** [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !12
 // CHECK1-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP18]] to i8*
-// CHECK1-NEXT:    [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]]) #[[ATTR5]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]])
 // CHECK1-NEXT:    [[CONV_I:%.*]] = bitcast i8* [[TMP22]] to i32*
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP24:%.*]] = load i8**, i8*** [[TMP23]], align 8
@ -451,7 +451,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
 // CHECK1-NEXT:    store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
-// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
+// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]])
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP42:%.*]] = load i8**, i8*** [[TMP41]], align 8
 // CHECK1-NEXT:    [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
--- a/clang/test/OpenMP/parallel_master_taskloop_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_taskloop_codegen.cpp
@ -721,7 +721,7 @@ struct S {
 // CHECK1-NEXT:    [[TMP24:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !52
 // CHECK1-NEXT:    store i32 [[TMP24]], i32* [[I_I]], align 4, !noalias !52
 // CHECK1-NEXT:    [[TMP25:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !52
-// CHECK1-NEXT:    [[TMP26:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]], i32 4) #[[ATTR2]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = call i32 @__kmpc_cancel(%struct.ident_t* @[[GLOB1]], i32 [[TMP25]], i32 4)
 // CHECK1-NEXT:    [[TMP27:%.*]] = icmp ne i32 [[TMP26]], 0
 // CHECK1-NEXT:    br i1 [[TMP27]], label [[DOTCANCEL_EXIT_I:%.*]], label [[DOTCANCEL_CONTINUE_I:%.*]]
 // CHECK1:       .cancel.exit.i:
@ -729,7 +729,7 @@ struct S {
 // CHECK1-NEXT:    br label [[DOTOMP_OUTLINED__9_EXIT:%.*]]
 // CHECK1:       .cancel.continue.i:
 // CHECK1-NEXT:    [[TMP28:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !52
-// CHECK1-NEXT:    [[TMP29:%.*]] = call i32 @__kmpc_cancellationpoint(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]], i32 4) #[[ATTR2]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = call i32 @__kmpc_cancellationpoint(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]], i32 4)
 // CHECK1-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
 // CHECK1-NEXT:    br i1 [[TMP30]], label [[DOTCANCEL_EXIT2_I:%.*]], label [[DOTCANCEL_CONTINUE3_I:%.*]]
 // CHECK1:       .cancel.exit2.i:
--- a/clang/test/OpenMP/parallel_master_taskloop_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_taskloop_lastprivate_codegen.cpp
@ -479,7 +479,7 @@ void loop() {
 // CHECK1-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[TMP42]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP50:%.*]] = bitcast %struct.S* [[ARRAYIDX6_I]] to i8*
 // CHECK1-NEXT:    [[TMP51:%.*]] = bitcast %struct.S* [[TMP40]] to i8*
-// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP50]], i8* align 8 [[TMP51]], i64 8, i1 false) #[[ATTR4]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP50]], i8* align 8 [[TMP51]], i64 8, i1 false)
 // CHECK1-NEXT:    store i32 33, i32* [[TMP44]], align 4
 // CHECK1-NEXT:    [[TMP52:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14
 // CHECK1-NEXT:    [[ADD7_I:%.*]] = add nsw i32 [[TMP52]], 1
@ -492,7 +492,7 @@ void loop() {
 // CHECK1:       .omp.lastprivate.then.i:
 // CHECK1-NEXT:    [[TMP55:%.*]] = bitcast %struct.S* [[TMP27]] to i8*
 // CHECK1-NEXT:    [[TMP56:%.*]] = bitcast %struct.S* [[TMP40]] to i8*
-// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP55]], i8* align 8 [[TMP56]], i64 8, i1 false) #[[ATTR4]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP55]], i8* align 8 [[TMP56]], i64 8, i1 false)
 // CHECK1-NEXT:    [[TMP57:%.*]] = load i32, i32* [[TMP41]], align 4
 // CHECK1-NEXT:    store i32 [[TMP57]], i32* [[TMP29]], align 4
 // CHECK1-NEXT:    [[ARRAY_BEGIN_I:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[TMP31]], i32 0, i32 0
@ -504,7 +504,7 @@ void loop() {
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST_I:%.*]] = phi %struct.S* [ [[ARRAY_BEGIN_I]], [[DOTOMP_LASTPRIVATE_THEN_I]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT_I:%.*]], [[OMP_ARRAYCPY_BODY_I]] ]
 // CHECK1-NEXT:    [[TMP60:%.*]] = bitcast %struct.S* [[OMP_ARRAYCPY_DESTELEMENTPAST_I]] to i8*
 // CHECK1-NEXT:    [[TMP61:%.*]] = bitcast %struct.S* [[OMP_ARRAYCPY_SRCELEMENTPAST_I]] to i8*
-// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP60]], i8* align 8 [[TMP61]], i64 8, i1 false) #[[ATTR4]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP60]], i8* align 8 [[TMP61]], i64 8, i1 false)
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT_I]] = getelementptr [[STRUCT_S]], %struct.S* [[OMP_ARRAYCPY_DESTELEMENTPAST_I]], i32 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT_I]] = getelementptr [[STRUCT_S]], %struct.S* [[OMP_ARRAYCPY_SRCELEMENTPAST_I]], i32 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE_I:%.*]] = icmp eq %struct.S* [[OMP_ARRAYCPY_DEST_ELEMENT_I]], [[TMP59]]
@ -512,7 +512,7 @@ void loop() {
 // CHECK1:       omp.arraycpy.done8.i:
 // CHECK1-NEXT:    [[TMP62:%.*]] = bitcast [2 x i32]* [[TMP33]] to i8*
 // CHECK1-NEXT:    [[TMP63:%.*]] = bitcast [2 x i32]* [[TMP43]] to i8*
-// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP62]], i8* align 4 [[TMP63]], i64 8, i1 false) #[[ATTR4]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP62]], i8* align 4 [[TMP63]], i64 8, i1 false)
 // CHECK1-NEXT:    [[TMP64:%.*]] = load i32, i32* [[TMP44]], align 4
 // CHECK1-NEXT:    store i32 [[TMP64]], i32* [[TMP39]], align 4
 // CHECK1-NEXT:    br label [[DOTOMP_OUTLINED__1_EXIT]]
@ -891,7 +891,7 @@ void loop() {
 // CHECK1-NEXT:    [[ARRAYIDX5_I:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[TMP40]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP47:%.*]] = bitcast %struct.S.0* [[ARRAYIDX5_I]] to i8*
 // CHECK1-NEXT:    [[TMP48:%.*]] = bitcast %struct.S.0* [[TMP41]] to i8*
-// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP47]], i8* align 4 [[TMP48]], i64 4, i1 false) #[[ATTR4]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP47]], i8* align 4 [[TMP48]], i64 4, i1 false)
 // CHECK1-NEXT:    [[TMP49:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !28
 // CHECK1-NEXT:    [[ADD6_I:%.*]] = add nsw i32 [[TMP49]], 1
 // CHECK1-NEXT:    store i32 [[ADD6_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !28
@ -905,7 +905,7 @@ void loop() {
 // CHECK1-NEXT:    store i32 [[TMP52]], i32* [[TMP27]], align 128
 // CHECK1-NEXT:    [[TMP53:%.*]] = bitcast [2 x i32]* [[TMP29]] to i8*
 // CHECK1-NEXT:    [[TMP54:%.*]] = bitcast [2 x i32]* [[TMP39]] to i8*
-// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP53]], i8* align 4 [[TMP54]], i64 8, i1 false) #[[ATTR4]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP53]], i8* align 4 [[TMP54]], i64 8, i1 false)
 // CHECK1-NEXT:    [[ARRAY_BEGIN_I:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[TMP31]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP55:%.*]] = bitcast [2 x %struct.S.0]* [[TMP40]] to %struct.S.0*
 // CHECK1-NEXT:    [[TMP56:%.*]] = getelementptr [[STRUCT_S_0:%.*]], %struct.S.0* [[ARRAY_BEGIN_I]], i64 2
@ -915,7 +915,7 @@ void loop() {
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST_I:%.*]] = phi %struct.S.0* [ [[ARRAY_BEGIN_I]], [[DOTOMP_LASTPRIVATE_THEN_I]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT_I:%.*]], [[OMP_ARRAYCPY_BODY_I]] ]
 // CHECK1-NEXT:    [[TMP57:%.*]] = bitcast %struct.S.0* [[OMP_ARRAYCPY_DESTELEMENTPAST_I]] to i8*
 // CHECK1-NEXT:    [[TMP58:%.*]] = bitcast %struct.S.0* [[OMP_ARRAYCPY_SRCELEMENTPAST_I]] to i8*
-// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i64 4, i1 false) #[[ATTR4]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i64 4, i1 false)
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT_I]] = getelementptr [[STRUCT_S_0]], %struct.S.0* [[OMP_ARRAYCPY_DESTELEMENTPAST_I]], i32 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT_I]] = getelementptr [[STRUCT_S_0]], %struct.S.0* [[OMP_ARRAYCPY_SRCELEMENTPAST_I]], i32 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE_I:%.*]] = icmp eq %struct.S.0* [[OMP_ARRAYCPY_DEST_ELEMENT_I]], [[TMP56]]
@ -923,7 +923,7 @@ void loop() {
 // CHECK1:       omp.arraycpy.done7.i:
 // CHECK1-NEXT:    [[TMP59:%.*]] = bitcast %struct.S.0* [[TMP35]] to i8*
 // CHECK1-NEXT:    [[TMP60:%.*]] = bitcast %struct.S.0* [[TMP41]] to i8*
-// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP59]], i8* align 4 [[TMP60]], i64 4, i1 false) #[[ATTR4]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP59]], i8* align 4 [[TMP60]], i64 4, i1 false)
 // CHECK1-NEXT:    br label [[DOTOMP_OUTLINED__3_EXIT]]
 // CHECK1:       .omp_outlined..3.exit:
 // CHECK1-NEXT:    ret i32 0
@ -1195,7 +1195,7 @@ void loop() {
 // CHECK3-NEXT:    store double* [[TMP30]], double** [[TMP36]], align 8, !noalias !14
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[REF_TMP_I]], i32 0, i32 1
 // CHECK3-NEXT:    store i32* [[TMP31]], i32** [[TMP37]], align 8, !noalias !14
-// CHECK3-NEXT:    call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(%class.anon.0* noundef nonnull align 8 dereferenceable(16) [[REF_TMP_I]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(%class.anon.0* noundef nonnull align 8 dereferenceable(16) [[REF_TMP_I]])
 // CHECK3-NEXT:    [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14
 // CHECK3-NEXT:    [[ADD3_I:%.*]] = add nsw i32 [[TMP38]], 1
 // CHECK3-NEXT:    store i32 [[ADD3_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14
--- a/clang/test/OpenMP/parallel_master_taskloop_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/parallel_master_taskloop_simd_lastprivate_codegen.cpp
@ -479,7 +479,7 @@ void loop() {
 // CHECK1-NEXT:    [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[TMP42]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP50:%.*]] = bitcast %struct.S* [[ARRAYIDX6_I]] to i8*
 // CHECK1-NEXT:    [[TMP51:%.*]] = bitcast %struct.S* [[TMP40]] to i8*
-// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP50]], i8* align 8 [[TMP51]], i64 8, i1 false) #[[ATTR4]], !llvm.access.group [[ACC_GRP15]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[TMP50]], i8* align 8 [[TMP51]], i64 8, i1 false), !llvm.access.group [[ACC_GRP15]]
 // CHECK1-NEXT:    store i32 33, i32* [[TMP44]], align 4, !llvm.access.group [[ACC_GRP15]]
 // CHECK1-NEXT:    [[TMP52:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
 // CHECK1-NEXT:    [[ADD7_I:%.*]] = add nsw i32 [[TMP52]], 1
@ -492,7 +492,7 @@ void loop() {
 // CHECK1:       .omp.lastprivate.then.i:
 // CHECK1-NEXT:    [[TMP55:%.*]] = bitcast %struct.S* [[TMP27]] to i8*
 // CHECK1-NEXT:    [[TMP56:%.*]] = bitcast %struct.S* [[TMP40]] to i8*
-// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP55]], i8* align 8 [[TMP56]], i64 8, i1 false) #[[ATTR4]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP55]], i8* align 8 [[TMP56]], i64 8, i1 false)
 // CHECK1-NEXT:    [[TMP57:%.*]] = load i32, i32* [[TMP41]], align 4
 // CHECK1-NEXT:    store i32 [[TMP57]], i32* [[TMP29]], align 4
 // CHECK1-NEXT:    [[ARRAY_BEGIN_I:%.*]] = getelementptr inbounds [2 x %struct.S], [2 x %struct.S]* [[TMP31]], i32 0, i32 0
@ -504,7 +504,7 @@ void loop() {
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST_I:%.*]] = phi %struct.S* [ [[ARRAY_BEGIN_I]], [[DOTOMP_LASTPRIVATE_THEN_I]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT_I:%.*]], [[OMP_ARRAYCPY_BODY_I]] ]
 // CHECK1-NEXT:    [[TMP60:%.*]] = bitcast %struct.S* [[OMP_ARRAYCPY_DESTELEMENTPAST_I]] to i8*
 // CHECK1-NEXT:    [[TMP61:%.*]] = bitcast %struct.S* [[OMP_ARRAYCPY_SRCELEMENTPAST_I]] to i8*
-// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP60]], i8* align 8 [[TMP61]], i64 8, i1 false) #[[ATTR4]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP60]], i8* align 8 [[TMP61]], i64 8, i1 false)
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT_I]] = getelementptr [[STRUCT_S]], %struct.S* [[OMP_ARRAYCPY_DESTELEMENTPAST_I]], i32 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT_I]] = getelementptr [[STRUCT_S]], %struct.S* [[OMP_ARRAYCPY_SRCELEMENTPAST_I]], i32 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE_I:%.*]] = icmp eq %struct.S* [[OMP_ARRAYCPY_DEST_ELEMENT_I]], [[TMP59]]
@ -512,7 +512,7 @@ void loop() {
 // CHECK1:       omp.arraycpy.done8.i:
 // CHECK1-NEXT:    [[TMP62:%.*]] = bitcast [2 x i32]* [[TMP33]] to i8*
 // CHECK1-NEXT:    [[TMP63:%.*]] = bitcast [2 x i32]* [[TMP43]] to i8*
-// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP62]], i8* align 4 [[TMP63]], i64 8, i1 false) #[[ATTR4]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP62]], i8* align 4 [[TMP63]], i64 8, i1 false)
 // CHECK1-NEXT:    [[TMP64:%.*]] = load i32, i32* [[TMP44]], align 4
 // CHECK1-NEXT:    store i32 [[TMP64]], i32* [[TMP39]], align 4
 // CHECK1-NEXT:    br label [[DOTOMP_OUTLINED__1_EXIT]]
@ -891,7 +891,7 @@ void loop() {
 // CHECK1-NEXT:    [[ARRAYIDX5_I:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[TMP40]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP47:%.*]] = bitcast %struct.S.0* [[ARRAYIDX5_I]] to i8*
 // CHECK1-NEXT:    [[TMP48:%.*]] = bitcast %struct.S.0* [[TMP41]] to i8*
-// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP47]], i8* align 4 [[TMP48]], i64 4, i1 false) #[[ATTR4]], !llvm.access.group [[ACC_GRP33]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP47]], i8* align 4 [[TMP48]], i64 4, i1 false), !llvm.access.group [[ACC_GRP33]]
 // CHECK1-NEXT:    [[TMP49:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !32, !llvm.access.group [[ACC_GRP33]]
 // CHECK1-NEXT:    [[ADD6_I:%.*]] = add nsw i32 [[TMP49]], 1
 // CHECK1-NEXT:    store i32 [[ADD6_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !32, !llvm.access.group [[ACC_GRP33]]
@ -905,7 +905,7 @@ void loop() {
 // CHECK1-NEXT:    store i32 [[TMP52]], i32* [[TMP27]], align 128
 // CHECK1-NEXT:    [[TMP53:%.*]] = bitcast [2 x i32]* [[TMP29]] to i8*
 // CHECK1-NEXT:    [[TMP54:%.*]] = bitcast [2 x i32]* [[TMP39]] to i8*
-// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP53]], i8* align 4 [[TMP54]], i64 8, i1 false) #[[ATTR4]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP53]], i8* align 4 [[TMP54]], i64 8, i1 false)
 // CHECK1-NEXT:    [[ARRAY_BEGIN_I:%.*]] = getelementptr inbounds [2 x %struct.S.0], [2 x %struct.S.0]* [[TMP31]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP55:%.*]] = bitcast [2 x %struct.S.0]* [[TMP40]] to %struct.S.0*
 // CHECK1-NEXT:    [[TMP56:%.*]] = getelementptr [[STRUCT_S_0:%.*]], %struct.S.0* [[ARRAY_BEGIN_I]], i64 2
@ -915,7 +915,7 @@ void loop() {
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST_I:%.*]] = phi %struct.S.0* [ [[ARRAY_BEGIN_I]], [[DOTOMP_LASTPRIVATE_THEN_I]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT_I:%.*]], [[OMP_ARRAYCPY_BODY_I]] ]
 // CHECK1-NEXT:    [[TMP57:%.*]] = bitcast %struct.S.0* [[OMP_ARRAYCPY_DESTELEMENTPAST_I]] to i8*
 // CHECK1-NEXT:    [[TMP58:%.*]] = bitcast %struct.S.0* [[OMP_ARRAYCPY_SRCELEMENTPAST_I]] to i8*
-// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i64 4, i1 false) #[[ATTR4]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP57]], i8* align 4 [[TMP58]], i64 4, i1 false)
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT_I]] = getelementptr [[STRUCT_S_0]], %struct.S.0* [[OMP_ARRAYCPY_DESTELEMENTPAST_I]], i32 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT_I]] = getelementptr [[STRUCT_S_0]], %struct.S.0* [[OMP_ARRAYCPY_SRCELEMENTPAST_I]], i32 1
 // CHECK1-NEXT:    [[OMP_ARRAYCPY_DONE_I:%.*]] = icmp eq %struct.S.0* [[OMP_ARRAYCPY_DEST_ELEMENT_I]], [[TMP56]]
@ -923,7 +923,7 @@ void loop() {
 // CHECK1:       omp.arraycpy.done7.i:
 // CHECK1-NEXT:    [[TMP59:%.*]] = bitcast %struct.S.0* [[TMP35]] to i8*
 // CHECK1-NEXT:    [[TMP60:%.*]] = bitcast %struct.S.0* [[TMP41]] to i8*
-// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP59]], i8* align 4 [[TMP60]], i64 4, i1 false) #[[ATTR4]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP59]], i8* align 4 [[TMP60]], i64 4, i1 false)
 // CHECK1-NEXT:    br label [[DOTOMP_OUTLINED__3_EXIT]]
 // CHECK1:       .omp_outlined..3.exit:
 // CHECK1-NEXT:    ret i32 0
@ -1195,7 +1195,7 @@ void loop() {
 // CHECK3-NEXT:    store double* [[TMP30]], double** [[TMP36]], align 8, !noalias !14, !llvm.access.group [[ACC_GRP15]]
 // CHECK3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds [[CLASS_ANON_0]], %class.anon.0* [[REF_TMP_I]], i32 0, i32 1
 // CHECK3-NEXT:    store i32* [[TMP31]], i32** [[TMP37]], align 8, !noalias !14, !llvm.access.group [[ACC_GRP15]]
-// CHECK3-NEXT:    call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(%class.anon.0* noundef nonnull align 8 dereferenceable(16) [[REF_TMP_I]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP15]]
+// CHECK3-NEXT:    call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(%class.anon.0* noundef nonnull align 8 dereferenceable(16) [[REF_TMP_I]]), !llvm.access.group [[ACC_GRP15]]
 // CHECK3-NEXT:    [[TMP38:%.*]] = load i32, i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
 // CHECK3-NEXT:    [[ADD3_I:%.*]] = add nsw i32 [[TMP38]], 1
 // CHECK3-NEXT:    store i32 [[ADD3_I]], i32* [[DOTOMP_IV_I]], align 4, !noalias !14, !llvm.access.group [[ACC_GRP15]]
--- a/clang/test/OpenMP/parallel_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_reduction_task_codegen.cpp
@ -419,7 +419,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP19:%.*]] = load i8*, i8** [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !12
 // CHECK1-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP18]] to i8*
-// CHECK1-NEXT:    [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]]) #[[ATTR5]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]])
 // CHECK1-NEXT:    [[CONV_I:%.*]] = bitcast i8* [[TMP22]] to i32*
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP24:%.*]] = load i8**, i8*** [[TMP23]], align 8
@ -442,7 +442,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
 // CHECK1-NEXT:    store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
-// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
+// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]])
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP42:%.*]] = load i8**, i8*** [[TMP41]], align 8
 // CHECK1-NEXT:    [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
--- a/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/parallel_sections_reduction_task_codegen.cpp
@ -461,7 +461,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP19:%.*]] = load i8*, i8** [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !12
 // CHECK1-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP18]] to i8*
-// CHECK1-NEXT:    [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]]) #[[ATTR5]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]])
 // CHECK1-NEXT:    [[CONV_I:%.*]] = bitcast i8* [[TMP22]] to i32*
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP24:%.*]] = load i8**, i8*** [[TMP23]], align 8
@ -484,7 +484,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
 // CHECK1-NEXT:    store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
-// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
+// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]])
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP42:%.*]] = load i8**, i8*** [[TMP41]], align 8
 // CHECK1-NEXT:    [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
--- a/clang/test/OpenMP/sections_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/sections_reduction_task_codegen.cpp
@ -466,7 +466,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP19:%.*]] = load i8*, i8** [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !12
 // CHECK1-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP18]] to i8*
-// CHECK1-NEXT:    [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]]) #[[ATTR5]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]])
 // CHECK1-NEXT:    [[CONV_I:%.*]] = bitcast i8* [[TMP22]] to i32*
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP24:%.*]] = load i8**, i8*** [[TMP23]], align 8
@ -489,7 +489,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
 // CHECK1-NEXT:    store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
-// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
+// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]])
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP42:%.*]] = load i8**, i8*** [[TMP41]], align 8
 // CHECK1-NEXT:    [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
--- a/clang/test/OpenMP/target_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/target_in_reduction_codegen.cpp
@ -612,7 +612,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP21:%.*]] = load i8*, i8** [[TMP18]], align 8
 // CHECK1-NEXT:    [[TMP22:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
 // CHECK1-NEXT:    [[TMP23:%.*]] = bitcast i32* [[TMP20]] to i8*
-// CHECK1-NEXT:    [[TMP24:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP22]], i8* [[TMP21]], i8* [[TMP23]]) #[[ATTR3]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP22]], i8* [[TMP21]], i8* [[TMP23]])
 // CHECK1-NEXT:    [[CONV_I:%.*]] = bitcast i8* [[TMP24]] to i32*
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP26:%.*]] = load i32*, i32** [[TMP25]], align 8
--- a/clang/test/OpenMP/target_parallel_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_codegen.cpp
@ -641,7 +641,7 @@ int bar(int n){
 // CHECK1-NEXT:    store i8** null, i8*** [[TMP18]], align 8, !noalias !21
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
 // CHECK1-NEXT:    store i64 0, i64* [[TMP19]], align 8, !noalias !21
-// CHECK1-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB1]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB1]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
 // CHECK1-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK1-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
 // CHECK1:       omp_offload.failed.i:
@ -1783,7 +1783,7 @@ int bar(int n){
 // CHECK3-NEXT:    store i8** null, i8*** [[TMP18]], align 4, !noalias !22
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
 // CHECK3-NEXT:    store i64 0, i64* [[TMP19]], align 8, !noalias !22
-// CHECK3-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB1]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK3-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB1]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
 // CHECK3-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK3-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
 // CHECK3:       omp_offload.failed.i:
--- a/clang/test/OpenMP/target_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_codegen.cpp
@ -1117,7 +1117,7 @@ int bar(int n){
 // CHECK1-NEXT:    store i8** null, i8*** [[TMP32]], align 8, !noalias !24
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
 // CHECK1-NEXT:    store i64 0, i64* [[TMP33]], align 8, !noalias !24
-// CHECK1-NEXT:    [[TMP34:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l138.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK1-NEXT:    [[TMP34:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l138.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
 // CHECK1-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
 // CHECK1-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__3_EXIT:%.*]]
 // CHECK1:       omp_offload.failed.i:
@ -2847,7 +2847,7 @@ int bar(int n){
 // CHECK3-NEXT:    store i8** null, i8*** [[TMP32]], align 4, !noalias !25
 // CHECK3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
 // CHECK3-NEXT:    store i64 0, i64* [[TMP33]], align 8, !noalias !25
-// CHECK3-NEXT:    [[TMP34:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l138.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK3-NEXT:    [[TMP34:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l138.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
 // CHECK3-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
 // CHECK3-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__3_EXIT:%.*]]
 // CHECK3:       omp_offload.failed.i:
@ -6159,7 +6159,7 @@ int bar(int n){
 // CHECK17-NEXT:    store i8** null, i8*** [[TMP32]], align 8, !noalias !24
 // CHECK17-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
 // CHECK17-NEXT:    store i64 0, i64* [[TMP33]], align 8, !noalias !24
-// CHECK17-NEXT:    [[TMP34:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l138.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK17-NEXT:    [[TMP34:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l138.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
 // CHECK17-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
 // CHECK17-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__3_EXIT:%.*]]
 // CHECK17:       omp_offload.failed.i:
@ -7889,7 +7889,7 @@ int bar(int n){
 // CHECK19-NEXT:    store i8** null, i8*** [[TMP32]], align 4, !noalias !25
 // CHECK19-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
 // CHECK19-NEXT:    store i64 0, i64* [[TMP33]], align 8, !noalias !25
-// CHECK19-NEXT:    [[TMP34:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l138.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK19-NEXT:    [[TMP34:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l138.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
 // CHECK19-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
 // CHECK19-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__3_EXIT:%.*]]
 // CHECK19:       omp_offload.failed.i:
--- a/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_reduction_task_codegen.cpp
@ -486,7 +486,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP19:%.*]] = load i8*, i8** [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !12
 // CHECK1-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP18]] to i8*
-// CHECK1-NEXT:    [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]]) #[[ATTR5]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]])
 // CHECK1-NEXT:    [[CONV_I:%.*]] = bitcast i8* [[TMP22]] to i32*
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP24:%.*]] = load i8**, i8*** [[TMP23]], align 8
@ -509,7 +509,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
 // CHECK1-NEXT:    store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
-// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
+// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]])
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP42:%.*]] = load i8**, i8*** [[TMP41]], align 8
 // CHECK1-NEXT:    [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
--- a/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
@ -757,7 +757,7 @@ int bar(int n){
 // CHECK1-NEXT:    store i8** null, i8*** [[TMP18]], align 8, !noalias !25
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
 // CHECK1-NEXT:    store i64 0, i64* [[TMP19]], align 8, !noalias !25
-// CHECK1-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
 // CHECK1-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK1-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
 // CHECK1:       omp_offload.failed.i:
@ -2438,7 +2438,7 @@ int bar(int n){
 // CHECK3-NEXT:    store i8** null, i8*** [[TMP18]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
 // CHECK3-NEXT:    store i64 0, i64* [[TMP19]], align 8, !noalias !26
-// CHECK3-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK3-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
 // CHECK3-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK3-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
 // CHECK3:       omp_offload.failed.i:
@ -4097,7 +4097,7 @@ int bar(int n){
 // CHECK5-NEXT:    store i8** null, i8*** [[TMP18]], align 8, !noalias !25
 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
 // CHECK5-NEXT:    store i64 0, i64* [[TMP19]], align 8, !noalias !25
-// CHECK5-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK5-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
 // CHECK5-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK5-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
 // CHECK5:       omp_offload.failed.i:
@ -5890,7 +5890,7 @@ int bar(int n){
 // CHECK7-NEXT:    store i8** null, i8*** [[TMP18]], align 4, !noalias !26
 // CHECK7-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
 // CHECK7-NEXT:    store i64 0, i64* [[TMP19]], align 8, !noalias !26
-// CHECK7-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK7-NEXT:    [[TMP20:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 1, i32 0, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
 // CHECK7-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK7-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
 // CHECK7:       omp_offload.failed.i:
--- a/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_reduction_task_codegen.cpp
@ -432,7 +432,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP19:%.*]] = load i8*, i8** [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !12
 // CHECK1-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP18]] to i8*
-// CHECK1-NEXT:    [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]]) #[[ATTR5]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]])
 // CHECK1-NEXT:    [[CONV_I:%.*]] = bitcast i8* [[TMP22]] to i32*
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP24:%.*]] = load i8**, i8*** [[TMP23]], align 8
@ -455,7 +455,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
 // CHECK1-NEXT:    store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
-// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
+// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]])
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP42:%.*]] = load i8**, i8*** [[TMP41]], align 8
 // CHECK1-NEXT:    [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
--- a/clang/test/OpenMP/target_teams_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_codegen.cpp
@ -895,7 +895,7 @@ int bar(int n){
 // CHECK1-NEXT:    store i8** null, i8*** [[TMP34]], align 8, !noalias !24
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
 // CHECK1-NEXT:    store i64 0, i64* [[TMP35]], align 8, !noalias !24
-// CHECK1-NEXT:    [[TMP36:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB1]], i64 -1, i32 [[TMP25]], i32 [[TMP26]], i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR3]]
+// CHECK1-NEXT:    [[TMP36:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB1]], i64 -1, i32 [[TMP25]], i32 [[TMP26]], i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
 // CHECK1-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK1-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
 // CHECK1:       omp_offload.failed.i:
@ -2441,7 +2441,7 @@ int bar(int n){
 // CHECK3-NEXT:    store i8** null, i8*** [[TMP34]], align 4, !noalias !25
 // CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
 // CHECK3-NEXT:    store i64 0, i64* [[TMP35]], align 8, !noalias !25
-// CHECK3-NEXT:    [[TMP36:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB1]], i64 -1, i32 [[TMP25]], i32 [[TMP26]], i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR3]]
+// CHECK3-NEXT:    [[TMP36:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB1]], i64 -1, i32 [[TMP25]], i32 [[TMP26]], i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
 // CHECK3-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK3-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
 // CHECK3:       omp_offload.failed.i:
--- a/clang/test/OpenMP/target_teams_distribute_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_codegen.cpp
@ -866,7 +866,7 @@ int bar(int n){
 // CHECK1-NEXT:    store i8** null, i8*** [[TMP34]], align 8, !noalias !21
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
 // CHECK1-NEXT:    store i64 10, i64* [[TMP35]], align 8, !noalias !21
-// CHECK1-NEXT:    [[TMP36:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 [[TMP25]], i32 [[TMP26]], i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR3]]
+// CHECK1-NEXT:    [[TMP36:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 [[TMP25]], i32 [[TMP26]], i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
 // CHECK1-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK1-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
 // CHECK1:       omp_offload.failed.i:
@ -2684,7 +2684,7 @@ int bar(int n){
 // CHECK3-NEXT:    store i8** null, i8*** [[TMP34]], align 4, !noalias !22
 // CHECK3-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
 // CHECK3-NEXT:    store i64 10, i64* [[TMP35]], align 8, !noalias !22
-// CHECK3-NEXT:    [[TMP36:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 [[TMP25]], i32 [[TMP26]], i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR3]]
+// CHECK3-NEXT:    [[TMP36:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 [[TMP25]], i32 [[TMP26]], i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
 // CHECK3-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK3-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
 // CHECK3:       omp_offload.failed.i:
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_task_codegen.cpp
@ -832,7 +832,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP19:%.*]] = load i8*, i8** [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !12
 // CHECK1-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP18]] to i8*
-// CHECK1-NEXT:    [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]]) #[[ATTR5]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]])
 // CHECK1-NEXT:    [[CONV_I:%.*]] = bitcast i8* [[TMP22]] to i32*
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP24:%.*]] = load i8**, i8*** [[TMP23]], align 8
@ -855,7 +855,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
 // CHECK1-NEXT:    store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
-// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
+// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]])
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP42:%.*]] = load i8**, i8*** [[TMP41]], align 8
 // CHECK1-NEXT:    [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
--- a/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
@ -855,7 +855,7 @@ int bar(int n){
 // CHECK1-NEXT:    store i8** null, i8*** [[TMP33]], align 8, !noalias !26
 // CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
 // CHECK1-NEXT:    store i64 10, i64* [[TMP34]], align 8, !noalias !26
-// CHECK1-NEXT:    [[TMP35:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 [[TMP25]], i32 1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK1-NEXT:    [[TMP35:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 [[TMP25]], i32 1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
 // CHECK1-NEXT:    [[TMP36:%.*]] = icmp ne i32 [[TMP35]], 0
 // CHECK1-NEXT:    br i1 [[TMP36]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
 // CHECK1:       omp_offload.failed.i:
@ -2695,7 +2695,7 @@ int bar(int n){
 // CHECK3-NEXT:    store i8** null, i8*** [[TMP33]], align 4, !noalias !27
 // CHECK3-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
 // CHECK3-NEXT:    store i64 10, i64* [[TMP34]], align 8, !noalias !27
-// CHECK3-NEXT:    [[TMP35:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 [[TMP25]], i32 1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK3-NEXT:    [[TMP35:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 [[TMP25]], i32 1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
 // CHECK3-NEXT:    [[TMP36:%.*]] = icmp ne i32 [[TMP35]], 0
 // CHECK3-NEXT:    br i1 [[TMP36]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
 // CHECK3:       omp_offload.failed.i:
@ -4515,7 +4515,7 @@ int bar(int n){
 // CHECK5-NEXT:    store i8** null, i8*** [[TMP33]], align 8, !noalias !26
 // CHECK5-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
 // CHECK5-NEXT:    store i64 10, i64* [[TMP34]], align 8, !noalias !26
-// CHECK5-NEXT:    [[TMP35:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 [[TMP25]], i32 1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK5-NEXT:    [[TMP35:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 [[TMP25]], i32 1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
 // CHECK5-NEXT:    [[TMP36:%.*]] = icmp ne i32 [[TMP35]], 0
 // CHECK5-NEXT:    br i1 [[TMP36]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
 // CHECK5:       omp_offload.failed.i:
@ -6430,7 +6430,7 @@ int bar(int n){
 // CHECK7-NEXT:    store i8** null, i8*** [[TMP33]], align 4, !noalias !27
 // CHECK7-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i32 8
 // CHECK7-NEXT:    store i64 10, i64* [[TMP34]], align 8, !noalias !27
-// CHECK7-NEXT:    [[TMP35:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 [[TMP25]], i32 1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null) #[[ATTR4]]
+// CHECK7-NEXT:    [[TMP35:%.*]] = call i32 @__tgt_target_kernel_nowait(%struct.ident_t* @[[GLOB2]], i64 -1, i32 [[TMP25]], i32 1, i8* @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.region_id, %struct.__tgt_kernel_arguments* [[KERNEL_ARGS_I]], i32 0, i8* null, i32 0, i8* null)
 // CHECK7-NEXT:    [[TMP36:%.*]] = icmp ne i32 [[TMP35]], 0
 // CHECK7-NEXT:    br i1 [[TMP36]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__1_EXIT:%.*]]
 // CHECK7:       omp_offload.failed.i:
--- a/clang/test/OpenMP/task_codegen.cpp
+++ b/clang/test/OpenMP/task_codegen.cpp
@ -658,13 +658,13 @@ void test_omp_all_memory()
 // CHECK1-NEXT:    store i32 1, i32* [[TMP13]], align 4
 // CHECK1-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !32
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !32
-// CHECK1-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]]) #[[ATTR4]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]])
 // CHECK1-NEXT:    br label [[DOTOMP_OUTLINED__3_EXIT:%.*]]
 // CHECK1:       .untied.jmp.1.i:
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !32
-// CHECK1-NEXT:    call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK1-NEXT:    call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]], [8 x i32]* @.gomp_critical_user_.var)
 // CHECK1-NEXT:    store i32 1, i32* @a, align 4, !noalias !32
-// CHECK1-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK1-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]], [8 x i32]* @.gomp_critical_user_.var)
 // CHECK1-NEXT:    store i32 0, i32* [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !32
 // CHECK1-NEXT:    br label [[CLEANUP_I]]
 // CHECK1:       cleanup.i:
@ -721,7 +721,7 @@ void test_omp_all_memory()
 // CHECK1-NEXT:    store i32 1, i32* [[TMP13]], align 4
 // CHECK1-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !42
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !42
-// CHECK1-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]]) #[[ATTR4]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]])
 // CHECK1-NEXT:    br label [[DOTOMP_OUTLINED__5_EXIT:%.*]]
 // CHECK1:       .untied.jmp.1.i:
 // CHECK1-NEXT:    store i32 1, i32* @a, align 4, !noalias !42
@ -781,7 +781,7 @@ void test_omp_all_memory()
 // CHECK1-NEXT:    store i32 1, i32* [[TMP13]], align 4
 // CHECK1-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !52
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !52
-// CHECK1-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]]) #[[ATTR4]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]])
 // CHECK1-NEXT:    br label [[DOTOMP_OUTLINED__7_EXIT:%.*]]
 // CHECK1:       .untied.jmp.1.i:
 // CHECK1-NEXT:    store i32 1, i32* @a, align 4, !noalias !52
@ -1099,26 +1099,26 @@ void test_omp_all_memory()
 // CHECK1-NEXT:    store i32 1, i32* [[TMP22]], align 4
 // CHECK1-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
 // CHECK1-NEXT:    [[TMP24:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK1-NEXT:    [[TMP25:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP23]], i8* [[TMP24]]) #[[ATTR4]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP23]], i8* [[TMP24]])
 // CHECK1-NEXT:    br label [[DOTOMP_OUTLINED__17_EXIT:%.*]]
 // CHECK1:       .untied.jmp.2.i:
-// CHECK1-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP17]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP17]])
 // CHECK1-NEXT:    [[TMP26:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
-// CHECK1-NEXT:    [[DOTS2__VOID_ADDR_I:%.*]] = call i8* @__kmpc_alloc(i32 [[TMP26]], i64 4, i8* inttoptr (i64 7 to i8*)) #[[ATTR4]]
+// CHECK1-NEXT:    [[DOTS2__VOID_ADDR_I:%.*]] = call i8* @__kmpc_alloc(i32 [[TMP26]], i64 4, i8* inttoptr (i64 7 to i8*))
 // CHECK1-NEXT:    [[DOTS2__ADDR_I:%.*]] = bitcast i8* [[DOTS2__VOID_ADDR_I]] to %struct.S*
 // CHECK1-NEXT:    store %struct.S* [[DOTS2__ADDR_I]], %struct.S** [[TMP18]], align 8
 // CHECK1-NEXT:    [[TMP27:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
 // CHECK1-NEXT:    store i32 2, i32* [[TMP27]], align 4
 // CHECK1-NEXT:    [[TMP28:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
 // CHECK1-NEXT:    [[TMP29:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK1-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]], i8* [[TMP29]]) #[[ATTR4]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]], i8* [[TMP29]])
 // CHECK1-NEXT:    br label [[DOTOMP_OUTLINED__17_EXIT]]
 // CHECK1:       .untied.jmp.3.i:
-// CHECK1-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP19]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP19]])
 // CHECK1-NEXT:    [[A_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[TMP19]], i32 0, i32 0
 // CHECK1-NEXT:    store i32 0, i32* [[A_I]], align 4
 // CHECK1-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
-// CHECK1-NEXT:    [[TMP32:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP31]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.18*)* @.omp_task_entry..19 to i32 (i32, i8*)*)) #[[ATTR4]]
+// CHECK1-NEXT:    [[TMP32:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP31]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.18*)* @.omp_task_entry..19 to i32 (i32, i8*)*))
 // CHECK1-NEXT:    [[TMP33:%.*]] = bitcast i8* [[TMP32]] to %struct.kmp_task_t_with_privates.18*
 // CHECK1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_18:%.*]], %struct.kmp_task_t_with_privates.18* [[TMP33]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_18]], %struct.kmp_task_t_with_privates.18* [[TMP33]], i32 0, i32 2
@ -1126,43 +1126,43 @@ void test_omp_all_memory()
 // CHECK1-NEXT:    [[TMP37:%.*]] = load i32, i32* [[TMP16]], align 128
 // CHECK1-NEXT:    store i32 [[TMP37]], i32* [[TMP36]], align 128
 // CHECK1-NEXT:    [[TMP38:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
-// CHECK1-NEXT:    [[TMP39:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP38]], i8* [[TMP32]]) #[[ATTR4]]
+// CHECK1-NEXT:    [[TMP39:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP38]], i8* [[TMP32]])
 // CHECK1-NEXT:    [[TMP40:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
 // CHECK1-NEXT:    store i32 3, i32* [[TMP40]], align 4
 // CHECK1-NEXT:    [[TMP41:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
 // CHECK1-NEXT:    [[TMP42:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK1-NEXT:    [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]], i8* [[TMP42]]) #[[ATTR4]]
+// CHECK1-NEXT:    [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]], i8* [[TMP42]])
 // CHECK1-NEXT:    br label [[DOTOMP_OUTLINED__17_EXIT]]
 // CHECK1:       .untied.jmp.5.i:
 // CHECK1-NEXT:    [[TMP44:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
-// CHECK1-NEXT:    [[TMP45:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]], i32 0) #[[ATTR4]]
+// CHECK1-NEXT:    [[TMP45:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]], i32 0)
 // CHECK1-NEXT:    [[TMP46:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
 // CHECK1-NEXT:    store i32 4, i32* [[TMP46]], align 4
 // CHECK1-NEXT:    [[TMP47:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
 // CHECK1-NEXT:    [[TMP48:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK1-NEXT:    [[TMP49:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]], i8* [[TMP48]]) #[[ATTR4]]
+// CHECK1-NEXT:    [[TMP49:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]], i8* [[TMP48]])
 // CHECK1-NEXT:    br label [[DOTOMP_OUTLINED__17_EXIT]]
 // CHECK1:       .untied.jmp.7.i:
-// CHECK1-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]])
 // CHECK1-NEXT:    [[TMP50:%.*]] = bitcast %struct.S* [[TMP17]] to i8*
 // CHECK1-NEXT:    [[TMP51:%.*]] = bitcast %struct.S* [[REF_TMP_I]] to i8*
-// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP50]], i8* align 4 [[TMP51]], i64 4, i1 false) #[[ATTR4]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP50]], i8* align 4 [[TMP51]], i64 4, i1 false)
 // CHECK1-NEXT:    call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[A9_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[TMP19]], i32 0, i32 0
 // CHECK1-NEXT:    store i32 10, i32* [[A9_I]], align 4
 // CHECK1-NEXT:    [[TMP52:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
-// CHECK1-NEXT:    [[TMP53:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]]) #[[ATTR4]]
+// CHECK1-NEXT:    [[TMP53:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]])
 // CHECK1-NEXT:    [[TMP54:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
 // CHECK1-NEXT:    store i32 5, i32* [[TMP54]], align 4
 // CHECK1-NEXT:    [[TMP55:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
 // CHECK1-NEXT:    [[TMP56:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK1-NEXT:    [[TMP57:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP55]], i8* [[TMP56]]) #[[ATTR4]]
+// CHECK1-NEXT:    [[TMP57:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP55]], i8* [[TMP56]])
 // CHECK1-NEXT:    br label [[DOTOMP_OUTLINED__17_EXIT]]
 // CHECK1:       .untied.jmp.10.i:
 // CHECK1-NEXT:    call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP19]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP58:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
 // CHECK1-NEXT:    [[TMP59:%.*]] = bitcast %struct.S* [[TMP19]] to i8*
-// CHECK1-NEXT:    call void @__kmpc_free(i32 [[TMP58]], i8* [[TMP59]], i8* inttoptr (i64 7 to i8*)) #[[ATTR4]]
+// CHECK1-NEXT:    call void @__kmpc_free(i32 [[TMP58]], i8* [[TMP59]], i8* inttoptr (i64 7 to i8*))
 // CHECK1-NEXT:    call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP17]]) #[[ATTR4]]
 // CHECK1-NEXT:    store i32 0, i32* [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !112
 // CHECK1-NEXT:    br label [[CLEANUP_I]]
@ -1850,13 +1850,13 @@ void test_omp_all_memory()
 // CHECK2-NEXT:    store i32 1, i32* [[TMP13]], align 4
 // CHECK2-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !32
 // CHECK2-NEXT:    [[TMP15:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !32
-// CHECK2-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]]) #[[ATTR4]]
+// CHECK2-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]])
 // CHECK2-NEXT:    br label [[DOTOMP_OUTLINED__3_EXIT:%.*]]
 // CHECK2:       .untied.jmp.1.i:
 // CHECK2-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !32
-// CHECK2-NEXT:    call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK2-NEXT:    call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]], [8 x i32]* @.gomp_critical_user_.var)
 // CHECK2-NEXT:    store i32 1, i32* @a, align 4, !noalias !32
-// CHECK2-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK2-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]], [8 x i32]* @.gomp_critical_user_.var)
 // CHECK2-NEXT:    store i32 0, i32* [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !32
 // CHECK2-NEXT:    br label [[CLEANUP_I]]
 // CHECK2:       cleanup.i:
@ -1913,7 +1913,7 @@ void test_omp_all_memory()
 // CHECK2-NEXT:    store i32 1, i32* [[TMP13]], align 4
 // CHECK2-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !42
 // CHECK2-NEXT:    [[TMP15:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !42
-// CHECK2-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]]) #[[ATTR4]]
+// CHECK2-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]])
 // CHECK2-NEXT:    br label [[DOTOMP_OUTLINED__5_EXIT:%.*]]
 // CHECK2:       .untied.jmp.1.i:
 // CHECK2-NEXT:    store i32 1, i32* @a, align 4, !noalias !42
@ -1973,7 +1973,7 @@ void test_omp_all_memory()
 // CHECK2-NEXT:    store i32 1, i32* [[TMP13]], align 4
 // CHECK2-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !52
 // CHECK2-NEXT:    [[TMP15:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !52
-// CHECK2-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]]) #[[ATTR4]]
+// CHECK2-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]])
 // CHECK2-NEXT:    br label [[DOTOMP_OUTLINED__7_EXIT:%.*]]
 // CHECK2:       .untied.jmp.1.i:
 // CHECK2-NEXT:    store i32 1, i32* @a, align 4, !noalias !52
@ -2291,26 +2291,26 @@ void test_omp_all_memory()
 // CHECK2-NEXT:    store i32 1, i32* [[TMP22]], align 4
 // CHECK2-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
 // CHECK2-NEXT:    [[TMP24:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK2-NEXT:    [[TMP25:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP23]], i8* [[TMP24]]) #[[ATTR4]]
+// CHECK2-NEXT:    [[TMP25:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP23]], i8* [[TMP24]])
 // CHECK2-NEXT:    br label [[DOTOMP_OUTLINED__17_EXIT:%.*]]
 // CHECK2:       .untied.jmp.2.i:
-// CHECK2-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP17]]) #[[ATTR4]]
+// CHECK2-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP17]])
 // CHECK2-NEXT:    [[TMP26:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
-// CHECK2-NEXT:    [[DOTS2__VOID_ADDR_I:%.*]] = call i8* @__kmpc_alloc(i32 [[TMP26]], i64 4, i8* inttoptr (i64 7 to i8*)) #[[ATTR4]]
+// CHECK2-NEXT:    [[DOTS2__VOID_ADDR_I:%.*]] = call i8* @__kmpc_alloc(i32 [[TMP26]], i64 4, i8* inttoptr (i64 7 to i8*))
 // CHECK2-NEXT:    [[DOTS2__ADDR_I:%.*]] = bitcast i8* [[DOTS2__VOID_ADDR_I]] to %struct.S*
 // CHECK2-NEXT:    store %struct.S* [[DOTS2__ADDR_I]], %struct.S** [[TMP18]], align 8
 // CHECK2-NEXT:    [[TMP27:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
 // CHECK2-NEXT:    store i32 2, i32* [[TMP27]], align 4
 // CHECK2-NEXT:    [[TMP28:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
 // CHECK2-NEXT:    [[TMP29:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK2-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]], i8* [[TMP29]]) #[[ATTR4]]
+// CHECK2-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]], i8* [[TMP29]])
 // CHECK2-NEXT:    br label [[DOTOMP_OUTLINED__17_EXIT]]
 // CHECK2:       .untied.jmp.3.i:
-// CHECK2-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP19]]) #[[ATTR4]]
+// CHECK2-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP19]])
 // CHECK2-NEXT:    [[A_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[TMP19]], i32 0, i32 0
 // CHECK2-NEXT:    store i32 0, i32* [[A_I]], align 4
 // CHECK2-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
-// CHECK2-NEXT:    [[TMP32:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP31]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.18*)* @.omp_task_entry..19 to i32 (i32, i8*)*)) #[[ATTR4]]
+// CHECK2-NEXT:    [[TMP32:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP31]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.18*)* @.omp_task_entry..19 to i32 (i32, i8*)*))
 // CHECK2-NEXT:    [[TMP33:%.*]] = bitcast i8* [[TMP32]] to %struct.kmp_task_t_with_privates.18*
 // CHECK2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_18:%.*]], %struct.kmp_task_t_with_privates.18* [[TMP33]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_18]], %struct.kmp_task_t_with_privates.18* [[TMP33]], i32 0, i32 2
@ -2318,43 +2318,43 @@ void test_omp_all_memory()
 // CHECK2-NEXT:    [[TMP37:%.*]] = load i32, i32* [[TMP16]], align 128
 // CHECK2-NEXT:    store i32 [[TMP37]], i32* [[TMP36]], align 128
 // CHECK2-NEXT:    [[TMP38:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
-// CHECK2-NEXT:    [[TMP39:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP38]], i8* [[TMP32]]) #[[ATTR4]]
+// CHECK2-NEXT:    [[TMP39:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP38]], i8* [[TMP32]])
 // CHECK2-NEXT:    [[TMP40:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
 // CHECK2-NEXT:    store i32 3, i32* [[TMP40]], align 4
 // CHECK2-NEXT:    [[TMP41:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
 // CHECK2-NEXT:    [[TMP42:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK2-NEXT:    [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]], i8* [[TMP42]]) #[[ATTR4]]
+// CHECK2-NEXT:    [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]], i8* [[TMP42]])
 // CHECK2-NEXT:    br label [[DOTOMP_OUTLINED__17_EXIT]]
 // CHECK2:       .untied.jmp.5.i:
 // CHECK2-NEXT:    [[TMP44:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
-// CHECK2-NEXT:    [[TMP45:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]], i32 0) #[[ATTR4]]
+// CHECK2-NEXT:    [[TMP45:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]], i32 0)
 // CHECK2-NEXT:    [[TMP46:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
 // CHECK2-NEXT:    store i32 4, i32* [[TMP46]], align 4
 // CHECK2-NEXT:    [[TMP47:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
 // CHECK2-NEXT:    [[TMP48:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK2-NEXT:    [[TMP49:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]], i8* [[TMP48]]) #[[ATTR4]]
+// CHECK2-NEXT:    [[TMP49:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]], i8* [[TMP48]])
 // CHECK2-NEXT:    br label [[DOTOMP_OUTLINED__17_EXIT]]
 // CHECK2:       .untied.jmp.7.i:
-// CHECK2-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
+// CHECK2-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]])
 // CHECK2-NEXT:    [[TMP50:%.*]] = bitcast %struct.S* [[TMP17]] to i8*
 // CHECK2-NEXT:    [[TMP51:%.*]] = bitcast %struct.S* [[REF_TMP_I]] to i8*
-// CHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP50]], i8* align 4 [[TMP51]], i64 4, i1 false) #[[ATTR4]]
+// CHECK2-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP50]], i8* align 4 [[TMP51]], i64 4, i1 false)
 // CHECK2-NEXT:    call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
 // CHECK2-NEXT:    [[A9_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[TMP19]], i32 0, i32 0
 // CHECK2-NEXT:    store i32 10, i32* [[A9_I]], align 4
 // CHECK2-NEXT:    [[TMP52:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
-// CHECK2-NEXT:    [[TMP53:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]]) #[[ATTR4]]
+// CHECK2-NEXT:    [[TMP53:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]])
 // CHECK2-NEXT:    [[TMP54:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
 // CHECK2-NEXT:    store i32 5, i32* [[TMP54]], align 4
 // CHECK2-NEXT:    [[TMP55:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
 // CHECK2-NEXT:    [[TMP56:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK2-NEXT:    [[TMP57:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP55]], i8* [[TMP56]]) #[[ATTR4]]
+// CHECK2-NEXT:    [[TMP57:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP55]], i8* [[TMP56]])
 // CHECK2-NEXT:    br label [[DOTOMP_OUTLINED__17_EXIT]]
 // CHECK2:       .untied.jmp.10.i:
 // CHECK2-NEXT:    call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP19]]) #[[ATTR4]]
 // CHECK2-NEXT:    [[TMP58:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !112
 // CHECK2-NEXT:    [[TMP59:%.*]] = bitcast %struct.S* [[TMP19]] to i8*
-// CHECK2-NEXT:    call void @__kmpc_free(i32 [[TMP58]], i8* [[TMP59]], i8* inttoptr (i64 7 to i8*)) #[[ATTR4]]
+// CHECK2-NEXT:    call void @__kmpc_free(i32 [[TMP58]], i8* [[TMP59]], i8* inttoptr (i64 7 to i8*))
 // CHECK2-NEXT:    call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP17]]) #[[ATTR4]]
 // CHECK2-NEXT:    store i32 0, i32* [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !112
 // CHECK2-NEXT:    br label [[CLEANUP_I]]
@ -3092,13 +3092,13 @@ void test_omp_all_memory()
 // CHECK2-51-NEXT:    store i32 1, i32* [[TMP13]], align 4
 // CHECK2-51-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !32
 // CHECK2-51-NEXT:    [[TMP15:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !32
-// CHECK2-51-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]]) #[[ATTR4]]
+// CHECK2-51-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]])
 // CHECK2-51-NEXT:    br label [[DOTOMP_OUTLINED__3_EXIT:%.*]]
 // CHECK2-51:       .untied.jmp.1.i:
 // CHECK2-51-NEXT:    [[TMP17:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !32
-// CHECK2-51-NEXT:    call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK2-51-NEXT:    call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]], [8 x i32]* @.gomp_critical_user_.var)
 // CHECK2-51-NEXT:    store i32 1, i32* @a, align 4, !noalias !32
-// CHECK2-51-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK2-51-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP17]], [8 x i32]* @.gomp_critical_user_.var)
 // CHECK2-51-NEXT:    store i32 0, i32* [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !32
 // CHECK2-51-NEXT:    br label [[CLEANUP_I]]
 // CHECK2-51:       cleanup.i:
@ -3155,7 +3155,7 @@ void test_omp_all_memory()
 // CHECK2-51-NEXT:    store i32 1, i32* [[TMP13]], align 4
 // CHECK2-51-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !42
 // CHECK2-51-NEXT:    [[TMP15:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !42
-// CHECK2-51-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]]) #[[ATTR4]]
+// CHECK2-51-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]])
 // CHECK2-51-NEXT:    br label [[DOTOMP_OUTLINED__5_EXIT:%.*]]
 // CHECK2-51:       .untied.jmp.1.i:
 // CHECK2-51-NEXT:    store i32 1, i32* @a, align 4, !noalias !42
@ -3215,7 +3215,7 @@ void test_omp_all_memory()
 // CHECK2-51-NEXT:    store i32 1, i32* [[TMP13]], align 4
 // CHECK2-51-NEXT:    [[TMP14:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !52
 // CHECK2-51-NEXT:    [[TMP15:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !52
-// CHECK2-51-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]]) #[[ATTR4]]
+// CHECK2-51-NEXT:    [[TMP16:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP14]], i8* [[TMP15]])
 // CHECK2-51-NEXT:    br label [[DOTOMP_OUTLINED__7_EXIT:%.*]]
 // CHECK2-51:       .untied.jmp.1.i:
 // CHECK2-51-NEXT:    store i32 1, i32* @a, align 4, !noalias !52
@ -3569,26 +3569,26 @@ void test_omp_all_memory()
 // CHECK2-51-NEXT:    store i32 1, i32* [[TMP22]], align 4
 // CHECK2-51-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !122
 // CHECK2-51-NEXT:    [[TMP24:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !122
-// CHECK2-51-NEXT:    [[TMP25:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP23]], i8* [[TMP24]]) #[[ATTR4]]
+// CHECK2-51-NEXT:    [[TMP25:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP23]], i8* [[TMP24]])
 // CHECK2-51-NEXT:    br label [[DOTOMP_OUTLINED__19_EXIT:%.*]]
 // CHECK2-51:       .untied.jmp.2.i:
-// CHECK2-51-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP17]]) #[[ATTR4]]
+// CHECK2-51-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP17]])
 // CHECK2-51-NEXT:    [[TMP26:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !122
-// CHECK2-51-NEXT:    [[DOTS2__VOID_ADDR_I:%.*]] = call i8* @__kmpc_alloc(i32 [[TMP26]], i64 4, i8* inttoptr (i64 7 to i8*)) #[[ATTR4]]
+// CHECK2-51-NEXT:    [[DOTS2__VOID_ADDR_I:%.*]] = call i8* @__kmpc_alloc(i32 [[TMP26]], i64 4, i8* inttoptr (i64 7 to i8*))
 // CHECK2-51-NEXT:    [[DOTS2__ADDR_I:%.*]] = bitcast i8* [[DOTS2__VOID_ADDR_I]] to %struct.S*
 // CHECK2-51-NEXT:    store %struct.S* [[DOTS2__ADDR_I]], %struct.S** [[TMP18]], align 8
 // CHECK2-51-NEXT:    [[TMP27:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !122
 // CHECK2-51-NEXT:    store i32 2, i32* [[TMP27]], align 4
 // CHECK2-51-NEXT:    [[TMP28:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !122
 // CHECK2-51-NEXT:    [[TMP29:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !122
-// CHECK2-51-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]], i8* [[TMP29]]) #[[ATTR4]]
+// CHECK2-51-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP28]], i8* [[TMP29]])
 // CHECK2-51-NEXT:    br label [[DOTOMP_OUTLINED__19_EXIT]]
 // CHECK2-51:       .untied.jmp.3.i:
-// CHECK2-51-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP19]]) #[[ATTR4]]
+// CHECK2-51-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP19]])
 // CHECK2-51-NEXT:    [[A_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[TMP19]], i32 0, i32 0
 // CHECK2-51-NEXT:    store i32 0, i32* [[A_I]], align 4
 // CHECK2-51-NEXT:    [[TMP31:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !122
-// CHECK2-51-NEXT:    [[TMP32:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP31]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.20*)* @.omp_task_entry..21 to i32 (i32, i8*)*)) #[[ATTR4]]
+// CHECK2-51-NEXT:    [[TMP32:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[TMP31]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.20*)* @.omp_task_entry..21 to i32 (i32, i8*)*))
 // CHECK2-51-NEXT:    [[TMP33:%.*]] = bitcast i8* [[TMP32]] to %struct.kmp_task_t_with_privates.20*
 // CHECK2-51-NEXT:    [[TMP34:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_20:%.*]], %struct.kmp_task_t_with_privates.20* [[TMP33]], i32 0, i32 0
 // CHECK2-51-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_20]], %struct.kmp_task_t_with_privates.20* [[TMP33]], i32 0, i32 2
@ -3596,43 +3596,43 @@ void test_omp_all_memory()
 // CHECK2-51-NEXT:    [[TMP37:%.*]] = load i32, i32* [[TMP16]], align 128
 // CHECK2-51-NEXT:    store i32 [[TMP37]], i32* [[TMP36]], align 128
 // CHECK2-51-NEXT:    [[TMP38:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !122
-// CHECK2-51-NEXT:    [[TMP39:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP38]], i8* [[TMP32]]) #[[ATTR4]]
+// CHECK2-51-NEXT:    [[TMP39:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP38]], i8* [[TMP32]])
 // CHECK2-51-NEXT:    [[TMP40:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !122
 // CHECK2-51-NEXT:    store i32 3, i32* [[TMP40]], align 4
 // CHECK2-51-NEXT:    [[TMP41:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !122
 // CHECK2-51-NEXT:    [[TMP42:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !122
-// CHECK2-51-NEXT:    [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]], i8* [[TMP42]]) #[[ATTR4]]
+// CHECK2-51-NEXT:    [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP41]], i8* [[TMP42]])
 // CHECK2-51-NEXT:    br label [[DOTOMP_OUTLINED__19_EXIT]]
 // CHECK2-51:       .untied.jmp.5.i:
 // CHECK2-51-NEXT:    [[TMP44:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !122
-// CHECK2-51-NEXT:    [[TMP45:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]], i32 0) #[[ATTR4]]
+// CHECK2-51-NEXT:    [[TMP45:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[TMP44]], i32 0)
 // CHECK2-51-NEXT:    [[TMP46:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !122
 // CHECK2-51-NEXT:    store i32 4, i32* [[TMP46]], align 4
 // CHECK2-51-NEXT:    [[TMP47:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !122
 // CHECK2-51-NEXT:    [[TMP48:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !122
-// CHECK2-51-NEXT:    [[TMP49:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]], i8* [[TMP48]]) #[[ATTR4]]
+// CHECK2-51-NEXT:    [[TMP49:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP47]], i8* [[TMP48]])
 // CHECK2-51-NEXT:    br label [[DOTOMP_OUTLINED__19_EXIT]]
 // CHECK2-51:       .untied.jmp.7.i:
-// CHECK2-51-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
+// CHECK2-51-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]])
 // CHECK2-51-NEXT:    [[TMP50:%.*]] = bitcast %struct.S* [[TMP17]] to i8*
 // CHECK2-51-NEXT:    [[TMP51:%.*]] = bitcast %struct.S* [[REF_TMP_I]] to i8*
-// CHECK2-51-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP50]], i8* align 4 [[TMP51]], i64 4, i1 false) #[[ATTR4]]
+// CHECK2-51-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP50]], i8* align 4 [[TMP51]], i64 4, i1 false)
 // CHECK2-51-NEXT:    call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
 // CHECK2-51-NEXT:    [[A9_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[TMP19]], i32 0, i32 0
 // CHECK2-51-NEXT:    store i32 10, i32* [[A9_I]], align 4
 // CHECK2-51-NEXT:    [[TMP52:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !122
-// CHECK2-51-NEXT:    [[TMP53:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]]) #[[ATTR4]]
+// CHECK2-51-NEXT:    [[TMP53:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[TMP52]])
 // CHECK2-51-NEXT:    [[TMP54:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !122
 // CHECK2-51-NEXT:    store i32 5, i32* [[TMP54]], align 4
 // CHECK2-51-NEXT:    [[TMP55:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !122
 // CHECK2-51-NEXT:    [[TMP56:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !122
-// CHECK2-51-NEXT:    [[TMP57:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP55]], i8* [[TMP56]]) #[[ATTR4]]
+// CHECK2-51-NEXT:    [[TMP57:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[TMP55]], i8* [[TMP56]])
 // CHECK2-51-NEXT:    br label [[DOTOMP_OUTLINED__19_EXIT]]
 // CHECK2-51:       .untied.jmp.10.i:
 // CHECK2-51-NEXT:    call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP19]]) #[[ATTR4]]
 // CHECK2-51-NEXT:    [[TMP58:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !122
 // CHECK2-51-NEXT:    [[TMP59:%.*]] = bitcast %struct.S* [[TMP19]] to i8*
-// CHECK2-51-NEXT:    call void @__kmpc_free(i32 [[TMP58]], i8* [[TMP59]], i8* inttoptr (i64 7 to i8*)) #[[ATTR4]]
+// CHECK2-51-NEXT:    call void @__kmpc_free(i32 [[TMP58]], i8* [[TMP59]], i8* inttoptr (i64 7 to i8*))
 // CHECK2-51-NEXT:    call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[TMP17]]) #[[ATTR4]]
 // CHECK2-51-NEXT:    store i32 0, i32* [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !122
 // CHECK2-51-NEXT:    br label [[CLEANUP_I]]
@ -4776,15 +4776,15 @@ void test_omp_all_memory()
 // CHECK3:       .untied.jmp..i:
 // CHECK3-NEXT:    [[TMP13:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !32
 // CHECK3-NEXT:    store i32 1, i32* [[TMP13]], align 4
-// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB7]]) #[[ATTR4]]
+// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB7]])
 // CHECK3-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !32
-// CHECK3-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]]) #[[ATTR4]]
+// CHECK3-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]])
 // CHECK3-NEXT:    br label [[DOTOMP_OUTLINED__3_EXIT:%.*]]
 // CHECK3:       .untied.jmp.1.i:
-// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM2_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]]
-// CHECK3-NEXT:    call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM2_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-NEXT:    call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var)
 // CHECK3-NEXT:    store i32 1, i32* @a, align 4, !noalias !32
-// CHECK3-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK3-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var)
 // CHECK3-NEXT:    store i32 0, i32* [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !32
 // CHECK3-NEXT:    br label [[CLEANUP_I]]
 // CHECK3:       cleanup.i:
@ -4839,9 +4839,9 @@ void test_omp_all_memory()
 // CHECK3:       .untied.jmp..i:
 // CHECK3-NEXT:    [[TMP13:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !42
 // CHECK3-NEXT:    store i32 1, i32* [[TMP13]], align 4
-// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB9]]) #[[ATTR4]]
+// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB9]])
 // CHECK3-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !42
-// CHECK3-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]]) #[[ATTR4]]
+// CHECK3-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]])
 // CHECK3-NEXT:    br label [[DOTOMP_OUTLINED__5_EXIT:%.*]]
 // CHECK3:       .untied.jmp.1.i:
 // CHECK3-NEXT:    store i32 1, i32* @a, align 4, !noalias !42
@ -4899,9 +4899,9 @@ void test_omp_all_memory()
 // CHECK3:       .untied.jmp..i:
 // CHECK3-NEXT:    [[TMP13:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !52
 // CHECK3-NEXT:    store i32 1, i32* [[TMP13]], align 4
-// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB11]]) #[[ATTR4]]
+// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB11]])
 // CHECK3-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !52
-// CHECK3-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]]) #[[ATTR4]]
+// CHECK3-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]])
 // CHECK3-NEXT:    br label [[DOTOMP_OUTLINED__7_EXIT:%.*]]
 // CHECK3:       .untied.jmp.1.i:
 // CHECK3-NEXT:    store i32 1, i32* @a, align 4, !noalias !52
@ -5217,55 +5217,55 @@ void test_omp_all_memory()
 // CHECK3:       .untied.jmp..i:
 // CHECK3-NEXT:    [[TMP21:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
 // CHECK3-NEXT:    store i32 1, i32* [[TMP21]], align 4
-// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]]) #[[ATTR4]]
+// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]])
 // CHECK3-NEXT:    [[TMP22:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK3-NEXT:    [[TMP23:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP22]]) #[[ATTR4]]
+// CHECK3-NEXT:    [[TMP23:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP22]])
 // CHECK3-NEXT:    br label [[DOTOMP_OUTLINED__17_EXIT:%.*]]
 // CHECK3:       .untied.jmp.2.i:
-// CHECK3-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S1_I]]) #[[ATTR4]]
-// CHECK3-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S2_I]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S1_I]])
+// CHECK3-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S2_I]])
 // CHECK3-NEXT:    [[A_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S2_I]], i32 0, i32 0
 // CHECK3-NEXT:    store i32 0, i32* [[A_I]], align 4, !noalias !112
-// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM3_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23:[0-9]+]]) #[[ATTR4]]
-// CHECK3-NEXT:    [[TMP24:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3_I]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.18*)* @.omp_task_entry..19 to i32 (i32, i8*)*)) #[[ATTR4]]
+// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM3_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23:[0-9]+]])
+// CHECK3-NEXT:    [[TMP24:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3_I]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.18*)* @.omp_task_entry..19 to i32 (i32, i8*)*))
 // CHECK3-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP24]] to %struct.kmp_task_t_with_privates.18*
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_18:%.*]], %struct.kmp_task_t_with_privates.18* [[TMP25]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_18]], %struct.kmp_task_t_with_privates.18* [[TMP25]], i32 0, i32 2
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP27]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP29:%.*]] = load i32, i32* [[TMP16]], align 128
 // CHECK3-NEXT:    store i32 [[TMP29]], i32* [[TMP28]], align 128
-// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM4_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]]) #[[ATTR4]]
-// CHECK3-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM4_I]], i8* [[TMP24]]) #[[ATTR4]]
+// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM4_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]])
+// CHECK3-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM4_I]], i8* [[TMP24]])
 // CHECK3-NEXT:    [[TMP31:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
 // CHECK3-NEXT:    store i32 2, i32* [[TMP31]], align 4
-// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM5_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]]) #[[ATTR4]]
+// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM5_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]])
 // CHECK3-NEXT:    [[TMP32:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK3-NEXT:    [[TMP33:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM5_I]], i8* [[TMP32]]) #[[ATTR4]]
+// CHECK3-NEXT:    [[TMP33:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM5_I]], i8* [[TMP32]])
 // CHECK3-NEXT:    br label [[DOTOMP_OUTLINED__17_EXIT]]
 // CHECK3:       .untied.jmp.6.i:
-// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM8_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]]
-// CHECK3-NEXT:    [[TMP34:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8_I]], i32 0) #[[ATTR4]]
+// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM8_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-NEXT:    [[TMP34:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8_I]], i32 0)
 // CHECK3-NEXT:    [[TMP35:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
 // CHECK3-NEXT:    store i32 3, i32* [[TMP35]], align 4
-// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM9_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]]) #[[ATTR4]]
+// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM9_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]])
 // CHECK3-NEXT:    [[TMP36:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK3-NEXT:    [[TMP37:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM9_I]], i8* [[TMP36]]) #[[ATTR4]]
+// CHECK3-NEXT:    [[TMP37:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM9_I]], i8* [[TMP36]])
 // CHECK3-NEXT:    br label [[DOTOMP_OUTLINED__17_EXIT]]
 // CHECK3:       .untied.jmp.10.i:
-// CHECK3-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]])
 // CHECK3-NEXT:    [[TMP38:%.*]] = bitcast %struct.S* [[S1_I]] to i8*
 // CHECK3-NEXT:    [[TMP39:%.*]] = bitcast %struct.S* [[REF_TMP_I]] to i8*
-// CHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP38]], i8* align 4 [[TMP39]], i64 4, i1 false) #[[ATTR4]], !noalias !112
+// CHECK3-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP38]], i8* align 4 [[TMP39]], i64 4, i1 false), !noalias !112
 // CHECK3-NEXT:    call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
 // CHECK3-NEXT:    [[A12_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S2_I]], i32 0, i32 0
 // CHECK3-NEXT:    store i32 10, i32* [[A12_I]], align 4, !noalias !112
-// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM13_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]]
-// CHECK3-NEXT:    [[TMP40:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM13_I]]) #[[ATTR4]]
+// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM13_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-NEXT:    [[TMP40:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM13_I]])
 // CHECK3-NEXT:    [[TMP41:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
 // CHECK3-NEXT:    store i32 4, i32* [[TMP41]], align 4
-// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM14_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]]) #[[ATTR4]]
+// CHECK3-NEXT:    [[OMP_GLOBAL_THREAD_NUM14_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]])
 // CHECK3-NEXT:    [[TMP42:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK3-NEXT:    [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM14_I]], i8* [[TMP42]]) #[[ATTR4]]
+// CHECK3-NEXT:    [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM14_I]], i8* [[TMP42]])
 // CHECK3-NEXT:    br label [[DOTOMP_OUTLINED__17_EXIT]]
 // CHECK3:       .untied.jmp.15.i:
 // CHECK3-NEXT:    call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S2_I]]) #[[ATTR4]]
@ -5855,15 +5855,15 @@ void test_omp_all_memory()
 // CHECK4:       .untied.jmp..i:
 // CHECK4-NEXT:    [[TMP13:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !32
 // CHECK4-NEXT:    store i32 1, i32* [[TMP13]], align 4
-// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB7]]) #[[ATTR4]]
+// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB7]])
 // CHECK4-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !32
-// CHECK4-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]]) #[[ATTR4]]
+// CHECK4-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]])
 // CHECK4-NEXT:    br label [[DOTOMP_OUTLINED__3_EXIT:%.*]]
 // CHECK4:       .untied.jmp.1.i:
-// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM2_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]]
-// CHECK4-NEXT:    call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM2_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-NEXT:    call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var)
 // CHECK4-NEXT:    store i32 1, i32* @a, align 4, !noalias !32
-// CHECK4-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK4-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var)
 // CHECK4-NEXT:    store i32 0, i32* [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !32
 // CHECK4-NEXT:    br label [[CLEANUP_I]]
 // CHECK4:       cleanup.i:
@ -5918,9 +5918,9 @@ void test_omp_all_memory()
 // CHECK4:       .untied.jmp..i:
 // CHECK4-NEXT:    [[TMP13:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !42
 // CHECK4-NEXT:    store i32 1, i32* [[TMP13]], align 4
-// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB9]]) #[[ATTR4]]
+// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB9]])
 // CHECK4-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !42
-// CHECK4-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]]) #[[ATTR4]]
+// CHECK4-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]])
 // CHECK4-NEXT:    br label [[DOTOMP_OUTLINED__5_EXIT:%.*]]
 // CHECK4:       .untied.jmp.1.i:
 // CHECK4-NEXT:    store i32 1, i32* @a, align 4, !noalias !42
@ -5978,9 +5978,9 @@ void test_omp_all_memory()
 // CHECK4:       .untied.jmp..i:
 // CHECK4-NEXT:    [[TMP13:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !52
 // CHECK4-NEXT:    store i32 1, i32* [[TMP13]], align 4
-// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB11]]) #[[ATTR4]]
+// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB11]])
 // CHECK4-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !52
-// CHECK4-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]]) #[[ATTR4]]
+// CHECK4-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]])
 // CHECK4-NEXT:    br label [[DOTOMP_OUTLINED__7_EXIT:%.*]]
 // CHECK4:       .untied.jmp.1.i:
 // CHECK4-NEXT:    store i32 1, i32* @a, align 4, !noalias !52
@ -6296,55 +6296,55 @@ void test_omp_all_memory()
 // CHECK4:       .untied.jmp..i:
 // CHECK4-NEXT:    [[TMP21:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
 // CHECK4-NEXT:    store i32 1, i32* [[TMP21]], align 4
-// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]]) #[[ATTR4]]
+// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]])
 // CHECK4-NEXT:    [[TMP22:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK4-NEXT:    [[TMP23:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP22]]) #[[ATTR4]]
+// CHECK4-NEXT:    [[TMP23:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP22]])
 // CHECK4-NEXT:    br label [[DOTOMP_OUTLINED__17_EXIT:%.*]]
 // CHECK4:       .untied.jmp.2.i:
-// CHECK4-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S1_I]]) #[[ATTR4]]
-// CHECK4-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S2_I]]) #[[ATTR4]]
+// CHECK4-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S1_I]])
+// CHECK4-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S2_I]])
 // CHECK4-NEXT:    [[A_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S2_I]], i32 0, i32 0
 // CHECK4-NEXT:    store i32 0, i32* [[A_I]], align 4, !noalias !112
-// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM3_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23:[0-9]+]]) #[[ATTR4]]
-// CHECK4-NEXT:    [[TMP24:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3_I]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.18*)* @.omp_task_entry..19 to i32 (i32, i8*)*)) #[[ATTR4]]
+// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM3_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23:[0-9]+]])
+// CHECK4-NEXT:    [[TMP24:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3_I]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.18*)* @.omp_task_entry..19 to i32 (i32, i8*)*))
 // CHECK4-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP24]] to %struct.kmp_task_t_with_privates.18*
 // CHECK4-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_18:%.*]], %struct.kmp_task_t_with_privates.18* [[TMP25]], i32 0, i32 0
 // CHECK4-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_18]], %struct.kmp_task_t_with_privates.18* [[TMP25]], i32 0, i32 2
 // CHECK4-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP27]], i32 0, i32 0
 // CHECK4-NEXT:    [[TMP29:%.*]] = load i32, i32* [[TMP16]], align 128
 // CHECK4-NEXT:    store i32 [[TMP29]], i32* [[TMP28]], align 128
-// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM4_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]]) #[[ATTR4]]
-// CHECK4-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM4_I]], i8* [[TMP24]]) #[[ATTR4]]
+// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM4_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]])
+// CHECK4-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM4_I]], i8* [[TMP24]])
 // CHECK4-NEXT:    [[TMP31:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
 // CHECK4-NEXT:    store i32 2, i32* [[TMP31]], align 4
-// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM5_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]]) #[[ATTR4]]
+// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM5_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]])
 // CHECK4-NEXT:    [[TMP32:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK4-NEXT:    [[TMP33:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM5_I]], i8* [[TMP32]]) #[[ATTR4]]
+// CHECK4-NEXT:    [[TMP33:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM5_I]], i8* [[TMP32]])
 // CHECK4-NEXT:    br label [[DOTOMP_OUTLINED__17_EXIT]]
 // CHECK4:       .untied.jmp.6.i:
-// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM8_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]]
-// CHECK4-NEXT:    [[TMP34:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8_I]], i32 0) #[[ATTR4]]
+// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM8_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-NEXT:    [[TMP34:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8_I]], i32 0)
 // CHECK4-NEXT:    [[TMP35:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
 // CHECK4-NEXT:    store i32 3, i32* [[TMP35]], align 4
-// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM9_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]]) #[[ATTR4]]
+// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM9_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]])
 // CHECK4-NEXT:    [[TMP36:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK4-NEXT:    [[TMP37:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM9_I]], i8* [[TMP36]]) #[[ATTR4]]
+// CHECK4-NEXT:    [[TMP37:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM9_I]], i8* [[TMP36]])
 // CHECK4-NEXT:    br label [[DOTOMP_OUTLINED__17_EXIT]]
 // CHECK4:       .untied.jmp.10.i:
-// CHECK4-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
+// CHECK4-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]])
 // CHECK4-NEXT:    [[TMP38:%.*]] = bitcast %struct.S* [[S1_I]] to i8*
 // CHECK4-NEXT:    [[TMP39:%.*]] = bitcast %struct.S* [[REF_TMP_I]] to i8*
-// CHECK4-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP38]], i8* align 4 [[TMP39]], i64 4, i1 false) #[[ATTR4]], !noalias !112
+// CHECK4-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP38]], i8* align 4 [[TMP39]], i64 4, i1 false), !noalias !112
 // CHECK4-NEXT:    call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
 // CHECK4-NEXT:    [[A12_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S2_I]], i32 0, i32 0
 // CHECK4-NEXT:    store i32 10, i32* [[A12_I]], align 4, !noalias !112
-// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM13_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]]
-// CHECK4-NEXT:    [[TMP40:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM13_I]]) #[[ATTR4]]
+// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM13_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-NEXT:    [[TMP40:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM13_I]])
 // CHECK4-NEXT:    [[TMP41:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
 // CHECK4-NEXT:    store i32 4, i32* [[TMP41]], align 4
-// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM14_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]]) #[[ATTR4]]
+// CHECK4-NEXT:    [[OMP_GLOBAL_THREAD_NUM14_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]])
 // CHECK4-NEXT:    [[TMP42:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK4-NEXT:    [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM14_I]], i8* [[TMP42]]) #[[ATTR4]]
+// CHECK4-NEXT:    [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM14_I]], i8* [[TMP42]])
 // CHECK4-NEXT:    br label [[DOTOMP_OUTLINED__17_EXIT]]
 // CHECK4:       .untied.jmp.15.i:
 // CHECK4-NEXT:    call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S2_I]]) #[[ATTR4]]
@ -6986,15 +6986,15 @@ void test_omp_all_memory()
 // CHECK3-51:       .untied.jmp..i:
 // CHECK3-51-NEXT:    [[TMP13:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !32
 // CHECK3-51-NEXT:    store i32 1, i32* [[TMP13]], align 4
-// CHECK3-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB7]]) #[[ATTR4]]
+// CHECK3-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB7]])
 // CHECK3-51-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !32
-// CHECK3-51-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]]) #[[ATTR4]]
+// CHECK3-51-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]])
 // CHECK3-51-NEXT:    br label [[DOTOMP_OUTLINED__3_EXIT:%.*]]
 // CHECK3-51:       .untied.jmp.1.i:
-// CHECK3-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM2_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]]
-// CHECK3-51-NEXT:    call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK3-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM2_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-51-NEXT:    call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var)
 // CHECK3-51-NEXT:    store i32 1, i32* @a, align 4, !noalias !32
-// CHECK3-51-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK3-51-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var)
 // CHECK3-51-NEXT:    store i32 0, i32* [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !32
 // CHECK3-51-NEXT:    br label [[CLEANUP_I]]
 // CHECK3-51:       cleanup.i:
@ -7049,9 +7049,9 @@ void test_omp_all_memory()
 // CHECK3-51:       .untied.jmp..i:
 // CHECK3-51-NEXT:    [[TMP13:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !42
 // CHECK3-51-NEXT:    store i32 1, i32* [[TMP13]], align 4
-// CHECK3-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB9]]) #[[ATTR4]]
+// CHECK3-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB9]])
 // CHECK3-51-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !42
-// CHECK3-51-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]]) #[[ATTR4]]
+// CHECK3-51-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]])
 // CHECK3-51-NEXT:    br label [[DOTOMP_OUTLINED__5_EXIT:%.*]]
 // CHECK3-51:       .untied.jmp.1.i:
 // CHECK3-51-NEXT:    store i32 1, i32* @a, align 4, !noalias !42
@ -7109,9 +7109,9 @@ void test_omp_all_memory()
 // CHECK3-51:       .untied.jmp..i:
 // CHECK3-51-NEXT:    [[TMP13:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !52
 // CHECK3-51-NEXT:    store i32 1, i32* [[TMP13]], align 4
-// CHECK3-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB11]]) #[[ATTR4]]
+// CHECK3-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB11]])
 // CHECK3-51-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !52
-// CHECK3-51-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]]) #[[ATTR4]]
+// CHECK3-51-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]])
 // CHECK3-51-NEXT:    br label [[DOTOMP_OUTLINED__7_EXIT:%.*]]
 // CHECK3-51:       .untied.jmp.1.i:
 // CHECK3-51-NEXT:    store i32 1, i32* @a, align 4, !noalias !52
@ -7463,55 +7463,55 @@ void test_omp_all_memory()
 // CHECK3-51:       .untied.jmp..i:
 // CHECK3-51-NEXT:    [[TMP21:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !122
 // CHECK3-51-NEXT:    store i32 1, i32* [[TMP21]], align 4
-// CHECK3-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]]) #[[ATTR4]]
+// CHECK3-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]])
 // CHECK3-51-NEXT:    [[TMP22:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !122
-// CHECK3-51-NEXT:    [[TMP23:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP22]]) #[[ATTR4]]
+// CHECK3-51-NEXT:    [[TMP23:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP22]])
 // CHECK3-51-NEXT:    br label [[DOTOMP_OUTLINED__19_EXIT:%.*]]
 // CHECK3-51:       .untied.jmp.2.i:
-// CHECK3-51-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S1_I]]) #[[ATTR4]]
-// CHECK3-51-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S2_I]]) #[[ATTR4]]
+// CHECK3-51-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S1_I]])
+// CHECK3-51-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S2_I]])
 // CHECK3-51-NEXT:    [[A_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S2_I]], i32 0, i32 0
 // CHECK3-51-NEXT:    store i32 0, i32* [[A_I]], align 4, !noalias !122
-// CHECK3-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM3_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB25:[0-9]+]]) #[[ATTR4]]
-// CHECK3-51-NEXT:    [[TMP24:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3_I]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.20*)* @.omp_task_entry..21 to i32 (i32, i8*)*)) #[[ATTR4]]
+// CHECK3-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM3_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB25:[0-9]+]])
+// CHECK3-51-NEXT:    [[TMP24:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3_I]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.20*)* @.omp_task_entry..21 to i32 (i32, i8*)*))
 // CHECK3-51-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP24]] to %struct.kmp_task_t_with_privates.20*
 // CHECK3-51-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_20:%.*]], %struct.kmp_task_t_with_privates.20* [[TMP25]], i32 0, i32 0
 // CHECK3-51-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_20]], %struct.kmp_task_t_with_privates.20* [[TMP25]], i32 0, i32 2
 // CHECK3-51-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP27]], i32 0, i32 0
 // CHECK3-51-NEXT:    [[TMP29:%.*]] = load i32, i32* [[TMP16]], align 128
 // CHECK3-51-NEXT:    store i32 [[TMP29]], i32* [[TMP28]], align 128
-// CHECK3-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM4_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB25]]) #[[ATTR4]]
-// CHECK3-51-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM4_I]], i8* [[TMP24]]) #[[ATTR4]]
+// CHECK3-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM4_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB25]])
+// CHECK3-51-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM4_I]], i8* [[TMP24]])
 // CHECK3-51-NEXT:    [[TMP31:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !122
 // CHECK3-51-NEXT:    store i32 2, i32* [[TMP31]], align 4
-// CHECK3-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM5_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]]) #[[ATTR4]]
+// CHECK3-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM5_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]])
 // CHECK3-51-NEXT:    [[TMP32:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !122
-// CHECK3-51-NEXT:    [[TMP33:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM5_I]], i8* [[TMP32]]) #[[ATTR4]]
+// CHECK3-51-NEXT:    [[TMP33:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM5_I]], i8* [[TMP32]])
 // CHECK3-51-NEXT:    br label [[DOTOMP_OUTLINED__19_EXIT]]
 // CHECK3-51:       .untied.jmp.6.i:
-// CHECK3-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM8_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]]
-// CHECK3-51-NEXT:    [[TMP34:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8_I]], i32 0) #[[ATTR4]]
+// CHECK3-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM8_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-51-NEXT:    [[TMP34:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8_I]], i32 0)
 // CHECK3-51-NEXT:    [[TMP35:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !122
 // CHECK3-51-NEXT:    store i32 3, i32* [[TMP35]], align 4
-// CHECK3-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM9_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]]) #[[ATTR4]]
+// CHECK3-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM9_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]])
 // CHECK3-51-NEXT:    [[TMP36:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !122
-// CHECK3-51-NEXT:    [[TMP37:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM9_I]], i8* [[TMP36]]) #[[ATTR4]]
+// CHECK3-51-NEXT:    [[TMP37:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM9_I]], i8* [[TMP36]])
 // CHECK3-51-NEXT:    br label [[DOTOMP_OUTLINED__19_EXIT]]
 // CHECK3-51:       .untied.jmp.10.i:
-// CHECK3-51-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
+// CHECK3-51-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]])
 // CHECK3-51-NEXT:    [[TMP38:%.*]] = bitcast %struct.S* [[S1_I]] to i8*
 // CHECK3-51-NEXT:    [[TMP39:%.*]] = bitcast %struct.S* [[REF_TMP_I]] to i8*
-// CHECK3-51-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP38]], i8* align 4 [[TMP39]], i64 4, i1 false) #[[ATTR4]], !noalias !122
+// CHECK3-51-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP38]], i8* align 4 [[TMP39]], i64 4, i1 false), !noalias !122
 // CHECK3-51-NEXT:    call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
 // CHECK3-51-NEXT:    [[A12_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S2_I]], i32 0, i32 0
 // CHECK3-51-NEXT:    store i32 10, i32* [[A12_I]], align 4, !noalias !122
-// CHECK3-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM13_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]]
-// CHECK3-51-NEXT:    [[TMP40:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM13_I]]) #[[ATTR4]]
+// CHECK3-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM13_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK3-51-NEXT:    [[TMP40:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM13_I]])
 // CHECK3-51-NEXT:    [[TMP41:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !122
 // CHECK3-51-NEXT:    store i32 4, i32* [[TMP41]], align 4
-// CHECK3-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM14_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]]) #[[ATTR4]]
+// CHECK3-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM14_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]])
 // CHECK3-51-NEXT:    [[TMP42:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !122
-// CHECK3-51-NEXT:    [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM14_I]], i8* [[TMP42]]) #[[ATTR4]]
+// CHECK3-51-NEXT:    [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM14_I]], i8* [[TMP42]])
 // CHECK3-51-NEXT:    br label [[DOTOMP_OUTLINED__19_EXIT]]
 // CHECK3-51:       .untied.jmp.15.i:
 // CHECK3-51-NEXT:    call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S2_I]]) #[[ATTR4]]
@ -8549,15 +8549,15 @@ void test_omp_all_memory()
 // CHECK4-51:       .untied.jmp..i:
 // CHECK4-51-NEXT:    [[TMP13:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !32
 // CHECK4-51-NEXT:    store i32 1, i32* [[TMP13]], align 4
-// CHECK4-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB7]]) #[[ATTR4]]
+// CHECK4-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB7]])
 // CHECK4-51-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !32
-// CHECK4-51-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]]) #[[ATTR4]]
+// CHECK4-51-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]])
 // CHECK4-51-NEXT:    br label [[DOTOMP_OUTLINED__3_EXIT:%.*]]
 // CHECK4-51:       .untied.jmp.1.i:
-// CHECK4-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM2_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]]
-// CHECK4-51-NEXT:    call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK4-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM2_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-51-NEXT:    call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var)
 // CHECK4-51-NEXT:    store i32 1, i32* @a, align 4, !noalias !32
-// CHECK4-51-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var) #[[ATTR4]]
+// CHECK4-51-NEXT:    call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM2_I]], [8 x i32]* @.gomp_critical_user_.var)
 // CHECK4-51-NEXT:    store i32 0, i32* [[CLEANUP_DEST_SLOT_I]], align 4, !noalias !32
 // CHECK4-51-NEXT:    br label [[CLEANUP_I]]
 // CHECK4-51:       cleanup.i:
@ -8612,9 +8612,9 @@ void test_omp_all_memory()
 // CHECK4-51:       .untied.jmp..i:
 // CHECK4-51-NEXT:    [[TMP13:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !42
 // CHECK4-51-NEXT:    store i32 1, i32* [[TMP13]], align 4
-// CHECK4-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB9]]) #[[ATTR4]]
+// CHECK4-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB9]])
 // CHECK4-51-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !42
-// CHECK4-51-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]]) #[[ATTR4]]
+// CHECK4-51-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]])
 // CHECK4-51-NEXT:    br label [[DOTOMP_OUTLINED__5_EXIT:%.*]]
 // CHECK4-51:       .untied.jmp.1.i:
 // CHECK4-51-NEXT:    store i32 1, i32* @a, align 4, !noalias !42
@ -8672,9 +8672,9 @@ void test_omp_all_memory()
 // CHECK4-51:       .untied.jmp..i:
 // CHECK4-51-NEXT:    [[TMP13:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !52
 // CHECK4-51-NEXT:    store i32 1, i32* [[TMP13]], align 4
-// CHECK4-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB11]]) #[[ATTR4]]
+// CHECK4-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB11]])
 // CHECK4-51-NEXT:    [[TMP14:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !52
-// CHECK4-51-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]]) #[[ATTR4]]
+// CHECK4-51-NEXT:    [[TMP15:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP14]])
 // CHECK4-51-NEXT:    br label [[DOTOMP_OUTLINED__7_EXIT:%.*]]
 // CHECK4-51:       .untied.jmp.1.i:
 // CHECK4-51-NEXT:    store i32 1, i32* @a, align 4, !noalias !52
@ -8990,55 +8990,55 @@ void test_omp_all_memory()
 // CHECK4-51:       .untied.jmp..i:
 // CHECK4-51-NEXT:    [[TMP21:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
 // CHECK4-51-NEXT:    store i32 1, i32* [[TMP21]], align 4
-// CHECK4-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]]) #[[ATTR4]]
+// CHECK4-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]])
 // CHECK4-51-NEXT:    [[TMP22:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK4-51-NEXT:    [[TMP23:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP22]]) #[[ATTR4]]
+// CHECK4-51-NEXT:    [[TMP23:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM_I]], i8* [[TMP22]])
 // CHECK4-51-NEXT:    br label [[DOTOMP_OUTLINED__17_EXIT:%.*]]
 // CHECK4-51:       .untied.jmp.2.i:
-// CHECK4-51-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S1_I]]) #[[ATTR4]]
-// CHECK4-51-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S2_I]]) #[[ATTR4]]
+// CHECK4-51-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S1_I]])
+// CHECK4-51-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S2_I]])
 // CHECK4-51-NEXT:    [[A_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S2_I]], i32 0, i32 0
 // CHECK4-51-NEXT:    store i32 0, i32* [[A_I]], align 4, !noalias !112
-// CHECK4-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM3_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23:[0-9]+]]) #[[ATTR4]]
-// CHECK4-51-NEXT:    [[TMP24:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3_I]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.18*)* @.omp_task_entry..19 to i32 (i32, i8*)*)) #[[ATTR4]]
+// CHECK4-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM3_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23:[0-9]+]])
+// CHECK4-51-NEXT:    [[TMP24:%.*]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM3_I]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %struct.kmp_task_t_with_privates.18*)* @.omp_task_entry..19 to i32 (i32, i8*)*))
 // CHECK4-51-NEXT:    [[TMP25:%.*]] = bitcast i8* [[TMP24]] to %struct.kmp_task_t_with_privates.18*
 // CHECK4-51-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_18:%.*]], %struct.kmp_task_t_with_privates.18* [[TMP25]], i32 0, i32 0
 // CHECK4-51-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES_18]], %struct.kmp_task_t_with_privates.18* [[TMP25]], i32 0, i32 2
 // CHECK4-51-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT__KMP_PRIVATES_T:%.*]], %struct..kmp_privates.t* [[TMP27]], i32 0, i32 0
 // CHECK4-51-NEXT:    [[TMP29:%.*]] = load i32, i32* [[TMP16]], align 128
 // CHECK4-51-NEXT:    store i32 [[TMP29]], i32* [[TMP28]], align 128
-// CHECK4-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM4_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]]) #[[ATTR4]]
-// CHECK4-51-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM4_I]], i8* [[TMP24]]) #[[ATTR4]]
+// CHECK4-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM4_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB23]])
+// CHECK4-51-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM4_I]], i8* [[TMP24]])
 // CHECK4-51-NEXT:    [[TMP31:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
 // CHECK4-51-NEXT:    store i32 2, i32* [[TMP31]], align 4
-// CHECK4-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM5_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]]) #[[ATTR4]]
+// CHECK4-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM5_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]])
 // CHECK4-51-NEXT:    [[TMP32:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK4-51-NEXT:    [[TMP33:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM5_I]], i8* [[TMP32]]) #[[ATTR4]]
+// CHECK4-51-NEXT:    [[TMP33:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM5_I]], i8* [[TMP32]])
 // CHECK4-51-NEXT:    br label [[DOTOMP_OUTLINED__17_EXIT]]
 // CHECK4-51:       .untied.jmp.6.i:
-// CHECK4-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM8_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]]
-// CHECK4-51-NEXT:    [[TMP34:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8_I]], i32 0) #[[ATTR4]]
+// CHECK4-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM8_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-51-NEXT:    [[TMP34:%.*]] = call i32 @__kmpc_omp_taskyield(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM8_I]], i32 0)
 // CHECK4-51-NEXT:    [[TMP35:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
 // CHECK4-51-NEXT:    store i32 3, i32* [[TMP35]], align 4
-// CHECK4-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM9_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]]) #[[ATTR4]]
+// CHECK4-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM9_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]])
 // CHECK4-51-NEXT:    [[TMP36:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK4-51-NEXT:    [[TMP37:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM9_I]], i8* [[TMP36]]) #[[ATTR4]]
+// CHECK4-51-NEXT:    [[TMP37:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM9_I]], i8* [[TMP36]])
 // CHECK4-51-NEXT:    br label [[DOTOMP_OUTLINED__17_EXIT]]
 // CHECK4-51:       .untied.jmp.10.i:
-// CHECK4-51-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
+// CHECK4-51-NEXT:    call void @_ZN1SC1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]])
 // CHECK4-51-NEXT:    [[TMP38:%.*]] = bitcast %struct.S* [[S1_I]] to i8*
 // CHECK4-51-NEXT:    [[TMP39:%.*]] = bitcast %struct.S* [[REF_TMP_I]] to i8*
-// CHECK4-51-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP38]], i8* align 4 [[TMP39]], i64 4, i1 false) #[[ATTR4]], !noalias !112
+// CHECK4-51-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP38]], i8* align 4 [[TMP39]], i64 4, i1 false), !noalias !112
 // CHECK4-51-NEXT:    call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[REF_TMP_I]]) #[[ATTR4]]
 // CHECK4-51-NEXT:    [[A12_I:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[S2_I]], i32 0, i32 0
 // CHECK4-51-NEXT:    store i32 10, i32* [[A12_I]], align 4, !noalias !112
-// CHECK4-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM13_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR4]]
-// CHECK4-51-NEXT:    [[TMP40:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM13_I]]) #[[ATTR4]]
+// CHECK4-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM13_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
+// CHECK4-51-NEXT:    [[TMP40:%.*]] = call i32 @__kmpc_omp_taskwait(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM13_I]])
 // CHECK4-51-NEXT:    [[TMP41:%.*]] = load i32*, i32** [[DOTPART_ID__ADDR_I]], align 8, !noalias !112
 // CHECK4-51-NEXT:    store i32 4, i32* [[TMP41]], align 4
-// CHECK4-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM14_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]]) #[[ATTR4]]
+// CHECK4-51-NEXT:    [[OMP_GLOBAL_THREAD_NUM14_I:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB21]])
 // CHECK4-51-NEXT:    [[TMP42:%.*]] = load i8*, i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !112
-// CHECK4-51-NEXT:    [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM14_I]], i8* [[TMP42]]) #[[ATTR4]]
+// CHECK4-51-NEXT:    [[TMP43:%.*]] = call i32 @__kmpc_omp_task(%struct.ident_t* @[[GLOB1]], i32 [[OMP_GLOBAL_THREAD_NUM14_I]], i8* [[TMP42]])
 // CHECK4-51-NEXT:    br label [[DOTOMP_OUTLINED__17_EXIT]]
 // CHECK4-51:       .untied.jmp.15.i:
 // CHECK4-51-NEXT:    call void @_ZN1SD1Ev(%struct.S* noundef nonnull align 4 dereferenceable(4) [[S2_I]]) #[[ATTR4]]
--- a/clang/test/OpenMP/task_if_codegen.cpp
+++ b/clang/test/OpenMP/task_if_codegen.cpp
@ -145,7 +145,7 @@ int main() {
 // CHECK1-NEXT:    store i8* [[TMP9]], i8** [[DOTTASK_T__ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    store %struct.anon* [[TMP8]], %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP10:%.*]] = load %struct.anon*, %struct.anon** [[__CONTEXT_ADDR_I]], align 8, !noalias !12
-// CHECK1-NEXT:    call void @_Z9gtid_testv() #[[ATTR3]]
+// CHECK1-NEXT:    call void @_Z9gtid_testv()
 // CHECK1-NEXT:    ret i32 0
 //
 //
--- a/clang/test/OpenMP/task_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/task_in_reduction_codegen.cpp
@ -612,18 +612,18 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP22:%.*]] = load i8*, i8** [[TMP18]], align 8
 // CHECK1-NEXT:    [[TMP23:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !12
 // CHECK1-NEXT:    [[TMP24:%.*]] = bitcast i32* [[TMP21]] to i8*
-// CHECK1-NEXT:    [[TMP25:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP23]], i8* [[TMP22]], i8* [[TMP24]]) #[[ATTR3]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP23]], i8* [[TMP22]], i8* [[TMP24]])
 // CHECK1-NEXT:    [[CONV_I:%.*]] = bitcast i8* [[TMP25]] to i32*
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP27:%.*]] = load i16*, i16** [[TMP26]], align 8
 // CHECK1-NEXT:    [[TMP28:%.*]] = mul nuw i64 [[TMP14]], 2
 // CHECK1-NEXT:    [[TMP29:%.*]] = udiv exact i64 [[TMP28]], ptrtoint (i16* getelementptr (i16, i16* null, i32 1) to i64)
-// CHECK1-NEXT:    [[TMP30:%.*]] = call i8* @__kmpc_threadprivate_cached(%struct.ident_t* @[[GLOB1]], i32 [[TMP23]], i8* bitcast (i64* @{{reduction_size[.].+[.]}}) #[[ATTR3]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = call i8* @__kmpc_threadprivate_cached(%struct.ident_t* @[[GLOB1]], i32 [[TMP23]], i8* bitcast (i64* @{{reduction_size[.].+[.]}})
 // CHECK1-NEXT:    [[TMP31:%.*]] = bitcast i8* [[TMP30]] to i64*
 // CHECK1-NEXT:    store i64 [[TMP29]], i64* [[TMP31]], align 8
 // CHECK1-NEXT:    [[TMP32:%.*]] = load i8*, i8** [[TMP19]], align 8
 // CHECK1-NEXT:    [[TMP33:%.*]] = bitcast i16* [[TMP27]] to i8*
-// CHECK1-NEXT:    [[TMP34:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP23]], i8* [[TMP32]], i8* [[TMP33]]) #[[ATTR3]]
+// CHECK1-NEXT:    [[TMP34:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP23]], i8* [[TMP32]], i8* [[TMP33]])
 // CHECK1-NEXT:    [[CONV2_I:%.*]] = bitcast i8* [[TMP34]] to i16*
 // CHECK1-NEXT:    [[TMP35:%.*]] = load i32, i32* [[CONV_I]], align 4
 // CHECK1-NEXT:    [[IDXPROM_I:%.*]] = sext i32 [[TMP35]] to i64
@ -672,7 +672,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP12:%.*]] = load i32*, i32** [[TMP11]], align 8
 // CHECK1-NEXT:    [[TMP13:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !24
 // CHECK1-NEXT:    [[TMP14:%.*]] = bitcast i32* [[TMP12]] to i8*
-// CHECK1-NEXT:    [[TMP15:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP13]], i8* null, i8* [[TMP14]]) #[[ATTR3]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP13]], i8* null, i8* [[TMP14]])
 // CHECK1-NEXT:    [[CONV_I:%.*]] = bitcast i8* [[TMP15]] to i32*
 // CHECK1-NEXT:    [[TMP16:%.*]] = load i32, i32* [[CONV_I]], align 4
 // CHECK1-NEXT:    [[INC_I:%.*]] = add nsw i32 [[TMP16]], 1
--- a/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_in_reduction_codegen.cpp
@ -624,18 +624,18 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP32:%.*]] = load i8*, i8** [[TMP28]], align 8
 // CHECK1-NEXT:    [[TMP33:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
 // CHECK1-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP31]] to i8*
-// CHECK1-NEXT:    [[TMP35:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP32]], i8* [[TMP34]]) #[[ATTR3]]
+// CHECK1-NEXT:    [[TMP35:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP32]], i8* [[TMP34]])
 // CHECK1-NEXT:    [[CONV_I:%.*]] = bitcast i8* [[TMP35]] to i32*
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP22]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP37:%.*]] = load i16*, i16** [[TMP36]], align 8
 // CHECK1-NEXT:    [[TMP38:%.*]] = mul nuw i64 [[TMP24]], 2
 // CHECK1-NEXT:    [[TMP39:%.*]] = udiv exact i64 [[TMP38]], ptrtoint (i16* getelementptr (i16, i16* null, i32 1) to i64)
-// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_threadprivate_cached(%struct.ident_t* @[[GLOB1]], i32 [[TMP33]], i8* bitcast (i64* @{{reduction_size[.].+[.]}}) #[[ATTR3]]
+// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_threadprivate_cached(%struct.ident_t* @[[GLOB1]], i32 [[TMP33]], i8* bitcast (i64* @{{reduction_size[.].+[.]}})
 // CHECK1-NEXT:    [[TMP41:%.*]] = bitcast i8* [[TMP40]] to i64*
 // CHECK1-NEXT:    store i64 [[TMP39]], i64* [[TMP41]], align 8
 // CHECK1-NEXT:    [[TMP42:%.*]] = load i8*, i8** [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP43:%.*]] = bitcast i16* [[TMP37]] to i8*
-// CHECK1-NEXT:    [[TMP44:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP42]], i8* [[TMP43]]) #[[ATTR3]]
+// CHECK1-NEXT:    [[TMP44:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP42]], i8* [[TMP43]])
 // CHECK1-NEXT:    [[CONV2_I:%.*]] = bitcast i8* [[TMP44]] to i16*
 // CHECK1-NEXT:    [[TMP45:%.*]] = load i64, i64* [[DOTLB__ADDR_I]], align 8, !noalias !14
 // CHECK1-NEXT:    [[CONV3_I:%.*]] = trunc i64 [[TMP45]] to i32
--- a/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_simd_in_reduction_codegen.cpp
@ -624,18 +624,18 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP32:%.*]] = load i8*, i8** [[TMP28]], align 8
 // CHECK1-NEXT:    [[TMP33:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
 // CHECK1-NEXT:    [[TMP34:%.*]] = bitcast i32* [[TMP31]] to i8*
-// CHECK1-NEXT:    [[TMP35:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP32]], i8* [[TMP34]]) #[[ATTR3]]
+// CHECK1-NEXT:    [[TMP35:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP32]], i8* [[TMP34]])
 // CHECK1-NEXT:    [[CONV_I:%.*]] = bitcast i8* [[TMP35]] to i32*
 // CHECK1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP22]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP37:%.*]] = load i16*, i16** [[TMP36]], align 8
 // CHECK1-NEXT:    [[TMP38:%.*]] = mul nuw i64 [[TMP24]], 2
 // CHECK1-NEXT:    [[TMP39:%.*]] = udiv exact i64 [[TMP38]], ptrtoint (i16* getelementptr (i16, i16* null, i32 1) to i64)
-// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_threadprivate_cached(%struct.ident_t* @[[GLOB1]], i32 [[TMP33]], i8* bitcast (i64* @{{reduction_size[.].+[.]}}) #[[ATTR3]]
+// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_threadprivate_cached(%struct.ident_t* @[[GLOB1]], i32 [[TMP33]], i8* bitcast (i64* @{{reduction_size[.].+[.]}})
 // CHECK1-NEXT:    [[TMP41:%.*]] = bitcast i8* [[TMP40]] to i64*
 // CHECK1-NEXT:    store i64 [[TMP39]], i64* [[TMP41]], align 8
 // CHECK1-NEXT:    [[TMP42:%.*]] = load i8*, i8** [[TMP29]], align 8
 // CHECK1-NEXT:    [[TMP43:%.*]] = bitcast i16* [[TMP37]] to i8*
-// CHECK1-NEXT:    [[TMP44:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP42]], i8* [[TMP43]]) #[[ATTR3]]
+// CHECK1-NEXT:    [[TMP44:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP33]], i8* [[TMP42]], i8* [[TMP43]])
 // CHECK1-NEXT:    [[CONV2_I:%.*]] = bitcast i8* [[TMP44]] to i16*
 // CHECK1-NEXT:    [[TMP45:%.*]] = load i64, i64* [[DOTLB__ADDR_I]], align 8, !noalias !14
 // CHECK1-NEXT:    [[CONV3_I:%.*]] = trunc i64 [[TMP45]] to i32
--- a/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_parallel_for_reduction_task_codegen.cpp
@ -838,7 +838,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP19:%.*]] = load i8*, i8** [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP20:%.*]] = load i32, i32* [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !12
 // CHECK1-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP18]] to i8*
-// CHECK1-NEXT:    [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]]) #[[ATTR5]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP19]], i8* [[TMP21]])
 // CHECK1-NEXT:    [[CONV_I:%.*]] = bitcast i8* [[TMP22]] to i32*
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP24:%.*]] = load i8**, i8*** [[TMP23]], align 8
@ -861,7 +861,7 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP38:%.*]] = mul nuw i64 [[TMP37]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
 // CHECK1-NEXT:    store i64 [[TMP37]], i64* @{{reduction_size[.].+[.]}}, align 8, !noalias !12
 // CHECK1-NEXT:    [[TMP39:%.*]] = load i8*, i8** [[TMP16]], align 8
-// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]]) #[[ATTR5]]
+// CHECK1-NEXT:    [[TMP40:%.*]] = call i8* @__kmpc_task_reduction_get_th_data(i32 [[TMP20]], i8* [[TMP39]], i8* [[TMP25]])
 // CHECK1-NEXT:    [[TMP41:%.*]] = getelementptr inbounds [[STRUCT_ANON]], %struct.anon* [[TMP12]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP42:%.*]] = load i8**, i8*** [[TMP41]], align 8
 // CHECK1-NEXT:    [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@ -2194,9 +2194,11 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
        CI->setTailCallKind(ChildTCK);
        InlinedMustTailCalls |= CI->isMustTailCall();

-        // Calls inlined through a 'nounwind' call site should be marked
-        // 'nounwind'.
-        if (MarkNoUnwind)
+        // Call sites inlined through a 'nounwind' call site should be
+        // 'nounwind' as well. However, avoid marking call sites explicitly
+        // where possible. This helps expose more opportunities for CSE after
+        // inlining, commonly when the callee is an intrinsic.
+        if (MarkNoUnwind && !CI->doesNotThrow())
          CI->setDoesNotThrow();
      }
    }
--- a/llvm/test/Transforms/Coroutines/coro-retcon-once-value.ll
+++ b/llvm/test/Transforms/Coroutines/coro-retcon-once-value.ll
@ -53,7 +53,7 @@ define void @test(i32* %array) {
 ; CHECK-NEXT:    store i32* [[ARRAY:%.*]], i32** [[TMP0]], align 8
 ; CHECK-NEXT:    [[LOAD_I:%.*]] = load i32, i32* [[ARRAY]], align 4
 ; CHECK-NEXT:    [[LOAD_POS_I:%.*]] = icmp sgt i32 [[LOAD_I]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.smax.i32(i32 [[LOAD_I]], i32 0) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.smax.i32(i32 [[LOAD_I]], i32 0)
 ; CHECK-NEXT:    call void @print(i32 [[TMP1]])
 ; CHECK-NEXT:    [[CONT_CAST:%.*]] = select i1 [[LOAD_POS_I]], void (i8*, i1)* @f.resume.0, void (i8*, i1)* @f.resume.1
 ; CHECK-NEXT:    call void [[CONT_CAST]](i8* nonnull [[DOTSUB]], i1 zeroext false)
--- a/llvm/test/Transforms/Inline/noalias-calls-always.ll
+++ b/llvm/test/Transforms/Inline/noalias-calls-always.ll
@ -34,11 +34,11 @@ define void @foo(i8* nocapture %a, i8* nocapture readonly %c, i8* nocapture %b)
 ; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]])
 ; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 512, i8* [[L_I]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A:%.*]], i8* align 16 [[B:%.*]], i64 16, i1 false) #[[ATTR6:[0-9]+]], !noalias !3
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[B]], i8* align 16 [[C:%.*]], i64 16, i1 false) #[[ATTR6]], !noalias !0
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A]], i8* align 16 [[C]], i64 16, i1 false) #[[ATTR6]], !alias.scope !5
-; CHECK-NEXT:    call void @hey() #[[ATTR6]], !noalias !5
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[L_I]], i8* align 16 [[C]], i64 16, i1 false) #[[ATTR6]], !noalias !0
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A:%.*]], i8* align 16 [[B:%.*]], i64 16, i1 false), !noalias !3
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[B]], i8* align 16 [[C:%.*]], i64 16, i1 false), !noalias !0
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A]], i8* align 16 [[C]], i64 16, i1 false), !alias.scope !5
+; CHECK-NEXT:    call void @hey(), !noalias !5
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[L_I]], i8* align 16 [[C]], i64 16, i1 false), !noalias !0
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 512, i8* [[L_I]])
 ; CHECK-NEXT:    ret void
 ;
@ -75,11 +75,11 @@ define void @foo_cs(i8* nocapture %a, i8* nocapture readonly %c, i8* nocapture %
 ; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
 ; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]])
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 512, i8* [[L_I]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A:%.*]], i8* align 16 [[B:%.*]], i64 16, i1 false) #[[ATTR6]], !noalias !9
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[B]], i8* align 16 [[C:%.*]], i64 16, i1 false) #[[ATTR6]], !noalias !6
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A]], i8* align 16 [[C]], i64 16, i1 false) #[[ATTR6]], !alias.scope !11
-; CHECK-NEXT:    call void @hey() #[[ATTR6]], !noalias !11
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[L_I]], i8* align 16 [[C]], i64 16, i1 false) #[[ATTR6]], !noalias !6
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A:%.*]], i8* align 16 [[B:%.*]], i64 16, i1 false), !noalias !9
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[B]], i8* align 16 [[C:%.*]], i64 16, i1 false), !noalias !6
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A]], i8* align 16 [[C]], i64 16, i1 false), !alias.scope !11
+; CHECK-NEXT:    call void @hey(), !noalias !11
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[L_I]], i8* align 16 [[C]], i64 16, i1 false), !noalias !6
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 512, i8* [[L_I]])
 ; CHECK-NEXT:    ret void
 ;
--- a/llvm/test/Transforms/Inline/noalias-calls.ll
+++ b/llvm/test/Transforms/Inline/noalias-calls.ll
@ -37,11 +37,11 @@ define void @foo(i8* nocapture %a, i8* nocapture readonly %c, i8* nocapture %b)
 ; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]])
 ; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 512, i8* [[L_I]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A]], i8* align 16 [[B]], i64 16, i1 false) #[[ATTR3]], !noalias !3
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[B]], i8* align 16 [[C]], i64 16, i1 false) #[[ATTR3]], !noalias !0
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A]], i8* align 16 [[C]], i64 16, i1 false) #[[ATTR3]], !alias.scope !5
-; CHECK-NEXT:    call void @hey() #[[ATTR3]], !noalias !5
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[L_I]], i8* align 16 [[C]], i64 16, i1 false) #[[ATTR3]], !noalias !0
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A]], i8* align 16 [[B]], i64 16, i1 false), !noalias !3
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[B]], i8* align 16 [[C]], i64 16, i1 false), !noalias !0
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A]], i8* align 16 [[C]], i64 16, i1 false), !alias.scope !5
+; CHECK-NEXT:    call void @hey(), !noalias !5
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[L_I]], i8* align 16 [[C]], i64 16, i1 false), !noalias !0
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 512, i8* [[L_I]])
 ; CHECK-NEXT:    ret void
 ;
@ -80,11 +80,11 @@ define void @foo_cs(i8* nocapture %a, i8* nocapture readonly %c, i8* nocapture %
 ; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]])
 ; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]])
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 512, i8* [[L_I]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A]], i8* align 16 [[B]], i64 16, i1 false) #[[ATTR3]], !noalias !9
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[B]], i8* align 16 [[C]], i64 16, i1 false) #[[ATTR3]], !noalias !6
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A]], i8* align 16 [[C]], i64 16, i1 false) #[[ATTR3]], !alias.scope !11
-; CHECK-NEXT:    call void @hey() #[[ATTR3]], !noalias !11
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[L_I]], i8* align 16 [[C]], i64 16, i1 false) #[[ATTR3]], !noalias !6
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A]], i8* align 16 [[B]], i64 16, i1 false), !noalias !9
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[B]], i8* align 16 [[C]], i64 16, i1 false), !noalias !6
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A]], i8* align 16 [[C]], i64 16, i1 false), !alias.scope !11
+; CHECK-NEXT:    call void @hey(), !noalias !11
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[L_I]], i8* align 16 [[C]], i64 16, i1 false), !noalias !6
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 512, i8* [[L_I]])
 ; CHECK-NEXT:    ret void
 ;
--- a/llvm/test/Transforms/Inline/noalias-cs.ll
+++ b/llvm/test/Transforms/Inline/noalias-cs.ll
@ -65,8 +65,8 @@ define void @caller(float* nocapture %a, float* nocapture %b, float** nocapture
 ; CHECK-LABEL: @caller(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[C:%.*]] = load float*, float** [[C_PTR:%.*]], align 8, !alias.scope !6
-; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]]) #[[ATTR2:[0-9]+]], !noalias !6
-; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]]) #[[ATTR2]], !noalias !6
+; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META9:![0-9]+]]), !noalias !6
+; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]]), !noalias !6
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[C]], align 4, !noalias !14
 ; CHECK-NEXT:    [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 5
 ; CHECK-NEXT:    store float [[TMP0]], float* [[ARRAYIDX_I_I]], align 4, !alias.scope !9, !noalias !15
@ -75,8 +75,8 @@ define void @caller(float* nocapture %a, float* nocapture %b, float** nocapture
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[C]], align 4, !noalias !6
 ; CHECK-NEXT:    [[ARRAYIDX_I:%.*]] = getelementptr inbounds float, float* [[A]], i64 7
 ; CHECK-NEXT:    store float [[TMP1]], float* [[ARRAYIDX_I]], align 4, !noalias !6
-; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META17:![0-9]+]]) #[[ATTR2]], !alias.scope !6
-; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]]) #[[ATTR2]], !alias.scope !6
+; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META17:![0-9]+]]), !alias.scope !6
+; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]]), !alias.scope !6
 ; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[A]], align 4, !alias.scope !6, !noalias !22
 ; CHECK-NEXT:    [[ARRAYIDX_I_I1:%.*]] = getelementptr inbounds float, float* [[B]], i64 5
 ; CHECK-NEXT:    store float [[TMP2]], float* [[ARRAYIDX_I_I1]], align 4, !alias.scope !23, !noalias !20
--- a/llvm/test/Transforms/Inline/noalias2.ll
+++ b/llvm/test/Transforms/Inline/noalias2.ll
@ -71,8 +71,8 @@ define void @foo2(float* nocapture %a, float* nocapture %b, float* nocapture rea
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META5:![0-9]+]])
 ; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]])
-; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]]) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META13:![0-9]+]]) #[[ATTR2]]
+; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]])
+; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META13:![0-9]+]])
 ; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[C]], align 4, !alias.scope !15, !noalias !16
 ; CHECK-NEXT:    [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds float, float* [[A]], i64 5
 ; CHECK-NEXT:    store float [[TMP0]], float* [[ARRAYIDX_I_I]], align 4, !alias.scope !16, !noalias !15
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
@ -65,7 +65,7 @@ define void @arm_mult_q15(i16* %pSrcA, i16* %pSrcB, i16 * noalias %pDst, i32 %bl
 ; CHECK-NEXT:    [[CONV2:%.*]] = sext i16 [[TMP11]] to i32
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
 ; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[MUL]], 15
-; CHECK-NEXT:    [[TMP12:%.*]] = tail call i32 @llvm.smin.i32(i32 [[SHR]], i32 32767) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call i32 @llvm.smin.i32(i32 [[SHR]], i32 32767)
 ; CHECK-NEXT:    [[CONV3:%.*]] = trunc i32 [[TMP12]] to i16
 ; CHECK-NEXT:    [[INCDEC_PTR4]] = getelementptr inbounds i16, i16* [[PDST_ADDR_05]], i32 1
 ; CHECK-NEXT:    store i16 [[CONV3]], i16* [[PDST_ADDR_05]], align 2
--- a/llvm/test/Transforms/PhaseOrdering/deletion-of-loops-that-became-side-effect-free.ll
+++ b/llvm/test/Transforms/PhaseOrdering/deletion-of-loops-that-became-side-effect-free.ll
@ -120,7 +120,7 @@ define dso_local zeroext i1 @is_not_empty_variant3(%struct.node* %p) {
 ; O1-NEXT:    [[SIZE_06_I:%.*]] = phi i64 [ [[INC_I:%.*]], [[WHILE_BODY_I]] ], [ 0, [[ENTRY:%.*]] ]
 ; O1-NEXT:    [[P_ADDR_05_I:%.*]] = phi %struct.node* [ [[TMP0:%.*]], [[WHILE_BODY_I]] ], [ [[P]], [[ENTRY]] ]
 ; O1-NEXT:    [[CMP_I:%.*]] = icmp ne i64 [[SIZE_06_I]], -1
-; O1-NEXT:    call void @llvm.assume(i1 [[CMP_I]]) #[[ATTR3:[0-9]+]]
+; O1-NEXT:    call void @llvm.assume(i1 [[CMP_I]])
 ; O1-NEXT:    [[NEXT_I:%.*]] = getelementptr inbounds [[STRUCT_NODE:%.*]], %struct.node* [[P_ADDR_05_I]], i64 0, i32 0
 ; O1-NEXT:    [[TMP0]] = load %struct.node*, %struct.node** [[NEXT_I]], align 8
 ; O1-NEXT:    [[INC_I]] = add i64 [[SIZE_06_I]], 1