From d6f4be97e656b30a3257a900b58c78c3138aa706 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 21 Aug 2017 05:29:02 +0000 Subject: [PATCH] [AVX-512] Don't change which instructions we use for unmasked subvector broadcasts when AVX512DQ is enabled. There's no functional difference between the AVX512DQ instructions if we're not masking. This change unifies test checks and removes extra isel entries. Similar was done for subvector insert and extracts recently. llvm-svn: 311308 --- llvm/lib/Target/X86/X86InstrAVX512.td | 104 +++++------- .../test/CodeGen/X86/avx512-vbroadcasti128.ll | 44 ++--- .../test/CodeGen/X86/avx512-vbroadcasti256.ll | 44 ++--- llvm/test/CodeGen/X86/subvector-broadcast.ll | 160 ++++-------------- 4 files changed, 99 insertions(+), 253 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 6e15cf4fa465..8ec7a4e08df1 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -1115,6 +1115,19 @@ multiclass avx512_subvec_broadcast_rm opc, string OpcodeStr, AVX5128IBase, EVEX; } +// This should be used for the AVX512DQ broadcast instructions. It disables +// the unmasked patterns so that we only use the DQ instructions when masking +// is requested. +multiclass avx512_subvec_broadcast_rm_dq opc, string OpcodeStr, + X86VectorVTInfo _Dst, X86VectorVTInfo _Src> { + defm rm : AVX512_maskable_split, + AVX5128IBase, EVEX; +} + let Predicates = [HasAVX512] in { // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. def : Pat<(v8i64 (X86VBroadcast (v8i64 (X86vzload addr:$src)))), @@ -1159,6 +1172,10 @@ defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4", EVEX_V512, EVEX_CD8<64, CD8VT4>; let Predicates = [HasAVX512] in { +def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))), + (VBROADCASTF64X4rm addr:$src)>; +def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))), + (VBROADCASTI64X4rm addr:$src)>; def : Pat<(v32i16 (X86SubVBroadcast (bc_v16i16 (loadv4i64 addr:$src)))), (VBROADCASTI64X4rm addr:$src)>; def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))), @@ -1169,9 +1186,15 @@ def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))), def : Pat<(v8f64 (X86SubVBroadcast (v4f64 VR256X:$src))), (VINSERTF64x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (v4f64 VR256X:$src), 1)>; +def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))), + (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8f32 VR256X:$src), 1)>; def : Pat<(v8i64 (X86SubVBroadcast (v4i64 VR256X:$src))), (VINSERTI64x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (v4i64 VR256X:$src), 1)>; +def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), + (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8i32 VR256X:$src), 1)>; def : Pat<(v32i16 (X86SubVBroadcast (v16i16 VR256X:$src))), (VINSERTI64x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (v16i16 VR256X:$src), 1)>; @@ -1179,6 +1202,10 @@ def : Pat<(v64i8 (X86SubVBroadcast (v32i8 VR256X:$src))), (VINSERTI64x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (v32i8 VR256X:$src), 1)>; +def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))), + (VBROADCASTF32X4rm addr:$src)>; +def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))), + (VBROADCASTI32X4rm addr:$src)>; def : Pat<(v32i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), (VBROADCASTI32X4rm addr:$src)>; def : Pat<(v64i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), @@ -1193,6 +1220,10 @@ defm VBROADCASTF32X4Z256 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4", v8f32x_info, v4f32x_info>, EVEX_V256, EVEX_CD8<32, CD8VT4>; +def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))), + (VBROADCASTF32X4Z256rm addr:$src)>; +def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), + (VBROADCASTI32X4Z256rm addr:$src)>; def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), (VBROADCASTI32X4Z256rm addr:$src)>; def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), @@ -1200,9 +1231,15 @@ def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. +def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))), + (VINSERTF32x4Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v2f64 VR128X:$src), 1)>; def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128X:$src))), (VINSERTF32x4Z256rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (v4f32 VR128X:$src), 1)>; +def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))), + (VINSERTI32x4Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v2i64 VR128X:$src), 1)>; def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128X:$src))), (VINSERTI32x4Z256rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (v4i32 VR128X:$src), 1)>; @@ -1215,82 +1252,27 @@ def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128X:$src))), } let Predicates = [HasVLX, HasDQI] in { -defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2", +defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2", v4i64x_info, v2i64x_info>, VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>; -defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2", +defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2", v4f64x_info, v2f64x_info>, VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>; - -// Provide fallback in case the load node that is used in the patterns above -// is used by additional users, which prevents the pattern selection. -def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))), - (VINSERTF64x2Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - (v2f64 VR128X:$src), 1)>; -def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))), - (VINSERTI64x2Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - (v2i64 VR128X:$src), 1)>; -} - -let Predicates = [HasVLX, NoDQI] in { -def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))), - (VBROADCASTF32X4Z256rm addr:$src)>; -def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), - (VBROADCASTI32X4Z256rm addr:$src)>; - -// Provide fallback in case the load node that is used in the patterns above -// is used by additional users, which prevents the pattern selection. -def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))), - (VINSERTF32x4Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - (v2f64 VR128X:$src), 1)>; -def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))), - (VINSERTI32x4Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - (v2i64 VR128X:$src), 1)>; -} - -let Predicates = [HasAVX512, NoDQI] in { -def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))), - (VBROADCASTF32X4rm addr:$src)>; -def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))), - (VBROADCASTI32X4rm addr:$src)>; - -def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))), - (VBROADCASTF64X4rm addr:$src)>; -def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))), - (VBROADCASTI64X4rm addr:$src)>; - -// Provide fallback in case the load node that is used in the patterns above -// is used by additional users, which prevents the pattern selection. -def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))), - (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), - (v8f32 VR256X:$src), 1)>; -def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), - (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), - (v8i32 VR256X:$src), 1)>; } let Predicates = [HasDQI] in { -defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2", +defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2", v8i64_info, v2i64x_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>; -defm VBROADCASTI32X8 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti32x8", +defm VBROADCASTI32X8 : avx512_subvec_broadcast_rm_dq<0x5b, "vbroadcasti32x8", v16i32_info, v8i32x_info>, EVEX_V512, EVEX_CD8<32, CD8VT8>; -defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2", +defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2", v8f64_info, v2f64x_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>; -defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf32x8", +defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8", v16f32_info, v8f32x_info>, EVEX_V512, EVEX_CD8<32, CD8VT8>; - -// Provide fallback in case the load node that is used in the patterns above -// is used by additional users, which prevents the pattern selection. -def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))), - (VINSERTF32x8Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), - (v8f32 VR256X:$src), 1)>; -def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), - (VINSERTI32x8Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), - (v8i32 VR256X:$src), 1)>; } multiclass avx512_common_broadcast_32x2 opc, string OpcodeStr, diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll b/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll index c5f7ec60700a..a88e25f62100 100644 --- a/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll +++ b/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll @@ -84,23 +84,11 @@ define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind { ; define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind { -; X64-AVX512VL-LABEL: test_broadcast_2f64_8f64: -; X64-AVX512VL: ## BB#0: -; X64-AVX512VL-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X64-AVX512VL-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512VL-NEXT: retq -; -; X64-AVX512BWVL-LABEL: test_broadcast_2f64_8f64: -; X64-AVX512BWVL: ## BB#0: -; X64-AVX512BWVL-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X64-AVX512BWVL-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512BWVL-NEXT: retq -; -; X64-AVX512DQVL-LABEL: test_broadcast_2f64_8f64: -; X64-AVX512DQVL: ## BB#0: -; X64-AVX512DQVL-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1] -; X64-AVX512DQVL-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512DQVL-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_2f64_8f64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0 +; X64-AVX512-NEXT: retq %1 = load <2 x double>, <2 x double> *%p %2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> %3 = fadd <8 x double> %2, @@ -108,23 +96,11 @@ define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind { } define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind { -; X64-AVX512VL-LABEL: test_broadcast_2i64_8i64: -; X64-AVX512VL: ## BB#0: -; X64-AVX512VL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X64-AVX512VL-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512VL-NEXT: retq -; -; X64-AVX512BWVL-LABEL: test_broadcast_2i64_8i64: -; X64-AVX512BWVL: ## BB#0: -; X64-AVX512BWVL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X64-AVX512BWVL-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512BWVL-NEXT: retq -; -; X64-AVX512DQVL-LABEL: test_broadcast_2i64_8i64: -; X64-AVX512DQVL: ## BB#0: -; X64-AVX512DQVL-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1] -; X64-AVX512DQVL-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512DQVL-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_2i64_8i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 +; X64-AVX512-NEXT: retq %1 = load <2 x i64>, <2 x i64> *%p %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> %3 = add <8 x i64> %2, diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcasti256.ll b/llvm/test/CodeGen/X86/avx512-vbroadcasti256.ll index 55fbc34732e1..1896bc714c21 100644 --- a/llvm/test/CodeGen/X86/avx512-vbroadcasti256.ll +++ b/llvm/test/CodeGen/X86/avx512-vbroadcasti256.ll @@ -28,23 +28,11 @@ define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind { } define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind { -; X64-AVX512VL-LABEL: test_broadcast_8f32_16f32: -; X64-AVX512VL: ## BB#0: -; X64-AVX512VL-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512VL-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512VL-NEXT: retq -; -; X64-AVX512BWVL-LABEL: test_broadcast_8f32_16f32: -; X64-AVX512BWVL: ## BB#0: -; X64-AVX512BWVL-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512BWVL-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512BWVL-NEXT: retq -; -; X64-AVX512DQVL-LABEL: test_broadcast_8f32_16f32: -; X64-AVX512DQVL: ## BB#0: -; X64-AVX512DQVL-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; X64-AVX512DQVL-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512DQVL-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_8f32_16f32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 +; X64-AVX512-NEXT: retq %1 = load <8 x float>, <8 x float> *%p %2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> %3 = fadd <16 x float> %2, @@ -52,23 +40,11 @@ define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind { } define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind { -; X64-AVX512VL-LABEL: test_broadcast_8i32_16i32: -; X64-AVX512VL: ## BB#0: -; X64-AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512VL-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512VL-NEXT: retq -; -; X64-AVX512BWVL-LABEL: test_broadcast_8i32_16i32: -; X64-AVX512BWVL: ## BB#0: -; X64-AVX512BWVL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512BWVL-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512BWVL-NEXT: retq -; -; X64-AVX512DQVL-LABEL: test_broadcast_8i32_16i32: -; X64-AVX512DQVL: ## BB#0: -; X64-AVX512DQVL-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; X64-AVX512DQVL-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512DQVL-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_8i32_16i32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 +; X64-AVX512-NEXT: retq %1 = load <8 x i32>, <8 x i32> *%p %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> %3 = add <16 x i32> %2, diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll index 4ddc79efb4d7..2756e42573c4 100644 --- a/llvm/test/CodeGen/X86/subvector-broadcast.ll +++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll @@ -38,23 +38,11 @@ define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind { ; X32-AVX-NEXT: vmovdqa %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; -; X32-AVX512F-LABEL: test_broadcast_2f64_8f64: -; X32-AVX512F: # BB#0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: test_broadcast_2f64_8f64: -; X32-AVX512BW: # BB#0: -; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: test_broadcast_2f64_8f64: -; X32-AVX512DQ: # BB#0: -; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512DQ-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1] -; X32-AVX512DQ-NEXT: retl +; X32-AVX512-LABEL: test_broadcast_2f64_8f64: +; X32-AVX512: # BB#0: +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_2f64_8f64: ; X64-AVX: # BB#0: @@ -62,20 +50,10 @@ define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind { ; X64-AVX-NEXT: vmovdqa %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; -; X64-AVX512F-LABEL: test_broadcast_2f64_8f64: -; X64-AVX512F: # BB#0: -; X64-AVX512F-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: test_broadcast_2f64_8f64: -; X64-AVX512BW: # BB#0: -; X64-AVX512BW-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512DQ-LABEL: test_broadcast_2f64_8f64: -; X64-AVX512DQ: # BB#0: -; X64-AVX512DQ-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1] -; X64-AVX512DQ-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_2f64_8f64: +; X64-AVX512: # BB#0: +; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: retq %1 = load <2 x double>, <2 x double> *%p %2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> ret <8 x double> %2 @@ -152,23 +130,11 @@ define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind { ; X32-AVX2-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX2-NEXT: retl ; -; X32-AVX512F-LABEL: test_broadcast_2i64_8i64: -; X32-AVX512F: # BB#0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: test_broadcast_2i64_8i64: -; X32-AVX512BW: # BB#0: -; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: test_broadcast_2i64_8i64: -; X32-AVX512DQ: # BB#0: -; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512DQ-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1] -; X32-AVX512DQ-NEXT: retl +; X32-AVX512-LABEL: test_broadcast_2i64_8i64: +; X32-AVX512: # BB#0: +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X32-AVX512-NEXT: retl ; ; X64-AVX1-LABEL: test_broadcast_2i64_8i64: ; X64-AVX1: # BB#0: @@ -182,20 +148,10 @@ define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind { ; X64-AVX2-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX2-NEXT: retq ; -; X64-AVX512F-LABEL: test_broadcast_2i64_8i64: -; X64-AVX512F: # BB#0: -; X64-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: test_broadcast_2i64_8i64: -; X64-AVX512BW: # BB#0: -; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512DQ-LABEL: test_broadcast_2i64_8i64: -; X64-AVX512DQ: # BB#0: -; X64-AVX512DQ-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1] -; X64-AVX512DQ-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_2i64_8i64: +; X64-AVX512: # BB#0: +; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: retq %1 = load <2 x i64>, <2 x i64> *%p %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> ret <8 x i64> %2 @@ -283,23 +239,11 @@ define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind { ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; -; X32-AVX512F-LABEL: test_broadcast_8f32_16f32: -; X32-AVX512F: # BB#0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: test_broadcast_8f32_16f32: -; X32-AVX512BW: # BB#0: -; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: test_broadcast_8f32_16f32: -; X32-AVX512DQ: # BB#0: -; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512DQ-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; X32-AVX512DQ-NEXT: retl +; X32-AVX512-LABEL: test_broadcast_8f32_16f32: +; X32-AVX512: # BB#0: +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_8f32_16f32: ; X64-AVX: # BB#0: @@ -307,20 +251,10 @@ define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind { ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; -; X64-AVX512F-LABEL: test_broadcast_8f32_16f32: -; X64-AVX512F: # BB#0: -; X64-AVX512F-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: test_broadcast_8f32_16f32: -; X64-AVX512BW: # BB#0: -; X64-AVX512BW-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512DQ-LABEL: test_broadcast_8f32_16f32: -; X64-AVX512DQ: # BB#0: -; X64-AVX512DQ-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; X64-AVX512DQ-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_8f32_16f32: +; X64-AVX512: # BB#0: +; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: retq %1 = load <8 x float>, <8 x float> *%p %2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> ret <16 x float> %2 @@ -403,23 +337,11 @@ define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind { ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; -; X32-AVX512F-LABEL: test_broadcast_8i32_16i32: -; X32-AVX512F: # BB#0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: test_broadcast_8i32_16i32: -; X32-AVX512BW: # BB#0: -; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: test_broadcast_8i32_16i32: -; X32-AVX512DQ: # BB#0: -; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; X32-AVX512DQ-NEXT: retl +; X32-AVX512-LABEL: test_broadcast_8i32_16i32: +; X32-AVX512: # BB#0: +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_8i32_16i32: ; X64-AVX: # BB#0: @@ -427,20 +349,10 @@ define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind { ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; -; X64-AVX512F-LABEL: test_broadcast_8i32_16i32: -; X64-AVX512F: # BB#0: -; X64-AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: test_broadcast_8i32_16i32: -; X64-AVX512BW: # BB#0: -; X64-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512DQ-LABEL: test_broadcast_8i32_16i32: -; X64-AVX512DQ: # BB#0: -; X64-AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; X64-AVX512DQ-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_8i32_16i32: +; X64-AVX512: # BB#0: +; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: retq %1 = load <8 x i32>, <8 x i32> *%p %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> ret <16 x i32> %2