diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 6e15cf4fa465..8ec7a4e08df1 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -1115,6 +1115,19 @@ multiclass avx512_subvec_broadcast_rm opc, string OpcodeStr, AVX5128IBase, EVEX; } +// This should be used for the AVX512DQ broadcast instructions. It disables +// the unmasked patterns so that we only use the DQ instructions when masking +// is requested. +multiclass avx512_subvec_broadcast_rm_dq opc, string OpcodeStr, + X86VectorVTInfo _Dst, X86VectorVTInfo _Src> { + defm rm : AVX512_maskable_split, + AVX5128IBase, EVEX; +} + let Predicates = [HasAVX512] in { // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. def : Pat<(v8i64 (X86VBroadcast (v8i64 (X86vzload addr:$src)))), @@ -1159,6 +1172,10 @@ defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4", EVEX_V512, EVEX_CD8<64, CD8VT4>; let Predicates = [HasAVX512] in { +def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))), + (VBROADCASTF64X4rm addr:$src)>; +def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))), + (VBROADCASTI64X4rm addr:$src)>; def : Pat<(v32i16 (X86SubVBroadcast (bc_v16i16 (loadv4i64 addr:$src)))), (VBROADCASTI64X4rm addr:$src)>; def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))), @@ -1169,9 +1186,15 @@ def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))), def : Pat<(v8f64 (X86SubVBroadcast (v4f64 VR256X:$src))), (VINSERTF64x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (v4f64 VR256X:$src), 1)>; +def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))), + (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8f32 VR256X:$src), 1)>; def : Pat<(v8i64 (X86SubVBroadcast (v4i64 VR256X:$src))), (VINSERTI64x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (v4i64 VR256X:$src), 1)>; +def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), + (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8i32 VR256X:$src), 1)>; def : Pat<(v32i16 (X86SubVBroadcast (v16i16 VR256X:$src))), (VINSERTI64x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (v16i16 VR256X:$src), 1)>; @@ -1179,6 +1202,10 @@ def : Pat<(v64i8 (X86SubVBroadcast (v32i8 VR256X:$src))), (VINSERTI64x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (v32i8 VR256X:$src), 1)>; +def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))), + (VBROADCASTF32X4rm addr:$src)>; +def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))), + (VBROADCASTI32X4rm addr:$src)>; def : Pat<(v32i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), (VBROADCASTI32X4rm addr:$src)>; def : Pat<(v64i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), @@ -1193,6 +1220,10 @@ defm VBROADCASTF32X4Z256 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4", v8f32x_info, v4f32x_info>, EVEX_V256, EVEX_CD8<32, CD8VT4>; +def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))), + (VBROADCASTF32X4Z256rm addr:$src)>; +def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), + (VBROADCASTI32X4Z256rm addr:$src)>; def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), (VBROADCASTI32X4Z256rm addr:$src)>; def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), @@ -1200,9 +1231,15 @@ def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. +def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))), + (VINSERTF32x4Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v2f64 VR128X:$src), 1)>; def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128X:$src))), (VINSERTF32x4Z256rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (v4f32 VR128X:$src), 1)>; +def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))), + (VINSERTI32x4Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v2i64 VR128X:$src), 1)>; def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128X:$src))), (VINSERTI32x4Z256rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (v4i32 VR128X:$src), 1)>; @@ -1215,82 +1252,27 @@ def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128X:$src))), } let Predicates = [HasVLX, HasDQI] in { -defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2", +defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2", v4i64x_info, v2i64x_info>, VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>; -defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2", +defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2", v4f64x_info, v2f64x_info>, VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>; - -// Provide fallback in case the load node that is used in the patterns above -// is used by additional users, which prevents the pattern selection. -def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))), - (VINSERTF64x2Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - (v2f64 VR128X:$src), 1)>; -def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))), - (VINSERTI64x2Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - (v2i64 VR128X:$src), 1)>; -} - -let Predicates = [HasVLX, NoDQI] in { -def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))), - (VBROADCASTF32X4Z256rm addr:$src)>; -def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), - (VBROADCASTI32X4Z256rm addr:$src)>; - -// Provide fallback in case the load node that is used in the patterns above -// is used by additional users, which prevents the pattern selection. -def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))), - (VINSERTF32x4Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - (v2f64 VR128X:$src), 1)>; -def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))), - (VINSERTI32x4Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - (v2i64 VR128X:$src), 1)>; -} - -let Predicates = [HasAVX512, NoDQI] in { -def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))), - (VBROADCASTF32X4rm addr:$src)>; -def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))), - (VBROADCASTI32X4rm addr:$src)>; - -def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))), - (VBROADCASTF64X4rm addr:$src)>; -def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))), - (VBROADCASTI64X4rm addr:$src)>; - -// Provide fallback in case the load node that is used in the patterns above -// is used by additional users, which prevents the pattern selection. -def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))), - (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), - (v8f32 VR256X:$src), 1)>; -def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), - (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), - (v8i32 VR256X:$src), 1)>; } let Predicates = [HasDQI] in { -defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2", +defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2", v8i64_info, v2i64x_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>; -defm VBROADCASTI32X8 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti32x8", +defm VBROADCASTI32X8 : avx512_subvec_broadcast_rm_dq<0x5b, "vbroadcasti32x8", v16i32_info, v8i32x_info>, EVEX_V512, EVEX_CD8<32, CD8VT8>; -defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2", +defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2", v8f64_info, v2f64x_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>; -defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf32x8", +defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8", v16f32_info, v8f32x_info>, EVEX_V512, EVEX_CD8<32, CD8VT8>; - -// Provide fallback in case the load node that is used in the patterns above -// is used by additional users, which prevents the pattern selection. -def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))), - (VINSERTF32x8Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), - (v8f32 VR256X:$src), 1)>; -def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), - (VINSERTI32x8Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), - (v8i32 VR256X:$src), 1)>; } multiclass avx512_common_broadcast_32x2 opc, string OpcodeStr, diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll b/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll index c5f7ec60700a..a88e25f62100 100644 --- a/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll +++ b/llvm/test/CodeGen/X86/avx512-vbroadcasti128.ll @@ -84,23 +84,11 @@ define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind { ; define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind { -; X64-AVX512VL-LABEL: test_broadcast_2f64_8f64: -; X64-AVX512VL: ## BB#0: -; X64-AVX512VL-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X64-AVX512VL-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512VL-NEXT: retq -; -; X64-AVX512BWVL-LABEL: test_broadcast_2f64_8f64: -; X64-AVX512BWVL: ## BB#0: -; X64-AVX512BWVL-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X64-AVX512BWVL-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512BWVL-NEXT: retq -; -; X64-AVX512DQVL-LABEL: test_broadcast_2f64_8f64: -; X64-AVX512DQVL: ## BB#0: -; X64-AVX512DQVL-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1] -; X64-AVX512DQVL-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512DQVL-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_2f64_8f64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0 +; X64-AVX512-NEXT: retq %1 = load <2 x double>, <2 x double> *%p %2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> %3 = fadd <8 x double> %2, @@ -108,23 +96,11 @@ define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind { } define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind { -; X64-AVX512VL-LABEL: test_broadcast_2i64_8i64: -; X64-AVX512VL: ## BB#0: -; X64-AVX512VL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X64-AVX512VL-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512VL-NEXT: retq -; -; X64-AVX512BWVL-LABEL: test_broadcast_2i64_8i64: -; X64-AVX512BWVL: ## BB#0: -; X64-AVX512BWVL-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X64-AVX512BWVL-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512BWVL-NEXT: retq -; -; X64-AVX512DQVL-LABEL: test_broadcast_2i64_8i64: -; X64-AVX512DQVL: ## BB#0: -; X64-AVX512DQVL-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1] -; X64-AVX512DQVL-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512DQVL-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_2i64_8i64: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: vpaddq {{.*}}(%rip), %zmm0, %zmm0 +; X64-AVX512-NEXT: retq %1 = load <2 x i64>, <2 x i64> *%p %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> %3 = add <8 x i64> %2, diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcasti256.ll b/llvm/test/CodeGen/X86/avx512-vbroadcasti256.ll index 55fbc34732e1..1896bc714c21 100644 --- a/llvm/test/CodeGen/X86/avx512-vbroadcasti256.ll +++ b/llvm/test/CodeGen/X86/avx512-vbroadcasti256.ll @@ -28,23 +28,11 @@ define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind { } define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind { -; X64-AVX512VL-LABEL: test_broadcast_8f32_16f32: -; X64-AVX512VL: ## BB#0: -; X64-AVX512VL-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512VL-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512VL-NEXT: retq -; -; X64-AVX512BWVL-LABEL: test_broadcast_8f32_16f32: -; X64-AVX512BWVL: ## BB#0: -; X64-AVX512BWVL-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512BWVL-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512BWVL-NEXT: retq -; -; X64-AVX512DQVL-LABEL: test_broadcast_8f32_16f32: -; X64-AVX512DQVL: ## BB#0: -; X64-AVX512DQVL-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; X64-AVX512DQVL-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512DQVL-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_8f32_16f32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 +; X64-AVX512-NEXT: retq %1 = load <8 x float>, <8 x float> *%p %2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> %3 = fadd <16 x float> %2, @@ -52,23 +40,11 @@ define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind { } define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind { -; X64-AVX512VL-LABEL: test_broadcast_8i32_16i32: -; X64-AVX512VL: ## BB#0: -; X64-AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512VL-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512VL-NEXT: retq -; -; X64-AVX512BWVL-LABEL: test_broadcast_8i32_16i32: -; X64-AVX512BWVL: ## BB#0: -; X64-AVX512BWVL-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512BWVL-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512BWVL-NEXT: retq -; -; X64-AVX512DQVL-LABEL: test_broadcast_8i32_16i32: -; X64-AVX512DQVL: ## BB#0: -; X64-AVX512DQVL-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; X64-AVX512DQVL-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 -; X64-AVX512DQVL-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_8i32_16i32: +; X64-AVX512: ## BB#0: +; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: vpaddd {{.*}}(%rip), %zmm0, %zmm0 +; X64-AVX512-NEXT: retq %1 = load <8 x i32>, <8 x i32> *%p %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> %3 = add <16 x i32> %2, diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll index 4ddc79efb4d7..2756e42573c4 100644 --- a/llvm/test/CodeGen/X86/subvector-broadcast.ll +++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll @@ -38,23 +38,11 @@ define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind { ; X32-AVX-NEXT: vmovdqa %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; -; X32-AVX512F-LABEL: test_broadcast_2f64_8f64: -; X32-AVX512F: # BB#0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: test_broadcast_2f64_8f64: -; X32-AVX512BW: # BB#0: -; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: test_broadcast_2f64_8f64: -; X32-AVX512DQ: # BB#0: -; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512DQ-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1] -; X32-AVX512DQ-NEXT: retl +; X32-AVX512-LABEL: test_broadcast_2f64_8f64: +; X32-AVX512: # BB#0: +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_2f64_8f64: ; X64-AVX: # BB#0: @@ -62,20 +50,10 @@ define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind { ; X64-AVX-NEXT: vmovdqa %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; -; X64-AVX512F-LABEL: test_broadcast_2f64_8f64: -; X64-AVX512F: # BB#0: -; X64-AVX512F-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: test_broadcast_2f64_8f64: -; X64-AVX512BW: # BB#0: -; X64-AVX512BW-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512DQ-LABEL: test_broadcast_2f64_8f64: -; X64-AVX512DQ: # BB#0: -; X64-AVX512DQ-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1] -; X64-AVX512DQ-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_2f64_8f64: +; X64-AVX512: # BB#0: +; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: retq %1 = load <2 x double>, <2 x double> *%p %2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> ret <8 x double> %2 @@ -152,23 +130,11 @@ define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind { ; X32-AVX2-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX2-NEXT: retl ; -; X32-AVX512F-LABEL: test_broadcast_2i64_8i64: -; X32-AVX512F: # BB#0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: test_broadcast_2i64_8i64: -; X32-AVX512BW: # BB#0: -; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: test_broadcast_2i64_8i64: -; X32-AVX512DQ: # BB#0: -; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512DQ-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1] -; X32-AVX512DQ-NEXT: retl +; X32-AVX512-LABEL: test_broadcast_2i64_8i64: +; X32-AVX512: # BB#0: +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X32-AVX512-NEXT: retl ; ; X64-AVX1-LABEL: test_broadcast_2i64_8i64: ; X64-AVX1: # BB#0: @@ -182,20 +148,10 @@ define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind { ; X64-AVX2-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX2-NEXT: retq ; -; X64-AVX512F-LABEL: test_broadcast_2i64_8i64: -; X64-AVX512F: # BB#0: -; X64-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: test_broadcast_2i64_8i64: -; X64-AVX512BW: # BB#0: -; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512DQ-LABEL: test_broadcast_2i64_8i64: -; X64-AVX512DQ: # BB#0: -; X64-AVX512DQ-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1] -; X64-AVX512DQ-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_2i64_8i64: +; X64-AVX512: # BB#0: +; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: retq %1 = load <2 x i64>, <2 x i64> *%p %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> ret <8 x i64> %2 @@ -283,23 +239,11 @@ define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind { ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; -; X32-AVX512F-LABEL: test_broadcast_8f32_16f32: -; X32-AVX512F: # BB#0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: test_broadcast_8f32_16f32: -; X32-AVX512BW: # BB#0: -; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: test_broadcast_8f32_16f32: -; X32-AVX512DQ: # BB#0: -; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512DQ-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; X32-AVX512DQ-NEXT: retl +; X32-AVX512-LABEL: test_broadcast_8f32_16f32: +; X32-AVX512: # BB#0: +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_8f32_16f32: ; X64-AVX: # BB#0: @@ -307,20 +251,10 @@ define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind { ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; -; X64-AVX512F-LABEL: test_broadcast_8f32_16f32: -; X64-AVX512F: # BB#0: -; X64-AVX512F-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: test_broadcast_8f32_16f32: -; X64-AVX512BW: # BB#0: -; X64-AVX512BW-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512DQ-LABEL: test_broadcast_8f32_16f32: -; X64-AVX512DQ: # BB#0: -; X64-AVX512DQ-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; X64-AVX512DQ-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_8f32_16f32: +; X64-AVX512: # BB#0: +; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: retq %1 = load <8 x float>, <8 x float> *%p %2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> ret <16 x float> %2 @@ -403,23 +337,11 @@ define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind { ; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX-NEXT: retl ; -; X32-AVX512F-LABEL: test_broadcast_8i32_16i32: -; X32-AVX512F: # BB#0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X32-AVX512F-NEXT: retl -; -; X32-AVX512BW-LABEL: test_broadcast_8i32_16i32: -; X32-AVX512BW: # BB#0: -; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X32-AVX512BW-NEXT: retl -; -; X32-AVX512DQ-LABEL: test_broadcast_8i32_16i32: -; X32-AVX512DQ: # BB#0: -; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; X32-AVX512DQ-NEXT: retl +; X32-AVX512-LABEL: test_broadcast_8i32_16i32: +; X32-AVX512: # BB#0: +; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; X32-AVX512-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_8i32_16i32: ; X64-AVX: # BB#0: @@ -427,20 +349,10 @@ define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind { ; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX-NEXT: retq ; -; X64-AVX512F-LABEL: test_broadcast_8i32_16i32: -; X64-AVX512F: # BB#0: -; X64-AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: test_broadcast_8i32_16i32: -; X64-AVX512BW: # BB#0: -; X64-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] -; X64-AVX512BW-NEXT: retq -; -; X64-AVX512DQ-LABEL: test_broadcast_8i32_16i32: -; X64-AVX512DQ: # BB#0: -; X64-AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; X64-AVX512DQ-NEXT: retq +; X64-AVX512-LABEL: test_broadcast_8i32_16i32: +; X64-AVX512: # BB#0: +; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] +; X64-AVX512-NEXT: retq %1 = load <8 x i32>, <8 x i32> *%p %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> ret <16 x i32> %2