From 51a4c6125ca6f25cff39c82a62878556b430d7f1 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 12 Mar 2020 23:07:06 -0700 Subject: [PATCH] [X86] Add test cases for failures to form vbroadcastw due to isTypeDesirableForOp preventing load shrinking to i16. These are based on existing test cases but use i64 instead of i32. Some of these end up with i64 zextload/extloads from i16 that we don't have isel patterns for. Some of the other cases fail because isTypeDesirableForOp prevents shrinking the (trunc (i64 (srl (load)))) directly. So we try to shrink based on the (i64 (srl (load))) but we need 64 - shift_amount to be a power of 2 to do that shrink. --- .../test/CodeGen/X86/vector-shuffle-128-v8.ll | 250 ++++++++++++++++++ .../CodeGen/X86/vector-shuffle-256-v16.ll | 209 +++++++++++++++ .../CodeGen/X86/vector-shuffle-512-v32.ll | 105 ++++++++ 3 files changed, 564 insertions(+) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll index 2688bde878ee..a09deb9a527a 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -3226,3 +3226,253 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_sext_i16(i16* %ptr) { %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <8 x i32> ret <8 x i16> %tmp4 } + +define <8 x i16> @insert_dup_mem_v8i16_i64(i64* %ptr) { +; SSE-LABEL: insert_dup_mem_v8i16_i64: +; SSE: # %bb.0: +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: retq +; +; AVX1-LABEL: insert_dup_mem_v8i16_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: insert_dup_mem_v8i16_i64: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX2OR512VL-NEXT: retq +; +; XOPAVX1-LABEL: insert_dup_mem_v8i16_i64: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: insert_dup_mem_v8i16_i64: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastw (%rdi), %xmm0 +; XOPAVX2-NEXT: retq + %tmp = load i64, i64* %ptr, align 4 + %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0 + %tmp2 = bitcast <2 x i64> %tmp1 to <8 x i16> + %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <8 x i32> zeroinitializer + ret <8 x i16> %tmp3 +} + +define <8 x i16> @insert_dup_elt1_mem_v8i16_i64(i64* %ptr) { +; SSE-LABEL: insert_dup_elt1_mem_v8i16_i64: +; SSE: # %bb.0: +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: retq +; +; AVX1-LABEL: insert_dup_elt1_mem_v8i16_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_dup_elt1_mem_v8i16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: shrq $16, %rax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_elt1_mem_v8i16_i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movq (%rdi), %rax +; AVX512VL-NEXT: shrq $16, %rax +; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0 +; AVX512VL-NEXT: retq +; +; XOPAVX1-LABEL: insert_dup_elt1_mem_v8i16_i64: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: insert_dup_elt1_mem_v8i16_i64: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: movq (%rdi), %rax +; XOPAVX2-NEXT: shrq $16, %rax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; XOPAVX2-NEXT: retq + %tmp = load i64, i64* %ptr, align 4 + %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0 + %tmp2 = bitcast <2 x i64> %tmp1 to <8 x i16> + %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <8 x i32> + ret <8 x i16> %tmp3 +} + +define <8 x i16> @insert_dup_elt3_mem_v8i16_i64(i64* %ptr) { +; SSE-LABEL: insert_dup_elt3_mem_v8i16_i64: +; SSE: # %bb.0: +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: retq +; +; AVX1-LABEL: insert_dup_elt3_mem_v8i16_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_dup_elt3_mem_v8i16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl 6(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_elt3_mem_v8i16_i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movzwl 6(%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0 +; AVX512VL-NEXT: retq +; +; XOPAVX1-LABEL: insert_dup_elt3_mem_v8i16_i64: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: insert_dup_elt3_mem_v8i16_i64: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: movzwl 6(%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; XOPAVX2-NEXT: retq + %tmp = load i64, i64* %ptr, align 4 + %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0 + %tmp2 = bitcast <2 x i64> %tmp1 to <8 x i16> + %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <8 x i32> + ret <8 x i16> %tmp3 +} + +define <8 x i16> @insert_dup_elt7_mem_v8i16_i64(i64* %ptr) { +; SSE2-LABEL: insert_dup_elt7_mem_v8i16_i64: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE2-NEXT: retq +; +; SSSE3-LABEL: insert_dup_elt7_mem_v8i16_i64: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: insert_dup_elt7_mem_v8i16_i64: +; SSE41: # %bb.0: +; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7] +; SSE41-NEXT: retq +; +; AVX1-LABEL: insert_dup_elt7_mem_v8i16_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_dup_elt7_mem_v8i16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl 6(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_elt7_mem_v8i16_i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movzwl 6(%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0 +; AVX512VL-NEXT: retq +; +; XOPAVX1-LABEL: insert_dup_elt7_mem_v8i16_i64: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: insert_dup_elt7_mem_v8i16_i64: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: movzwl 6(%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; XOPAVX2-NEXT: retq + %tmp = load i64, i64* %ptr, align 4 + %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 1 + %tmp2 = bitcast <2 x i64> %tmp1 to <8 x i16> + %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <8 x i32> + ret <8 x i16> %tmp3 +} + +define <8 x i16> @insert_dup_mem_v8i16_sext_i16_i64(i16* %ptr) { +; SSE-LABEL: insert_dup_mem_v8i16_sext_i16_i64: +; SSE: # %bb.0: +; SSE-NEXT: movzwl (%rdi), %eax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: retq +; +; AVX1-LABEL: insert_dup_mem_v8i16_sext_i16_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: movzwl (%rdi), %eax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_dup_mem_v8i16_sext_i16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl (%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_mem_v8i16_sext_i16_i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movzwl (%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0 +; AVX512VL-NEXT: retq +; +; XOPAVX1-LABEL: insert_dup_mem_v8i16_sext_i16_i64: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: movzwl (%rdi), %eax +; XOPAVX1-NEXT: vmovq %rax, %xmm0 +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: insert_dup_mem_v8i16_sext_i16_i64: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: movzwl (%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; XOPAVX2-NEXT: retq + %tmp = load i16, i16* %ptr, align 2 + %tmp1 = sext i16 %tmp to i64 + %tmp2 = insertelement <2 x i64> zeroinitializer, i64 %tmp1, i32 0 + %tmp3 = bitcast <2 x i64> %tmp2 to <8 x i16> + %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <8 x i32> zeroinitializer + ret <8 x i16> %tmp4 +} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index d92db2e15c58..103de84f0c38 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -7458,6 +7458,215 @@ define <16 x i16> @insert_dup_elt3_mem_v16i16_i32(i32* %ptr) #0 { ret <16 x i16> %tmp3 } +define <16 x i16> @insert_dup_mem_v16i16_i64(i64* %ptr) { +; AVX1-LABEL: insert_dup_mem_v16i16_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: insert_dup_mem_v16i16_i64: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX2OR512VL-NEXT: retq +; +; XOPAVX1-LABEL: insert_dup_mem_v16i16_i64: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: insert_dup_mem_v16i16_i64: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastw (%rdi), %ymm0 +; XOPAVX2-NEXT: retq + %tmp = load i64, i64* %ptr, align 4 + %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0 + %tmp2 = bitcast <2 x i64> %tmp1 to <8 x i16> + %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> zeroinitializer + ret <16 x i16> %tmp3 +} + +define <16 x i16> @insert_dup_elt1_mem_v16i16_i64(i64* %ptr) { +; AVX1-LABEL: insert_dup_elt1_mem_v16i16_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_dup_elt1_mem_v16i16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movq (%rdi), %rax +; AVX2-NEXT: shrq $16, %rax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_elt1_mem_v16i16_i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movq (%rdi), %rax +; AVX512VL-NEXT: shrq $16, %rax +; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0 +; AVX512VL-NEXT: retq +; +; XOPAVX1-LABEL: insert_dup_elt1_mem_v16i16_i64: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,2,3,4,5,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: insert_dup_elt1_mem_v16i16_i64: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: movq (%rdi), %rax +; XOPAVX2-NEXT: shrq $16, %rax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; XOPAVX2-NEXT: retq + %tmp = load i64, i64* %ptr, align 4 + %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0 + %tmp2 = bitcast <2 x i64> %tmp1 to <8 x i16> + %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> + ret <16 x i16> %tmp3 +} + +define <16 x i16> @insert_dup_elt3_mem_v16i16_i64(i64* %ptr) { +; AVX1-LABEL: insert_dup_elt3_mem_v16i16_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_dup_elt3_mem_v16i16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl 6(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_elt3_mem_v16i16_i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movzwl 6(%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0 +; AVX512VL-NEXT: retq +; +; XOPAVX1-LABEL: insert_dup_elt3_mem_v16i16_i64: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: insert_dup_elt3_mem_v16i16_i64: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: movzwl 6(%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; XOPAVX2-NEXT: retq + %tmp = load i64, i64* %ptr, align 4 + %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0 + %tmp2 = bitcast <2 x i64> %tmp1 to <8 x i16> + %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> + ret <16 x i16> %tmp3 +} + +define <16 x i16> @insert_dup_elt7_mem_v16i16_i64(i64* %ptr) { +; AVX1-LABEL: insert_dup_elt7_mem_v16i16_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_dup_elt7_mem_v16i16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl 6(%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_elt7_mem_v16i16_i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movzwl 6(%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0 +; AVX512VL-NEXT: retq +; +; XOPAVX1-LABEL: insert_dup_elt7_mem_v16i16_i64: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7] +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: insert_dup_elt7_mem_v16i16_i64: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: movzwl 6(%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; XOPAVX2-NEXT: retq + %tmp = load i64, i64* %ptr, align 4 + %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 1 + %tmp2 = bitcast <2 x i64> %tmp1 to <8 x i16> + %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <16 x i32> + ret <16 x i16> %tmp3 +} + +define <16 x i16> @insert_dup_mem_v16i16_sext_i16_i64(i16* %ptr) { +; AVX1-LABEL: insert_dup_mem_v16i16_sext_i16_i64: +; AVX1: # %bb.0: +; AVX1-NEXT: movzwl (%rdi), %eax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: insert_dup_mem_v16i16_sext_i16_i64: +; AVX2: # %bb.0: +; AVX2-NEXT: movzwl (%rdi), %eax +; AVX2-NEXT: vmovd %eax, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: insert_dup_mem_v16i16_sext_i16_i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: movzwl (%rdi), %eax +; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0 +; AVX512VL-NEXT: retq +; +; XOPAVX1-LABEL: insert_dup_mem_v16i16_sext_i16_i64: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: movzwl (%rdi), %eax +; XOPAVX1-NEXT: vmovq %rax, %xmm0 +; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: insert_dup_mem_v16i16_sext_i16_i64: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: movzwl (%rdi), %eax +; XOPAVX2-NEXT: vmovd %eax, %xmm0 +; XOPAVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; XOPAVX2-NEXT: retq + %tmp = load i16, i16* %ptr, align 2 + %tmp1 = sext i16 %tmp to i64 + %tmp2 = insertelement <2 x i64> zeroinitializer, i64 %tmp1, i32 0 + %tmp3 = bitcast <2 x i64> %tmp2 to <8 x i16> + %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <16 x i32> zeroinitializer + ret <16 x i16> %tmp4 +} + define <16 x i16> @unpckh_v16i16(<16 x i16> %x, <16 x i16> %y) { ; AVX1-LABEL: unpckh_v16i16: ; AVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll index 54266b12864f..11085c945914 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -289,6 +289,111 @@ define <32 x i16> @insert_dup_elt3_mem_v32i16_i32(i32* %ptr) #0 { ret <32 x i16> %tmp3 } +define <32 x i16> @insert_dup_mem_v16i16_i64(i64* %ptr) { +; KNL-LABEL: insert_dup_mem_v16i16_i64: +; KNL: ## %bb.0: +; KNL-NEXT: vpbroadcastw (%rdi), %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: insert_dup_mem_v16i16_i64: +; SKX: ## %bb.0: +; SKX-NEXT: vpbroadcastw (%rdi), %zmm0 +; SKX-NEXT: retq + %tmp = load i64, i64* %ptr, align 4 + %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0 + %tmp2 = bitcast <2 x i64> %tmp1 to <8 x i16> + %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <32 x i32> zeroinitializer + ret <32 x i16> %tmp3 +} + +define <32 x i16> @insert_dup_elt1_mem_v16i16_i64(i64* %ptr) { +; KNL-LABEL: insert_dup_elt1_mem_v16i16_i64: +; KNL: ## %bb.0: +; KNL-NEXT: movq (%rdi), %rax +; KNL-NEXT: shrq $16, %rax +; KNL-NEXT: vmovd %eax, %xmm0 +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: insert_dup_elt1_mem_v16i16_i64: +; SKX: ## %bb.0: +; SKX-NEXT: movq (%rdi), %rax +; SKX-NEXT: shrq $16, %rax +; SKX-NEXT: vpbroadcastw %eax, %zmm0 +; SKX-NEXT: retq + %tmp = load i64, i64* %ptr, align 4 + %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0 + %tmp2 = bitcast <2 x i64> %tmp1 to <8 x i16> + %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <32 x i32> + ret <32 x i16> %tmp3 +} + +define <32 x i16> @insert_dup_elt3_mem_v16i16_i64(i64* %ptr) { +; KNL-LABEL: insert_dup_elt3_mem_v16i16_i64: +; KNL: ## %bb.0: +; KNL-NEXT: movzwl 6(%rdi), %eax +; KNL-NEXT: vmovd %eax, %xmm0 +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: insert_dup_elt3_mem_v16i16_i64: +; SKX: ## %bb.0: +; SKX-NEXT: movzwl 6(%rdi), %eax +; SKX-NEXT: vpbroadcastw %eax, %zmm0 +; SKX-NEXT: retq + %tmp = load i64, i64* %ptr, align 4 + %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 0 + %tmp2 = bitcast <2 x i64> %tmp1 to <8 x i16> + %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <32 x i32> + ret <32 x i16> %tmp3 +} + +define <32 x i16> @insert_dup_elt7_mem_v16i16_i64(i64* %ptr) { +; KNL-LABEL: insert_dup_elt7_mem_v16i16_i64: +; KNL: ## %bb.0: +; KNL-NEXT: movzwl 6(%rdi), %eax +; KNL-NEXT: vmovd %eax, %xmm0 +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: insert_dup_elt7_mem_v16i16_i64: +; SKX: ## %bb.0: +; SKX-NEXT: movzwl 6(%rdi), %eax +; SKX-NEXT: vpbroadcastw %eax, %zmm0 +; SKX-NEXT: retq + %tmp = load i64, i64* %ptr, align 4 + %tmp1 = insertelement <2 x i64> zeroinitializer, i64 %tmp, i32 1 + %tmp2 = bitcast <2 x i64> %tmp1 to <8 x i16> + %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <32 x i32> + ret <32 x i16> %tmp3 +} + +define <32 x i16> @insert_dup_mem_v16i16_sext_i16_i64(i16* %ptr) { +; KNL-LABEL: insert_dup_mem_v16i16_sext_i16_i64: +; KNL: ## %bb.0: +; KNL-NEXT: movzwl (%rdi), %eax +; KNL-NEXT: vmovd %eax, %xmm0 +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: insert_dup_mem_v16i16_sext_i16_i64: +; SKX: ## %bb.0: +; SKX-NEXT: movzwl (%rdi), %eax +; SKX-NEXT: vpbroadcastw %eax, %zmm0 +; SKX-NEXT: retq + %tmp = load i16, i16* %ptr, align 2 + %tmp1 = sext i16 %tmp to i64 + %tmp2 = insertelement <2 x i64> zeroinitializer, i64 %tmp1, i32 0 + %tmp3 = bitcast <2 x i64> %tmp2 to <8 x i16> + %tmp4 = shufflevector <8 x i16> %tmp3, <8 x i16> undef, <32 x i32> zeroinitializer + ret <32 x i16> %tmp4 +} + define <32 x i16> @shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz(<32 x i16> %a) { ; KNL-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz: ; KNL: ## %bb.0: