From f9f401dba179038591ad066bfa063cd5f3d5b4d8 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 31 Mar 2020 22:40:33 +0100 Subject: [PATCH] [X86][AVX] Add additional 256/512-bit test cases for PACKSS/PACKUS shuffle patterns Also add lowerShuffleWithPACK call to lowerV32I16Shuffle - shuffle combining was catching it but we avoid a lot of temporary shuffle creations if we catch it at lowering first. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 11 ++- .../CodeGen/X86/vector-shuffle-256-v16.ll | 96 +++++++++++++++++++ .../CodeGen/X86/vector-shuffle-512-v32.ll | 52 ++++++++++ .../CodeGen/X86/vector-shuffle-512-v64.ll | 88 +++++++++++++++++ .../X86/vector-shuffle-combining-avx512bw.ll | 30 ++++++ 5 files changed, 274 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 45cdfa9450d0..2230babc4dcd 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -17216,6 +17216,11 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef Mask, if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG)) return V; + // Use dedicated pack instructions for masks that match their pattern. + if (SDValue V = + lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget)) + return V; + // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) @@ -17237,13 +17242,13 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef Mask, // As this is a single-input shuffle, the repeated mask should be // a strictly valid v8i16 mask that we can pass through to the v8i16 // lowering to handle even the v32 case. - return lowerV8I16GeneralSingleInputShuffle( - DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG); + return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1, + RepeatedMask, Subtarget, DAG); } } if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask, - Zeroable, Subtarget, DAG)) + Zeroable, Subtarget, DAG)) return Blend; if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2, diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index 8c12c0d2e9fc..7db96f42b1cb 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -6914,6 +6914,102 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_2 ret <16 x i16> %4 } +define <16 x i16> @shuffle_v16i16_ashr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30(<8 x i32> %a0, <8 x i32> %a1) { +; AVX1-LABEL: shuffle_v16i16_ashr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrad $25, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsrad $25, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $25, %xmm1, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpsrad $25, %xmm1, %xmm1 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v16i16_ashr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpsrad $25, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: vpsrad $25, %ymm1, %ymm1 +; AVX2OR512VL-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq +; +; XOPAVX1-LABEL: shuffle_v16i16_ashr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpsrad $25, %xmm0, %xmm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; XOPAVX1-NEXT: vpsrad $25, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpsrad $25, %xmm1, %xmm3 +; XOPAVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; XOPAVX1-NEXT: vpsrad $25, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: shuffle_v16i16_ashr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpsrad $25, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpsrad $25, %ymm1, %ymm1 +; XOPAVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: retq + %1 = ashr <8 x i32> %a0, + %2 = ashr <8 x i32> %a1, + %3 = bitcast <8 x i32> %1 to <16 x i16> + %4 = bitcast <8 x i32> %2 to <16 x i16> + %5 = shufflevector <16 x i16> %3, <16 x i16> %4, <16 x i32> + ret <16 x i16> %5 +} + +define <16 x i16> @shuffle_v16i16_lshr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30(<8 x i32> %a0, <8 x i32> %a1) { +; AVX1-LABEL: shuffle_v16i16_lshr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrld $25, %xmm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpsrld $25, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $25, %xmm1, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpsrld $25, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: shuffle_v16i16_lshr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpsrld $25, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: vpsrld $25, %ymm1, %ymm1 +; AVX2OR512VL-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2OR512VL-NEXT: retq +; +; XOPAVX1-LABEL: shuffle_v16i16_lshr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vpsrld $25, %xmm0, %xmm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; XOPAVX1-NEXT: vpsrld $25, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpsrld $25, %xmm1, %xmm3 +; XOPAVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; XOPAVX1-NEXT: vpsrld $25, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: shuffle_v16i16_lshr_00_02_04_06_16_18_20_22_08_10_12_14_24_26_28_30: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpsrld $25, %ymm0, %ymm0 +; XOPAVX2-NEXT: vpsrld $25, %ymm1, %ymm1 +; XOPAVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; XOPAVX2-NEXT: retq + %1 = lshr <8 x i32> %a0, + %2 = lshr <8 x i32> %a1, + %3 = bitcast <8 x i32> %1 to <16 x i16> + %4 = bitcast <8 x i32> %2 to <16 x i16> + %5 = shufflevector <16 x i16> %3, <16 x i16> %4, <16 x i32> + ret <16 x i16> %5 +} + define <16 x i16> @shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13(<16 x i16> %a) { ; AVX1-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13: ; AVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll index 0e79116884f4..2601c7d4172d 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -216,6 +216,58 @@ define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a ret <32 x i16> %shuffle } +define <32 x i16> @shuffle_v32i16_ashr_00_02_04_06_32_34_36_38_08_10_12_14_40_42_44_46_16_18_20_22_48_50_52_54_24_26_28_30_56_58_60_62(<16 x i32> %a0, <16 x i32> %a1) nounwind { +; KNL-LABEL: shuffle_v32i16_ashr_00_02_04_06_32_34_36_38_08_10_12_14_40_42_44_46_16_18_20_22_48_50_52_54_24_26_28_30_56_58_60_62: +; KNL: ## %bb.0: +; KNL-NEXT: vpsrad $25, %zmm0, %zmm0 +; KNL-NEXT: vpsrad $25, %zmm1, %zmm1 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; KNL-NEXT: vpackssdw %ymm3, %ymm2, %ymm2 +; KNL-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_ashr_00_02_04_06_32_34_36_38_08_10_12_14_40_42_44_46_16_18_20_22_48_50_52_54_24_26_28_30_56_58_60_62: +; SKX: ## %bb.0: +; SKX-NEXT: vpsrad $25, %zmm0, %zmm0 +; SKX-NEXT: vpsrad $25, %zmm1, %zmm1 +; SKX-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 +; SKX-NEXT: retq + %1 = ashr <16 x i32> %a0, + %2 = ashr <16 x i32> %a1, + %3 = bitcast <16 x i32> %1 to <32 x i16> + %4 = bitcast <16 x i32> %2 to <32 x i16> + %5 = shufflevector <32 x i16> %3, <32 x i16> %4, <32 x i32> + ret <32 x i16> %5 +} + +define <32 x i16> @shuffle_v32i16_lshr_00_02_04_06_32_34_36_38_08_10_12_14_40_42_44_46_16_18_20_22_48_50_52_54_24_26_28_30_56_58_60_62(<16 x i32> %a0, <16 x i32> %a1) nounwind { +; KNL-LABEL: shuffle_v32i16_lshr_00_02_04_06_32_34_36_38_08_10_12_14_40_42_44_46_16_18_20_22_48_50_52_54_24_26_28_30_56_58_60_62: +; KNL: ## %bb.0: +; KNL-NEXT: vpsrld $25, %zmm0, %zmm0 +; KNL-NEXT: vpsrld $25, %zmm1, %zmm1 +; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; KNL-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 +; KNL-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_lshr_00_02_04_06_32_34_36_38_08_10_12_14_40_42_44_46_16_18_20_22_48_50_52_54_24_26_28_30_56_58_60_62: +; SKX: ## %bb.0: +; SKX-NEXT: vpsrld $25, %zmm0, %zmm0 +; SKX-NEXT: vpsrld $25, %zmm1, %zmm1 +; SKX-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 +; SKX-NEXT: retq + %1 = lshr <16 x i32> %a0, + %2 = lshr <16 x i32> %a1, + %3 = bitcast <16 x i32> %1 to <32 x i16> + %4 = bitcast <16 x i32> %2 to <32 x i16> + %5 = shufflevector <32 x i16> %3, <32 x i16> %4, <32 x i32> + ret <32 x i16> %5 +} + define <32 x i16> @insert_dup_mem_v32i16_i32(i32* %ptr) { ; KNL-LABEL: insert_dup_mem_v32i16_i32: ; KNL: ## %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll index 3afb54a9d3bb..3c95f4ce400e 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll @@ -546,6 +546,94 @@ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_ ret <64 x i8> %shuffle } +define <64 x i8> @shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125(<16 x i32> %a0, <16 x i32> %a1) nounwind { +; AVX512F-LABEL: shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpsrad $25, %zmm0, %zmm0 +; AVX512F-NEXT: vpsrad $25, %zmm1, %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpackssdw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsrad $25, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrad $25, %zmm1, %zmm1 +; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpsrad $25, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpsrad $25, %zmm1, %zmm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512DQ-NEXT: vpackssdw %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VBMI-LABEL: shuffle_v64i8_ashr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125: +; AVX512VBMI: # %bb.0: +; AVX512VBMI-NEXT: vpsrad $25, %zmm0, %zmm0 +; AVX512VBMI-NEXT: vpsrad $25, %zmm1, %zmm1 +; AVX512VBMI-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 +; AVX512VBMI-NEXT: retq + %1 = ashr <16 x i32> %a0, + %2 = ashr <16 x i32> %a1, + %3 = bitcast <16 x i32> %1 to <64 x i8> + %4 = bitcast <16 x i32> %2 to <64 x i8> + %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> + ret <64 x i8> %5 +} + +define <64 x i8> @shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125(<16 x i32> %a0, <16 x i32> %a1) nounwind { +; AVX512F-LABEL: shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpsrld $25, %zmm0, %zmm0 +; AVX512F-NEXT: vpsrld $25, %zmm1, %zmm1 +; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512F-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsrld $25, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrld $25, %zmm1, %zmm1 +; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpsrld $25, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpsrld $25, %zmm1, %zmm1 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512DQ-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 +; AVX512DQ-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VBMI-LABEL: shuffle_v64i8_lshr_00_01_04_05_08_09_12_13_64_65_68_69_72_73_76_77_16_17_20_21_24_25_28_29_80_81_84_85_88_89_92_93_32_33_36_37_40_41_44_45_96_97_100_101_104_105_108_109_48_49_52_53_56_57_60_61_112_113_116_117_120_121_124_125: +; AVX512VBMI: # %bb.0: +; AVX512VBMI-NEXT: vpsrld $25, %zmm0, %zmm0 +; AVX512VBMI-NEXT: vpsrld $25, %zmm1, %zmm1 +; AVX512VBMI-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 +; AVX512VBMI-NEXT: retq + %1 = lshr <16 x i32> %a0, + %2 = lshr <16 x i32> %a1, + %3 = bitcast <16 x i32> %1 to <64 x i8> + %4 = bitcast <16 x i32> %2 to <64 x i8> + %5 = shufflevector <64 x i8> %3, <64 x i8> %4, <64 x i32> + ret <64 x i8> %5 +} + define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126(<32 x i16> %a0, <32 x i16> %a1) { ; AVX512F-LABEL: shuffle_v64i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62_64_66_68_70_72_74_76_78_80_82_84_86_88_90_92_94_96_98_100_102_104_106_108_110_112_114_116_118_120_122_124_126: ; AVX512F: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll index 08923cab6ebb..17781eb922a6 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -159,6 +159,36 @@ define <32 x i16> @combine_pshufb_as_pshufhw(<32 x i16> %a0) { ret <32 x i16> %1 } +define <32 x i16> @combine_vpermi2var_as_packssdw(<16 x i32> %a0, <16 x i32> %a1) nounwind { +; CHECK-LABEL: combine_vpermi2var_as_packssdw: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsrad $25, %zmm0, %zmm0 +; CHECK-NEXT: vpsrad $25, %zmm1, %zmm1 +; CHECK-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %1 = ashr <16 x i32> %a0, + %2 = ashr <16 x i32> %a1, + %3 = bitcast <16 x i32> %1 to <32 x i16> + %4 = bitcast <16 x i32> %2 to <32 x i16> + %5 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %3, <32 x i16> , <32 x i16> %4, i32 -1) + ret <32 x i16> %5 +} + +define <32 x i16> @combine_vpermi2var_as_packusdw(<16 x i32> %a0, <16 x i32> %a1) nounwind { +; CHECK-LABEL: combine_vpermi2var_as_packusdw: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsrld $25, %zmm0, %zmm0 +; CHECK-NEXT: vpsrld $25, %zmm1, %zmm1 +; CHECK-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %1 = lshr <16 x i32> %a0, + %2 = lshr <16 x i32> %a1, + %3 = bitcast <16 x i32> %1 to <32 x i16> + %4 = bitcast <16 x i32> %2 to <32 x i16> + %5 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %3, <32 x i16> , <32 x i16> %4, i32 -1) + ret <32 x i16> %5 +} + define <64 x i8> @combine_pshufb_as_packsswb(<32 x i16> %a0, <32 x i16> %a1) nounwind { ; CHECK-LABEL: combine_pshufb_as_packsswb: ; CHECK: # %bb.0: