diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 8da6bde6c9e3..f7aac458be88 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -8431,6 +8431,31 @@ let Predicates = [HasAVX2] in { (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>; def : Pat<(v4i64 (X86VBroadcast GR64:$src)), (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>; + + def : Pat<(v16i8 (X86VBroadcast GR8:$src)), + (VPBROADCASTBrr (COPY_TO_REGCLASS + (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), + VR128))>; + def : Pat<(v32i8 (X86VBroadcast GR8:$src)), + (VPBROADCASTBYrr (COPY_TO_REGCLASS + (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)), + VR128))>; + + def : Pat<(v8i16 (X86VBroadcast GR16:$src)), + (VPBROADCASTWrr (COPY_TO_REGCLASS + (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)), + VR128))>; + def : Pat<(v16i16 (X86VBroadcast GR16:$src)), + (VPBROADCASTWYrr (COPY_TO_REGCLASS + (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)), + VR128))>; + + // The patterns for VPBROADCASTD are not needed because they would match + // the exact same thing as VBROADCASTSS patterns. + + def : Pat<(v2i64 (X86VBroadcast GR64:$src)), + (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>; + // The v4i64 pattern is not needed because VBROADCASTSDYrr already match. } } diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll index 16e676214c4e..bac9c66c3aed 100644 --- a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll @@ -413,3 +413,161 @@ define <4 x double> @splat_concat4(double %d) { %5 = shufflevector <2 x double> %2, <2 x double> %4, <4 x i32> ret <4 x double> %5 } + +; Test cases for . +; Instruction selection for broacast instruction fails if +; the load cannot be folded into the broadcast. +; This happens if the load has initial one use but other uses are +; created later, or if selection DAG cannot prove that folding the +; load will not create a cycle in the DAG. +; Those test cases exerce the latter. + +; CHECK-LABEL: isel_crash_16b +; CHECK: vpbroadcastb {{[^,]+}}, %xmm{{[0-9]+}} +; CHECK: ret +define void @isel_crash_16b(i8* %cV_R.addr) { +eintry: + %__a.addr.i = alloca <2 x i64>, align 16 + %__b.addr.i = alloca <2 x i64>, align 16 + %vCr = alloca <2 x i64>, align 16 + store <2 x i64> zeroinitializer, <2 x i64>* %vCr, align 16 + %tmp = load <2 x i64>* %vCr, align 16 + %tmp2 = load i8* %cV_R.addr, align 4 + %splat.splatinsert = insertelement <16 x i8> undef, i8 %tmp2, i32 0 + %splat.splat = shufflevector <16 x i8> %splat.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + %tmp3 = bitcast <16 x i8> %splat.splat to <2 x i64> + store <2 x i64> %tmp, <2 x i64>* %__a.addr.i, align 16 + store <2 x i64> %tmp3, <2 x i64>* %__b.addr.i, align 16 + ret void +} + +; CHECK-LABEL: isel_crash_32b +; CHECK: vpbroadcastb {{[^,]+}}, %ymm{{[0-9]+}} +; CHECK: ret +define void @isel_crash_32b(i8* %cV_R.addr) { +eintry: + %__a.addr.i = alloca <4 x i64>, align 16 + %__b.addr.i = alloca <4 x i64>, align 16 + %vCr = alloca <4 x i64>, align 16 + store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16 + %tmp = load <4 x i64>* %vCr, align 16 + %tmp2 = load i8* %cV_R.addr, align 4 + %splat.splatinsert = insertelement <32 x i8> undef, i8 %tmp2, i32 0 + %splat.splat = shufflevector <32 x i8> %splat.splatinsert, <32 x i8> undef, <32 x i32> zeroinitializer + %tmp3 = bitcast <32 x i8> %splat.splat to <4 x i64> + store <4 x i64> %tmp, <4 x i64>* %__a.addr.i, align 16 + store <4 x i64> %tmp3, <4 x i64>* %__b.addr.i, align 16 + ret void +} + +; CHECK-LABEL: isel_crash_8w +; CHECK: vpbroadcastw {{[^,]+}}, %xmm{{[0-9]+}} +; CHECK: ret +define void @isel_crash_8w(i16* %cV_R.addr) { +entry: + %__a.addr.i = alloca <2 x i64>, align 16 + %__b.addr.i = alloca <2 x i64>, align 16 + %vCr = alloca <2 x i64>, align 16 + store <2 x i64> zeroinitializer, <2 x i64>* %vCr, align 16 + %tmp = load <2 x i64>* %vCr, align 16 + %tmp2 = load i16* %cV_R.addr, align 4 + %splat.splatinsert = insertelement <8 x i16> undef, i16 %tmp2, i32 0 + %splat.splat = shufflevector <8 x i16> %splat.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + %tmp3 = bitcast <8 x i16> %splat.splat to <2 x i64> + store <2 x i64> %tmp, <2 x i64>* %__a.addr.i, align 16 + store <2 x i64> %tmp3, <2 x i64>* %__b.addr.i, align 16 + ret void +} + +; CHECK-LABEL: isel_crash_16w +; CHECK: vpbroadcastw {{[^,]+}}, %ymm{{[0-9]+}} +; CHECK: ret +define void @isel_crash_16w(i16* %cV_R.addr) { +eintry: + %__a.addr.i = alloca <4 x i64>, align 16 + %__b.addr.i = alloca <4 x i64>, align 16 + %vCr = alloca <4 x i64>, align 16 + store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16 + %tmp = load <4 x i64>* %vCr, align 16 + %tmp2 = load i16* %cV_R.addr, align 4 + %splat.splatinsert = insertelement <16 x i16> undef, i16 %tmp2, i32 0 + %splat.splat = shufflevector <16 x i16> %splat.splatinsert, <16 x i16> undef, <16 x i32> zeroinitializer + %tmp3 = bitcast <16 x i16> %splat.splat to <4 x i64> + store <4 x i64> %tmp, <4 x i64>* %__a.addr.i, align 16 + store <4 x i64> %tmp3, <4 x i64>* %__b.addr.i, align 16 + ret void +} + +; CHECK-LABEL: isel_crash_4d +; CHECK: vbroadcastss {{[^,]+}}, %xmm{{[0-9]+}} +; CHECK: ret +define void @isel_crash_4d(i32* %cV_R.addr) { +entry: + %__a.addr.i = alloca <2 x i64>, align 16 + %__b.addr.i = alloca <2 x i64>, align 16 + %vCr = alloca <2 x i64>, align 16 + store <2 x i64> zeroinitializer, <2 x i64>* %vCr, align 16 + %tmp = load <2 x i64>* %vCr, align 16 + %tmp2 = load i32* %cV_R.addr, align 4 + %splat.splatinsert = insertelement <4 x i32> undef, i32 %tmp2, i32 0 + %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + %tmp3 = bitcast <4 x i32> %splat.splat to <2 x i64> + store <2 x i64> %tmp, <2 x i64>* %__a.addr.i, align 16 + store <2 x i64> %tmp3, <2 x i64>* %__b.addr.i, align 16 + ret void +} + +; CHECK-LABEL: isel_crash_8d +; CHECK: vbroadcastss {{[^,]+}}, %ymm{{[0-9]+}} +; CHECK: ret +define void @isel_crash_8d(i32* %cV_R.addr) { +eintry: + %__a.addr.i = alloca <4 x i64>, align 16 + %__b.addr.i = alloca <4 x i64>, align 16 + %vCr = alloca <4 x i64>, align 16 + store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16 + %tmp = load <4 x i64>* %vCr, align 16 + %tmp2 = load i32* %cV_R.addr, align 4 + %splat.splatinsert = insertelement <8 x i32> undef, i32 %tmp2, i32 0 + %splat.splat = shufflevector <8 x i32> %splat.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer + %tmp3 = bitcast <8 x i32> %splat.splat to <4 x i64> + store <4 x i64> %tmp, <4 x i64>* %__a.addr.i, align 16 + store <4 x i64> %tmp3, <4 x i64>* %__b.addr.i, align 16 + ret void +} + +; CHECK-LABEL: isel_crash_2q +; CHECK: vpbroadcastq {{[^,]+}}, %xmm{{[0-9]+}} +; CHECK: ret +define void @isel_crash_2q(i64* %cV_R.addr) { +entry: + %__a.addr.i = alloca <2 x i64>, align 16 + %__b.addr.i = alloca <2 x i64>, align 16 + %vCr = alloca <2 x i64>, align 16 + store <2 x i64> zeroinitializer, <2 x i64>* %vCr, align 16 + %tmp = load <2 x i64>* %vCr, align 16 + %tmp2 = load i64* %cV_R.addr, align 4 + %splat.splatinsert = insertelement <2 x i64> undef, i64 %tmp2, i32 0 + %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer + store <2 x i64> %tmp, <2 x i64>* %__a.addr.i, align 16 + store <2 x i64> %splat.splat, <2 x i64>* %__b.addr.i, align 16 + ret void +} + +; CHECK-LABEL: isel_crash_4q +; CHECK: vbroadcastsd {{[^,]+}}, %ymm{{[0-9]+}} +; CHECK: ret +define void @isel_crash_4q(i64* %cV_R.addr) { +eintry: + %__a.addr.i = alloca <4 x i64>, align 16 + %__b.addr.i = alloca <4 x i64>, align 16 + %vCr = alloca <4 x i64>, align 16 + store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16 + %tmp = load <4 x i64>* %vCr, align 16 + %tmp2 = load i64* %cV_R.addr, align 4 + %splat.splatinsert = insertelement <4 x i64> undef, i64 %tmp2, i32 0 + %splat.splat = shufflevector <4 x i64> %splat.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer + store <4 x i64> %tmp, <4 x i64>* %__a.addr.i, align 16 + store <4 x i64> %splat.splat, <4 x i64>* %__b.addr.i, align 16 + ret void +}