[X86][ISelDAG] Add missing fallback patterns for avx2 broadcast instructions.

Those patterns are used when the load cannot be folded into the related broadcast
during the select phase.
This happens when the load gets additional uses that were not anticipated during
the previous lowering phases (constant vector to constant load, then constant
load reused) or when selection DAG is not able to prove that folding the load
will not create a cycle in the DAG.

<rdar://problem/16074331>

llvm-svn: 204631
This commit is contained in:
Quentin Colombet 2014-03-24 17:54:19 +00:00
parent ad41d7b531
commit 2d5c156b96
2 changed files with 183 additions and 0 deletions

View File

@ -8431,6 +8431,31 @@ let Predicates = [HasAVX2] in {
(VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
(VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
(VPBROADCASTBrr (COPY_TO_REGCLASS
(i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
VR128))>;
def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
(VPBROADCASTBYrr (COPY_TO_REGCLASS
(i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
VR128))>;
def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
(VPBROADCASTWrr (COPY_TO_REGCLASS
(i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
VR128))>;
def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
(VPBROADCASTWYrr (COPY_TO_REGCLASS
(i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
VR128))>;
// The patterns for VPBROADCASTD are not needed because they would match
// the exact same thing as VBROADCASTSS patterns.
def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
(VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
// The v4i64 pattern is not needed because VBROADCASTSDYrr already match.
}
}

View File

@ -413,3 +413,161 @@ define <4 x double> @splat_concat4(double %d) {
%5 = shufflevector <2 x double> %2, <2 x double> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x double> %5
}
; Test cases for <rdar://problem/16074331>.
; Instruction selection for broacast instruction fails if
; the load cannot be folded into the broadcast.
; This happens if the load has initial one use but other uses are
; created later, or if selection DAG cannot prove that folding the
; load will not create a cycle in the DAG.
; Those test cases exerce the latter.
; CHECK-LABEL: isel_crash_16b
; CHECK: vpbroadcastb {{[^,]+}}, %xmm{{[0-9]+}}
; CHECK: ret
define void @isel_crash_16b(i8* %cV_R.addr) {
eintry:
%__a.addr.i = alloca <2 x i64>, align 16
%__b.addr.i = alloca <2 x i64>, align 16
%vCr = alloca <2 x i64>, align 16
store <2 x i64> zeroinitializer, <2 x i64>* %vCr, align 16
%tmp = load <2 x i64>* %vCr, align 16
%tmp2 = load i8* %cV_R.addr, align 4
%splat.splatinsert = insertelement <16 x i8> undef, i8 %tmp2, i32 0
%splat.splat = shufflevector <16 x i8> %splat.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
%tmp3 = bitcast <16 x i8> %splat.splat to <2 x i64>
store <2 x i64> %tmp, <2 x i64>* %__a.addr.i, align 16
store <2 x i64> %tmp3, <2 x i64>* %__b.addr.i, align 16
ret void
}
; CHECK-LABEL: isel_crash_32b
; CHECK: vpbroadcastb {{[^,]+}}, %ymm{{[0-9]+}}
; CHECK: ret
define void @isel_crash_32b(i8* %cV_R.addr) {
eintry:
%__a.addr.i = alloca <4 x i64>, align 16
%__b.addr.i = alloca <4 x i64>, align 16
%vCr = alloca <4 x i64>, align 16
store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
%tmp = load <4 x i64>* %vCr, align 16
%tmp2 = load i8* %cV_R.addr, align 4
%splat.splatinsert = insertelement <32 x i8> undef, i8 %tmp2, i32 0
%splat.splat = shufflevector <32 x i8> %splat.splatinsert, <32 x i8> undef, <32 x i32> zeroinitializer
%tmp3 = bitcast <32 x i8> %splat.splat to <4 x i64>
store <4 x i64> %tmp, <4 x i64>* %__a.addr.i, align 16
store <4 x i64> %tmp3, <4 x i64>* %__b.addr.i, align 16
ret void
}
; CHECK-LABEL: isel_crash_8w
; CHECK: vpbroadcastw {{[^,]+}}, %xmm{{[0-9]+}}
; CHECK: ret
define void @isel_crash_8w(i16* %cV_R.addr) {
entry:
%__a.addr.i = alloca <2 x i64>, align 16
%__b.addr.i = alloca <2 x i64>, align 16
%vCr = alloca <2 x i64>, align 16
store <2 x i64> zeroinitializer, <2 x i64>* %vCr, align 16
%tmp = load <2 x i64>* %vCr, align 16
%tmp2 = load i16* %cV_R.addr, align 4
%splat.splatinsert = insertelement <8 x i16> undef, i16 %tmp2, i32 0
%splat.splat = shufflevector <8 x i16> %splat.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
%tmp3 = bitcast <8 x i16> %splat.splat to <2 x i64>
store <2 x i64> %tmp, <2 x i64>* %__a.addr.i, align 16
store <2 x i64> %tmp3, <2 x i64>* %__b.addr.i, align 16
ret void
}
; CHECK-LABEL: isel_crash_16w
; CHECK: vpbroadcastw {{[^,]+}}, %ymm{{[0-9]+}}
; CHECK: ret
define void @isel_crash_16w(i16* %cV_R.addr) {
eintry:
%__a.addr.i = alloca <4 x i64>, align 16
%__b.addr.i = alloca <4 x i64>, align 16
%vCr = alloca <4 x i64>, align 16
store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
%tmp = load <4 x i64>* %vCr, align 16
%tmp2 = load i16* %cV_R.addr, align 4
%splat.splatinsert = insertelement <16 x i16> undef, i16 %tmp2, i32 0
%splat.splat = shufflevector <16 x i16> %splat.splatinsert, <16 x i16> undef, <16 x i32> zeroinitializer
%tmp3 = bitcast <16 x i16> %splat.splat to <4 x i64>
store <4 x i64> %tmp, <4 x i64>* %__a.addr.i, align 16
store <4 x i64> %tmp3, <4 x i64>* %__b.addr.i, align 16
ret void
}
; CHECK-LABEL: isel_crash_4d
; CHECK: vbroadcastss {{[^,]+}}, %xmm{{[0-9]+}}
; CHECK: ret
define void @isel_crash_4d(i32* %cV_R.addr) {
entry:
%__a.addr.i = alloca <2 x i64>, align 16
%__b.addr.i = alloca <2 x i64>, align 16
%vCr = alloca <2 x i64>, align 16
store <2 x i64> zeroinitializer, <2 x i64>* %vCr, align 16
%tmp = load <2 x i64>* %vCr, align 16
%tmp2 = load i32* %cV_R.addr, align 4
%splat.splatinsert = insertelement <4 x i32> undef, i32 %tmp2, i32 0
%splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
%tmp3 = bitcast <4 x i32> %splat.splat to <2 x i64>
store <2 x i64> %tmp, <2 x i64>* %__a.addr.i, align 16
store <2 x i64> %tmp3, <2 x i64>* %__b.addr.i, align 16
ret void
}
; CHECK-LABEL: isel_crash_8d
; CHECK: vbroadcastss {{[^,]+}}, %ymm{{[0-9]+}}
; CHECK: ret
define void @isel_crash_8d(i32* %cV_R.addr) {
eintry:
%__a.addr.i = alloca <4 x i64>, align 16
%__b.addr.i = alloca <4 x i64>, align 16
%vCr = alloca <4 x i64>, align 16
store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
%tmp = load <4 x i64>* %vCr, align 16
%tmp2 = load i32* %cV_R.addr, align 4
%splat.splatinsert = insertelement <8 x i32> undef, i32 %tmp2, i32 0
%splat.splat = shufflevector <8 x i32> %splat.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
%tmp3 = bitcast <8 x i32> %splat.splat to <4 x i64>
store <4 x i64> %tmp, <4 x i64>* %__a.addr.i, align 16
store <4 x i64> %tmp3, <4 x i64>* %__b.addr.i, align 16
ret void
}
; CHECK-LABEL: isel_crash_2q
; CHECK: vpbroadcastq {{[^,]+}}, %xmm{{[0-9]+}}
; CHECK: ret
define void @isel_crash_2q(i64* %cV_R.addr) {
entry:
%__a.addr.i = alloca <2 x i64>, align 16
%__b.addr.i = alloca <2 x i64>, align 16
%vCr = alloca <2 x i64>, align 16
store <2 x i64> zeroinitializer, <2 x i64>* %vCr, align 16
%tmp = load <2 x i64>* %vCr, align 16
%tmp2 = load i64* %cV_R.addr, align 4
%splat.splatinsert = insertelement <2 x i64> undef, i64 %tmp2, i32 0
%splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
store <2 x i64> %tmp, <2 x i64>* %__a.addr.i, align 16
store <2 x i64> %splat.splat, <2 x i64>* %__b.addr.i, align 16
ret void
}
; CHECK-LABEL: isel_crash_4q
; CHECK: vbroadcastsd {{[^,]+}}, %ymm{{[0-9]+}}
; CHECK: ret
define void @isel_crash_4q(i64* %cV_R.addr) {
eintry:
%__a.addr.i = alloca <4 x i64>, align 16
%__b.addr.i = alloca <4 x i64>, align 16
%vCr = alloca <4 x i64>, align 16
store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
%tmp = load <4 x i64>* %vCr, align 16
%tmp2 = load i64* %cV_R.addr, align 4
%splat.splatinsert = insertelement <4 x i64> undef, i64 %tmp2, i32 0
%splat.splat = shufflevector <4 x i64> %splat.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
store <4 x i64> %tmp, <4 x i64>* %__a.addr.i, align 16
store <4 x i64> %splat.splat, <4 x i64>* %__b.addr.i, align 16
ret void
}