diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 3175324b4c95..23073a938784 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -5590,6 +5590,30 @@ defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>; defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>; let Predicates = [HasAVX2] in { + def : Pat<(v16i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>; + def : Pat<(v8i32 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDYrr VR128:$src)>; + def : Pat<(v4i64 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQYrr VR128:$src)>; + + def : Pat<(v8i32 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>; + def : Pat<(v4i64 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQYrr VR128:$src)>; + + def : Pat<(v4i64 (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>; + + def : Pat<(v16i16 (X86vsext (v32i8 VR256:$src))), + (VPMOVSXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + def : Pat<(v8i32 (X86vsext (v32i8 VR256:$src))), + (VPMOVSXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + def : Pat<(v4i64 (X86vsext (v32i8 VR256:$src))), + (VPMOVSXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + + def : Pat<(v8i32 (X86vsext (v16i16 VR256:$src))), + (VPMOVSXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + def : Pat<(v4i64 (X86vsext (v16i16 VR256:$src))), + (VPMOVSXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + + def : Pat<(v4i64 (X86vsext (v8i32 VR256:$src))), + (VPMOVSXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>; + def : Pat<(v8i32 (X86vsmovl (v8i16 (bitconvert (v2i64 (load addr:$src)))))), (VPMOVSXWDYrm addr:$src)>; def : Pat<(v4i64 (X86vsmovl (v4i32 (bitconvert (v2i64 (load addr:$src)))))), @@ -5628,6 +5652,15 @@ let Predicates = [HasAVX] in { } let Predicates = [UseSSE41] in { + def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (PMOVSXBWrr VR128:$src)>; + def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (PMOVSXBDrr VR128:$src)>; + def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (PMOVSXBQrr VR128:$src)>; + + def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>; + def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (PMOVSXWQrr VR128:$src)>; + + def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>; + // Common patterns involving scalar load def : Pat<(int_x86_sse41_pmovsxbq (bitconvert (v4i32 (X86vzmovl @@ -5727,6 +5760,15 @@ let Predicates = [HasAVX] in { def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))), (VPMOVZXDQrm addr:$src)>; + def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWrr VR128:$src)>; + def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDrr VR128:$src)>; + def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQrr VR128:$src)>; + + def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>; + def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQrr VR128:$src)>; + + def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>; + def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), (VPMOVSXWDrm addr:$src)>; diff --git a/llvm/test/CodeGen/X86/pmovsx-inreg.ll b/llvm/test/CodeGen/X86/pmovsx-inreg.ll new file mode 100644 index 000000000000..d8c27f25043a --- /dev/null +++ b/llvm/test/CodeGen/X86/pmovsx-inreg.ll @@ -0,0 +1,176 @@ +; RUN: llc < %s -march=x86-64 -mcpu=penryn | FileCheck -check-prefix=SSE41 %s +; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck -check-prefix=AVX1 %s +; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck -check-prefix=AVX2 %s + +; PR14887 +; These tests inject a store into the chain to test the inreg versions of pmovsx + +define void @test1(<2 x i8>* %in, <2 x i64>* %out) nounwind { + %wide.load35 = load <2 x i8>* %in, align 1 + %sext = sext <2 x i8> %wide.load35 to <2 x i64> + store <2 x i64> zeroinitializer, <2 x i64>* undef, align 8 + store <2 x i64> %sext, <2 x i64>* %out, align 8 + ret void + +; SSE41: test1: +; SSE41: pmovsxbq + +; AVX1: test1: +; AVX1: vpmovsxbq + +; AVX2: test1: +; AVX2: vpmovsxbq +} + +define void @test2(<4 x i8>* %in, <4 x i64>* %out) nounwind { + %wide.load35 = load <4 x i8>* %in, align 1 + %sext = sext <4 x i8> %wide.load35 to <4 x i64> + store <4 x i64> zeroinitializer, <4 x i64>* undef, align 8 + store <4 x i64> %sext, <4 x i64>* %out, align 8 + ret void + +; AVX2: test2: +; AVX2: vpmovsxbq +} + +define void @test3(<4 x i8>* %in, <4 x i32>* %out) nounwind { + %wide.load35 = load <4 x i8>* %in, align 1 + %sext = sext <4 x i8> %wide.load35 to <4 x i32> + store <4 x i32> zeroinitializer, <4 x i32>* undef, align 8 + store <4 x i32> %sext, <4 x i32>* %out, align 8 + ret void + +; SSE41: test3: +; SSE41: pmovsxbd + +; AVX1: test3: +; AVX1: vpmovsxbd + +; AVX2: test3: +; AVX2: vpmovsxbd +} + +define void @test4(<8 x i8>* %in, <8 x i32>* %out) nounwind { + %wide.load35 = load <8 x i8>* %in, align 1 + %sext = sext <8 x i8> %wide.load35 to <8 x i32> + store <8 x i32> zeroinitializer, <8 x i32>* undef, align 8 + store <8 x i32> %sext, <8 x i32>* %out, align 8 + ret void + +; AVX2: test4: +; AVX2: vpmovsxbd +} + +define void @test5(<8 x i8>* %in, <8 x i16>* %out) nounwind { + %wide.load35 = load <8 x i8>* %in, align 1 + %sext = sext <8 x i8> %wide.load35 to <8 x i16> + store <8 x i16> zeroinitializer, <8 x i16>* undef, align 8 + store <8 x i16> %sext, <8 x i16>* %out, align 8 + ret void + +; SSE41: test5: +; SSE41: pmovsxbw + +; AVX1: test5: +; AVX1: vpmovsxbw + +; AVX2: test5: +; AVX2: vpmovsxbw +} + +define void @test6(<16 x i8>* %in, <16 x i16>* %out) nounwind { + %wide.load35 = load <16 x i8>* %in, align 1 + %sext = sext <16 x i8> %wide.load35 to <16 x i16> + store <16 x i16> zeroinitializer, <16 x i16>* undef, align 8 + store <16 x i16> %sext, <16 x i16>* %out, align 8 + ret void + +; AVX2: test6: +; FIXME: v16i8 -> v16i16 is scalarized. +; AVX2-NOT: pmovsx +} + +define void @test7(<2 x i16>* %in, <2 x i64>* %out) nounwind { + %wide.load35 = load <2 x i16>* %in, align 1 + %sext = sext <2 x i16> %wide.load35 to <2 x i64> + store <2 x i64> zeroinitializer, <2 x i64>* undef, align 8 + store <2 x i64> %sext, <2 x i64>* %out, align 8 + ret void + + +; SSE41: test7: +; SSE41: pmovsxwq + +; AVX1: test7: +; AVX1: vpmovsxwq + +; AVX2: test7: +; AVX2: vpmovsxwq +} + +define void @test8(<4 x i16>* %in, <4 x i64>* %out) nounwind { + %wide.load35 = load <4 x i16>* %in, align 1 + %sext = sext <4 x i16> %wide.load35 to <4 x i64> + store <4 x i64> zeroinitializer, <4 x i64>* undef, align 8 + store <4 x i64> %sext, <4 x i64>* %out, align 8 + ret void + +; AVX2: test8: +; AVX2: vpmovsxwq +} + +define void @test9(<4 x i16>* %in, <4 x i32>* %out) nounwind { + %wide.load35 = load <4 x i16>* %in, align 1 + %sext = sext <4 x i16> %wide.load35 to <4 x i32> + store <4 x i32> zeroinitializer, <4 x i32>* undef, align 8 + store <4 x i32> %sext, <4 x i32>* %out, align 8 + ret void + +; SSE41: test9: +; SSE41: pmovsxwd + +; AVX1: test9: +; AVX1: vpmovsxwd + +; AVX2: test9: +; AVX2: vpmovsxwd +} + +define void @test10(<8 x i16>* %in, <8 x i32>* %out) nounwind { + %wide.load35 = load <8 x i16>* %in, align 1 + %sext = sext <8 x i16> %wide.load35 to <8 x i32> + store <8 x i32> zeroinitializer, <8 x i32>* undef, align 8 + store <8 x i32> %sext, <8 x i32>* %out, align 8 + ret void + +; AVX2: test10: +; AVX2: vpmovsxwd +} + +define void @test11(<2 x i32>* %in, <2 x i64>* %out) nounwind { + %wide.load35 = load <2 x i32>* %in, align 1 + %sext = sext <2 x i32> %wide.load35 to <2 x i64> + store <2 x i64> zeroinitializer, <2 x i64>* undef, align 8 + store <2 x i64> %sext, <2 x i64>* %out, align 8 + ret void + +; SSE41: test11: +; SSE41: pmovsxdq + +; AVX1: test11: +; AVX1: vpmovsxdq + +; AVX2: test11: +; AVX2: vpmovsxdq +} + +define void @test12(<4 x i32>* %in, <4 x i64>* %out) nounwind { + %wide.load35 = load <4 x i32>* %in, align 1 + %sext = sext <4 x i32> %wide.load35 to <4 x i64> + store <4 x i64> zeroinitializer, <4 x i64>* undef, align 8 + store <4 x i64> %sext, <4 x i64>* %out, align 8 + ret void + +; AVX2: test12: +; AVX2: vpmovsxdq +}