[X86] Add back the -x86-experimental-vector-widening-legalization comand line flag and all associated code, but leave it enabled by default

Google is reporting performance issues with the new default behavior and have asked for a way to switch back to the old behavior while we investigate and make fixes. I've restored all of the code that had since been removed and added additional checks of the command flag onto code paths that are not otherwise guarded by a check of getTypeAction. I've also modified the cost model tables to hopefully get us back to the previous costs. Hopefully we won't need to support this for very long since we have no test coverage of the old behavior so we can very easily break it. llvm-svn: 369332
2019-08-20 06:58:00 +00:00 · 2019-08-20 06:58:00 +00:00 · 1ada137854
parent 12cbbab9d9
commit 1ada137854
2 changed files with 1229 additions and 145 deletions
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@ -50,6 +50,8 @@ using namespace llvm;

 #define DEBUG_TYPE "x86tti"

+extern cl::opt<bool> ExperimentalVectorWideningLegalization;
+
 //===----------------------------------------------------------------------===//
 //
 // X86 cost model.
@ -918,7 +920,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
      // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
      // vectors.
      int OrigSubElts = SubTp->getVectorNumElements();
-      if (NumSubElts > OrigSubElts &&
+      if (ExperimentalVectorWideningLegalization &&
+          NumSubElts > OrigSubElts &&
          (Index % OrigSubElts) == 0 && (NumSubElts % OrigSubElts) == 0 &&
          LT.second.getVectorElementType() ==
            SubLT.second.getVectorElementType() &&
@ -1330,6 +1333,12 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
  // TODO: For AVX512DQ + AVX512VL, we also have cheap casts for 128-bit and
  // 256-bit wide vectors.

+  // Used with widening legalization
+  static const TypeConversionCostTblEntry AVX512FConversionTblWide[] = {
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,   1 },
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,   1 },
+  };
+
  static const TypeConversionCostTblEntry AVX512FConversionTbl[] = {
    { ISD::FP_EXTEND, MVT::v8f64,   MVT::v8f32,  1 },
    { ISD::FP_EXTEND, MVT::v8f64,   MVT::v16f32, 3 },
@ -1347,8 +1356,6 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
-    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i8,   1 },
-    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i8,   1 },
    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i16,  1 },
    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
@ -1401,19 +1408,28 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
    { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v16f32, 2 },
  };

+  static const TypeConversionCostTblEntry AVX2ConversionTblWide[] = {
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,   1 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   1 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16,  1 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16,  1 },
+  };
+
  static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   3 },
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   3 },
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,   1 },
-    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,   1 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   1 },
-    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,   3 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,   3 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   3 },
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   3 },
    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16,  1 },
-    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16,  1 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16,  3 },
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
@ -1432,18 +1448,24 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  8 },
  };

+  static const TypeConversionCostTblEntry AVXConversionTblWide[] = {
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,  4 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  4 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 4 },
+  };
+
  static const TypeConversionCostTblEntry AVXConversionTbl[] = {
    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,  6 },
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,  4 },
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,  7 },
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,  4 },
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,  4 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i8,  6 },
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,  4 },
-    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  4 },
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  7 },
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  4 },
    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
-    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 4 },
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 6 },
    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i16, 3 },
    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16, 4 },
@ -1642,18 +1664,35 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                                                     SimpleDstTy, SimpleSrcTy))
        return Entry->Cost;

+    if (ST->hasAVX512() && ExperimentalVectorWideningLegalization)
+      if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTblWide, ISD,
+                                                     SimpleDstTy, SimpleSrcTy))
+        return Entry->Cost;
+
    if (ST->hasAVX512())
      if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
                                                     SimpleDstTy, SimpleSrcTy))
        return Entry->Cost;
  }

+  if (ST->hasAVX2() && ExperimentalVectorWideningLegalization) {
+    if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTblWide, ISD,
+                                                   SimpleDstTy, SimpleSrcTy))
+      return Entry->Cost;
+  }
+
  if (ST->hasAVX2()) {
    if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
                                                   SimpleDstTy, SimpleSrcTy))
      return Entry->Cost;
  }

+  if (ST->hasAVX() && ExperimentalVectorWideningLegalization) {
+    if (const auto *Entry = ConvertCostTableLookup(AVXConversionTblWide, ISD,
+                                                   SimpleDstTy, SimpleSrcTy))
+      return Entry->Cost;
+  }
+
  if (ST->hasAVX()) {
    if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
                                                   SimpleDstTy, SimpleSrcTy))
@ -2520,7 +2559,7 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
  // in the table.
  // FIXME: Is there a better way to do this?
  EVT VT = TLI->getValueType(DL, ValTy);
-  if (VT.isSimple()) {
+  if (VT.isSimple() && ExperimentalVectorWideningLegalization) {
    MVT MTy = VT.getSimpleVT();
    if (IsPairwise) {
      if (ST->hasAVX())