llvm-project/clang/utils/TableGen/NeonEmitter.cpp

//===- NeonEmitter.cpp - Generate arm_neon.h for use with clang -*- C++ -*-===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This tablegen backend is responsible for emitting arm_neon.h, which includes
// a declaration and definition of each function specified by the ARM NEON
// compiler interface.  See ARM document DUI0348B.
//
// Each NEON instruction is implemented in terms of 1 or more functions which
// are suffixed with the element type of the input vectors.  Functions may be
// implemented in terms of generic vector operations such as +, *, -, etc. or
// by calling a __builtin_-prefixed function which will be handled by clang's
// CodeGen library.
//
// Additional validation code can be generated by this file when runHeader() is
// called, rather than the normal run() entry point.  A complete set of tests
// for Neon intrinsics can be generated by calling the runTests() entry point.
//
//===----------------------------------------------------------------------===//

#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/TableGen/Error.h"
#include "llvm/TableGen/Record.h"
#include "llvm/TableGen/TableGenBackend.h"
#include <string>
using namespace llvm;

enum OpKind {
  OpNone,
  OpUnavailable,
  OpAdd,
  OpAddl,
  OpAddlHi,
  OpAddw,
  OpAddwHi,
  OpSub,
  OpSubl,
  OpSublHi,
  OpSubw,
  OpSubwHi,
  OpMul,
  OpMla,
  OpMlal,
  OpMullHi,
  OpMullHiN,
  OpMlalHi,
  OpMlalHiN,
  OpMls,
  OpMlsl,
  OpMlslHi,
  OpMlslHiN,
  OpMulN,
  OpMlaN,
  OpMlsN,
  OpFMlaN,
  OpFMlsN,
  OpMlalN,
  OpMlslN,
  OpMulLane,
  OpMulXLane,
  OpMullLane,
  OpMullHiLane,
  OpMlaLane,
  OpMlsLane,
  OpMlalLane,
  OpMlalHiLane,
  OpMlslLane,
  OpMlslHiLane,
  OpQDMullLane,
  OpQDMullHiLane,
  OpQDMlalLane,
  OpQDMlalHiLane,
  OpQDMlslLane,
  OpQDMlslHiLane,
  OpQDMulhLane,
  OpQRDMulhLane,
  OpFMSLane,
  OpFMSLaneQ,
  OpTrn1,
  OpZip1,
  OpUzp1,
  OpTrn2,
  OpZip2,
  OpUzp2,
  OpEq,
  OpGe,
  OpLe,
  OpGt,
  OpLt,
  OpNeg,
  OpNot,
  OpAnd,
  OpOr,
  OpXor,
  OpAndNot,
  OpOrNot,
  OpCast,
  OpConcat,
  OpDup,
  OpDupLane,
  OpHi,
  OpLo,
  OpSelect,
  OpRev16,
  OpRev32,
  OpRev64,
  OpXtnHi,
  OpSqxtunHi,
  OpQxtnHi,
  OpFcvtnHi,
  OpFcvtlHi,
  OpFcvtxnHi,
  OpReinterpret,
  OpAddhnHi,
  OpRAddhnHi,
  OpSubhnHi,
  OpRSubhnHi,
  OpAbdl,
  OpAbdlHi,
  OpAba,
  OpAbal,
  OpAbalHi,
  OpQDMullHi,
  OpQDMullHiN,
  OpQDMlalHi,
  OpQDMlalHiN,
  OpQDMlslHi,
  OpQDMlslHiN,
  OpDiv,
  OpLongHi,
  OpNarrowHi,
  OpMovlHi,
  OpCopyLane,
  OpCopyQLane,
  OpCopyLaneQ,
  OpScalarMulLane,
  OpScalarMulLaneQ,
  OpScalarMulXLane,
  OpScalarMulXLaneQ,
  OpScalarVMulXLane,
  OpScalarVMulXLaneQ,
  OpScalarQDMullLane,
  OpScalarQDMullLaneQ,
  OpScalarQDMulHiLane,
  OpScalarQDMulHiLaneQ,
  OpScalarQRDMulHiLane,
  OpScalarQRDMulHiLaneQ
};

enum ClassKind {
  ClassNone,
  ClassI,           // generic integer instruction, e.g., "i8" suffix
  ClassS,           // signed/unsigned/poly, e.g., "s8", "u8" or "p8" suffix
  ClassW,           // width-specific instruction, e.g., "8" suffix
  ClassB,           // bitcast arguments with enum argument to specify type
  ClassL,           // Logical instructions which are op instructions
                    // but we need to not emit any suffix for in our
                    // tests.
  ClassNoTest       // Instructions which we do not test since they are
                    // not TRUE instructions.
};

/// NeonTypeFlags - Flags to identify the types for overloaded Neon
/// builtins.  These must be kept in sync with the flags in
/// include/clang/Basic/TargetBuiltins.h.
namespace {
class NeonTypeFlags {
  enum {
    EltTypeMask = 0xf,
    UnsignedFlag = 0x10,
    QuadFlag = 0x20
  };
  uint32_t Flags;

public:
  enum EltType {
    Int8,
    Int16,
    Int32,
    Int64,
    Poly8,
    Poly16,
    Poly64,
    Float16,
    Float32,
    Float64
  };

  NeonTypeFlags(unsigned F) : Flags(F) {}
  NeonTypeFlags(EltType ET, bool IsUnsigned, bool IsQuad) : Flags(ET) {
    if (IsUnsigned)
      Flags |= UnsignedFlag;
    if (IsQuad)
      Flags |= QuadFlag;
  }

  uint32_t getFlags() const { return Flags; }
};
} // end anonymous namespace

namespace {
class NeonEmitter {
  RecordKeeper &Records;
  StringMap<OpKind> OpMap;
  DenseMap<Record*, ClassKind> ClassMap;

public:
  NeonEmitter(RecordKeeper &R) : Records(R) {
    OpMap["OP_NONE"]  = OpNone;
    OpMap["OP_UNAVAILABLE"] = OpUnavailable;
    OpMap["OP_ADD"]   = OpAdd;
    OpMap["OP_ADDL"]  = OpAddl;
    OpMap["OP_ADDLHi"] = OpAddlHi;
    OpMap["OP_ADDW"]  = OpAddw;
    OpMap["OP_ADDWHi"] = OpAddwHi;
    OpMap["OP_SUB"]   = OpSub;
    OpMap["OP_SUBL"]  = OpSubl;
    OpMap["OP_SUBLHi"] = OpSublHi;
    OpMap["OP_SUBW"]  = OpSubw;
    OpMap["OP_SUBWHi"] = OpSubwHi;
    OpMap["OP_MUL"]   = OpMul;
    OpMap["OP_MLA"]   = OpMla;
    OpMap["OP_MLAL"]  = OpMlal;
    OpMap["OP_MULLHi"]  = OpMullHi;
    OpMap["OP_MULLHi_N"]  = OpMullHiN;
    OpMap["OP_MLALHi"]  = OpMlalHi;
    OpMap["OP_MLALHi_N"]  = OpMlalHiN;
    OpMap["OP_MLS"]   = OpMls;
    OpMap["OP_MLSL"]  = OpMlsl;
    OpMap["OP_MLSLHi"] = OpMlslHi;
    OpMap["OP_MLSLHi_N"] = OpMlslHiN;
    OpMap["OP_MUL_N"] = OpMulN;
    OpMap["OP_MLA_N"] = OpMlaN;
    OpMap["OP_MLS_N"] = OpMlsN;
    OpMap["OP_FMLA_N"] = OpFMlaN;
    OpMap["OP_FMLS_N"] = OpFMlsN;
    OpMap["OP_MLAL_N"] = OpMlalN;
    OpMap["OP_MLSL_N"] = OpMlslN;
    OpMap["OP_MUL_LN"]= OpMulLane;
    OpMap["OP_MULX_LN"]= OpMulXLane;
    OpMap["OP_MULL_LN"] = OpMullLane;
    OpMap["OP_MULLHi_LN"] = OpMullHiLane;
    OpMap["OP_MLA_LN"]= OpMlaLane;
    OpMap["OP_MLS_LN"]= OpMlsLane;
    OpMap["OP_MLAL_LN"] = OpMlalLane;
    OpMap["OP_MLALHi_LN"] = OpMlalHiLane;
    OpMap["OP_MLSL_LN"] = OpMlslLane;
    OpMap["OP_MLSLHi_LN"] = OpMlslHiLane;
    OpMap["OP_QDMULL_LN"] = OpQDMullLane;
    OpMap["OP_QDMULLHi_LN"] = OpQDMullHiLane;
    OpMap["OP_QDMLAL_LN"] = OpQDMlalLane;
    OpMap["OP_QDMLALHi_LN"] = OpQDMlalHiLane;
    OpMap["OP_QDMLSL_LN"] = OpQDMlslLane;
    OpMap["OP_QDMLSLHi_LN"] = OpQDMlslHiLane;
    OpMap["OP_QDMULH_LN"] = OpQDMulhLane;
    OpMap["OP_QRDMULH_LN"] = OpQRDMulhLane;
    OpMap["OP_FMS_LN"] = OpFMSLane;
    OpMap["OP_FMS_LNQ"] = OpFMSLaneQ;
    OpMap["OP_TRN1"]  = OpTrn1;
    OpMap["OP_ZIP1"]  = OpZip1;
    OpMap["OP_UZP1"]  = OpUzp1;
    OpMap["OP_TRN2"]  = OpTrn2;
    OpMap["OP_ZIP2"]  = OpZip2;
    OpMap["OP_UZP2"]  = OpUzp2;
    OpMap["OP_EQ"]    = OpEq;
    OpMap["OP_GE"]    = OpGe;
    OpMap["OP_LE"]    = OpLe;
    OpMap["OP_GT"]    = OpGt;
    OpMap["OP_LT"]    = OpLt;
    OpMap["OP_NEG"]   = OpNeg;
    OpMap["OP_NOT"]   = OpNot;
    OpMap["OP_AND"]   = OpAnd;
    OpMap["OP_OR"]    = OpOr;
    OpMap["OP_XOR"]   = OpXor;
    OpMap["OP_ANDN"]  = OpAndNot;
    OpMap["OP_ORN"]   = OpOrNot;
    OpMap["OP_CAST"]  = OpCast;
    OpMap["OP_CONC"]  = OpConcat;
    OpMap["OP_HI"]    = OpHi;
    OpMap["OP_LO"]    = OpLo;
    OpMap["OP_DUP"]   = OpDup;
    OpMap["OP_DUP_LN"] = OpDupLane;
    OpMap["OP_SEL"]   = OpSelect;
    OpMap["OP_REV16"] = OpRev16;
    OpMap["OP_REV32"] = OpRev32;
    OpMap["OP_REV64"] = OpRev64;
    OpMap["OP_XTN"] = OpXtnHi;
    OpMap["OP_SQXTUN"] = OpSqxtunHi;
    OpMap["OP_QXTN"] = OpQxtnHi;
    OpMap["OP_VCVT_NA_HI"] = OpFcvtnHi;
    OpMap["OP_VCVT_EX_HI"] = OpFcvtlHi;
    OpMap["OP_VCVTX_HI"] = OpFcvtxnHi;
    OpMap["OP_REINT"] = OpReinterpret;
    OpMap["OP_ADDHNHi"] = OpAddhnHi;
    OpMap["OP_RADDHNHi"] = OpRAddhnHi;
    OpMap["OP_SUBHNHi"] = OpSubhnHi;
    OpMap["OP_RSUBHNHi"] = OpRSubhnHi;
    OpMap["OP_ABDL"]  = OpAbdl;
    OpMap["OP_ABDLHi"] = OpAbdlHi;
    OpMap["OP_ABA"]   = OpAba;
    OpMap["OP_ABAL"]  = OpAbal;
    OpMap["OP_ABALHi"] = OpAbalHi;
    OpMap["OP_QDMULLHi"] = OpQDMullHi;
    OpMap["OP_QDMULLHi_N"] = OpQDMullHiN;
    OpMap["OP_QDMLALHi"] = OpQDMlalHi;
    OpMap["OP_QDMLALHi_N"] = OpQDMlalHiN;
    OpMap["OP_QDMLSLHi"] = OpQDMlslHi;
    OpMap["OP_QDMLSLHi_N"] = OpQDMlslHiN;
    OpMap["OP_DIV"] = OpDiv;
    OpMap["OP_LONG_HI"] = OpLongHi;
    OpMap["OP_NARROW_HI"] = OpNarrowHi;
    OpMap["OP_MOVL_HI"] = OpMovlHi;
    OpMap["OP_COPY_LN"] = OpCopyLane;
    OpMap["OP_COPYQ_LN"] = OpCopyQLane;
    OpMap["OP_COPY_LNQ"] = OpCopyLaneQ;
    OpMap["OP_SCALAR_MUL_LN"]= OpScalarMulLane;
    OpMap["OP_SCALAR_MUL_LNQ"]= OpScalarMulLaneQ;
    OpMap["OP_SCALAR_MULX_LN"]= OpScalarMulXLane;
    OpMap["OP_SCALAR_MULX_LNQ"]= OpScalarMulXLaneQ;
    OpMap["OP_SCALAR_VMULX_LN"]= OpScalarVMulXLane;
    OpMap["OP_SCALAR_VMULX_LNQ"]= OpScalarVMulXLaneQ;
    OpMap["OP_SCALAR_QDMULL_LN"] = OpScalarQDMullLane;
    OpMap["OP_SCALAR_QDMULL_LNQ"] = OpScalarQDMullLaneQ;
    OpMap["OP_SCALAR_QDMULH_LN"] = OpScalarQDMulHiLane;
    OpMap["OP_SCALAR_QDMULH_LNQ"] = OpScalarQDMulHiLaneQ;
    OpMap["OP_SCALAR_QRDMULH_LN"] = OpScalarQRDMulHiLane;
    OpMap["OP_SCALAR_QRDMULH_LNQ"] = OpScalarQRDMulHiLaneQ;


    Record *SI = R.getClass("SInst");
    Record *II = R.getClass("IInst");
    Record *WI = R.getClass("WInst");
    Record *SOpI = R.getClass("SOpInst");
    Record *IOpI = R.getClass("IOpInst");
    Record *WOpI = R.getClass("WOpInst");
    Record *LOpI = R.getClass("LOpInst");
    Record *NoTestOpI = R.getClass("NoTestOpInst");

    ClassMap[SI] = ClassS;
    ClassMap[II] = ClassI;
    ClassMap[WI] = ClassW;
    ClassMap[SOpI] = ClassS;
    ClassMap[IOpI] = ClassI;
    ClassMap[WOpI] = ClassW;
    ClassMap[LOpI] = ClassL;
    ClassMap[NoTestOpI] = ClassNoTest;
  }

  // run - Emit arm_neon.h.inc
  void run(raw_ostream &o);

  // runHeader - Emit all the __builtin prototypes used in arm_neon.h
  void runHeader(raw_ostream &o);

  // runTests - Emit tests for all the Neon intrinsics.
  void runTests(raw_ostream &o);

private:
  void emitIntrinsic(raw_ostream &OS, Record *R,
                     StringMap<ClassKind> &EmittedMap);
  void genBuiltinsDef(raw_ostream &OS, StringMap<ClassKind> &A64IntrinsicMap,
                      bool isA64GenBuiltinDef);
  void genOverloadTypeCheckCode(raw_ostream &OS,
                                StringMap<ClassKind> &A64IntrinsicMap,
                                bool isA64TypeCheck);
  void genIntrinsicRangeCheckCode(raw_ostream &OS,
                                  StringMap<ClassKind> &A64IntrinsicMap,
                                  bool isA64RangeCheck);
  void genTargetTest(raw_ostream &OS, StringMap<OpKind> &EmittedMap,
                     bool isA64TestGen);
};
} // end anonymous namespace

/// ParseTypes - break down a string such as "fQf" into a vector of StringRefs,
/// which each StringRef representing a single type declared in the string.
/// for "fQf" we would end up with 2 StringRefs, "f", and "Qf", representing
/// 2xfloat and 4xfloat respectively.
static void ParseTypes(Record *r, std::string &s,
                       SmallVectorImpl<StringRef> &TV) {
  const char *data = s.data();
  int len = 0;

  for (unsigned i = 0, e = s.size(); i != e; ++i, ++len) {
    if (data[len] == 'P' || data[len] == 'Q' || data[len] == 'U'
                         || data[len] == 'H' || data[len] == 'S')
      continue;

    switch (data[len]) {
      case 'c':
      case 's':
      case 'i':
      case 'l':
      case 'h':
      case 'f':
      case 'd':
        break;
      default:
        PrintFatalError(r->getLoc(),
                      "Unexpected letter: " + std::string(data + len, 1));
    }
    TV.push_back(StringRef(data, len + 1));
    data += len + 1;
    len = -1;
  }
}

/// Widen - Convert a type code into the next wider type.  char -> short,
/// short -> int, etc.
static char Widen(const char t) {
  switch (t) {
    case 'c':
      return 's';
    case 's':
      return 'i';
    case 'i':
      return 'l';
    case 'h':
      return 'f';
    case 'f':
      return 'd';
    default:
      PrintFatalError("unhandled type in widen!");
  }
}

/// Narrow - Convert a type code into the next smaller type.  short -> char,
/// float -> half float, etc.
static char Narrow(const char t) {
  switch (t) {
    case 's':
      return 'c';
    case 'i':
      return 's';
    case 'l':
      return 'i';
    case 'f':
      return 'h';
    case 'd':
      return 'f';
    default:
      PrintFatalError("unhandled type in narrow!");
  }
}

static std::string GetNarrowTypestr(StringRef ty)
{
  std::string s;
  for (size_t i = 0, end = ty.size(); i < end; i++) {
    switch (ty[i]) {
      case 's':
        s += 'c';
        break;
      case 'i':
        s += 's';
        break;
      case 'l':
        s += 'i';
        break;
      default:
        s += ty[i];
        break;
    }
  }

  return s;
}

/// For a particular StringRef, return the base type code, and whether it has
/// the quad-vector, polynomial, or unsigned modifiers set.
static char ClassifyType(StringRef ty, bool &quad, bool &poly, bool &usgn) {
  unsigned off = 0;
  // ignore scalar.
  if (ty[off] == 'S') {
    ++off;
  }
  // remember quad.
  if (ty[off] == 'Q' || ty[off] == 'H') {
    quad = true;
    ++off;
  }

  // remember poly.
  if (ty[off] == 'P') {
    poly = true;
    ++off;
  }

  // remember unsigned.
  if (ty[off] == 'U') {
    usgn = true;
    ++off;
  }

  // base type to get the type string for.
  return ty[off];
}

/// ModType - Transform a type code and its modifiers based on a mod code. The
/// mod code definitions may be found at the top of arm_neon.td.
static char ModType(const char mod, char type, bool &quad, bool &poly,
                    bool &usgn, bool &scal, bool &cnst, bool &pntr) {
  switch (mod) {
    case 't':
      if (poly) {
        poly = false;
        usgn = true;
      }
      break;
    case 'b':
      scal = true;
    case 'u':
      usgn = true;
      poly = false;
      if (type == 'f')
        type = 'i';
      if (type == 'd')
        type = 'l';
      break;
    case '$':
      scal = true;
    case 'x':
      usgn = false;
      poly = false;
      if (type == 'f')
        type = 'i';
      if (type == 'd')
        type = 'l';
      break;
    case 'o':
      scal = true;
      type = 'd';
      usgn = false;
      break;
    case 'y':
      scal = true;
    case 'f':
      if (type == 'h')
        quad = true;
      type = 'f';
      usgn = false;
      break;
    case 'F':
      type = 'd';
      usgn = false;
      break;
    case 'g':
      quad = false;
      break;
    case 'B':
    case 'C':
    case 'D':
    case 'j':
      quad = true;
      break;
    case 'w':
      type = Widen(type);
      quad = true;
      break;
    case 'n':
      type = Widen(type);
      break;
    case 'i':
      type = 'i';
      scal = true;
      break;
    case 'l':
      type = 'l';
      scal = true;
      usgn = true;
      break;
    case 'z':
      type = Narrow(type);
      scal = true;
      break;
    case 'r':
      type = Widen(type);
      scal = true;
      break;
    case 's':
    case 'a':
      scal = true;
      break;
    case 'k':
      quad = true;
      break;
    case 'c':
      cnst = true;
    case 'p':
      pntr = true;
      scal = true;
      break;
    case 'h':
      type = Narrow(type);
      if (type == 'h')
        quad = false;
      break;
    case 'q':
      type = Narrow(type);
      quad = true;
      break;
    case 'e':
      type = Narrow(type);
      usgn = true;
      break;
    case 'm':
      type = Narrow(type);
      quad = false;
      break;
    default:
      break;
  }
  return type;
}

static bool IsMultiVecProto(const char p) {
  return ((p >= '2' && p <= '4') || (p >= 'B' && p <= 'D'));
}

/// TypeString - for a modifier and type, generate the name of the typedef for
/// that type.  QUc -> uint8x8_t.
static std::string TypeString(const char mod, StringRef typestr) {
  bool quad = false;
  bool poly = false;
  bool usgn = false;
  bool scal = false;
  bool cnst = false;
  bool pntr = false;

  if (mod == 'v')
    return "void";
  if (mod == 'i')
    return "int";

  // base type to get the type string for.
  char type = ClassifyType(typestr, quad, poly, usgn);

  // Based on the modifying character, change the type and width if necessary.
  type = ModType(mod, type, quad, poly, usgn, scal, cnst, pntr);

  SmallString<128> s;

  if (usgn)
    s.push_back('u');

  switch (type) {
    case 'c':
      s += poly ? "poly8" : "int8";
      if (scal)
        break;
      s += quad ? "x16" : "x8";
      break;
    case 's':
      s += poly ? "poly16" : "int16";
      if (scal)
        break;
      s += quad ? "x8" : "x4";
      break;
    case 'i':
      s += "int32";
      if (scal)
        break;
      s += quad ? "x4" : "x2";
      break;
    case 'l':
      s += (poly && !usgn)? "poly64" : "int64";
      if (scal)
        break;
      s += quad ? "x2" : "x1";
      break;
    case 'h':
      s += "float16";
      if (scal)
        break;
      s += quad ? "x8" : "x4";
      break;
    case 'f':
      s += "float32";
      if (scal)
        break;
      s += quad ? "x4" : "x2";
      break;
    case 'd':
      s += "float64";
      if (scal)
        break;
      s += quad ? "x2" : "x1";
      break;

    default:
      PrintFatalError("unhandled type!");
  }

  if (mod == '2' || mod == 'B')
    s += "x2";
  if (mod == '3' || mod == 'C')
    s += "x3";
  if (mod == '4' || mod == 'D')
    s += "x4";

  // Append _t, finishing the type string typedef type.
  s += "_t";

  if (cnst)
    s += " const";

  if (pntr)
    s += " *";

  return s.str();
}

/// BuiltinTypeString - for a modifier and type, generate the clang
/// BuiltinsARM.def prototype code for the function.  See the top of clang's
/// Builtins.def for a description of the type strings.
static std::string BuiltinTypeString(const char mod, StringRef typestr,
                                     ClassKind ck, bool ret) {
  bool quad = false;
  bool poly = false;
  bool usgn = false;
  bool scal = false;
  bool cnst = false;
  bool pntr = false;

  if (mod == 'v')
    return "v"; // void
  if (mod == 'i')
    return "i"; // int

  // base type to get the type string for.
  char type = ClassifyType(typestr, quad, poly, usgn);

  // Based on the modifying character, change the type and width if necessary.
  type = ModType(mod, type, quad, poly, usgn, scal, cnst, pntr);

  // All pointers are void* pointers.  Change type to 'v' now.
  if (pntr) {
    usgn = false;
    poly = false;
    type = 'v';
  }
  // Treat half-float ('h') types as unsigned short ('s') types.
  if (type == 'h') {
    type = 's';
    usgn = true;
  }
  usgn = usgn | poly | ((ck == ClassI || ck == ClassW) &&
                         scal && type != 'f' && type != 'd');

  if (scal) {
    SmallString<128> s;

    if (usgn)
      s.push_back('U');
    else if (type == 'c')
      s.push_back('S'); // make chars explicitly signed

    if (type == 'l') // 64-bit long
      s += "LLi";
    else
      s.push_back(type);

    if (cnst)
      s.push_back('C');
    if (pntr)
      s.push_back('*');
    return s.str();
  }

  // Since the return value must be one type, return a vector type of the
  // appropriate width which we will bitcast.  An exception is made for
  // returning structs of 2, 3, or 4 vectors which are returned in a sret-like
  // fashion, storing them to a pointer arg.
  if (ret) {
    if (IsMultiVecProto(mod))
      return "vv*"; // void result with void* first argument
    if (mod == 'f' || (ck != ClassB && type == 'f'))
      return quad ? "V4f" : "V2f";
    if (mod == 'F' || (ck != ClassB && type == 'd'))
      return quad ? "V2d" : "V1d";
    if (ck != ClassB && type == 's')
      return quad ? "V8s" : "V4s";
    if (ck != ClassB && type == 'i')
      return quad ? "V4i" : "V2i";
    if (ck != ClassB && type == 'l')
      return quad ? "V2LLi" : "V1LLi";

    return quad ? "V16Sc" : "V8Sc";
  }

  // Non-return array types are passed as individual vectors.
  if (mod == '2' || mod == 'B')
    return quad ? "V16ScV16Sc" : "V8ScV8Sc";
  if (mod == '3' || mod == 'C')
    return quad ? "V16ScV16ScV16Sc" : "V8ScV8ScV8Sc";
  if (mod == '4' || mod == 'D')
    return quad ? "V16ScV16ScV16ScV16Sc" : "V8ScV8ScV8ScV8Sc";

  if (mod == 'f' || (ck != ClassB && type == 'f'))
    return quad ? "V4f" : "V2f";
  if (mod == 'F' || (ck != ClassB && type == 'd'))
    return quad ? "V2d" : "V1d";
  if (ck != ClassB && type == 's')
    return quad ? "V8s" : "V4s";
  if (ck != ClassB && type == 'i')
    return quad ? "V4i" : "V2i";
  if (ck != ClassB && type == 'l')
    return quad ? "V2LLi" : "V1LLi";

  return quad ? "V16Sc" : "V8Sc";
}

/// InstructionTypeCode - Computes the ARM argument character code and
/// quad status for a specific type string and ClassKind.
static void InstructionTypeCode(const StringRef &typeStr,
                                const ClassKind ck,
                                bool &quad,
                                std::string &typeCode) {
  bool poly = false;
  bool usgn = false;
  char type = ClassifyType(typeStr, quad, poly, usgn);

  switch (type) {
  case 'c':
    switch (ck) {
    case ClassS: typeCode = poly ? "p8" : usgn ? "u8" : "s8"; break;
    case ClassI: typeCode = "i8"; break;
    case ClassW: typeCode = "8"; break;
    default: break;
    }
    break;
  case 's':
    switch (ck) {
    case ClassS: typeCode = poly ? "p16" : usgn ? "u16" : "s16"; break;
    case ClassI: typeCode = "i16"; break;
    case ClassW: typeCode = "16"; break;
    default: break;
    }
    break;
  case 'i':
    switch (ck) {
    case ClassS: typeCode = usgn ? "u32" : "s32"; break;
    case ClassI: typeCode = "i32"; break;
    case ClassW: typeCode = "32"; break;
    default: break;
    }
    break;
  case 'l':
    switch (ck) {
    case ClassS: typeCode = poly ? "p64" : usgn ? "u64" : "s64"; break;
    case ClassI: typeCode = "i64"; break;
    case ClassW: typeCode = "64"; break;
    default: break;
    }
    break;
  case 'h':
    switch (ck) {
    case ClassS:
    case ClassI: typeCode = "f16"; break;
    case ClassW: typeCode = "16"; break;
    default: break;
    }
    break;
  case 'f':
    switch (ck) {
    case ClassS:
    case ClassI: typeCode = "f32"; break;
    case ClassW: typeCode = "32"; break;
    default: break;
    }
    break;
  case 'd':
    switch (ck) {
    case ClassS:
    case ClassI:
      typeCode += "f64";
      break;
    case ClassW:
      PrintFatalError("unhandled type!");
    default:
      break;
    }
    break;
  default:
    PrintFatalError("unhandled type!");
  }
}

static char Insert_BHSD_Suffix(StringRef typestr){
  unsigned off = 0;
  if(typestr[off++] == 'S'){
    while(typestr[off] == 'Q' || typestr[off] == 'H'||
          typestr[off] == 'P' || typestr[off] == 'U')
      ++off;
    switch (typestr[off]){
    default  : break;
    case 'c' : return 'b';
    case 's' : return 'h';
    case 'i' :
    case 'f' : return 's';
    case 'l' :
    case 'd' : return 'd';
    }
  }
  return 0;
}

static bool endsWith_xN(std::string const &name) {
  if (name.length() > 3) {
    if (name.compare(name.length() - 3, 3, "_x2") == 0 ||
        name.compare(name.length() - 3, 3, "_x3") == 0 ||
        name.compare(name.length() - 3, 3, "_x4") == 0)
      return true;
  }
  return false;
}

/// MangleName - Append a type or width suffix to a base neon function name,
/// and insert a 'q' in the appropriate location if type string starts with 'Q'.
/// E.g. turn "vst2_lane" into "vst2q_lane_f32", etc.
/// Insert proper 'b' 'h' 's' 'd' if prefix 'S' is used.
static std::string MangleName(const std::string &name, StringRef typestr,
                              ClassKind ck) {
  if (name == "vcvt_f32_f16" || name == "vcvt_f32_f64" ||
      name == "vcvt_f64_f32")
    return name;

  bool quad = false;
  std::string typeCode = "";

  InstructionTypeCode(typestr, ck, quad, typeCode);

  std::string s = name;

  if (typeCode.size() > 0) {
    // If the name is end with _xN (N = 2,3,4), insert the typeCode before _xN.
    if (endsWith_xN(s))
      s.insert(s.length() - 3, "_" + typeCode);
    else
      s += "_" + typeCode;
  }

  if (ck == ClassB)
    s += "_v";

  // Insert a 'q' before the first '_' character so that it ends up before
  // _lane or _n on vector-scalar operations.
  if (typestr.find("Q") != StringRef::npos) {
      size_t pos = s.find('_');
      s = s.insert(pos, "q");
  }
  char ins = Insert_BHSD_Suffix(typestr);
  if(ins){
    size_t pos = s.find('_');
    s = s.insert(pos, &ins, 1);
  }

  return s;
}

static void PreprocessInstruction(const StringRef &Name,
                                  const std::string &InstName,
                                  std::string &Prefix,
                                  bool &HasNPostfix,
                                  bool &HasLanePostfix,
                                  bool &HasDupPostfix,
                                  bool &IsSpecialVCvt,
                                  size_t &TBNumber) {
  // All of our instruction name fields from arm_neon.td are of the form
  //   <instructionname>_...
  // Thus we grab our instruction name via computation of said Prefix.
  const size_t PrefixEnd = Name.find_first_of('_');
  // If InstName is passed in, we use that instead of our name Prefix.
  Prefix = InstName.size() == 0? Name.slice(0, PrefixEnd).str() : InstName;

  const StringRef Postfix = Name.slice(PrefixEnd, Name.size());

  HasNPostfix = Postfix.count("_n");
  HasLanePostfix = Postfix.count("_lane");
  HasDupPostfix = Postfix.count("_dup");
  IsSpecialVCvt = Postfix.size() != 0 && Name.count("vcvt");

  if (InstName.compare("vtbl") == 0 ||
      InstName.compare("vtbx") == 0) {
    // If we have a vtblN/vtbxN instruction, use the instruction's ASCII
    // encoding to get its true value.
    TBNumber = Name[Name.size()-1] - 48;
  }
}

/// GenerateRegisterCheckPatternsForLoadStores - Given a bunch of data we have
/// extracted, generate a FileCheck pattern for a Load Or Store
static void
GenerateRegisterCheckPatternForLoadStores(const StringRef &NameRef,
                                          const std::string& OutTypeCode,
                                          const bool &IsQuad,
                                          const bool &HasDupPostfix,
                                          const bool &HasLanePostfix,
                                          const size_t Count,
                                          std::string &RegisterSuffix) {
  const bool IsLDSTOne = NameRef.count("vld1") || NameRef.count("vst1");
  // If N == 3 || N == 4 and we are dealing with a quad instruction, Clang
  // will output a series of v{ld,st}1s, so we have to handle it specially.
  if ((Count == 3 || Count == 4) && IsQuad) {
    RegisterSuffix += "{";
    for (size_t i = 0; i < Count; i++) {
      RegisterSuffix += "d{{[0-9]+}}";
      if (HasDupPostfix) {
        RegisterSuffix += "[]";
      }
      if (HasLanePostfix) {
        RegisterSuffix += "[{{[0-9]+}}]";
      }
      if (i < Count-1) {
        RegisterSuffix += ", ";
      }
    }
    RegisterSuffix += "}";
  } else {

    // Handle normal loads and stores.
    RegisterSuffix += "{";
    for (size_t i = 0; i < Count; i++) {
      RegisterSuffix += "d{{[0-9]+}}";
      if (HasDupPostfix) {
        RegisterSuffix += "[]";
      }
      if (HasLanePostfix) {
        RegisterSuffix += "[{{[0-9]+}}]";
      }
      if (IsQuad && !HasLanePostfix) {
        RegisterSuffix += ", d{{[0-9]+}}";
        if (HasDupPostfix) {
          RegisterSuffix += "[]";
        }
      }
      if (i < Count-1) {
        RegisterSuffix += ", ";
      }
    }
    RegisterSuffix += "}, [r{{[0-9]+}}";

    // We only include the alignment hint if we have a vld1.*64 or
    // a dup/lane instruction.
    if (IsLDSTOne) {
      if ((HasLanePostfix || HasDupPostfix) && OutTypeCode != "8") {
        RegisterSuffix += ":" + OutTypeCode;
      }
    }

    RegisterSuffix += "]";
  }
}

static bool HasNPostfixAndScalarArgs(const StringRef &NameRef,
                                     const bool &HasNPostfix) {
  return (NameRef.count("vmla") ||
          NameRef.count("vmlal") ||
          NameRef.count("vmlsl") ||
          NameRef.count("vmull") ||
          NameRef.count("vqdmlal") ||
          NameRef.count("vqdmlsl") ||
          NameRef.count("vqdmulh") ||
          NameRef.count("vqdmull") ||
          NameRef.count("vqrdmulh")) && HasNPostfix;
}

static bool IsFiveOperandLaneAccumulator(const StringRef &NameRef,
                                         const bool &HasLanePostfix) {
  return (NameRef.count("vmla") ||
          NameRef.count("vmls") ||
          NameRef.count("vmlal") ||
          NameRef.count("vmlsl") ||
          (NameRef.count("vmul") && NameRef.size() == 3)||
          NameRef.count("vqdmlal") ||
          NameRef.count("vqdmlsl") ||
          NameRef.count("vqdmulh") ||
          NameRef.count("vqrdmulh")) && HasLanePostfix;
}

static bool IsSpecialLaneMultiply(const StringRef &NameRef,
                                  const bool &HasLanePostfix,
                                  const bool &IsQuad) {
  const bool IsVMulOrMulh = (NameRef.count("vmul") || NameRef.count("mulh"))
                               && IsQuad;
  const bool IsVMull = NameRef.count("mull") && !IsQuad;
  return (IsVMulOrMulh || IsVMull) && HasLanePostfix;
}

static void NormalizeProtoForRegisterPatternCreation(const std::string &Name,
                                                     const std::string &Proto,
                                                     const bool &HasNPostfix,
                                                     const bool &IsQuad,
                                                     const bool &HasLanePostfix,
                                                     const bool &HasDupPostfix,
                                                     std::string &NormedProto) {
  // Handle generic case.
  const StringRef NameRef(Name);
  for (size_t i = 0, end = Proto.size(); i < end; i++) {
    switch (Proto[i]) {
    case 'u':
    case 'f':
    case 'F':
    case 'd':
    case 's':
    case 'x':
    case 't':
    case 'n':
      NormedProto += IsQuad? 'q' : 'd';
      break;
    case 'w':
    case 'k':
      NormedProto += 'q';
      break;
    case 'g':
    case 'j':
    case 'h':
    case 'e':
      NormedProto += 'd';
      break;
    case 'i':
      NormedProto += HasLanePostfix? 'a' : 'i';
      break;
    case 'a':
      if (HasLanePostfix) {
        NormedProto += 'a';
      } else if (HasNPostfixAndScalarArgs(NameRef, HasNPostfix)) {
        NormedProto += IsQuad? 'q' : 'd';
      } else {
        NormedProto += 'i';
      }
      break;
    }
  }

  // Handle Special Cases.
  const bool IsNotVExt = !NameRef.count("vext");
  const bool IsVPADAL = NameRef.count("vpadal");
  const bool Is5OpLaneAccum = IsFiveOperandLaneAccumulator(NameRef,
                                                           HasLanePostfix);
  const bool IsSpecialLaneMul = IsSpecialLaneMultiply(NameRef, HasLanePostfix,
                                                      IsQuad);

  if (IsSpecialLaneMul) {
    // If
    NormedProto[2] = NormedProto[3];
    NormedProto.erase(3);
  } else if (NormedProto.size() == 4 &&
             NormedProto[0] == NormedProto[1] &&
             IsNotVExt) {
    // If NormedProto.size() == 4 and the first two proto characters are the
    // same, ignore the first.
    NormedProto = NormedProto.substr(1, 3);
  } else if (Is5OpLaneAccum) {
    // If we have a 5 op lane accumulator operation, we take characters 1,2,4
    std::string tmp = NormedProto.substr(1,2);
    tmp += NormedProto[4];
    NormedProto = tmp;
  } else if (IsVPADAL) {
    // If we have VPADAL, ignore the first character.
    NormedProto = NormedProto.substr(0, 2);
  } else if (NameRef.count("vdup") && NormedProto.size() > 2) {
    // If our instruction is a dup instruction, keep only the first and
    // last characters.
    std::string tmp = "";
    tmp += NormedProto[0];
    tmp += NormedProto[NormedProto.size()-1];
    NormedProto = tmp;
  }
}

/// GenerateRegisterCheckPatterns - Given a bunch of data we have
/// extracted, generate a FileCheck pattern to check that an
/// instruction's arguments are correct.
static void GenerateRegisterCheckPattern(const std::string &Name,
                                         const std::string &Proto,
                                         const std::string &OutTypeCode,
                                         const bool &HasNPostfix,
                                         const bool &IsQuad,
                                         const bool &HasLanePostfix,
                                         const bool &HasDupPostfix,
                                         const size_t &TBNumber,
                                         std::string &RegisterSuffix) {

  RegisterSuffix = "";

  const StringRef NameRef(Name);

  if ((NameRef.count("vdup") || NameRef.count("vmov")) && HasNPostfix) {
    return;
  }

  const bool IsLoadStore = NameRef.count("vld") || NameRef.count("vst");
  const bool IsTBXOrTBL = NameRef.count("vtbl") || NameRef.count("vtbx");

  if (IsLoadStore) {
    // Grab N value from  v{ld,st}N using its ascii representation.
    const size_t Count = NameRef[3] - 48;

    GenerateRegisterCheckPatternForLoadStores(NameRef, OutTypeCode, IsQuad,
                                              HasDupPostfix, HasLanePostfix,
                                              Count, RegisterSuffix);
  } else if (IsTBXOrTBL) {
    RegisterSuffix += "d{{[0-9]+}}, {";
    for (size_t i = 0; i < TBNumber-1; i++) {
      RegisterSuffix += "d{{[0-9]+}}, ";
    }
    RegisterSuffix += "d{{[0-9]+}}}, d{{[0-9]+}}";
  } else {
    // Handle a normal instruction.
    if (NameRef.count("vget") || NameRef.count("vset"))
      return;

    // We first normalize our proto, since we only need to emit 4
    // different types of checks, yet have more than 4 proto types
    // that map onto those 4 patterns.
    std::string NormalizedProto("");
    NormalizeProtoForRegisterPatternCreation(Name, Proto, HasNPostfix, IsQuad,
                                             HasLanePostfix, HasDupPostfix,
                                             NormalizedProto);

    for (size_t i = 0, end = NormalizedProto.size(); i < end; i++) {
      const char &c = NormalizedProto[i];
      switch (c) {
      case 'q':
        RegisterSuffix += "q{{[0-9]+}}, ";
        break;

      case 'd':
        RegisterSuffix += "d{{[0-9]+}}, ";
        break;

      case 'i':
        RegisterSuffix += "#{{[0-9]+}}, ";
        break;

      case 'a':
        RegisterSuffix += "d{{[0-9]+}}[{{[0-9]}}], ";
        break;
      }
    }

    // Remove extra ", ".
    RegisterSuffix = RegisterSuffix.substr(0, RegisterSuffix.size()-2);
  }
}

/// GenerateChecksForIntrinsic - Given a specific instruction name +
/// typestr + class kind, generate the proper set of FileCheck
/// Patterns to check for. We could just return a string, but instead
/// use a vector since it provides us with the extra flexibility of
/// emitting multiple checks, which comes in handy for certain cases
/// like mla where we want to check for 2 different instructions.
static void GenerateChecksForIntrinsic(const std::string &Name,
                                       const std::string &Proto,
                                       StringRef &OutTypeStr,
                                       StringRef &InTypeStr,
                                       ClassKind Ck,
                                       const std::string &InstName,
                                       bool IsHiddenLOp,
                                       std::vector<std::string>& Result) {

  // If Ck is a ClassNoTest instruction, just return so no test is
  // emitted.
  if(Ck == ClassNoTest)
    return;

  if (Name == "vcvt_f32_f16") {
    Result.push_back("vcvt.f32.f16");
    return;
  }


  // Now we preprocess our instruction given the data we have to get the
  // data that we need.
  // Create a StringRef for String Manipulation of our Name.
  const StringRef NameRef(Name);
  // Instruction Prefix.
  std::string Prefix;
  // The type code for our out type string.
  std::string OutTypeCode;
  // To handle our different cases, we need to check for different postfixes.
  // Is our instruction a quad instruction.
  bool IsQuad = false;
  // Our instruction is of the form <instructionname>_n.
  bool HasNPostfix = false;
  // Our instruction is of the form <instructionname>_lane.
  bool HasLanePostfix = false;
  // Our instruction is of the form <instructionname>_dup.
  bool HasDupPostfix  = false;
  // Our instruction is a vcvt instruction which requires special handling.
  bool IsSpecialVCvt = false;
  // If we have a vtbxN or vtblN instruction, this is set to N.
  size_t TBNumber = -1;
  // Register Suffix
  std::string RegisterSuffix;

  PreprocessInstruction(NameRef, InstName, Prefix,
                        HasNPostfix, HasLanePostfix, HasDupPostfix,
                        IsSpecialVCvt, TBNumber);

  InstructionTypeCode(OutTypeStr, Ck, IsQuad, OutTypeCode);
  GenerateRegisterCheckPattern(Name, Proto, OutTypeCode, HasNPostfix, IsQuad,
                               HasLanePostfix, HasDupPostfix, TBNumber,
                               RegisterSuffix);

  // In the following section, we handle a bunch of special cases. You can tell
  // a special case by the fact we are returning early.

  // If our instruction is a logical instruction without postfix or a
  // hidden LOp just return the current Prefix.
  if (Ck == ClassL || IsHiddenLOp) {
    Result.push_back(Prefix + " " + RegisterSuffix);
    return;
  }

  // If we have a vmov, due to the many different cases, some of which
  // vary within the different intrinsics generated for a single
  // instruction type, just output a vmov. (e.g. given an instruction
  // A, A.u32 might be vmov and A.u8 might be vmov.8).
  //
  // FIXME: Maybe something can be done about this. The two cases that we care
  // about are vmov as an LType and vmov as a WType.
  if (Prefix == "vmov") {
    Result.push_back(Prefix + " " + RegisterSuffix);
    return;
  }

  // In the following section, we handle special cases.

  if (OutTypeCode == "64") {
    // If we have a 64 bit vdup/vext and are handling an uint64x1_t
    // type, the intrinsic will be optimized away, so just return
    // nothing.  On the other hand if we are handling an uint64x2_t
    // (i.e. quad instruction), vdup/vmov instructions should be
    // emitted.
    if (Prefix == "vdup" || Prefix == "vext") {
      if (IsQuad) {
        Result.push_back("{{vmov|vdup}}");
      }
      return;
    }

    // v{st,ld}{2,3,4}_{u,s}64 emit v{st,ld}1.64 instructions with
    // multiple register operands.
    bool MultiLoadPrefix = Prefix == "vld2" || Prefix == "vld3"
                            || Prefix == "vld4";
    bool MultiStorePrefix = Prefix == "vst2" || Prefix == "vst3"
                            || Prefix == "vst4";
    if (MultiLoadPrefix || MultiStorePrefix) {
      Result.push_back(NameRef.slice(0, 3).str() + "1.64");
      return;
    }

    // v{st,ld}1_{lane,dup}_{u64,s64} use vldr/vstr/vmov/str instead of
    // emitting said instructions. So return a check for
    // vldr/vstr/vmov/str instead.
    if (HasLanePostfix || HasDupPostfix) {
      if (Prefix == "vst1") {
        Result.push_back("{{str|vstr|vmov}}");
        return;
      } else if (Prefix == "vld1") {
        Result.push_back("{{ldr|vldr|vmov}}");
        return;
      }
    }
  }

  // vzip.32/vuzp.32 are the same instruction as vtrn.32 and are
  // sometimes disassembled as vtrn.32. We use a regex to handle both
  // cases.
  if ((Prefix == "vzip" || Prefix == "vuzp") && OutTypeCode == "32") {
    Result.push_back("{{vtrn|" + Prefix + "}}.32 " + RegisterSuffix);
    return;
  }

  // Currently on most ARM processors, we do not use vmla/vmls for
  // quad floating point operations. Instead we output vmul + vadd. So
  // check if we have one of those instructions and just output a
  // check for vmul.
  if (OutTypeCode == "f32") {
    if (Prefix == "vmls") {
      Result.push_back("vmul." + OutTypeCode + " " + RegisterSuffix);
      Result.push_back("vsub." + OutTypeCode);
      return;
    } else if (Prefix == "vmla") {
      Result.push_back("vmul." + OutTypeCode + " " + RegisterSuffix);
      Result.push_back("vadd." + OutTypeCode);
      return;
    }
  }

  // If we have vcvt, get the input type from the instruction name
  // (which should be of the form instname_inputtype) and append it
  // before the output type.
  if (Prefix == "vcvt") {
    const std::string inTypeCode = NameRef.substr(NameRef.find_last_of("_")+1);
    Prefix += "." + inTypeCode;
  }

  // Append output type code to get our final mangled instruction.
  Prefix += "." + OutTypeCode;

  Result.push_back(Prefix + " " + RegisterSuffix);
}

/// UseMacro - Examine the prototype string to determine if the intrinsic
/// should be defined as a preprocessor macro instead of an inline function.
static bool UseMacro(const std::string &proto) {
  // If this builtin takes an immediate argument, we need to #define it rather
  // than use a standard declaration, so that SemaChecking can range check
  // the immediate passed by the user.
  if (proto.find('i') != std::string::npos)
    return true;

  // Pointer arguments need to use macros to avoid hiding aligned attributes
  // from the pointer type.
  if (proto.find('p') != std::string::npos ||
      proto.find('c') != std::string::npos)
    return true;

  return false;
}

/// MacroArgUsedDirectly - Return true if argument i for an intrinsic that is
/// defined as a macro should be accessed directly instead of being first
/// assigned to a local temporary.
static bool MacroArgUsedDirectly(const std::string &proto, unsigned i) {
  // True for constant ints (i), pointers (p) and const pointers (c).
  return (proto[i] == 'i' || proto[i] == 'p' || proto[i] == 'c');
}

// Generate the string "(argtype a, argtype b, ...)"
static std::string GenArgs(const std::string &proto, StringRef typestr,
                           const std::string &name) {
  bool define = UseMacro(proto);
  char arg = 'a';

  std::string s;
  s += "(";

  for (unsigned i = 1, e = proto.size(); i != e; ++i, ++arg) {
    if (define) {
      // Some macro arguments are used directly instead of being assigned
      // to local temporaries; prepend an underscore prefix to make their
      // names consistent with the local temporaries.
      if (MacroArgUsedDirectly(proto, i))
        s += "__";
    } else {
      s += TypeString(proto[i], typestr) + " __";
    }
    s.push_back(arg);
    //To avoid argument being multiple defined, add extra number for renaming.
    if (name == "vcopy_lane" || name == "vcopy_laneq")
      s.push_back('1');
    if ((i + 1) < e)
      s += ", ";
  }

  s += ")";
  return s;
}

// Macro arguments are not type-checked like inline function arguments, so
// assign them to local temporaries to get the right type checking.
static std::string GenMacroLocals(const std::string &proto, StringRef typestr,
                                  const std::string &name ) {
  char arg = 'a';
  std::string s;
  bool generatedLocal = false;

  for (unsigned i = 1, e = proto.size(); i != e; ++i, ++arg) {
    // Do not create a temporary for an immediate argument.
    // That would defeat the whole point of using a macro!
    if (MacroArgUsedDirectly(proto, i))
      continue;
    generatedLocal = true;
    bool extranumber = false;
    if (name == "vcopy_lane" || name == "vcopy_laneq")
      extranumber = true;

    s += TypeString(proto[i], typestr) + " __";
    s.push_back(arg);
    if(extranumber)
      s.push_back('1');
    s += " = (";
    s.push_back(arg);
    if(extranumber)
      s.push_back('1');
    s += "); ";
  }

  if (generatedLocal)
    s += "\\\n  ";
  return s;
}

// Use the vmovl builtin to sign-extend or zero-extend a vector.
static std::string Extend(StringRef typestr, const std::string &a, bool h=0) {
  std::string s, high;
  high = h ? "_high" : "";
  s = MangleName("vmovl" + high, typestr, ClassS);
  s += "(" + a + ")";
  return s;
}

// Get the high 64-bit part of a vector
static std::string GetHigh(const std::string &a, StringRef typestr) {
  std::string s;
  s = MangleName("vget_high", typestr, ClassS);
  s += "(" + a + ")";
  return s;
}

// Gen operation with two operands and get high 64-bit for both of two operands.
static std::string Gen2OpWith2High(StringRef typestr,
                                   const std::string &op,
                                   const std::string &a,
                                   const std::string &b) {
  std::string s;
  std::string Op1 = GetHigh(a, typestr);
  std::string Op2 = GetHigh(b, typestr);
  s = MangleName(op, typestr, ClassS);
  s += "(" + Op1 + ", " + Op2 + ");";
  return s;
}

// Gen operation with three operands and get high 64-bit of the latter 
// two operands.
static std::string Gen3OpWith2High(StringRef typestr,
                                   const std::string &op,
                                   const std::string &a,
                                   const std::string &b,
                                   const std::string &c) {
  std::string s;
  std::string Op1 = GetHigh(b, typestr);
  std::string Op2 = GetHigh(c, typestr);
  s = MangleName(op, typestr, ClassS);
  s += "(" + a + ", " + Op1 + ", " + Op2 + ");";
  return s;
}

// Gen combine operation by putting a on low 64-bit, and b on high 64-bit.
static std::string GenCombine(std::string typestr,
                              const std::string &a,
                              const std::string &b) {
  std::string s;
  s = MangleName("vcombine", typestr, ClassS);
  s += "(" + a + ", " + b + ")";
  return s;
}

static std::string Duplicate(unsigned nElts, StringRef typestr,
                             const std::string &a) {
  std::string s;

  s = "(" + TypeString('d', typestr) + "){ ";
  for (unsigned i = 0; i != nElts; ++i) {
    s += a;
    if ((i + 1) < nElts)
      s += ", ";
  }
  s += " }";

  return s;
}

static std::string SplatLane(unsigned nElts, const std::string &vec,
                             const std::string &lane) {
  std::string s = "__builtin_shufflevector(" + vec + ", " + vec;
  for (unsigned i = 0; i < nElts; ++i)
    s += ", " + lane;
  s += ")";
  return s;
}

static std::string RemoveHigh(const std::string &name) {
  std::string s = name;
  std::size_t found = s.find("_high_");
  if (found == std::string::npos)
    PrintFatalError("name should contain \"_high_\" for high intrinsics");
  s.replace(found, 5, "");
  return s;
}

static unsigned GetNumElements(StringRef typestr, bool &quad) {
  quad = false;
  bool dummy = false;
  char type = ClassifyType(typestr, quad, dummy, dummy);
  unsigned nElts = 0;
  switch (type) {
  case 'c': nElts = 8; break;
  case 's': nElts = 4; break;
  case 'i': nElts = 2; break;
  case 'l': nElts = 1; break;
  case 'h': nElts = 4; break;
  case 'f': nElts = 2; break;
  case 'd':
    nElts = 1;
    break;
  default:
    PrintFatalError("unhandled type!");
  }
  if (quad) nElts <<= 1;
  return nElts;
}

// Generate the definition for this intrinsic, e.g. "a + b" for OpAdd.
static std::string GenOpString(const std::string &name, OpKind op,
                               const std::string &proto, StringRef typestr) {
  bool quad;
  unsigned nElts = GetNumElements(typestr, quad);
  bool define = UseMacro(proto);

  std::string ts = TypeString(proto[0], typestr);
  std::string s;
  if (!define) {
    s = "return ";
  }

  switch(op) {
  case OpAdd:
    s += "__a + __b;";
    break;
  case OpAddl:
    s += Extend(typestr, "__a") + " + " + Extend(typestr, "__b") + ";";
    break;
  case OpAddlHi:
    s += Extend(typestr, "__a", 1) + " + " + Extend(typestr, "__b", 1) + ";";
    break;
  case OpAddw:
    s += "__a + " + Extend(typestr, "__b") + ";";
    break;
  case OpAddwHi:
    s += "__a + " + Extend(typestr, "__b", 1) + ";";
    break;
  case OpSub:
    s += "__a - __b;";
    break;
  case OpSubl:
    s += Extend(typestr, "__a") + " - " + Extend(typestr, "__b") + ";";
    break;
  case OpSublHi:
    s += Extend(typestr, "__a", 1) + " - " + Extend(typestr, "__b", 1) + ";";
    break;
  case OpSubw:
    s += "__a - " + Extend(typestr, "__b") + ";";
    break;
  case OpSubwHi:
    s += "__a - " + Extend(typestr, "__b", 1) + ";";
    break;
  case OpMulN:
    s += "__a * " + Duplicate(nElts, typestr, "__b") + ";";
    break;
  case OpMulLane:
    s += "__a * " + SplatLane(nElts, "__b", "__c") + ";";
    break;
  case OpMulXLane:
    s += MangleName("vmulx", typestr, ClassS) + "(__a, " +
      SplatLane(nElts, "__b", "__c") + ");";
    break;
  case OpMul:
    s += "__a * __b;";
    break;
  case OpFMlaN:
    s += MangleName("vfma", typestr, ClassS);
    s += "(__a, __b, " + Duplicate(nElts,typestr, "__c") + ");";
    break;
  case OpFMlsN:
    s += MangleName("vfms", typestr, ClassS);
    s += "(__a, __b, " + Duplicate(nElts,typestr, "__c") + ");";
    break;
  case OpMullLane:
    s += MangleName("vmull", typestr, ClassS) + "(__a, " +
      SplatLane(nElts, "__b", "__c") + ");";
    break;
  case OpMullHiLane:
    s += MangleName("vmull", typestr, ClassS) + "(" +
      GetHigh("__a", typestr) + ", " + SplatLane(nElts, "__b", "__c") + ");";
    break;
  case OpMlaN:
    s += "__a + (__b * " + Duplicate(nElts, typestr, "__c") + ");";
    break;
  case OpMlaLane:
    s += "__a + (__b * " + SplatLane(nElts, "__c", "__d") + ");";
    break;
  case OpMla:
    s += "__a + (__b * __c);";
    break;
  case OpMlalN:
    s += "__a + " + MangleName("vmull", typestr, ClassS) + "(__b, " +
      Duplicate(nElts, typestr, "__c") + ");";
    break;
  case OpMlalLane:
    s += "__a + " + MangleName("vmull", typestr, ClassS) + "(__b, " +
      SplatLane(nElts, "__c", "__d") + ");";
    break;
  case OpMlalHiLane:
    s += "__a + " + MangleName("vmull", typestr, ClassS) + "(" +
      GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");";
    break;
  case OpMlal:
    s += "__a + " + MangleName("vmull", typestr, ClassS) + "(__b, __c);";
    break;
  case OpMullHi:
    s += Gen2OpWith2High(typestr, "vmull", "__a", "__b");
    break;
  case OpMullHiN:
    s += MangleName("vmull_n", typestr, ClassS);
    s += "(" + GetHigh("__a", typestr) + ", __b);";
    return s;
  case OpMlalHi:
    s += Gen3OpWith2High(typestr, "vmlal", "__a", "__b", "__c");
    break;
  case OpMlalHiN:
    s += MangleName("vmlal_n", typestr, ClassS);
    s += "(__a, " + GetHigh("__b", typestr) + ", __c);";
    return s;
  case OpMlsN:
    s += "__a - (__b * " + Duplicate(nElts, typestr, "__c") + ");";
    break;
  case OpMlsLane:
    s += "__a - (__b * " + SplatLane(nElts, "__c", "__d") + ");";
    break;
  case OpFMSLane:
    s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n  ";
    s += TypeString(proto[2], typestr) + " __b1 = __b; \\\n  ";
    s += TypeString(proto[3], typestr) + " __c1 = __c; \\\n  ";
    s += MangleName("vfma_lane", typestr, ClassS) + "(__a1, __b1, -__c1, __d);";
    break;
  case OpFMSLaneQ:
    s += TypeString(proto[1], typestr) + " __a1 = __a; \\\n  ";
    s += TypeString(proto[2], typestr) + " __b1 = __b; \\\n  ";
    s += TypeString(proto[3], typestr) + " __c1 = __c; \\\n  ";
    s += MangleName("vfma_laneq", typestr, ClassS) + "(__a1, __b1, -__c1, __d);";
    break;
  case OpMls:
    s += "__a - (__b * __c);";
    break;
  case OpMlslN:
    s += "__a - " + MangleName("vmull", typestr, ClassS) + "(__b, " +
      Duplicate(nElts, typestr, "__c") + ");";
    break;
  case OpMlslLane:
    s += "__a - " + MangleName("vmull", typestr, ClassS) + "(__b, " +
      SplatLane(nElts, "__c", "__d") + ");";
    break;
  case OpMlslHiLane:
    s += "__a - " + MangleName("vmull", typestr, ClassS) + "(" +
      GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");";
    break;
  case OpMlsl:
    s += "__a - " + MangleName("vmull", typestr, ClassS) + "(__b, __c);";
    break;
  case OpMlslHi:
    s += Gen3OpWith2High(typestr, "vmlsl", "__a", "__b", "__c");
    break;
  case OpMlslHiN:
    s += MangleName("vmlsl_n", typestr, ClassS);
    s += "(__a, " + GetHigh("__b", typestr) + ", __c);";
    break;
  case OpQDMullLane:
    s += MangleName("vqdmull", typestr, ClassS) + "(__a, " +
      SplatLane(nElts, "__b", "__c") + ");";
    break;
  case OpQDMullHiLane:
    s += MangleName("vqdmull", typestr, ClassS) + "(" +
      GetHigh("__a", typestr) + ", " + SplatLane(nElts, "__b", "__c") + ");";
    break;
  case OpQDMlalLane:
    s += MangleName("vqdmlal", typestr, ClassS) + "(__a, __b, " +
      SplatLane(nElts, "__c", "__d") + ");";
    break;
  case OpQDMlalHiLane:
    s += MangleName("vqdmlal", typestr, ClassS) + "(__a, " +
      GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");";
    break;
  case OpQDMlslLane:
    s += MangleName("vqdmlsl", typestr, ClassS) + "(__a, __b, " +
      SplatLane(nElts, "__c", "__d") + ");";
    break;
  case OpQDMlslHiLane:
    s += MangleName("vqdmlsl", typestr, ClassS) + "(__a, " +
      GetHigh("__b", typestr) + ", " + SplatLane(nElts, "__c", "__d") + ");";
    break;
  case OpQDMulhLane:
    s += MangleName("vqdmulh", typestr, ClassS) + "(__a, " +
      SplatLane(nElts, "__b", "__c") + ");";
    break;
  case OpQRDMulhLane:
    s += MangleName("vqrdmulh", typestr, ClassS) + "(__a, " +
      SplatLane(nElts, "__b", "__c") + ");";
    break;
  case OpEq:
    s += "(" + ts + ")(__a == __b);";
    break;
  case OpGe:
    s += "(" + ts + ")(__a >= __b);";
    break;
  case OpLe:
    s += "(" + ts + ")(__a <= __b);";
    break;
  case OpGt:
    s += "(" + ts + ")(__a > __b);";
    break;
  case OpLt:
    s += "(" + ts + ")(__a < __b);";
    break;
  case OpNeg:
    s += " -__a;";
    break;
  case OpNot:
    s += " ~__a;";
    break;
  case OpAnd:
    s += "__a & __b;";
    break;
  case OpOr:
    s += "__a | __b;";
    break;
  case OpXor:
    s += "__a ^ __b;";
    break;
  case OpAndNot:
    s += "__a & ~__b;";
    break;
  case OpOrNot:
    s += "__a | ~__b;";
    break;
  case OpCast:
    s += "(" + ts + ")__a;";
    break;
  case OpConcat:
    s += "(" + ts + ")__builtin_shufflevector((int64x1_t)__a";
    s += ", (int64x1_t)__b, 0, 1);";
    break;
  case OpHi:
    // nElts is for the result vector, so the source is twice that number.
    s += "__builtin_shufflevector(__a, __a";
    for (unsigned i = nElts; i < nElts * 2; ++i)
      s += ", " + utostr(i);
    s+= ");";
    break;
  case OpLo:
    s += "__builtin_shufflevector(__a, __a";
    for (unsigned i = 0; i < nElts; ++i)
      s += ", " + utostr(i);
    s+= ");";
    break;
  case OpDup:
    s += Duplicate(nElts, typestr, "__a") + ";";
    break;
  case OpDupLane:
    s += SplatLane(nElts, "__a", "__b") + ";";
    break;
  case OpSelect:
    // ((0 & 1) | (~0 & 2))
    s += "(" + ts + ")";
    ts = TypeString(proto[1], typestr);
    s += "((__a & (" + ts + ")__b) | ";
    s += "(~__a & (" + ts + ")__c));";
    break;
  case OpRev16:
    s += "__builtin_shufflevector(__a, __a";
    for (unsigned i = 2; i <= nElts; i += 2)
      for (unsigned j = 0; j != 2; ++j)
        s += ", " + utostr(i - j - 1);
    s += ");";
    break;
  case OpRev32: {
    unsigned WordElts = nElts >> (1 + (int)quad);
    s += "__builtin_shufflevector(__a, __a";
    for (unsigned i = WordElts; i <= nElts; i += WordElts)
      for (unsigned j = 0; j != WordElts; ++j)
        s += ", " + utostr(i - j - 1);
    s += ");";
    break;
  }
  case OpRev64: {
    unsigned DblWordElts = nElts >> (int)quad;
    s += "__builtin_shufflevector(__a, __a";
    for (unsigned i = DblWordElts; i <= nElts; i += DblWordElts)
      for (unsigned j = 0; j != DblWordElts; ++j)
        s += ", " + utostr(i - j - 1);
    s += ");";
    break;
  }
  case OpXtnHi: {
    s = TypeString(proto[1], typestr) + " __a1 = " +
        MangleName("vmovn", typestr, ClassS) + "(__b);\n  " +
        "return __builtin_shufflevector(__a, __a1";
    for (unsigned i = 0; i < nElts * 4; ++i)
      s += ", " + utostr(i);
    s += ");";
    break;
  }
  case OpSqxtunHi: {
    s = TypeString(proto[1], typestr) + " __a1 = " +
        MangleName("vqmovun", typestr, ClassS) + "(__b);\n  " +
        "return __builtin_shufflevector(__a, __a1";
    for (unsigned i = 0; i < nElts * 4; ++i)
      s += ", " + utostr(i);
    s += ");";
    break;
  }
  case OpQxtnHi: {
    s = TypeString(proto[1], typestr) + " __a1 = " +
        MangleName("vqmovn", typestr, ClassS) + "(__b);\n  " +
        "return __builtin_shufflevector(__a, __a1";
    for (unsigned i = 0; i < nElts * 4; ++i)
      s += ", " + utostr(i);
    s += ");";
    break;
  }
  case OpFcvtnHi: {
    std::string FName = (nElts == 1) ? "vcvt_f32" : "vcvt_f16";
    s = TypeString(proto[1], typestr) + " __a1 = " +
        MangleName(FName, typestr, ClassS) + "(__b);\n  " +
        "return __builtin_shufflevector(__a, __a1";
    for (unsigned i = 0; i < nElts * 4; ++i)
      s += ", " + utostr(i);
    s += ");";
    break;
  }
  case OpFcvtlHi: {
    std::string FName = (nElts == 2) ? "vcvt_f64" : "vcvt_f32";
    s = TypeString('d', typestr) + " __a1 = " + GetHigh("__a", typestr) +
        ";\n  return " + MangleName(FName, typestr, ClassS) + "(__a1);";
    break;
  }
  case OpFcvtxnHi: {
    s = TypeString(proto[1], typestr) + " __a1 = " +
        MangleName("vcvtx_f32", typestr, ClassS) + "(__b);\n  " +
        "return __builtin_shufflevector(__a, __a1";
    for (unsigned i = 0; i < nElts * 4; ++i)
      s += ", " + utostr(i);
    s += ");";
    break;
  }
  case OpUzp1:
    s += "__builtin_shufflevector(__a, __b";
    for (unsigned i = 0; i < nElts; i++)
      s += ", " + utostr(2*i);
    s += ");";
    break;
  case OpUzp2:
    s += "__builtin_shufflevector(__a, __b";
    for (unsigned i = 0; i < nElts; i++)
      s += ", " + utostr(2*i+1);
    s += ");";
    break;
  case OpZip1:
    s += "__builtin_shufflevector(__a, __b";
    for (unsigned i = 0; i < (nElts/2); i++)
       s += ", " + utostr(i) + ", " + utostr(i+nElts);
    s += ");";
    break;
  case OpZip2:
    s += "__builtin_shufflevector(__a, __b";
    for (unsigned i = nElts/2; i < nElts; i++)
       s += ", " + utostr(i) + ", " + utostr(i+nElts);
    s += ");";
    break;
  case OpTrn1:
    s += "__builtin_shufflevector(__a, __b";
    for (unsigned i = 0; i < (nElts/2); i++)
       s += ", " + utostr(2*i) + ", " + utostr(2*i+nElts);
    s += ");";
    break;
  case OpTrn2:
    s += "__builtin_shufflevector(__a, __b";
    for (unsigned i = 0; i < (nElts/2); i++)
       s += ", " + utostr(2*i+1) + ", " + utostr(2*i+1+nElts);
    s += ");";
    break;
  case OpAbdl: {
    std::string abd = MangleName("vabd", typestr, ClassS) + "(__a, __b)";
    if (typestr[0] != 'U') {
      // vabd results are always unsigned and must be zero-extended.
      std::string utype = "U" + typestr.str();
      s += "(" + TypeString(proto[0], typestr) + ")";
      abd = "(" + TypeString('d', utype) + ")" + abd;
      s += Extend(utype, abd) + ";";
    } else {
      s += Extend(typestr, abd) + ";";
    }
    break;
  }
  case OpAbdlHi:
    s += Gen2OpWith2High(typestr, "vabdl", "__a", "__b");
    break;
  case OpAddhnHi: {
    std::string addhn = MangleName("vaddhn", typestr, ClassS) + "(__b, __c)";
    s += GenCombine(GetNarrowTypestr(typestr), "__a", addhn);
    s += ";";
    break;
  }
  case OpRAddhnHi: {
    std::string raddhn = MangleName("vraddhn", typestr, ClassS) + "(__b, __c)";
    s += GenCombine(GetNarrowTypestr(typestr), "__a", raddhn);
    s += ";";
    break;
  }
  case OpSubhnHi: {
    std::string subhn = MangleName("vsubhn", typestr, ClassS) + "(__b, __c)";
    s += GenCombine(GetNarrowTypestr(typestr), "__a", subhn);
    s += ";";
    break;
  }
  case OpRSubhnHi: {
    std::string rsubhn = MangleName("vrsubhn", typestr, ClassS) + "(__b, __c)";
    s += GenCombine(GetNarrowTypestr(typestr), "__a", rsubhn);
    s += ";";
    break;
  }
  case OpAba:
    s += "__a + " + MangleName("vabd", typestr, ClassS) + "(__b, __c);";
    break;
  case OpAbal:
    s += "__a + " + MangleName("vabdl", typestr, ClassS) + "(__b, __c);";
    break;
  case OpAbalHi:
    s += Gen3OpWith2High(typestr, "vabal", "__a", "__b", "__c");
    break;
  case OpQDMullHi:
    s += Gen2OpWith2High(typestr, "vqdmull", "__a", "__b");
    break;
  case OpQDMullHiN:
    s += MangleName("vqdmull_n", typestr, ClassS);
    s += "(" + GetHigh("__a", typestr) + ", __b);";
    return s;
  case OpQDMlalHi:
    s += Gen3OpWith2High(typestr, "vqdmlal", "__a", "__b", "__c");
    break;
  case OpQDMlalHiN:
    s += MangleName("vqdmlal_n", typestr, ClassS);
    s += "(__a, " + GetHigh("__b", typestr) + ", __c);";
    return s;
  case OpQDMlslHi:
    s += Gen3OpWith2High(typestr, "vqdmlsl", "__a", "__b", "__c");
    break;
  case OpQDMlslHiN:
    s += MangleName("vqdmlsl_n", typestr, ClassS);
    s += "(__a, " + GetHigh("__b", typestr) + ", __c);";
    return s;
  case OpDiv:
    s += "__a / __b;";
    break;
  case OpMovlHi: {
    s = TypeString(proto[1], typestr.drop_front()) + " __a1 = " +
        MangleName("vget_high", typestr, ClassS) + "(__a);\n  " + s;
    s += "(" + ts + ")" + MangleName("vshll_n", typestr, ClassS);
    s += "(__a1, 0);";
    break;
  }
  case OpLongHi: {
    // Another local variable __a1 is needed for calling a Macro,
    // or using __a will have naming conflict when Macro expanding.
    s += TypeString(proto[1], typestr.drop_front()) + " __a1 = " +
         MangleName("vget_high", typestr, ClassS) + "(__a); \\\n";
    s += "  (" + ts + ")" + MangleName(RemoveHigh(name), typestr, ClassS) +
         "(__a1, __b);";
    break;
  }
  case OpNarrowHi: {
    s += "(" + ts + ")" + MangleName("vcombine", typestr, ClassS) + "(__a, " +
         MangleName(RemoveHigh(name), typestr, ClassS) + "(__b, __c));";
    break;
  }
  case OpCopyLane: {
    s += TypeString('s', typestr) + " __c2 = " +
         MangleName("vget_lane", typestr, ClassS) + "(__c1, __d1); \\\n  " +
         MangleName("vset_lane", typestr, ClassS) + "(__c2, __a1, __b1);";
    break;
  }
  case OpCopyQLane: {
    std::string typeCode = "";
    InstructionTypeCode(typestr, ClassS, quad, typeCode);
    s += TypeString('s', typestr) + " __c2 = vget_lane_" + typeCode +
         "(__c1, __d1); \\\n  vsetq_lane_" + typeCode + "(__c2, __a1, __b1);";
    break;
  }
  case OpCopyLaneQ: {
    std::string typeCode = "";
    InstructionTypeCode(typestr, ClassS, quad, typeCode);
    s += TypeString('s', typestr) + " __c2 = vgetq_lane_" + typeCode +
         "(__c1, __d1); \\\n  vset_lane_" + typeCode + "(__c2, __a1, __b1);";
    break;
  }
  case OpScalarMulLane: {
    std::string typeCode = "";
    InstructionTypeCode(typestr, ClassS, quad, typeCode);
    s += TypeString('s', typestr) + " __d1 = vget_lane_" + typeCode +
      "(__b, __c);\\\n  __a * __d1;";
    break;
  }
  case OpScalarMulLaneQ: {
    std::string typeCode = "";
    InstructionTypeCode(typestr, ClassS, quad, typeCode);
        s += TypeString('s', typestr) + " __d1 = vgetq_lane_" + typeCode +
          "(__b, __c);\\\n  __a * __d1;";
    break;
  }
  case OpScalarMulXLane: {
    bool dummy = false;
    char type = ClassifyType(typestr, dummy, dummy, dummy);
    if (type == 'f') type = 's';
    std::string typeCode = "";
    InstructionTypeCode(typestr, ClassS, quad, typeCode);
    s += TypeString('s', typestr) + " __d1 = vget_lane_" + typeCode +
      "(__b, __c);\\\n  vmulx" + type + "_" +
      typeCode +  "(__a, __d1);";
    break;
  }
  case OpScalarMulXLaneQ: {
    bool dummy = false;
    char type = ClassifyType(typestr, dummy, dummy, dummy);
    if (type == 'f') type = 's';
    std::string typeCode = "";
    InstructionTypeCode(typestr, ClassS, quad, typeCode);
    s += TypeString('s', typestr) + " __d1 = vgetq_lane_" +
      typeCode + "(__b, __c);\\\n  vmulx" + type +
      "_" + typeCode +  "(__a, __d1);";
    break;
  }

  case OpScalarVMulXLane: {
    bool dummy = false;
    char type = ClassifyType(typestr, dummy, dummy, dummy);
    if (type == 'f') type = 's';
    std::string typeCode = "";
    InstructionTypeCode(typestr, ClassS, quad, typeCode);
    s += TypeString('s', typestr) + " __d1 = vget_lane_" +
      typeCode + "(__a, 0);\\\n" +
      "  " + TypeString('s', typestr) + " __e1 = vget_lane_" +
      typeCode + "(__b, __c);\\\n" +
      "  " + TypeString('s', typestr) + " __f1 = vmulx" + type + "_" +
      typeCode + "(__d1, __e1);\\\n" +
      "  " + TypeString('d', typestr) + " __g1;\\\n" +
      "  vset_lane_" + typeCode + "(__f1, __g1, __c);";
    break;
  }

  case OpScalarVMulXLaneQ: {
    bool dummy = false;
    char type = ClassifyType(typestr, dummy, dummy, dummy);
    if (type == 'f') type = 's';
    std::string typeCode = "";
    InstructionTypeCode(typestr, ClassS, quad, typeCode);
    s += TypeString('s', typestr) + " __d1 = vget_lane_" +
      typeCode + "(__a, 0);\\\n" +
      "  " + TypeString('s', typestr) + " __e1 = vgetq_lane_" +
      typeCode + "(__b, __c);\\\n" +
      "  " + TypeString('s', typestr) + " __f1 = vmulx" + type + "_" +
      typeCode + "(__d1, __e1);\\\n" +
      "  " + TypeString('d', typestr) + " __g1;\\\n" +
      "  vset_lane_" + typeCode + "(__f1, __g1, 0);";
    break;
  }
  case OpScalarQDMullLane: {
    std::string typeCode = "";
    InstructionTypeCode(typestr, ClassS, quad, typeCode);
    s += MangleName("vqdmull", typestr, ClassS) + "(__a, " +
    "vget_lane_" + typeCode + "(b, __c));";
    break;
  }
  case OpScalarQDMullLaneQ: {
    std::string typeCode = "";
    InstructionTypeCode(typestr, ClassS, quad, typeCode);
    s += MangleName("vqdmull", typestr, ClassS) + "(__a, " +
    "vgetq_lane_" + typeCode + "(b, __c));";
    break;
  }
  case OpScalarQDMulHiLane: {
    std::string typeCode = "";
    InstructionTypeCode(typestr, ClassS, quad, typeCode);
    s += MangleName("vqdmulh", typestr, ClassS) + "(__a, " +
    "vget_lane_" + typeCode + "(__b, __c));";
    break;
  }
  case OpScalarQDMulHiLaneQ: {
    std::string typeCode = "";
    InstructionTypeCode(typestr, ClassS, quad, typeCode);
    s += MangleName("vqdmulh", typestr, ClassS) + "(__a, " +
    "vgetq_lane_" + typeCode + "(__b, __c));";
    break;
  }
  case OpScalarQRDMulHiLane: {
    std::string typeCode = "";
    InstructionTypeCode(typestr, ClassS, quad, typeCode);
    s += MangleName("vqrdmulh", typestr, ClassS) + "(__a, " +
    "vget_lane_" + typeCode + "(__b, __c));";
    break;
  }
  case OpScalarQRDMulHiLaneQ: {
    std::string typeCode = "";
    InstructionTypeCode(typestr, ClassS, quad, typeCode);
    s += MangleName("vqrdmulh", typestr, ClassS) + "(__a, " +
    "vgetq_lane_" + typeCode + "(__b, __c));";
    break;
  }
  default:
    PrintFatalError("unknown OpKind!");
  }
  return s;
}

static unsigned GetNeonEnum(const std::string &proto, StringRef typestr) {
  unsigned mod = proto[0];

  if (mod == 'v' || mod == 'f' || mod == 'F')
    mod = proto[1];

  bool quad = false;
  bool poly = false;
  bool usgn = false;
  bool scal = false;
  bool cnst = false;
  bool pntr = false;

  // Base type to get the type string for.
  char type = ClassifyType(typestr, quad, poly, usgn);

  // Based on the modifying character, change the type and width if necessary.
  type = ModType(mod, type, quad, poly, usgn, scal, cnst, pntr);

  NeonTypeFlags::EltType ET;
  switch (type) {
    case 'c':
      ET = poly ? NeonTypeFlags::Poly8 : NeonTypeFlags::Int8;
      break;
    case 's':
      ET = poly ? NeonTypeFlags::Poly16 : NeonTypeFlags::Int16;
      break;
    case 'i':
      ET = NeonTypeFlags::Int32;
      break;
    case 'l':
      ET = poly ? NeonTypeFlags::Poly64 : NeonTypeFlags::Int64;
      break;
    case 'h':
      ET = NeonTypeFlags::Float16;
      break;
    case 'f':
      ET = NeonTypeFlags::Float32;
      break;
    case 'd':
      ET = NeonTypeFlags::Float64;
      break;
    default:
      PrintFatalError("unhandled type!");
  }
  NeonTypeFlags Flags(ET, usgn, quad && proto[1] != 'g');
  return Flags.getFlags();
}

// We don't check 'a' in this function, because for builtin function the
// argument matching to 'a' uses a vector type splatted from a scalar type.
static bool ProtoHasScalar(const std::string proto)
{
  return (proto.find('s') != std::string::npos
          || proto.find('z') != std::string::npos
          || proto.find('r') != std::string::npos
          || proto.find('b') != std::string::npos
          || proto.find('$') != std::string::npos
          || proto.find('y') != std::string::npos
          || proto.find('o') != std::string::npos);
}

// Generate the definition for this intrinsic, e.g. __builtin_neon_cls(a)
static std::string GenBuiltin(const std::string &name, const std::string &proto,
                              StringRef typestr, ClassKind ck) {
  std::string s;

  // If this builtin returns a struct 2, 3, or 4 vectors, pass it as an implicit
  // sret-like argument.
  bool sret = IsMultiVecProto(proto[0]);

  bool define = UseMacro(proto);

  // Check if the prototype has a scalar operand with the type of the vector
  // elements.  If not, bitcasting the args will take care of arg checking.
  // The actual signedness etc. will be taken care of with special enums.
  if (!ProtoHasScalar(proto))
    ck = ClassB;

  if (proto[0] != 'v') {
    std::string ts = TypeString(proto[0], typestr);

    if (define) {
      if (sret)
        s += ts + " r; ";
      else
        s += "(" + ts + ")";
    } else if (sret) {
      s += ts + " r; ";
    } else {
      s += "return (" + ts + ")";
    }
  }

  bool splat = proto.find('a') != std::string::npos;

  s += "__builtin_neon_";
  if (splat) {
    // Call the non-splat builtin: chop off the "_n" suffix from the name.
    std::string vname(name, 0, name.size()-2);
    s += MangleName(vname, typestr, ck);
  } else {
    s += MangleName(name, typestr, ck);
  }
  s += "(";

  // Pass the address of the return variable as the first argument to sret-like
  // builtins.
  if (sret)
    s += "&r, ";

  char arg = 'a';
  for (unsigned i = 1, e = proto.size(); i != e; ++i, ++arg) {
    std::string args = std::string(&arg, 1);

    // Use the local temporaries instead of the macro arguments.
    args = "__" + args;

    bool argQuad = false;
    bool argPoly = false;
    bool argUsgn = false;
    bool argScalar = false;
    bool dummy = false;
    char argType = ClassifyType(typestr, argQuad, argPoly, argUsgn);
    argType = ModType(proto[i], argType, argQuad, argPoly, argUsgn, argScalar,
                      dummy, dummy);

    // Handle multiple-vector values specially, emitting each subvector as an
    // argument to the __builtin.
    unsigned NumOfVec = 0;
    if (proto[i] >= '2' && proto[i] <= '4') {
      NumOfVec = proto[i] - '0';
    } else if (proto[i] >= 'B' && proto[i] <= 'D') {
      NumOfVec = proto[i] - 'A' + 1;
    }
    
    if (NumOfVec > 0) {
      // Check if an explicit cast is needed.
      if (argType != 'c' || argPoly || argUsgn)
        args = (argQuad ? "(int8x16_t)" : "(int8x8_t)") + args;

      for (unsigned vi = 0, ve = NumOfVec; vi != ve; ++vi) {
        s += args + ".val[" + utostr(vi) + "]";
        if ((vi + 1) < ve)
          s += ", ";
      }
      if ((i + 1) < e)
        s += ", ";

      continue;
    }

    if (splat && (i + 1) == e)
      args = Duplicate(GetNumElements(typestr, argQuad), typestr, args);

    // Check if an explicit cast is needed.
    if ((splat || !argScalar) &&
        ((ck == ClassB && argType != 'c') || argPoly || argUsgn)) {
      std::string argTypeStr = "c";
      if (ck != ClassB)
        argTypeStr = argType;
      if (argQuad)
        argTypeStr = "Q" + argTypeStr;
      args = "(" + TypeString('d', argTypeStr) + ")" + args;
    }

    s += args;
    if ((i + 1) < e)
      s += ", ";
  }

  // Extra constant integer to hold type class enum for this function, e.g. s8
  if (ck == ClassB)
    s += ", " + utostr(GetNeonEnum(proto, typestr));

  s += ");";

  if (proto[0] != 'v' && sret) {
    if (define)
      s += " r;";
    else
      s += " return r;";
  }
  return s;
}

static std::string GenBuiltinDef(const std::string &name,
                                 const std::string &proto,
                                 StringRef typestr, ClassKind ck) {
  std::string s("BUILTIN(__builtin_neon_");

  // If all types are the same size, bitcasting the args will take care
  // of arg checking.  The actual signedness etc. will be taken care of with
  // special enums.
  if (!ProtoHasScalar(proto))
    ck = ClassB;

  s += MangleName(name, typestr, ck);
  s += ", \"";

  for (unsigned i = 0, e = proto.size(); i != e; ++i)
    s += BuiltinTypeString(proto[i], typestr, ck, i == 0);

  // Extra constant integer to hold type class enum for this function, e.g. s8
  if (ck == ClassB)
    s += "i";

  s += "\", \"n\")";
  return s;
}

static std::string GenIntrinsic(const std::string &name,
                                const std::string &proto,
                                StringRef outTypeStr, StringRef inTypeStr,
                                OpKind kind, ClassKind classKind) {
  assert(!proto.empty() && "");
  bool define = UseMacro(proto) && kind != OpUnavailable;
  std::string s;

  // static always inline + return type
  if (define)
    s += "#define ";
  else
    s += "__ai " + TypeString(proto[0], outTypeStr) + " ";

  // Function name with type suffix
  std::string mangledName = MangleName(name, outTypeStr, ClassS);
  if (outTypeStr != inTypeStr) {
    // If the input type is different (e.g., for vreinterpret), append a suffix
    // for the input type.  String off a "Q" (quad) prefix so that MangleName
    // does not insert another "q" in the name.
    unsigned typeStrOff = (inTypeStr[0] == 'Q' ? 1 : 0);
    StringRef inTypeNoQuad = inTypeStr.substr(typeStrOff);
    mangledName = MangleName(mangledName, inTypeNoQuad, ClassS);
  }
  s += mangledName;

  // Function arguments
  s += GenArgs(proto, inTypeStr, name);

  // Definition.
  if (define) {
    s += " __extension__ ({ \\\n  ";
    s += GenMacroLocals(proto, inTypeStr, name);
  } else if (kind == OpUnavailable) {
    s += " __attribute__((unavailable));\n";
    return s;
  } else
    s += " {\n  ";

  if (kind != OpNone)
    s += GenOpString(name, kind, proto, outTypeStr);
  else
    s += GenBuiltin(name, proto, outTypeStr, classKind);
  if (define)
    s += " })";
  else
    s += " }";
  s += "\n";
  return s;
}

/// run - Read the records in arm_neon.td and output arm_neon.h.  arm_neon.h
/// is comprised of type definitions and function declarations.
void NeonEmitter::run(raw_ostream &OS) {
  OS << 
    "/*===---- arm_neon.h - ARM Neon intrinsics ------------------------------"
    "---===\n"
    " *\n"
    " * Permission is hereby granted, free of charge, to any person obtaining "
    "a copy\n"
    " * of this software and associated documentation files (the \"Software\"),"
    " to deal\n"
    " * in the Software without restriction, including without limitation the "
    "rights\n"
    " * to use, copy, modify, merge, publish, distribute, sublicense, "
    "and/or sell\n"
    " * copies of the Software, and to permit persons to whom the Software is\n"
    " * furnished to do so, subject to the following conditions:\n"
    " *\n"
    " * The above copyright notice and this permission notice shall be "
    "included in\n"
    " * all copies or substantial portions of the Software.\n"
    " *\n"
    " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, "
    "EXPRESS OR\n"
    " * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF "
    "MERCHANTABILITY,\n"
    " * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT "
    "SHALL THE\n"
    " * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR "
    "OTHER\n"
    " * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, "
    "ARISING FROM,\n"
    " * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER "
    "DEALINGS IN\n"
    " * THE SOFTWARE.\n"
    " *\n"
    " *===--------------------------------------------------------------------"
    "---===\n"
    " */\n\n";

  OS << "#ifndef __ARM_NEON_H\n";
  OS << "#define __ARM_NEON_H\n\n";

  OS << "#if !defined(__ARM_NEON)\n";
  OS << "#error \"NEON support not enabled\"\n";
  OS << "#endif\n\n";

  OS << "#include <stdint.h>\n\n";

  // Emit NEON-specific scalar typedefs.
  OS << "typedef float float32_t;\n";
  OS << "typedef __fp16 float16_t;\n";

  OS << "#ifdef __aarch64__\n";
  OS << "typedef double float64_t;\n";
  OS << "#endif\n\n";

  // For now, signedness of polynomial types depends on target
  OS << "#ifdef __aarch64__\n";
  OS << "typedef uint8_t poly8_t;\n";
  OS << "typedef uint16_t poly16_t;\n";
  OS << "typedef uint64_t poly64_t;\n";
  OS << "#else\n";
  OS << "typedef int8_t poly8_t;\n";
  OS << "typedef int16_t poly16_t;\n";
  OS << "#endif\n";

  // Emit Neon vector typedefs.
  std::string TypedefTypes(
      "cQcsQsiQilQlUcQUcUsQUsUiQUiUlQUlhQhfQfdQdPcQPcPsQPsPlQPl");
  SmallVector<StringRef, 24> TDTypeVec;
  ParseTypes(0, TypedefTypes, TDTypeVec);

  // Emit vector typedefs.
  bool isA64 = false;
  bool preinsert;
  bool postinsert;
  for (unsigned i = 0, e = TDTypeVec.size(); i != e; ++i) {
    bool dummy, quad = false, poly = false;
    char type = ClassifyType(TDTypeVec[i], quad, poly, dummy);
    preinsert = false;
    postinsert = false;

    if (type == 'd' || (type == 'l' && poly)) {
      preinsert = isA64? false: true;
      isA64 = true;
    } else {
      postinsert = isA64? true: false;
      isA64 = false;
    }
    if (postinsert)
      OS << "#endif\n";
    if (preinsert)
      OS << "#ifdef __aarch64__\n";

    if (poly)
      OS << "typedef __attribute__((neon_polyvector_type(";
    else
      OS << "typedef __attribute__((neon_vector_type(";

    unsigned nElts = GetNumElements(TDTypeVec[i], quad);
    OS << utostr(nElts) << "))) ";
    if (nElts < 10)
      OS << " ";

    OS << TypeString('s', TDTypeVec[i]);
    OS << " " << TypeString('d', TDTypeVec[i]) << ";\n";

  }
  postinsert = isA64? true: false;
  if (postinsert)
    OS << "#endif\n";
  OS << "\n";

  // Emit struct typedefs.
  isA64 = false;
  for (unsigned vi = 2; vi != 5; ++vi) {
    for (unsigned i = 0, e = TDTypeVec.size(); i != e; ++i) {
      bool dummy, quad = false, poly = false;
      char type = ClassifyType(TDTypeVec[i], quad, poly, dummy);
      preinsert = false;
      postinsert = false;

      if (type == 'd' || (type == 'l' && poly)) {
        preinsert = isA64? false: true;
        isA64 = true;
      } else {
        postinsert = isA64? true: false;
        isA64 = false;
      }
      if (postinsert)
        OS << "#endif\n";
      if (preinsert)
        OS << "#ifdef __aarch64__\n";

      std::string ts = TypeString('d', TDTypeVec[i]);
      std::string vs = TypeString('0' + vi, TDTypeVec[i]);
      OS << "typedef struct " << vs << " {\n";
      OS << "  " << ts << " val";
      OS << "[" << utostr(vi) << "]";
      OS << ";\n} ";
      OS << vs << ";\n";
      OS << "\n";
    }
  }
  postinsert = isA64? true: false;
  if (postinsert)
    OS << "#endif\n";
  OS << "\n";

  OS<<"#define __ai static inline __attribute__((__always_inline__, __nodebug__))\n\n";

  std::vector<Record*> RV = Records.getAllDerivedDefinitions("Inst");

  StringMap<ClassKind> EmittedMap;

  // Emit vmovl, vmull and vabd intrinsics first so they can be used by other
  // intrinsics.  (Some of the saturating multiply instructions are also
  // used to implement the corresponding "_lane" variants, but tablegen
  // sorts the records into alphabetical order so that the "_lane" variants
  // come after the intrinsics they use.)
  emitIntrinsic(OS, Records.getDef("VMOVL"), EmittedMap);
  emitIntrinsic(OS, Records.getDef("VMULL"), EmittedMap);
  emitIntrinsic(OS, Records.getDef("VABD"), EmittedMap);
  emitIntrinsic(OS, Records.getDef("VABDL"), EmittedMap);

  // ARM intrinsics must be emitted before AArch64 intrinsics to ensure
  // common intrinsics appear only once in the output stream.
  // The check for uniquiness is done in emitIntrinsic.
  // Emit ARM intrinsics.
  for (unsigned i = 0, e = RV.size(); i != e; ++i) {
    Record *R = RV[i];

    // Skip AArch64 intrinsics; they will be emitted at the end.
    bool isA64 = R->getValueAsBit("isA64");
    if (isA64)
      continue;

    if (R->getName() != "VMOVL" && R->getName() != "VMULL" &&
        R->getName() != "VABD")
      emitIntrinsic(OS, R, EmittedMap);
  }

  // Emit AArch64-specific intrinsics.
  OS << "#ifdef __aarch64__\n";

  emitIntrinsic(OS, Records.getDef("VMOVL_HIGH"), EmittedMap);
  emitIntrinsic(OS, Records.getDef("VMULL_HIGH"), EmittedMap);
  emitIntrinsic(OS, Records.getDef("VABDL_HIGH"), EmittedMap);

  for (unsigned i = 0, e = RV.size(); i != e; ++i) {
    Record *R = RV[i];

    // Skip ARM intrinsics already included above.
    bool isA64 = R->getValueAsBit("isA64");
    if (!isA64)
      continue;

    // Skip crypto temporarily, and will emit them all together at the end.
    bool isCrypto = R->getValueAsBit("isCrypto");
    if (isCrypto)
      continue;

    emitIntrinsic(OS, R, EmittedMap);
  }

  OS << "#ifdef __ARM_FEATURE_CRYPTO\n";

  for (unsigned i = 0, e = RV.size(); i != e; ++i) {
    Record *R = RV[i];

    // Skip crypto temporarily, and will emit them all together at the end.
    bool isCrypto = R->getValueAsBit("isCrypto");
    if (!isCrypto)
      continue;

    emitIntrinsic(OS, R, EmittedMap);
  }
  
  OS << "#endif\n\n";

  OS << "#endif\n\n";

  OS << "#undef __ai\n\n";
  OS << "#endif /* __ARM_NEON_H */\n";
}

/// emitIntrinsic - Write out the arm_neon.h header file definitions for the
/// intrinsics specified by record R checking for intrinsic uniqueness.
void NeonEmitter::emitIntrinsic(raw_ostream &OS, Record *R,
                                StringMap<ClassKind> &EmittedMap) {
  std::string name = R->getValueAsString("Name");
  std::string Proto = R->getValueAsString("Prototype");
  std::string Types = R->getValueAsString("Types");

  SmallVector<StringRef, 16> TypeVec;
  ParseTypes(R, Types, TypeVec);

  OpKind kind = OpMap[R->getValueAsDef("Operand")->getName()];

  ClassKind classKind = ClassNone;
  if (R->getSuperClasses().size() >= 2)
    classKind = ClassMap[R->getSuperClasses()[1]];
  if (classKind == ClassNone && kind == OpNone)
    PrintFatalError(R->getLoc(), "Builtin has no class kind");

  for (unsigned ti = 0, te = TypeVec.size(); ti != te; ++ti) {
    if (kind == OpReinterpret) {
      bool outQuad = false;
      bool dummy = false;
      (void)ClassifyType(TypeVec[ti], outQuad, dummy, dummy);
      for (unsigned srcti = 0, srcte = TypeVec.size();
           srcti != srcte; ++srcti) {
        bool inQuad = false;
        (void)ClassifyType(TypeVec[srcti], inQuad, dummy, dummy);
        if (srcti == ti || inQuad != outQuad)
          continue;
        std::string s = GenIntrinsic(name, Proto, TypeVec[ti], TypeVec[srcti],
                                     OpCast, ClassS);
        if (EmittedMap.count(s))
          continue;
        EmittedMap[s] = ClassS;
        OS << s;
      }
    } else {
      std::string s =
          GenIntrinsic(name, Proto, TypeVec[ti], TypeVec[ti], kind, classKind);
      if (EmittedMap.count(s))
        continue;
      EmittedMap[s] = classKind;
      OS << s;
    }
  }
  OS << "\n";
}

static unsigned RangeFromType(const char mod, StringRef typestr) {
  // base type to get the type string for.
  bool quad = false, dummy = false;
  char type = ClassifyType(typestr, quad, dummy, dummy);
  type = ModType(mod, type, quad, dummy, dummy, dummy, dummy, dummy);

  switch (type) {
    case 'c':
      return (8 << (int)quad) - 1;
    case 'h':
    case 's':
      return (4 << (int)quad) - 1;
    case 'f':
    case 'i':
      return (2 << (int)quad) - 1;
    case 'd':
    case 'l':
      return (1 << (int)quad) - 1;
    default:
      PrintFatalError("unhandled type!");
  }
}

static unsigned RangeScalarShiftImm(const char mod, StringRef typestr) {
  // base type to get the type string for.
  bool dummy = false;
  char type = ClassifyType(typestr, dummy, dummy, dummy);
  type = ModType(mod, type, dummy, dummy, dummy, dummy, dummy, dummy);

  switch (type) {
    case 'c':
      return 7;
    case 'h':
    case 's':
      return 15;
    case 'f':
    case 'i':
      return 31;
    case 'd':
    case 'l':
      return 63;
    default:
      PrintFatalError("unhandled type!");
  }
}

/// Generate the ARM and AArch64 intrinsic range checking code for
/// shift/lane immediates, checking for unique declarations.
void
NeonEmitter::genIntrinsicRangeCheckCode(raw_ostream &OS,
                                        StringMap<ClassKind> &A64IntrinsicMap,
                                        bool isA64RangeCheck) {
  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
  StringMap<OpKind> EmittedMap;

  // Generate the intrinsic range checking code for shift/lane immediates.
  if (isA64RangeCheck)
    OS << "#ifdef GET_NEON_AARCH64_IMMEDIATE_CHECK\n";
  else
    OS << "#ifdef GET_NEON_IMMEDIATE_CHECK\n";

  for (unsigned i = 0, e = RV.size(); i != e; ++i) {
    Record *R = RV[i];

    OpKind k = OpMap[R->getValueAsDef("Operand")->getName()];
    if (k != OpNone)
      continue;

    std::string name = R->getValueAsString("Name");
    std::string Proto = R->getValueAsString("Prototype");
    std::string Types = R->getValueAsString("Types");
    std::string Rename = name + "@" + Proto;

    // Functions with 'a' (the splat code) in the type prototype should not get
    // their own builtin as they use the non-splat variant.
    if (Proto.find('a') != std::string::npos)
      continue;

    // Functions which do not have an immediate do not need to have range
    // checking code emitted.
    size_t immPos = Proto.find('i');
    if (immPos == std::string::npos)
      continue;

    SmallVector<StringRef, 16> TypeVec;
    ParseTypes(R, Types, TypeVec);

    if (R->getSuperClasses().size() < 2)
      PrintFatalError(R->getLoc(), "Builtin has no class kind");

    ClassKind ck = ClassMap[R->getSuperClasses()[1]];
    if (!ProtoHasScalar(Proto))
      ck = ClassB;

    // Do not include AArch64 range checks if not generating code for AArch64.
    bool isA64 = R->getValueAsBit("isA64");
    if (!isA64RangeCheck && isA64)
      continue;

    // Include ARM range checks in AArch64 but only if ARM intrinsics are not
    // redefined by AArch64 to handle new types.
    if (isA64RangeCheck && !isA64 && A64IntrinsicMap.count(Rename)) {
      ClassKind &A64CK = A64IntrinsicMap[Rename];
      if (A64CK == ck && ck != ClassNone)
        continue;
    }

    for (unsigned ti = 0, te = TypeVec.size(); ti != te; ++ti) {
      std::string namestr, shiftstr, rangestr;

      if (R->getValueAsBit("isVCVT_N")) {
        // VCVT between floating- and fixed-point values takes an immediate
        // in the range [1, 32] for f32, or [1, 64] for f64.
        ck = ClassB;
        if (name.find("32") != std::string::npos)
          rangestr = "l = 1; u = 31"; // upper bound = l + u
        else if (name.find("64") != std::string::npos)
          rangestr = "l = 1; u = 63";
        else
          PrintFatalError(R->getLoc(),
              "Fixed point convert name should contains \"32\" or \"64\"");

      } else if (R->getValueAsBit("isScalarShift")) {
        // Right shifts have an 'r' in the name, left shifts do not.  Convert
        // instructions have the same bounds and right shifts.
        if (name.find('r') != std::string::npos ||
            name.find("cvt") != std::string::npos)
          rangestr = "l = 1; ";

        unsigned upBound = RangeScalarShiftImm(Proto[immPos - 1], TypeVec[ti]);
        // Narrow shift has half the upper bound
        if (R->getValueAsBit("isScalarNarrowShift"))
          upBound /= 2;

        rangestr += "u = " + utostr(upBound);
      } else if (R->getValueAsBit("isShift")) {
        // Builtins which are overloaded by type will need to have their upper
        // bound computed at Sema time based on the type constant.
        shiftstr = ", true";

        // Right shifts have an 'r' in the name, left shifts do not.
        if (name.find('r') != std::string::npos)
          rangestr = "l = 1; ";

        rangestr += "u = RFT(TV" + shiftstr + ")";
      } else {
        // The immediate generally refers to a lane in the preceding argument.
        assert(immPos > 0 && "unexpected immediate operand");
        rangestr =
            "u = " + utostr(RangeFromType(Proto[immPos - 1], TypeVec[ti]));
      }
      // Make sure cases appear only once by uniquing them in a string map.
      namestr = MangleName(name, TypeVec[ti], ck);
      if (EmittedMap.count(namestr))
        continue;
      EmittedMap[namestr] = OpNone;

      // Calculate the index of the immediate that should be range checked.
      unsigned immidx = 0;

      // Builtins that return a struct of multiple vectors have an extra
      // leading arg for the struct return.
      if (IsMultiVecProto(Proto[0]))
        ++immidx;

      // Add one to the index for each argument until we reach the immediate
      // to be checked.  Structs of vectors are passed as multiple arguments.
      for (unsigned ii = 1, ie = Proto.size(); ii != ie; ++ii) {
        switch (Proto[ii]) {
        default:
          immidx += 1;
          break;
        case '2':
        case 'B':
          immidx += 2;
          break;
        case '3':
        case 'C':
          immidx += 3;
          break;
        case '4':
        case 'D':
          immidx += 4;
          break;
        case 'i':
          ie = ii + 1;
          break;
        }
      }
      if (isA64RangeCheck)
        OS << "case AArch64::BI__builtin_neon_";
      else
        OS << "case ARM::BI__builtin_neon_";
      OS << MangleName(name, TypeVec[ti], ck) << ": i = " << immidx << "; "
         << rangestr << "; break;\n";
    }
  }
  OS << "#endif\n\n";
}

/// Generate the ARM and AArch64 overloaded type checking code for
/// SemaChecking.cpp, checking for unique builtin declarations.
void
NeonEmitter::genOverloadTypeCheckCode(raw_ostream &OS,
                                      StringMap<ClassKind> &A64IntrinsicMap,
                                      bool isA64TypeCheck) {
  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");

  // Generate the overloaded type checking code for SemaChecking.cpp
  if (isA64TypeCheck)
    OS << "#ifdef GET_NEON_AARCH64_OVERLOAD_CHECK\n";
  else
    OS << "#ifdef GET_NEON_OVERLOAD_CHECK\n";

  for (unsigned i = 0, e = RV.size(); i != e; ++i) {
    Record *R = RV[i];
    OpKind k = OpMap[R->getValueAsDef("Operand")->getName()];
    if (k != OpNone)
      continue;

    std::string Proto = R->getValueAsString("Prototype");
    std::string Types = R->getValueAsString("Types");
    std::string name = R->getValueAsString("Name");
    std::string Rename = name + "@" + Proto;
    
    // Functions with 'a' (the splat code) in the type prototype should not get
    // their own builtin as they use the non-splat variant.
    if (Proto.find('a') != std::string::npos)
      continue;

    // Functions which have a scalar argument cannot be overloaded, no need to
    // check them if we are emitting the type checking code.
    if (ProtoHasScalar(Proto))
      continue;

    SmallVector<StringRef, 16> TypeVec;
    ParseTypes(R, Types, TypeVec);

    if (R->getSuperClasses().size() < 2)
      PrintFatalError(R->getLoc(), "Builtin has no class kind");

    // Do not include AArch64 type checks if not generating code for AArch64.
    bool isA64 = R->getValueAsBit("isA64");
    if (!isA64TypeCheck && isA64)
      continue;

    // Include ARM  type check in AArch64 but only if ARM intrinsics
    // are not redefined in AArch64 to handle new types, e.g. "vabd" is a SIntr
    // redefined in AArch64 to handle an additional 2 x f64 type.
    ClassKind ck = ClassMap[R->getSuperClasses()[1]];
    if (isA64TypeCheck && !isA64 && A64IntrinsicMap.count(Rename)) {
      ClassKind &A64CK = A64IntrinsicMap[Rename];
      if (A64CK == ck && ck != ClassNone)
        continue;
    }

    int si = -1, qi = -1;
    uint64_t mask = 0, qmask = 0;
    for (unsigned ti = 0, te = TypeVec.size(); ti != te; ++ti) {
      // Generate the switch case(s) for this builtin for the type validation.
      bool quad = false, poly = false, usgn = false;
      (void) ClassifyType(TypeVec[ti], quad, poly, usgn);

      if (quad) {
        qi = ti;
        qmask |= 1ULL << GetNeonEnum(Proto, TypeVec[ti]);
      } else {
        si = ti;
        mask |= 1ULL << GetNeonEnum(Proto, TypeVec[ti]);
      }
    }

    // Check if the builtin function has a pointer or const pointer argument.
    int PtrArgNum = -1;
    bool HasConstPtr = false;
    for (unsigned arg = 1, arge = Proto.size(); arg != arge; ++arg) {
      char ArgType = Proto[arg];
      if (ArgType == 'c') {
        HasConstPtr = true;
        PtrArgNum = arg - 1;
        break;
      }
      if (ArgType == 'p') {
        PtrArgNum = arg - 1;
        break;
      }
    }
    // For sret builtins, adjust the pointer argument index.
    if (PtrArgNum >= 0 && IsMultiVecProto(Proto[0]))
      PtrArgNum += 1;

    // Omit type checking for the pointer arguments of vld1_lane, vld1_dup,
    // and vst1_lane intrinsics.  Using a pointer to the vector element
    // type with one of those operations causes codegen to select an aligned
    // load/store instruction.  If you want an unaligned operation,
    // the pointer argument needs to have less alignment than element type,
    // so just accept any pointer type.
    if (name == "vld1_lane" || name == "vld1_dup" || name == "vst1_lane") {
      PtrArgNum = -1;
      HasConstPtr = false;
    }

    if (mask) {
      if (isA64TypeCheck)
        OS << "case AArch64::BI__builtin_neon_";
      else
        OS << "case ARM::BI__builtin_neon_";
      OS << MangleName(name, TypeVec[si], ClassB) << ": mask = "
         << "0x" << utohexstr(mask) << "ULL";
      if (PtrArgNum >= 0)
        OS << "; PtrArgNum = " << PtrArgNum;
      if (HasConstPtr)
        OS << "; HasConstPtr = true";
      OS << "; break;\n";
    }
    if (qmask) {
      if (isA64TypeCheck)
        OS << "case AArch64::BI__builtin_neon_";
      else
        OS << "case ARM::BI__builtin_neon_";
      OS << MangleName(name, TypeVec[qi], ClassB) << ": mask = "
         << "0x" << utohexstr(qmask) << "ULL";
      if (PtrArgNum >= 0)
        OS << "; PtrArgNum = " << PtrArgNum;
      if (HasConstPtr)
        OS << "; HasConstPtr = true";
      OS << "; break;\n";
    }
  }
  OS << "#endif\n\n";
}

/// genBuiltinsDef: Generate the BuiltinsARM.def and  BuiltinsAArch64.def
/// declaration of builtins, checking for unique builtin declarations.
void NeonEmitter::genBuiltinsDef(raw_ostream &OS,
                                 StringMap<ClassKind> &A64IntrinsicMap,
                                 bool isA64GenBuiltinDef) {
  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
  StringMap<OpKind> EmittedMap;

  // Generate BuiltinsARM.def and BuiltinsAArch64.def
  if (isA64GenBuiltinDef)
    OS << "#ifdef GET_NEON_AARCH64_BUILTINS\n";
  else
    OS << "#ifdef GET_NEON_BUILTINS\n";

  for (unsigned i = 0, e = RV.size(); i != e; ++i) {
    Record *R = RV[i];
    OpKind k = OpMap[R->getValueAsDef("Operand")->getName()];
    if (k != OpNone)
      continue;

    std::string Proto = R->getValueAsString("Prototype");
    std::string name = R->getValueAsString("Name");
    std::string Rename = name + "@" + Proto;

    // Functions with 'a' (the splat code) in the type prototype should not get
    // their own builtin as they use the non-splat variant.
    if (Proto.find('a') != std::string::npos)
      continue;

    std::string Types = R->getValueAsString("Types");
    SmallVector<StringRef, 16> TypeVec;
    ParseTypes(R, Types, TypeVec);

    if (R->getSuperClasses().size() < 2)
      PrintFatalError(R->getLoc(), "Builtin has no class kind");

    ClassKind ck = ClassMap[R->getSuperClasses()[1]];

    // Do not include AArch64 BUILTIN() macros if not generating
    // code for AArch64
    bool isA64 = R->getValueAsBit("isA64");
    if (!isA64GenBuiltinDef && isA64)
      continue;

    // Include ARM  BUILTIN() macros  in AArch64 but only if ARM intrinsics
    // are not redefined in AArch64 to handle new types, e.g. "vabd" is a SIntr
    // redefined in AArch64 to handle an additional 2 x f64 type.
    if (isA64GenBuiltinDef && !isA64 && A64IntrinsicMap.count(Rename)) {
      ClassKind &A64CK = A64IntrinsicMap[Rename];
      if (A64CK == ck && ck != ClassNone)
        continue;
    }

    for (unsigned ti = 0, te = TypeVec.size(); ti != te; ++ti) {
      // Generate the declaration for this builtin, ensuring
      // that each unique BUILTIN() macro appears only once in the output
      // stream.
      std::string bd = GenBuiltinDef(name, Proto, TypeVec[ti], ck);
      if (EmittedMap.count(bd))
        continue;

      EmittedMap[bd] = OpNone;
      OS << bd << "\n";
    }
  }
  OS << "#endif\n\n";
}

/// runHeader - Emit a file with sections defining:
/// 1. the NEON section of BuiltinsARM.def and BuiltinsAArch64.def.
/// 2. the SemaChecking code for the type overload checking.
/// 3. the SemaChecking code for validation of intrinsic immediate arguments.
void NeonEmitter::runHeader(raw_ostream &OS) {
  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");

  // build a map of AArch64 intriniscs to be used in uniqueness checks.
  StringMap<ClassKind> A64IntrinsicMap;
  for (unsigned i = 0, e = RV.size(); i != e; ++i) {
    Record *R = RV[i];

    bool isA64 = R->getValueAsBit("isA64");
    if (!isA64)
      continue;

    ClassKind CK = ClassNone;
    if (R->getSuperClasses().size() >= 2)
      CK = ClassMap[R->getSuperClasses()[1]];

    std::string Name = R->getValueAsString("Name");
    std::string Proto = R->getValueAsString("Prototype");
    std::string Rename = Name + "@" + Proto;
    if (A64IntrinsicMap.count(Rename))
      continue;
    A64IntrinsicMap[Rename] = CK;
  }

  // Generate BuiltinsARM.def for ARM
  genBuiltinsDef(OS, A64IntrinsicMap, false);

  // Generate BuiltinsAArch64.def for AArch64
  genBuiltinsDef(OS, A64IntrinsicMap, true);

  // Generate ARM overloaded type checking code for SemaChecking.cpp
  genOverloadTypeCheckCode(OS, A64IntrinsicMap, false);

  // Generate AArch64 overloaded type checking code for SemaChecking.cpp
  genOverloadTypeCheckCode(OS, A64IntrinsicMap, true);

  // Generate ARM range checking code for shift/lane immediates.
  genIntrinsicRangeCheckCode(OS, A64IntrinsicMap, false);

  // Generate the AArch64 range checking code for shift/lane immediates.
  genIntrinsicRangeCheckCode(OS, A64IntrinsicMap, true);
}

/// GenTest - Write out a test for the intrinsic specified by the name and
/// type strings, including the embedded patterns for FileCheck to match.
static std::string GenTest(const std::string &name,
                           const std::string &proto,
                           StringRef outTypeStr, StringRef inTypeStr,
                           bool isShift, bool isHiddenLOp,
                           ClassKind ck, const std::string &InstName,
                           bool isA64,
                           std::string & testFuncProto) {
  assert(!proto.empty() && "");
  std::string s;

  // Function name with type suffix
  std::string mangledName = MangleName(name, outTypeStr, ClassS);
  if (outTypeStr != inTypeStr) {
    // If the input type is different (e.g., for vreinterpret), append a suffix
    // for the input type.  String off a "Q" (quad) prefix so that MangleName
    // does not insert another "q" in the name.
    unsigned typeStrOff = (inTypeStr[0] == 'Q' ? 1 : 0);
    StringRef inTypeNoQuad = inTypeStr.substr(typeStrOff);
    mangledName = MangleName(mangledName, inTypeNoQuad, ClassS);
  }

  // todo: GenerateChecksForIntrinsic does not generate CHECK
  // for aarch64 instructions yet
  std::vector<std::string> FileCheckPatterns;
  if (!isA64) {
	GenerateChecksForIntrinsic(name, proto, outTypeStr, inTypeStr, ck, InstName,
							   isHiddenLOp, FileCheckPatterns);
	s+= "// CHECK_ARM: test_" + mangledName + "\n";
  }
  s += "// CHECK_AARCH64: test_" + mangledName + "\n";

  // Emit the FileCheck patterns.
  // If for any reason we do not want to emit a check, mangledInst
  // will be the empty string.
  if (FileCheckPatterns.size()) {
    for (std::vector<std::string>::const_iterator i = FileCheckPatterns.begin(),
                                                  e = FileCheckPatterns.end();
         i != e;
         ++i) {
      s += "// CHECK_ARM: " + *i + "\n";
    }
  }

  // Emit the start of the test function.

  testFuncProto = TypeString(proto[0], outTypeStr) + " test_" + mangledName + "(";
  char arg = 'a';
  std::string comma;
  for (unsigned i = 1, e = proto.size(); i != e; ++i, ++arg) {
    // Do not create arguments for values that must be immediate constants.
    if (proto[i] == 'i')
      continue;
    testFuncProto += comma + TypeString(proto[i], inTypeStr) + " ";
    testFuncProto.push_back(arg);
    comma = ", ";
  }
  testFuncProto += ")";

  s+= testFuncProto;
  s+= " {\n  ";

  if (proto[0] != 'v')
    s += "return ";
  s += mangledName + "(";
  arg = 'a';
  for (unsigned i = 1, e = proto.size(); i != e; ++i, ++arg) {
    if (proto[i] == 'i') {
      // For immediate operands, test the maximum value.
      if (isShift)
        s += "1"; // FIXME
      else
        // The immediate generally refers to a lane in the preceding argument.
        s += utostr(RangeFromType(proto[i-1], inTypeStr));
    } else {
      s.push_back(arg);
    }
    if ((i + 1) < e)
      s += ", ";
  }
  s += ");\n}\n\n";
  return s;
}

/// Write out all intrinsic tests for the specified target, checking
/// for intrinsic test uniqueness.
void NeonEmitter::genTargetTest(raw_ostream &OS, StringMap<OpKind> &EmittedMap,
                                bool isA64GenTest) {
  if (isA64GenTest)
	OS << "#ifdef __aarch64__\n";

  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
  for (unsigned i = 0, e = RV.size(); i != e; ++i) {
    Record *R = RV[i];
    std::string name = R->getValueAsString("Name");
    std::string Proto = R->getValueAsString("Prototype");
    std::string Types = R->getValueAsString("Types");
    bool isShift = R->getValueAsBit("isShift");
    std::string InstName = R->getValueAsString("InstName");
    bool isHiddenLOp = R->getValueAsBit("isHiddenLInst");
    bool isA64 = R->getValueAsBit("isA64");

    // do not include AArch64 intrinsic test if not generating
    // code for AArch64
    if (!isA64GenTest && isA64)
      continue;

    SmallVector<StringRef, 16> TypeVec;
    ParseTypes(R, Types, TypeVec);

    ClassKind ck = ClassMap[R->getSuperClasses()[1]];
    OpKind kind = OpMap[R->getValueAsDef("Operand")->getName()];
    if (kind == OpUnavailable)
      continue;
    for (unsigned ti = 0, te = TypeVec.size(); ti != te; ++ti) {
      if (kind == OpReinterpret) {
        bool outQuad = false;
        bool dummy = false;
        (void)ClassifyType(TypeVec[ti], outQuad, dummy, dummy);
        for (unsigned srcti = 0, srcte = TypeVec.size();
             srcti != srcte; ++srcti) {
          bool inQuad = false;
          (void)ClassifyType(TypeVec[srcti], inQuad, dummy, dummy);
          if (srcti == ti || inQuad != outQuad)
            continue;
		  std::string testFuncProto;
          std::string s = GenTest(name, Proto, TypeVec[ti], TypeVec[srcti],
                                  isShift, isHiddenLOp, ck, InstName, isA64,
								  testFuncProto);
          if (EmittedMap.count(testFuncProto))
            continue;
          EmittedMap[testFuncProto] = kind;
          OS << s << "\n";
        }
      } else {
		std::string testFuncProto;
        std::string s = GenTest(name, Proto, TypeVec[ti], TypeVec[ti], isShift,
                                isHiddenLOp, ck, InstName, isA64, testFuncProto);
        if (EmittedMap.count(testFuncProto))
          continue;
        EmittedMap[testFuncProto] = kind;
        OS << s << "\n";
      }
    }
  }

  if (isA64GenTest)
	OS << "#endif\n";
}
/// runTests - Write out a complete set of tests for all of the Neon
/// intrinsics.
void NeonEmitter::runTests(raw_ostream &OS) {
  OS << "// RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi "
        "apcs-gnu\\\n"
        "// RUN:  -target-cpu swift -ffreestanding -Os -S -o - %s\\\n"
        "// RUN:  | FileCheck %s -check-prefix=CHECK_ARM\n"
		"\n"
	    "// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \\\n"
	    "// RUN -target-feature +neon  -ffreestanding -S -o - %s \\\n"
	    "// RUN:  | FileCheck %s -check-prefix=CHECK_AARCH64\n"
        "\n"
        "// REQUIRES: long_tests\n"
        "\n"
        "#include <arm_neon.h>\n"
        "\n";

  // ARM tests must be emitted before AArch64 tests to ensure
  // tests for intrinsics that are common to ARM and AArch64
  // appear only once in the output stream.
  // The check for uniqueness is done in genTargetTest.
  StringMap<OpKind> EmittedMap;

  genTargetTest(OS, EmittedMap, false);

  genTargetTest(OS, EmittedMap, true);
}

namespace clang {
void EmitNeon(RecordKeeper &Records, raw_ostream &OS) {
  NeonEmitter(Records).run(OS);
}
void EmitNeonSema(RecordKeeper &Records, raw_ostream &OS) {
  NeonEmitter(Records).runHeader(OS);
}
void EmitNeonTest(RecordKeeper &Records, raw_ostream &OS) {
  NeonEmitter(Records).runTests(OS);
}
} // End namespace clang