[ARM] Enabling range checks on Neon intrinsics' lane arguments

Summary: Range checks were not properly performed in the lane arguments of Neon intrinsics implemented based on splat operations. Calls to those intrinsics where translated to `__builtin__shufflevector` calls directly by the pre-processor through the arm_neon.h macros, missing the chance for the proper range checks. This patch enables the range check by introducing an auxiliary splat instruction in arm_neon.td, delaying the translation to shufflevector calls to CGBuiltin.cpp in clang after the checks were performed. Reviewers: jmolloy, t.p.northover, rsmith, olista01, ostannard Reviewed By: ostannard Subscribers: ostannard, dnsampaio, danielkiss, kristof.beyls, cfe-commits Tags: #clang Differential Revision: https://reviews.llvm.org/D74619
2020-03-05 16:45:03 +00:00 · 2020-03-05 16:45:03 +00:00 · f56550cf7f
parent d42711625a
commit f56550cf7f
13 changed files with 2441 additions and 1244 deletions
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@ -51,39 +51,39 @@ def OP_FMLA_N   : Op<(call "vfma", $p0, $p1, (dup $p2))>;
 def OP_FMLS_N   : Op<(call "vfma", $p0, (op "-", $p1), (dup $p2))>;
 def OP_MLAL_N   : Op<(op "+", $p0, (call "vmull", $p1, (dup $p2)))>;
 def OP_MLSL_N   : Op<(op "-", $p0, (call "vmull", $p1, (dup $p2)))>;
-def OP_MUL_LN   : Op<(op "*", $p0, (splat $p1, $p2))>;
-def OP_MULX_LN  : Op<(call "vmulx", $p0, (splat $p1, $p2))>;
+def OP_MUL_LN   : Op<(op "*", $p0, (call_mangled "splat_lane", $p1, $p2))>;
+def OP_MULX_LN  : Op<(call "vmulx", $p0, (call_mangled "splat_lane", $p1, $p2))>;
 def OP_MULL_N  : Op<(call "vmull", $p0, (dup $p1))>;
-def OP_MULL_LN  : Op<(call "vmull", $p0, (splat $p1, $p2))>;
-def OP_MULLHi_LN: Op<(call "vmull", (call "vget_high", $p0), (splat $p1, $p2))>;
-def OP_MLA_LN   : Op<(op "+", $p0, (op "*", $p1, (splat $p2, $p3)))>;
-def OP_MLS_LN   : Op<(op "-", $p0, (op "*", $p1, (splat $p2, $p3)))>;
-def OP_MLAL_LN  : Op<(op "+", $p0, (call "vmull", $p1, (splat $p2, $p3)))>;
+def OP_MULL_LN  : Op<(call "vmull", $p0, (call_mangled "splat_lane", $p1, $p2))>;
+def OP_MULLHi_LN: Op<(call "vmull", (call "vget_high", $p0), (call_mangled "splat_lane", $p1, $p2))>;
+def OP_MLA_LN   : Op<(op "+", $p0, (op "*", $p1, (call_mangled "splat_lane", $p2, $p3)))>;
+def OP_MLS_LN   : Op<(op "-", $p0, (op "*", $p1, (call_mangled "splat_lane", $p2, $p3)))>;
+def OP_MLAL_LN  : Op<(op "+", $p0, (call "vmull", $p1, (call_mangled "splat_lane", $p2, $p3)))>;
 def OP_MLALHi_LN: Op<(op "+", $p0, (call "vmull", (call "vget_high", $p1),
-                                                  (splat $p2, $p3)))>;
-def OP_MLSL_LN  : Op<(op "-", $p0, (call "vmull", $p1, (splat $p2, $p3)))>;
+                                                  (call_mangled "splat_lane", $p2, $p3)))>;
+def OP_MLSL_LN  : Op<(op "-", $p0, (call "vmull", $p1, (call_mangled "splat_lane", $p2, $p3)))>;
 def OP_MLSLHi_LN : Op<(op "-", $p0, (call "vmull", (call "vget_high", $p1),
-                                                   (splat $p2, $p3)))>;
+                                                   (call_mangled "splat_lane", $p2, $p3)))>;
 def OP_QDMULL_N : Op<(call "vqdmull", $p0, (dup $p1))>;
-def OP_QDMULL_LN : Op<(call "vqdmull", $p0, (splat $p1, $p2))>;
+def OP_QDMULL_LN : Op<(call "vqdmull", $p0, (call_mangled "splat_lane", $p1, $p2))>;
 def OP_QDMULLHi_LN : Op<(call "vqdmull", (call "vget_high", $p0),
-                                         (splat $p1, $p2))>;
+                                         (call_mangled "splat_lane", $p1, $p2))>;
 def OP_QDMLAL_N : Op<(call "vqdmlal", $p0, $p1, (dup $p2))>;
-def OP_QDMLAL_LN : Op<(call "vqdmlal", $p0, $p1, (splat $p2, $p3))>;
+def OP_QDMLAL_LN : Op<(call "vqdmlal", $p0, $p1, (call_mangled "splat_lane", $p2, $p3))>;
 def OP_QDMLALHi_LN : Op<(call "vqdmlal", $p0, (call "vget_high", $p1),
-                                              (splat $p2, $p3))>;
+                                              (call_mangled "splat_lane", $p2, $p3))>;
 def OP_QDMLSL_N : Op<(call "vqdmlsl", $p0, $p1, (dup $p2))>;
-def OP_QDMLSL_LN : Op<(call "vqdmlsl", $p0, $p1, (splat $p2, $p3))>;
+def OP_QDMLSL_LN : Op<(call "vqdmlsl", $p0, $p1, (call_mangled "splat_lane", $p2, $p3))>;
 def OP_QDMLSLHi_LN : Op<(call "vqdmlsl", $p0, (call "vget_high", $p1),
-                                              (splat $p2, $p3))>;
+                                              (call_mangled "splat_lane", $p2, $p3))>;
 def OP_QDMULH_N : Op<(call "vqdmulh", $p0, (dup $p1))>;
-def OP_QDMULH_LN : Op<(call "vqdmulh", $p0, (splat $p1, $p2))>;
-def OP_QRDMULH_LN : Op<(call "vqrdmulh", $p0, (splat $p1, $p2))>;
+def OP_QDMULH_LN : Op<(call "vqdmulh", $p0, (call_mangled "splat_lane", $p1, $p2))>;
+def OP_QRDMULH_LN : Op<(call "vqrdmulh", $p0, (call_mangled "splat_lane", $p1, $p2))>;
 def OP_QRDMULH_N : Op<(call "vqrdmulh", $p0, (dup $p1))>;
 def OP_QRDMLAH : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, $p2))>;
 def OP_QRDMLSH : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, $p2))>;
-def OP_QRDMLAH_LN : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, (splat $p2, $p3)))>;
-def OP_QRDMLSH_LN : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, (splat $p2, $p3)))>;
+def OP_QRDMLAH_LN : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, (call_mangled "splat_lane", $p2, $p3)))>;
+def OP_QRDMLSH_LN : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, (call_mangled "splat_lane", $p2, $p3)))>;
 def OP_FMS_LN   : Op<(call "vfma_lane", $p0, (op "-", $p1), $p2, $p3)>;
 def OP_FMS_LNQ  : Op<(call "vfma_laneq", $p0, (op "-", $p1), $p2, $p3)>;
 def OP_TRN1     : Op<(shuffle $p0, $p1, (interleave (decimate mask0, 2),
@ -115,7 +115,7 @@ def OP_HI       : Op<(shuffle $p0, $p0, (highhalf mask0))>;
 def OP_LO       : Op<(shuffle $p0, $p0, (lowhalf mask0))>;
 def OP_CONC     : Op<(shuffle $p0, $p1, (add mask0, mask1))>;
 def OP_DUP      : Op<(dup $p0)>;
-def OP_DUP_LN   : Op<(splat $p0, $p1)>;
+def OP_DUP_LN   : Op<(call_mangled "splat_lane", $p0, $p1)>;
 def OP_SEL      : Op<(cast "R", (op "|",
                                    (op "&", $p0, (cast $p0, $p1)),
                                    (op "&", (op "~", $p0), (cast $p0, $p2))))>;
@ -207,10 +207,10 @@ def OP_SCALAR_HALF_SET_LNQ : Op<(bitcast "float16x8_t",

 def OP_DOT_LN
    : Op<(call "vdot", $p0, $p1,
-          (bitcast $p1, (splat(bitcast "uint32x2_t", $p2), $p3)))>;
+          (bitcast $p1, (call_mangled "splat_lane", (bitcast "32", $p2), $p3)))>;
 def OP_DOT_LNQ
    : Op<(call "vdot", $p0, $p1,
-          (bitcast $p1, (splat(bitcast "uint32x4_t", $p2), $p3)))>;
+          (bitcast $p1, (call_mangled "splat_lane", (bitcast "32", $p2), $p3)))>;

 def OP_FMLAL_LN     : Op<(call "vfmlal_low", $p0, $p1,
                           (dup_typed $p1, (call "vget_lane", $p2, $p3)))>;
@ -222,7 +222,19 @@ def OP_FMLSL_LN_Hi  : Op<(call "vfmlsl_high", $p0, $p1,
                           (dup_typed $p1, (call "vget_lane", $p2, $p3)))>;

 //===----------------------------------------------------------------------===//
-// Instructions
+// Auxiliary Instructions
+//===----------------------------------------------------------------------===//
+
+// Splat operation - performs a range-checked splat over a vector
+def SPLAT  : WInst<"splat_lane", ".(!q)I",
+                   "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl">;
+def SPLATQ : WInst<"splat_laneq", ".(!Q)I",
+                   "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl"> {
+  let isLaneQ = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Intrinsics
 //===----------------------------------------------------------------------===//

 ////////////////////////////////////////////////////////////////////////////////
--- a/clang/include/clang/Basic/arm_neon_incl.td
+++ b/clang/include/clang/Basic/arm_neon_incl.td
@ -88,6 +88,7 @@ def call_mangled;
 //          - "D" - Double the number of lanes in the type.
 //          - "8" - Convert type to an equivalent vector of 8-bit signed
 //                  integers.
+//          - "32" - Convert type to an equivalent vector of 32-bit integers.
 // example: (cast "R", "U", $p0) -> "(uint32x4_t)__p0" (assuming the return
 //           value is of type "int32x4_t".
 //          (cast $p0, "D", "8", $p1) -> "(int8x16_t)__p1" (assuming __p0
@ -109,12 +110,6 @@ def dup;
 // example: (dup_typed $p1, $p2) -> "(float16x4_t) {__p2, __p2, __p2, __p2}"
 //          (assuming __p1 is float16x4_t, and __p2 is a compatible scalar).
 def dup_typed;
-// splat - Take a vector and a lane index, and return a vector of the same type
-//         containing repeated instances of the source vector at the lane index.
-// example: (splat $p0, $p1) ->
-//            "__builtin_shufflevector(__p0, __p0, __p1, __p1, __p1, __p1)"
-//          (assuming __p0 has four elements).
-def splat;
 // save_temp - Create a temporary (local) variable. The variable takes a name
 //             based on the zero'th parameter and can be referenced using
 //             using that name in subsequent DAGs in the same
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@ -4495,10 +4495,15 @@ static llvm::VectorType *GetFloatNeonType(CodeGenFunction *CGF,
  }
 }

+Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C,
+                                      const ElementCount &Count) {
+  Value *SV = llvm::ConstantVector::getSplat(Count, C);
+  return Builder.CreateShuffleVector(V, V, SV, "lane");
+}
+
 Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C) {
  ElementCount EC = V->getType()->getVectorElementCount();
-  Value *SV = llvm::ConstantVector::getSplat(EC, C);
-  return Builder.CreateShuffleVector(V, V, SV, "lane");
+  return EmitNeonSplat(V, C, EC);
 }

 Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl<Value*> &Ops,
@ -4605,6 +4610,10 @@ struct ARMVectorIntrinsicInfo {
      TypeModifier }

 static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
+  NEONMAP0(splat_lane_v),
+  NEONMAP0(splat_laneq_v),
+  NEONMAP0(splatq_lane_v),
+  NEONMAP0(splatq_laneq_v),
  NEONMAP2(vabd_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
  NEONMAP2(vabdq_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts),
  NEONMAP1(vabs_v, arm_neon_vabs, 0),
@ -4886,6 +4895,10 @@ static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
 };

 static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
+  NEONMAP0(splat_lane_v),
+  NEONMAP0(splat_laneq_v),
+  NEONMAP0(splatq_lane_v),
+  NEONMAP0(splatq_laneq_v),
  NEONMAP1(vabs_v, aarch64_neon_abs, 0),
  NEONMAP1(vabsq_v, aarch64_neon_abs, 0),
  NEONMAP0(vaddhn_v),
@ -5460,6 +5473,19 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(

  switch (BuiltinID) {
  default: break;
+  case NEON::BI__builtin_neon_splat_lane_v:
+  case NEON::BI__builtin_neon_splat_laneq_v:
+  case NEON::BI__builtin_neon_splatq_lane_v:
+  case NEON::BI__builtin_neon_splatq_laneq_v: {
+    auto NumElements = VTy->getElementCount();
+    if (BuiltinID == NEON::BI__builtin_neon_splatq_lane_v)
+      NumElements = NumElements * 2;
+    if (BuiltinID == NEON::BI__builtin_neon_splat_laneq_v)
+      NumElements = NumElements / 2;
+
+    Ops[0] = Builder.CreateBitCast(Ops[0], VTy);
+    return EmitNeonSplat(Ops[0], cast<ConstantInt>(Ops[1]), NumElements);
+  }
  case NEON::BI__builtin_neon_vpadd_v:
  case NEON::BI__builtin_neon_vpaddq_v:
    // We don't allow fp/int overloading of intrinsics.
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@ -3894,6 +3894,8 @@ public:
                            SmallVectorImpl<llvm::Value*> &O,
                            const char *name,
                            unsigned shift = 0, bool rightshift = false);
+  llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx,
+                             const llvm::ElementCount &Count);
  llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx);
  llvm::Value *EmitNeonShiftVector(llvm::Value *V, llvm::Type *Ty,
                                   bool negateForRightShift);
--- a/clang/test/CodeGen/aarch64-neon-2velem.c
+++ b/clang/test/CodeGen/aarch64-neon-2velem.c
--- a/clang/test/CodeGen/aarch64-neon-fma.c
+++ b/clang/test/CodeGen/aarch64-neon-fma.c
@ -69,144 +69,177 @@ float64x2_t test_vmlsq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
 }

 // CHECK-LABEL: define <2 x float> @test_vmla_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
-// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
-// CHECK:   ret <2 x float> [[ADD]]
+// CHECK:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK:    [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <2 x float> [[ADD]]
 float32x2_t test_vmla_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
  return vmla_lane_f32(a, b, v, 0);
 }

 // CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
-// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
-// CHECK:   ret <4 x float> [[ADD]]
+// CHECK:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK:    [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
  return vmlaq_lane_f32(a, b, v, 0);
 }

 // CHECK-LABEL: define <2 x float> @test_vmla_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
-// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
-// CHECK:   ret <2 x float> [[ADD]]
+// CHECK:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK:    [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <2 x float> [[ADD]]
 float32x2_t test_vmla_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
  return vmla_laneq_f32(a, b, v, 0);
 }

 // CHECK-LABEL: define <4 x float> @test_vmlaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
-// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
-// CHECK:   ret <4 x float> [[ADD]]
+// CHECK:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK:    [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
  return vmlaq_laneq_f32(a, b, v, 0);
 }

 // CHECK-LABEL: define <2 x float> @test_vmls_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
-// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
-// CHECK:   ret <2 x float> [[SUB]]
+// CHECK:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK:    [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <2 x float> [[SUB]]
 float32x2_t test_vmls_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
  return vmls_lane_f32(a, b, v, 0);
 }

 // CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
-// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
-// CHECK:   ret <4 x float> [[SUB]]
+// CHECK:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK:    [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
  return vmlsq_lane_f32(a, b, v, 0);
 }

 // CHECK-LABEL: define <2 x float> @test_vmls_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
-// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
-// CHECK:   ret <2 x float> [[SUB]]
+// CHECK:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK:    [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <2 x float> [[SUB]]
 float32x2_t test_vmls_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
  return vmls_laneq_f32(a, b, v, 0);
 }

 // CHECK-LABEL: define <4 x float> @test_vmlsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
-// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
-// CHECK:   ret <4 x float> [[SUB]]
+// CHECK:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer
+// CHECK:    [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
  return vmlsq_laneq_f32(a, b, v, 0);
 }

 // CHECK-LABEL: define <2 x float> @test_vmla_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
-// CHECK:   ret <2 x float> [[ADD]]
+// CHECK:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:    [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <2 x float> [[ADD]]
 float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
  return vmla_lane_f32(a, b, v, 1);
 }

 // CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
-// CHECK:   ret <4 x float> [[ADD]]
+// CHECK:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:    [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
  return vmlaq_lane_f32(a, b, v, 1);
 }

 // CHECK-LABEL: define <2 x float> @test_vmla_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
-// CHECK:   ret <2 x float> [[ADD]]
+// CHECK:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK:    [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <2 x float> [[ADD]]
 float32x2_t test_vmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
  return vmla_laneq_f32(a, b, v, 3);
 }

 // CHECK-LABEL: define <4 x float> @test_vmlaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
-// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
-// CHECK:   ret <4 x float> [[ADD]]
+// CHECK:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:    [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
  return vmlaq_laneq_f32(a, b, v, 3);
 }

 // CHECK-LABEL: define <2 x float> @test_vmls_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
-// CHECK:   ret <2 x float> [[SUB]]
+// CHECK:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:    [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <2 x float> [[SUB]]
 float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
  return vmls_lane_f32(a, b, v, 1);
 }

 // CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
-// CHECK:   ret <4 x float> [[SUB]]
+// CHECK:    [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:    [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <4 x float> [[SUB]]
+//
 float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
  return vmlsq_lane_f32(a, b, v, 1);
 }
 // CHECK-LABEL: define <2 x float> @test_vmls_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
-// CHECK:   ret <2 x float> [[SUB]]
+// CHECK:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> <i32 3, i32 3>
+// CHECK:    [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <2 x float> [[SUB]]
 float32x2_t test_vmls_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
  return vmls_laneq_f32(a, b, v, 3);
 }

 // CHECK-LABEL: define <4 x float> @test_vmlsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
-// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
-// CHECK:   ret <4 x float> [[SUB]]
+// CHECK:    [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:    [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:    [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]]
+// CHECK:    [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]]
+// CHECK:    ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
  return vmlsq_laneq_f32(a, b, v, 3);
 }
--- a/clang/test/CodeGen/aarch64-poly64.c
+++ b/clang/test/CodeGen/aarch64-poly64.c
@ -150,22 +150,28 @@ poly64x2_t test_vmovq_n_p64(poly64_t a) {
 }

 // CHECK-LABEL: define <1 x i64> @test_vdup_lane_p64(<1 x i64> %vec) #0 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %vec, <1 x i64> %vec, <1 x i32> zeroinitializer
-// CHECK:   ret <1 x i64> [[SHUFFLE]]
+// CHECK:    [[TMP0:%.*]] = bitcast <1 x i64> [[VEC:%.*]] to <8 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:    [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer
+// CHECK:    ret <1 x i64> [[LANE]]
 poly64x1_t test_vdup_lane_p64(poly64x1_t vec) {
  return vdup_lane_p64(vec, 0);
 }

 // CHECK-LABEL: define <2 x i64> @test_vdupq_lane_p64(<1 x i64> %vec) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %vec, <1 x i64> %vec, <2 x i32> zeroinitializer
-// CHECK:   ret <2 x i64> [[SHUFFLE]]
+// CHECK:    [[TMP0:%.*]] = bitcast <1 x i64> [[VEC:%.*]] to <8 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:    [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer
+// CHECK:    ret <2 x i64> [[LANE]]
 poly64x2_t test_vdupq_lane_p64(poly64x1_t vec) {
  return vdupq_lane_p64(vec, 0);
 }

 // CHECK-LABEL: define <2 x i64> @test_vdupq_laneq_p64(<2 x i64> %vec) #1 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i64> %vec, <2 x i64> %vec, <2 x i32> <i32 1, i32 1>
-// CHECK:   ret <2 x i64> [[SHUFFLE]]
+// CHECK:    [[TMP0:%.*]] = bitcast <2 x i64> [[VEC:%.*]] to <16 x i8>
+// CHECK:    [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:    [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <2 x i32> <i32 1, i32 1>
+// CHECK:    ret <2 x i64> [[LANE]]
 poly64x2_t test_vdupq_laneq_p64(poly64x2_t vec) {
  return vdupq_laneq_p64(vec, 1);
 }
--- a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
@ -1086,32 +1086,40 @@ float16_t test_vfmsh_laneq_f16(float16_t a, float16_t b, float16x8_t c) {
 }

 // CHECK-LABEL: test_vmul_lane_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[MUL:%.*]]  = fmul <4 x half> %a, [[TMP0]]
+// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK: [[MUL:%.*]] = fmul <4 x half> [[A:%.*]], [[LANE]]
 // CHECK: ret <4 x half> [[MUL]]
 float16x4_t test_vmul_lane_f16(float16x4_t a, float16x4_t b) {
  return vmul_lane_f16(a, b, 3);
 }

 // CHECK-LABEL: test_vmulq_lane_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[MUL:%.*]]  = fmul <8 x half> %a, [[TMP0]]
+// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK: [[MUL:%.*]] = fmul <8 x half> [[A:%.*]], [[LANE]]
 // CHECK: ret <8 x half> [[MUL]]
 float16x8_t test_vmulq_lane_f16(float16x8_t a, float16x4_t b) {
-  return vmulq_lane_f16(a, b, 7);
+  return vmulq_lane_f16(a, b, 3);
 }

 // CHECK-LABEL: test_vmul_laneq_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <8 x half> %b, <8 x half> %b, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[MUL:%.*]]  = fmul <4 x half> %a, [[TMP0]]
+// CHECK: [[TMP0:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK: [[MUL:%.*]] = fmul <4 x half> [[A:%.*]], [[LANE]]
 // CHECK: ret <4 x half> [[MUL]]
 float16x4_t test_vmul_laneq_f16(float16x4_t a, float16x8_t b) {
  return vmul_laneq_f16(a, b, 7);
 }

 // CHECK-LABEL: test_vmulq_laneq_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <8 x half> %b, <8 x half> %b, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[MUL:%.*]]  = fmul <8 x half> %a, [[TMP0]]
+// CHECK: [[TMP0:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK: [[MUL:%.*]] = fmul <8 x half> [[A:%.*]], [[LANE]]
 // CHECK: ret <8 x half> [[MUL]]
 float16x8_t test_vmulq_laneq_f16(float16x8_t a, float16x8_t b) {
  return vmulq_laneq_f16(a, b, 7);
@ -1165,33 +1173,49 @@ float16_t test_vmulh_laneq_f16(float16_t a, float16x8_t b) {
 }

 // CHECK-LABEL: test_vmulx_lane_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[MUL:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> [[TMP0]])
-// CHECK: ret <4 x half> [[MUL]]
+// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK: [[TMP2:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8>
+// CHECK: [[TMP3:%.*]] = bitcast <4 x half> [[LANE]] to <8 x i8>
+// CHECK: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]]) #4
+// CHECK: ret <4 x half> [[VMULX2_I]]
 float16x4_t test_vmulx_lane_f16(float16x4_t a, float16x4_t b) {
  return vmulx_lane_f16(a, b, 3);
 }

 // CHECK-LABEL: test_vmulxq_lane_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[MUL:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> [[TMP0]])
-// CHECK: ret <8 x half> [[MUL]]
+// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK: [[TMP2:%.*]] = bitcast <8 x half> [[A:%.*]] to <16 x i8>
+// CHECK: [[TMP3:%.*]] = bitcast <8 x half> [[LANE]] to <16 x i8>
+// CHECK: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]]) #4
+// CHECK: ret <8 x half> [[VMULX2_I]]
 float16x8_t test_vmulxq_lane_f16(float16x8_t a, float16x4_t b) {
-  return vmulxq_lane_f16(a, b, 7);
+  return vmulxq_lane_f16(a, b, 3);
 }

 // CHECK-LABEL: test_vmulx_laneq_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <8 x half> %b, <8 x half> %b, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[MUL:%.*]]  = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> [[TMP0]])
-// CHECK: ret <4 x half> [[MUL]]
+// CHECK: [[TMP0:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK: [[TMP2:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8>
+// CHECK: [[TMP3:%.*]] = bitcast <4 x half> [[LANE]] to <8 x i8>
+// CHECK: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]]) #4
+// CHECK: ret <4 x half> [[VMULX2_I]]
 float16x4_t test_vmulx_laneq_f16(float16x4_t a, float16x8_t b) {
  return vmulx_laneq_f16(a, b, 7);
 }

 // CHECK-LABEL: test_vmulxq_laneq_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <8 x half> %b, <8 x half> %b, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[MUL:%.*]]  = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> [[TMP0]])
-// CHECK: ret <8 x half> [[MUL]]
+// CHECK: [[TMP0:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK: [[TMP2:%.*]] = bitcast <8 x half> [[A:%.*]] to <16 x i8>
+// CHECK: [[TMP3:%.*]] = bitcast <8 x half> [[LANE]] to <16 x i8>
+// CHECK: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]]) #4
+// CHECK: ret <8 x half> [[VMULX2_I]]
 float16x8_t test_vmulxq_laneq_f16(float16x8_t a, float16x8_t b) {
  return vmulxq_laneq_f16(a, b, 7);
 }
@ -1473,17 +1497,21 @@ float16x8_t test_vdupq_n_f16(float16_t a) {
 }

 // CHECK-LABEL: test_vdup_lane_f16
-// CHECK:   [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   ret <4 x half> [[SHFL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x half> [[LANE]]
 float16x4_t test_vdup_lane_f16(float16x4_t a) {
  return vdup_lane_f16(a, 3);
 }

 // CHECK-LABEL: test_vdupq_lane_f16
-// CHECK:   [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK:   ret <8 x half> [[SHFL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <8 x half> [[LANE]]
 float16x8_t test_vdupq_lane_f16(float16x4_t a) {
-  return vdupq_lane_f16(a, 7);
+  return vdupq_lane_f16(a, 3);
 }

 // CHECK-LABEL: @test_vext_f16(
--- a/clang/test/CodeGen/arm-neon-range-checks.c
+++ b/clang/test/CodeGen/arm-neon-range-checks.c
@ -0,0 +1,410 @@
+// RUN: %clang_cc1 -triple arm64-none-eabi -target-feature +neon -target-feature +dotprod -target-feature +v8.1a -verify %s
+// RUN: %clang_cc1 -triple armv8.1a-none-eabi -target-feature +neon -target-feature +dotprod -target-feature +v8.1a -verify %s
+
+#include <arm_neon.h>
+
+void test_vdot_lane(int32x2_t r, int8x8_t a, int8x8_t b) {
+  vdot_lane_s32(r, a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vdot_lane_s32(r, a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vdot_lane_s32(r, a, b, 0);
+  vdot_lane_s32(r, a, b, 1);
+}
+
+void test_vdotq_lane(int32x4_t r, int8x16_t a, int8x8_t b) {
+  vdotq_lane_s32(r, a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vdotq_lane_s32(r, a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vdotq_lane_s32(r, a, b, 0);
+  vdotq_lane_s32(r, a, b, 1);
+}
+
+#if defined(__aarch64__)
+void test_vdot_laneq(int32x2_t r, int8x8_t a, int8x16_t b) {
+  vdot_laneq_s32(r, a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vdot_laneq_s32(r, a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vdot_laneq_s32(r, a, b, 0);
+  vdot_laneq_s32(r, a, b, 3);
+}
+
+void test_vdotq_laneq(int32x4_t r, int8x16_t a, int8x16_t b) {
+  vdotq_laneq_s32(r, a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vdotq_laneq_s32(r, a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vdotq_laneq_s32(r, a, b, 0);
+  vdotq_laneq_s32(r, a, b, 3);
+}
+#endif
+
+void test_vdup_lane(int32x2_t v) {
+  vdup_lane_s32(v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vdup_lane_s32(v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vdup_lane_s32(v, 0);
+  vdup_lane_s32(v, 1);
+}
+
+void test_vdupq_lane(int32x2_t v) {
+  vdupq_lane_s32(v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vdupq_lane_s32(v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vdupq_lane_s32(v, 0);
+  vdupq_lane_s32(v, 1);
+}
+
+#if defined(__aarch64__)
+void test_vdup_laneq(int32x4_t v) {
+  vdup_laneq_s32(v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vdup_laneq_s32(v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vdup_laneq_s32(v, 0);
+  vdup_laneq_s32(v, 3);
+}
+
+void test_vdupq_laneq(int32x4_t v) {
+  vdupq_laneq_s32(v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vdupq_laneq_s32(v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vdupq_laneq_s32(v, 0);
+  vdupq_laneq_s32(v, 3);
+}
+#endif
+
+void test_vmla_lane(int32x2_t a, int32x2_t b, int32x2_t v) {
+  vmla_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vmla_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vmla_lane_s32(a, b, v, 0);
+  vmla_lane_s32(a, b, v, 1);
+}
+
+void test_vmlaq_lane(int32x4_t a, int32x4_t b, int32x2_t v) {
+  vmlaq_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vmlaq_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vmlaq_lane_s32(a, b, v, 0);
+  vmlaq_lane_s32(a, b, v, 1);
+}
+
+#if defined(__aarch64__)
+void test_vmla_laneq(int32x2_t a, int32x2_t b, int32x4_t v) {
+  vmla_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vmla_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vmla_laneq_s32(a, b, v, 0);
+  vmla_laneq_s32(a, b, v, 3);
+}
+
+void test_vmlaq_laneq(int32x4_t a, int32x4_t b, int32x4_t v) {
+  vmlaq_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vmlaq_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vmlaq_laneq_s32(a, b, v, 0);
+  vmlaq_laneq_s32(a, b, v, 3);
+}
+
+void test_vmlal_high_lane(int64x2_t a, int32x4_t b, int32x2_t v) {
+  vmlal_high_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vmlal_high_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vmlal_high_lane_s32(a, b, v, 0);
+  vmlal_high_lane_s32(a, b, v, 1);
+}
+
+void test_vmlal_high_laneq(int64x2_t a, int32x4_t b, int32x4_t v) {
+  vmlal_high_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vmlal_high_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vmlal_high_laneq_s32(a, b, v, 0);
+  vmlal_high_laneq_s32(a, b, v, 3);
+}
+#endif
+
+void test_vmlal_lane(int64x2_t a, int32x2_t b, int32x2_t v) {
+  vmlal_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vmlal_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vmlal_lane_s32(a, b, v, 0);
+  vmlal_lane_s32(a, b, v, 1);
+}
+
+#if defined(__aarch64__)
+void test_vmlal_laneq(int64x2_t a, int32x2_t b, int32x4_t v) {
+  vmlal_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vmlal_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vmlal_laneq_s32(a, b, v, 0);
+  vmlal_laneq_s32(a, b, v, 3);
+}
+#endif
+
+void test_vmls_lane(int32x2_t a, int32x2_t b, int32x2_t v) {
+  vmls_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vmls_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vmls_lane_s32(a, b, v, 0);
+  vmls_lane_s32(a, b, v, 1);
+}
+
+void test_vmlsq_lane(int32x4_t a, int32x4_t b, int32x2_t v) {
+  vmlsq_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vmlsq_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vmlsq_lane_s32(a, b, v, 0);
+  vmlsq_lane_s32(a, b, v, 1);
+}
+
+#if defined(__aarch64__)
+void test_vmls_laneq(int32x2_t a, int32x2_t b, int32x4_t v) {
+  vmls_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vmls_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vmls_laneq_s32(a, b, v, 0);
+  vmls_laneq_s32(a, b, v, 3);
+}
+
+void test_vmlsq_laneq(int32x4_t a, int32x4_t b, int32x4_t v) {
+  vmlsq_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vmlsq_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vmlsq_laneq_s32(a, b, v, 0);
+  vmlsq_laneq_s32(a, b, v, 3);
+}
+
+void test_vmlsl_high_lane(int64x2_t a, int32x4_t b, int32x2_t v) {
+  vmlsl_high_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vmlsl_high_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vmlsl_high_lane_s32(a, b, v, 0);
+  vmlsl_high_lane_s32(a, b, v, 1);
+}
+
+void test_vmlsl_high_laneq(int64x2_t a, int32x4_t b, int32x4_t v) {
+  vmlsl_high_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vmlsl_high_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vmlsl_high_laneq_s32(a, b, v, 0);
+  vmlsl_high_laneq_s32(a, b, v, 3);
+}
+#endif
+
+void test_vmlsl_lane(int64x2_t a, int32x2_t b, int32x2_t v) {
+  vmlsl_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vmlsl_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vmlsl_lane_s32(a, b, v, 0);
+  vmlsl_lane_s32(a, b, v, 1);
+}
+
+#if defined(__aarch64__)
+void test_vmlsl_laneq(int64x2_t a, int32x2_t b, int32x4_t v) {
+  vmlsl_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vmlsl_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vmlsl_laneq_s32(a, b, v, 0);
+  vmlsl_laneq_s32(a, b, v, 3);
+}
+#endif
+
+void test_vmull_lane(int32x2_t a, int32x2_t b) {
+  vmull_lane_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vmull_lane_s32(a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vmull_lane_s32(a, b, 0);
+  vmull_lane_s32(a, b, 1);
+}
+
+#if defined(__aarch64__)
+void test_vmull_laneq(int32x2_t a, int32x4_t b) {
+  vmull_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vmull_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vmull_laneq_s32(a, b, 0);
+  vmull_laneq_s32(a, b, 3);
+}
+
+void test_vmull_high_lane(int32x4_t a, int32x2_t b) {
+  vmull_high_lane_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vmull_high_lane_s32(a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vmull_high_lane_s32(a, b, 0);
+  vmull_high_lane_s32(a, b, 1);
+}
+
+void test_vmull_high_laneq(int32x4_t a, int32x4_t b) {
+  vmull_high_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vmull_high_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vmull_high_laneq_s32(a, b, 0);
+  vmull_high_laneq_s32(a, b, 3);
+}
+
+void test_vqdmlal_high_lane(int64x2_t a, int32x4_t b, int32x2_t v) {
+  vqdmlal_high_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqdmlal_high_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqdmlal_high_lane_s32(a, b, v, 0);
+  vqdmlal_high_lane_s32(a, b, v, 1);
+}
+
+void test_vqdmlal_high_laneq(int64x2_t a, int32x4_t b, int32x4_t v) {
+  vqdmlal_high_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqdmlal_high_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqdmlal_high_laneq_s32(a, b, v, 0);
+  vqdmlal_high_laneq_s32(a, b, v, 3);
+}
+#endif
+
+void test_vqdmlal_lane(int64x2_t a, int32x2_t b, int32x2_t v) {
+  vqdmlal_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqdmlal_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqdmlal_lane_s32(a, b, v, 0);
+  vqdmlal_lane_s32(a, b, v, 1);
+}
+
+#if defined(__aarch64__)
+void test_vqdmlal_laneq(int64x2_t a, int32x2_t b, int32x4_t v) {
+  vqdmlal_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqdmlal_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqdmlal_laneq_s32(a, b, v, 0);
+  vqdmlal_laneq_s32(a, b, v, 3);
+}
+
+void test_vqdmlsl_high_lane(int64x2_t a, int32x4_t b, int32x2_t v) {
+  vqdmlsl_high_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqdmlsl_high_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqdmlsl_high_lane_s32(a, b, v, 0);
+  vqdmlsl_high_lane_s32(a, b, v, 1);
+}
+
+void test_vqdmlsl_high_laneq(int64x2_t a, int32x4_t b, int32x4_t v) {
+  vqdmlsl_high_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqdmlsl_high_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqdmlsl_high_laneq_s32(a, b, v, 0);
+  vqdmlsl_high_laneq_s32(a, b, v, 3);
+}
+#endif
+
+void test_vqdmlsl_lane(int64x2_t a, int32x2_t b, int32x2_t v) {
+  vqdmlsl_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqdmlsl_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqdmlsl_lane_s32(a, b, v, 0);
+  vqdmlsl_lane_s32(a, b, v, 1);
+}
+
+#if defined(__aarch64__)
+void test_vqdmlsl_laneq(int64x2_t a, int32x2_t b, int32x4_t v) {
+  vqdmlsl_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqdmlsl_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqdmlsl_laneq_s32(a, b, v, 0);
+  vqdmlsl_laneq_s32(a, b, v, 3);
+}
+#endif
+
+void test_vqdmulh_lane(int32x2_t a, int32x2_t b) {
+  vqdmulh_lane_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqdmulh_lane_s32(a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqdmulh_lane_s32(a, b, 0);
+  vqdmulh_lane_s32(a, b, 1);
+}
+
+#if defined(__aarch64__)
+void test_vqdmulh_laneq(int32x2_t a, int32x4_t b) {
+  vqdmulh_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqdmulh_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqdmulh_laneq_s32(a, b, 0);
+  vqdmulh_laneq_s32(a, b, 3);
+}
+
+void test_vqdmulhq_laneq(int32x4_t a, int32x4_t b) {
+  vqdmulhq_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqdmulhq_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqdmulhq_laneq_s32(a, b, 0);
+  vqdmulhq_laneq_s32(a, b, 3);
+}
+
+void test_vqdmull_high_lane(int32x4_t a, int32x2_t b) {
+  vqdmull_high_lane_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqdmull_high_lane_s32(a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqdmull_high_lane_s32(a, b, 0);
+  vqdmull_high_lane_s32(a, b, 1);
+}
+
+void test_vqdmull_high_laneq(int32x4_t a, int32x4_t b) {
+  vqdmull_high_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqdmull_high_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqdmull_high_laneq_s32(a, b, 0);
+  vqdmull_high_laneq_s32(a, b, 3);
+}
+#endif
+
+void test_vqdmull_lane(int32x2_t a, int32x2_t v) {
+  vqdmull_lane_s32(a, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqdmull_lane_s32(a, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqdmull_lane_s32(a, v, 0);
+  vqdmull_lane_s32(a, v, 1);
+}
+
+#if defined(__aarch64__)
+void test_vqdmull_laneq(int32x2_t a, int32x4_t v) {
+  vqdmull_laneq_s32(a, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqdmull_laneq_s32(a, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqdmull_laneq_s32(a, v, 0);
+  vqdmull_laneq_s32(a, v, 3);
+}
+#endif
+
+void test_vqrdmlah_lane(int32x2_t a, int32x2_t b, int32x2_t v) {
+  vqrdmlah_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqrdmlah_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqrdmlah_lane_s32(a, b, v, 0);
+  vqrdmlah_lane_s32(a, b, v, 1);
+}
+
+void test_vqrdmlahq_lane(int32x4_t a, int32x4_t b, int32x2_t v) {
+  vqrdmlahq_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqrdmlahq_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqrdmlahq_lane_s32(a, b, v, 0);
+  vqrdmlahq_lane_s32(a, b, v, 1);
+}
+
+#if defined(__aarch64__)
+void test_vqrdmlah_laneq(int32x2_t a, int32x2_t b, int32x4_t v) {
+  vqrdmlah_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqrdmlah_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqrdmlah_laneq_s32(a, b, v, 0);
+  vqrdmlah_laneq_s32(a, b, v, 3);
+}
+
+void test_vqrdmlahq_laneq(int32x4_t a, int32x4_t b, int32x4_t v) {
+  vqrdmlahq_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqrdmlahq_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqrdmlahq_laneq_s32(a, b, v, 0);
+  vqrdmlahq_laneq_s32(a, b, v, 3);
+}
+#endif
+
+void test_vqrdmlsh_lane(int32x2_t a, int32x2_t b, int32x2_t v) {
+  vqrdmlsh_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqrdmlsh_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqrdmlsh_lane_s32(a, b, v, 0);
+  vqrdmlsh_lane_s32(a, b, v, 1);
+}
+
+void test_vqrdmlshq_lane(int32x4_t a, int32x4_t b, int32x2_t v) {
+  vqrdmlshq_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqrdmlshq_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqrdmlshq_lane_s32(a, b, v, 0);
+  vqrdmlshq_lane_s32(a, b, v, 1);
+}
+
+#if defined(__aarch64__)
+void test_vqrdmlsh_laneq(int32x2_t a, int32x2_t b, int32x4_t v) {
+  vqrdmlsh_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqrdmlsh_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqrdmlsh_laneq_s32(a, b, v, 0);
+  vqrdmlsh_laneq_s32(a, b, v, 3);
+}
+
+void test_vqrdmlshq_laneq(int32x4_t a, int32x4_t b, int32x4_t v) {
+  vqrdmlshq_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqrdmlshq_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqrdmlshq_laneq_s32(a, b, v, 0);
+  vqrdmlshq_laneq_s32(a, b, v, 3);
+}
+#endif
+
+void test_vqrdmulh_lane(int32x2_t a, int32x2_t v) {
+  vqrdmulh_lane_s32(a, v,  -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  vqrdmulh_lane_s32(a, v,  2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  vqrdmulh_lane_s32(a, v,  0);
+  vqrdmulh_lane_s32(a, v,  1);
+}
+
+#if defined(__aarch64__)
+void test_vqrdmulh_laneq(int32x2_t a, int32x4_t v) {
+  vqrdmulh_laneq_s32(a, v,  -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqrdmulh_laneq_s32(a, v,  4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqrdmulh_laneq_s32(a, v,  0);
+  vqrdmulh_laneq_s32(a, v,  3);
+}
+
+void test_vqrdmulhq_laneq(int32x4_t a, int32x4_t v) {
+  vqrdmulhq_laneq_s32(a, v,  -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  vqrdmulhq_laneq_s32(a, v,  4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  vqrdmulhq_laneq_s32(a, v,  0);
+  vqrdmulhq_laneq_s32(a, v,  3);
+}
+#endif
--- a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c
+++ b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c
@ -773,19 +773,23 @@ float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) {
 }

 // CHECK-LABEL: test_vmul_lane_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK: [[MUL:%.*]]  = fmul <4 x half> %a, [[TMP0]]
+// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK: [[MUL:%.*]] = fmul <4 x half> [[A:%.*]], [[LANE]]
 // CHECK: ret <4 x half> [[MUL]]
 float16x4_t test_vmul_lane_f16(float16x4_t a, float16x4_t b) {
  return vmul_lane_f16(a, b, 3);
 }

 // CHECK-LABEL: test_vmulq_lane_f16
-// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK: [[MUL:%.*]]  = fmul <8 x half> %a, [[TMP0]]
+// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8>
+// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK: [[MUL:%.*]] = fmul <8 x half> [[A:%.*]], [[LANE]]
 // CHECK: ret <8 x half> [[MUL]]
 float16x8_t test_vmulq_lane_f16(float16x8_t a, float16x4_t b) {
-  return vmulq_lane_f16(a, b, 7);
+  return vmulq_lane_f16(a, b, 3);
 }

 // CHECK-LABEL: test_vmul_n_f16
@ -939,17 +943,21 @@ float16x8_t test_vdupq_n_f16(float16_t a) {
 }

 // CHECK-LABEL: test_vdup_lane_f16
-// CHECK:   [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
-// CHECK:   ret <4 x half> [[SHFL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x half> [[LANE]]
 float16x4_t test_vdup_lane_f16(float16x4_t a) {
  return vdup_lane_f16(a, 3);
 }

 // CHECK-LABEL: test_vdupq_lane_f16
-// CHECK:   [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
-// CHECK:   ret <8 x half> [[SHFL]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <8 x half> [[LANE]]
 float16x8_t test_vdupq_lane_f16(float16x4_t a) {
-  return vdupq_lane_f16(a, 7);
+  return vdupq_lane_f16(a, 3);
 }

 // CHECK-LABEL: @test_vext_f16(
--- a/clang/test/CodeGen/arm64_vdupq_n_f64.c
+++ b/clang/test/CodeGen/arm64_vdupq_n_f64.c
@ -28,7 +28,9 @@ float32x4_t test_vdupq_n_f32(float32_t w) {
 // this was in <rdar://problem/11778405>, but had already been implemented,
 // test anyway
 // CHECK-LABEL: define <2 x double> @test_vdupq_lane_f64(<1 x double> %V) #0 {
-// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %V, <1 x double> %V, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %V to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer
 // CHECK:   ret <2 x double> [[SHUFFLE]]
 float64x2_t test_vdupq_lane_f64(float64x1_t V) {
    return vdupq_lane_f64(V, 0);
--- a/clang/test/CodeGen/arm_neon_intrinsics.c
+++ b/clang/test/CodeGen/arm_neon_intrinsics.c
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@ -239,6 +239,11 @@ public:
    NumVectors = 1;
  }

+  void make32BitElement() {
+    assert_with_loc(Bitwidth > 32, "Not enough bits to make it 32!");
+    ElementBitwidth = 32;
+  }
+
  void doubleLanes() {
    assert_with_loc(Bitwidth != 128, "Can't get bigger than 128!");
    Bitwidth = 128;
@ -1496,6 +1501,8 @@ std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagCast(DagInit *DI,
        castToType.doubleLanes();
      } else if (SI->getAsUnquotedString() == "8") {
        castToType.makeInteger(8, true);
+      } else if (SI->getAsUnquotedString() == "32") {
+        castToType.make32BitElement();
      } else {
        castToType = Type::fromTypedefName(SI->getAsUnquotedString());
        assert_with_loc(!castToType.isVoid(), "Unknown typedef");