[WebAssembly] Implement prototype v128.load{32,64}_zero instructions

Specified in https://github.com/WebAssembly/simd/pull/237, these instructions load the first vector lane from memory and zero the other lanes. Since these instructions are not officially part of the SIMD proposal, they are only available on an opt-in basis via LLVM intrinsics and clang builtin functions. If these instructions are merged to the proposal, this implementation will change so that the instructions will be generated from normal IR. At that point the intrinsics and builtin functions would be removed. This PR also changes the opcodes for the experimental f32x4.qfm{a,s} instructions because their opcodes conflicted with those of the v128.load{32,64}_zero instructions. The new opcodes were chosen to match those used in V8. Differential Revision: https://reviews.llvm.org/D84820
2020-08-03 13:54:00 -07:00 · 2020-08-03 13:54:00 -07:00 · cb32792210
parent 66e7dce714
commit cb32792210
10 changed files with 334 additions and 12 deletions
--- a/clang/include/clang/Basic/BuiltinsWebAssembly.def
+++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def
@ -169,5 +169,8 @@ TARGET_BUILTIN(__builtin_wasm_narrow_u_i8x16_i16x8, "V16cV8sV8s", "nc", "simd128
 TARGET_BUILTIN(__builtin_wasm_narrow_s_i16x8_i32x4, "V8sV4iV4i", "nc", "simd128")
 TARGET_BUILTIN(__builtin_wasm_narrow_u_i16x8_i32x4, "V8sV4iV4i", "nc", "simd128")

+TARGET_BUILTIN(__builtin_wasm_load32_zero, "V4ii*", "nU", "simd128")
+TARGET_BUILTIN(__builtin_wasm_load64_zero, "V2LLiLLi*", "nU", "simd128")
+
 #undef BUILTIN
 #undef TARGET_BUILTIN
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@ -16497,6 +16497,16 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID,
        CGM.getIntrinsic(IntNo, {ConvertType(E->getType()), Low->getType()});
    return Builder.CreateCall(Callee, {Low, High});
  }
+  case WebAssembly::BI__builtin_wasm_load32_zero: {
+    Value *Ptr = EmitScalarExpr(E->getArg(0));
+    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_load32_zero);
+    return Builder.CreateCall(Callee, {Ptr});
+  }
+  case WebAssembly::BI__builtin_wasm_load64_zero: {
+    Value *Ptr = EmitScalarExpr(E->getArg(0));
+    Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_load64_zero);
+    return Builder.CreateCall(Callee, {Ptr});
+  }
  case WebAssembly::BI__builtin_wasm_shuffle_v8x16: {
    Value *Ops[18];
    size_t OpIdx = 0;
--- a/clang/test/CodeGen/builtins-wasm.c
+++ b/clang/test/CodeGen/builtins-wasm.c
@ -737,6 +737,18 @@ i16x8 narrow_u_i16x8_i32x4(i32x4 low, i32x4 high) {
  // WEBASSEMBLY: ret
 }

+i32x4 load32_zero(int *p) {
+  return __builtin_wasm_load32_zero(p);
+  // WEBASSEMBLY: call <4 x i32> @llvm.wasm.load32.zero(i32* %p)
+  // WEBASSEMBLY: ret
+}
+
+i64x2 load64_zero(long long *p) {
+  return __builtin_wasm_load64_zero(p);
+  // WEBASSEMBLY: call <2 x i64> @llvm.wasm.load64.zero(i64* %p)
+  // WEBASSEMBLY: ret
+}
+
 i8x16 swizzle_v8x16(i8x16 x, i8x16 y) {
  return __builtin_wasm_swizzle_v8x16(x, y);
  // WEBASSEMBLY: call <16 x i8> @llvm.wasm.swizzle(<16 x i8> %x, <16 x i8> %y)
--- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
+++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td
@ -190,6 +190,20 @@ def int_wasm_nearest :
            [LLVMMatchType<0>],
            [IntrNoMem, IntrSpeculatable]>;

+// TODO: Replace these intrinsic with normal ISel patterns once the
+// load_zero instructions are merged to the proposal.
+def int_wasm_load32_zero :
+  Intrinsic<[llvm_v4i32_ty],
+            [LLVMPointerType<llvm_i32_ty>],
+            [IntrReadMem, IntrArgMemOnly],
+             "", [SDNPMemOperand]>;
+
+def int_wasm_load64_zero :
+  Intrinsic<[llvm_v2i64_ty],
+            [LLVMPointerType<llvm_i64_ty>],
+            [IntrReadMem, IntrArgMemOnly],
+             "", [SDNPMemOperand]>;
+
 //===----------------------------------------------------------------------===//
 // Thread-local storage intrinsics
 //===----------------------------------------------------------------------===//
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@ -232,6 +232,7 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
  WASM_LOAD_STORE(ATOMIC_NOTIFY)
  WASM_LOAD_STORE(ATOMIC_WAIT_I32)
  WASM_LOAD_STORE(LOAD_SPLAT_v32x4)
+  WASM_LOAD_STORE(LOAD_ZERO_v4i32)
    return 2;
  WASM_LOAD_STORE(LOAD_I64)
  WASM_LOAD_STORE(LOAD_F64)
@ -254,6 +255,7 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
  WASM_LOAD_STORE(LOAD_EXTEND_U_v4i32)
  WASM_LOAD_STORE(LOAD_EXTEND_S_v2i64)
  WASM_LOAD_STORE(LOAD_EXTEND_U_v2i64)
+  WASM_LOAD_STORE(LOAD_ZERO_v2i64)
    return 3;
  WASM_LOAD_STORE(LOAD_V128)
  WASM_LOAD_STORE(STORE_V128)
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@ -675,6 +675,15 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
    Info.align = Align(8);
    Info.flags = MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad;
    return true;
+  case Intrinsic::wasm_load32_zero:
+  case Intrinsic::wasm_load64_zero:
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = Intrinsic == Intrinsic::wasm_load32_zero ? MVT::i32 : MVT::i64;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = Info.memVT == MVT::i32 ? Align(4) : Align(8);
+    Info.flags = MachineMemOperand::MOLoad;
+    return true;
  default:
    return false;
  }
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@ -70,7 +70,7 @@ defm LOAD_F64 : WebAssemblyLoad<F64, "f64.load", 0x2b, []>;
 multiclass LoadPatNoOffset<ValueType ty, PatFrag kind, string inst> {
  def : Pat<(ty (kind I32:$addr)), (!cast<NI>(inst # "_A32") 0, 0, I32:$addr)>,
        Requires<[HasAddr32]>;
-  def : Pat<(ty (kind I64:$addr)), (!cast<NI>(inst # "_A64") 0, 0, I64:$addr)>,
+  def : Pat<(ty (kind (i64 I64:$addr))), (!cast<NI>(inst # "_A64") 0, 0, I64:$addr)>,
        Requires<[HasAddr64]>;
 }

--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@ -163,6 +163,43 @@ defm : LoadPatGlobalAddrOffOnly<types[0], !cast<PatFrag>(exts[0]#types[1]),
                                "LOAD_EXTEND"#exts[1]#"_"#types[0]>;
 }

+// Load lane into zero vector
+multiclass SIMDLoadZero<ValueType vec_t, string name, bits<32> simdop> {
+  let mayLoad = 1, UseNamedOperandTable = 1 in {
+  defm LOAD_ZERO_#vec_t#_A32 :
+    SIMD_I<(outs V128:$dst),
+           (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+           (outs), (ins P2Align:$p2align, offset32_op:$off), [],
+           name#"\t$dst, ${off}(${addr})$p2align",
+           name#"\t$off$p2align", simdop>;
+  defm LOAD_ZERO_#vec_t#_A64 :
+    SIMD_I<(outs V128:$dst),
+           (ins P2Align:$p2align, offset64_op:$off, I64:$addr),
+           (outs), (ins P2Align:$p2align, offset64_op:$off), [],
+           name#"\t$dst, ${off}(${addr})$p2align",
+           name#"\t$off$p2align", simdop>;
+  } // mayLoad = 1, UseNamedOperandTable = 1
+}
+
+// TODO: Also support v4f32 and v2f64 once the instructions are merged
+// to the proposal
+defm "" : SIMDLoadZero<v4i32, "v128.load32_zero", 252>;
+defm "" : SIMDLoadZero<v2i64, "v128.load64_zero", 253>;
+
+defm : LoadPatNoOffset<v4i32, int_wasm_load32_zero, "LOAD_ZERO_v4i32">;
+defm : LoadPatNoOffset<v2i64, int_wasm_load64_zero, "LOAD_ZERO_v2i64">;
+
+defm : LoadPatImmOff<v4i32, int_wasm_load32_zero, regPlusImm, "LOAD_ZERO_v4i32">;
+defm : LoadPatImmOff<v2i64, int_wasm_load64_zero, regPlusImm, "LOAD_ZERO_v2i64">;
+
+defm : LoadPatImmOff<v4i32, int_wasm_load32_zero, or_is_add, "LOAD_ZERO_v4i32">;
+defm : LoadPatImmOff<v2i64, int_wasm_load64_zero, or_is_add, "LOAD_ZERO_v2i64">;
+
+defm : LoadPatOffsetOnly<v4i32, int_wasm_load32_zero, "LOAD_ZERO_v4i32">;
+defm : LoadPatOffsetOnly<v2i64, int_wasm_load64_zero, "LOAD_ZERO_v2i64">;
+
+defm : LoadPatGlobalAddrOffOnly<v4i32, int_wasm_load32_zero, "LOAD_ZERO_v4i32">;
+defm : LoadPatGlobalAddrOffOnly<v2i64, int_wasm_load64_zero, "LOAD_ZERO_v2i64">;

 // Store: v128.store
 let mayStore = 1, UseNamedOperandTable = 1 in {
@ -800,7 +837,7 @@ let isCommutable = 1 in
 defm DOT : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs), (outs), (ins),
                  [(set V128:$dst, (int_wasm_dot V128:$lhs, V128:$rhs))],
                  "i32x4.dot_i16x8_s\t$dst, $lhs, $rhs", "i32x4.dot_i16x8_s",
-                  180>;
+                  186>;

 //===----------------------------------------------------------------------===//
 // Floating-point unary arithmetic
@ -1038,20 +1075,21 @@ def : Pat<(t1 (bitconvert (t2 V128:$v))), (t1 V128:$v)>;
 // Quasi-Fused Multiply- Add and Subtract (QFMA/QFMS)
 //===----------------------------------------------------------------------===//

-multiclass SIMDQFM<ValueType vec_t, string vec, bits<32> baseInst> {
+multiclass SIMDQFM<ValueType vec_t, string vec, bits<32> simdopA,
+                   bits<32> simdopS> {
  defm QFMA_#vec_t :
    SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c),
           (outs), (ins),
           [(set (vec_t V128:$dst),
             (int_wasm_qfma (vec_t V128:$a), (vec_t V128:$b), (vec_t V128:$c)))],
-           vec#".qfma\t$dst, $a, $b, $c", vec#".qfma", baseInst>;
+           vec#".qfma\t$dst, $a, $b, $c", vec#".qfma", simdopA>;
  defm QFMS_#vec_t :
    SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c),
           (outs), (ins),
           [(set (vec_t V128:$dst),
             (int_wasm_qfms (vec_t V128:$a), (vec_t V128:$b), (vec_t V128:$c)))],
-           vec#".qfms\t$dst, $a, $b, $c", vec#".qfms", !add(baseInst, 1)>;
+           vec#".qfms\t$dst, $a, $b, $c", vec#".qfms", simdopS>;
 }

-defm "" : SIMDQFM<v4f32, "f32x4", 252>;
-defm "" : SIMDQFM<v2f64, "f64x2", 254>;
+defm "" : SIMDQFM<v4f32, "f32x4", 180, 212>;
+defm "" : SIMDQFM<v2f64, "f64x2", 254, 255>;
--- a/llvm/test/CodeGen/WebAssembly/simd-load-zero-offset.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-load-zero-offset.ll
@ -0,0 +1,228 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s
+
+; Test SIMD v128.load{32,64}_zero instructions
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+declare <4 x i32> @llvm.wasm.load32.zero(i32*)
+declare <2 x i64> @llvm.wasm.load64.zero(i64*)
+
+;===----------------------------------------------------------------------------
+; v128.load32_zero
+;===----------------------------------------------------------------------------
+
+define <4 x i32> @load_zero_i32_no_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_no_offset:
+; CHECK:         .functype load_zero_i32_no_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load32_zero 0
+; CHECK-NEXT:    # fallthrough-return
+  %v = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %p)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @load_zero_i32_with_folded_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_with_folded_offset:
+; CHECK:         .functype load_zero_i32_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load32_zero 24
+; CHECK-NEXT:    # fallthrough-return
+  %q = ptrtoint i32* %p to i32
+  %r = add nuw i32 %q, 24
+  %s = inttoptr i32 %r to i32*
+  %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s)
+  ret <4 x i32> %t
+}
+
+define <4 x i32> @load_zero_i32_with_folded_gep_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_with_folded_gep_offset:
+; CHECK:         .functype load_zero_i32_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load32_zero 24
+; CHECK-NEXT:    # fallthrough-return
+  %s = getelementptr inbounds i32, i32* %p, i32 6
+  %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s)
+  ret <4 x i32> %t
+}
+
+define <4 x i32> @load_zero_i32_with_unfolded_gep_negative_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_zero_i32_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -24
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load32_zero 0
+; CHECK-NEXT:    # fallthrough-return
+  %s = getelementptr inbounds i32, i32* %p, i32 -6
+  %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s)
+  ret <4 x i32> %t
+}
+
+define <4 x i32> @load_zero_i32_with_unfolded_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_with_unfolded_offset:
+; CHECK:         .functype load_zero_i32_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 24
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load32_zero 0
+; CHECK-NEXT:    # fallthrough-return
+  %q = ptrtoint i32* %p to i32
+  %r = add nsw i32 %q, 24
+  %s = inttoptr i32 %r to i32*
+  %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s)
+  ret <4 x i32> %t
+}
+
+define <4 x i32> @load_zero_i32_with_unfolded_gep_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_with_unfolded_gep_offset:
+; CHECK:         .functype load_zero_i32_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 24
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load32_zero 0
+; CHECK-NEXT:    # fallthrough-return
+  %s = getelementptr i32, i32* %p, i32 6
+  %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s)
+  ret <4 x i32> %t
+}
+
+define <4 x i32> @load_zero_i32_from_numeric_address() {
+; CHECK-LABEL: load_zero_i32_from_numeric_address:
+; CHECK:         .functype load_zero_i32_from_numeric_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v128.load32_zero 42
+; CHECK-NEXT:    # fallthrough-return
+  %s = inttoptr i32 42 to i32*
+  %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s)
+  ret <4 x i32> %t
+}
+
+@gv_i32 = global i32 0
+define <4 x i32> @load_zero_i32_from_global_address() {
+; CHECK-LABEL: load_zero_i32_from_global_address:
+; CHECK:         .functype load_zero_i32_from_global_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v128.load32_zero gv_i32
+; CHECK-NEXT:    # fallthrough-return
+  %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* @gv_i32)
+  ret <4 x i32> %t
+}
+
+;===----------------------------------------------------------------------------
+; v128.load64_zero
+;===----------------------------------------------------------------------------
+
+define <2 x i64> @load_zero_i64_no_offset(i64* %p) {
+; CHECK-LABEL: load_zero_i64_no_offset:
+; CHECK:         .functype load_zero_i64_no_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load64_zero 0
+; CHECK-NEXT:    # fallthrough-return
+  %v = tail call <2 x i64> @llvm.wasm.load64.zero(i64* %p)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @load_zero_i64_with_folded_offset(i64* %p) {
+; CHECK-LABEL: load_zero_i64_with_folded_offset:
+; CHECK:         .functype load_zero_i64_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load64_zero 24
+; CHECK-NEXT:    # fallthrough-return
+  %q = ptrtoint i64* %p to i32
+  %r = add nuw i32 %q, 24
+  %s = inttoptr i32 %r to i64*
+  %t = tail call <2 x i64> @llvm.wasm.load64.zero(i64* %s)
+  ret <2 x i64> %t
+}
+
+define <2 x i64> @load_zero_i64_with_folded_gep_offset(i64* %p) {
+; CHECK-LABEL: load_zero_i64_with_folded_gep_offset:
+; CHECK:         .functype load_zero_i64_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.load64_zero 48
+; CHECK-NEXT:    # fallthrough-return
+  %s = getelementptr inbounds i64, i64* %p, i64 6
+  %t = tail call <2 x i64> @llvm.wasm.load64.zero(i64* %s)
+  ret <2 x i64> %t
+}
+
+define <2 x i64> @load_zero_i64_with_unfolded_gep_negative_offset(i64* %p) {
+; CHECK-LABEL: load_zero_i64_with_unfolded_gep_negative_offset:
+; CHECK:         .functype load_zero_i64_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -48
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load64_zero 0
+; CHECK-NEXT:    # fallthrough-return
+  %s = getelementptr inbounds i64, i64* %p, i64 -6
+  %t = tail call <2 x i64> @llvm.wasm.load64.zero(i64* %s)
+  ret <2 x i64> %t
+}
+
+define <2 x i64> @load_zero_i64_with_unfolded_offset(i64* %p) {
+; CHECK-LABEL: load_zero_i64_with_unfolded_offset:
+; CHECK:         .functype load_zero_i64_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 24
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load64_zero 0
+; CHECK-NEXT:    # fallthrough-return
+  %q = ptrtoint i64* %p to i32
+  %r = add nsw i32 %q, 24
+  %s = inttoptr i32 %r to i64*
+  %t = tail call <2 x i64> @llvm.wasm.load64.zero(i64* %s)
+  ret <2 x i64> %t
+}
+
+define <2 x i64> @load_zero_i64_with_unfolded_gep_offset(i64* %p) {
+; CHECK-LABEL: load_zero_i64_with_unfolded_gep_offset:
+; CHECK:         .functype load_zero_i64_with_unfolded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const 48
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    v128.load64_zero 0
+; CHECK-NEXT:    # fallthrough-return
+  %s = getelementptr i64, i64* %p, i64 6
+  %t = tail call <2 x i64> @llvm.wasm.load64.zero(i64* %s)
+  ret <2 x i64> %t
+}
+
+define <2 x i64> @load_zero_i64_from_numeric_address() {
+; CHECK-LABEL: load_zero_i64_from_numeric_address:
+; CHECK:         .functype load_zero_i64_from_numeric_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v128.load64_zero 42
+; CHECK-NEXT:    # fallthrough-return
+  %s = inttoptr i32 42 to i64*
+  %t = tail call <2 x i64> @llvm.wasm.load64.zero(i64* %s)
+  ret <2 x i64> %t
+}
+
+@gv_i64 = global i64 0
+define <2 x i64> @load_zero_i64_from_global_address() {
+; CHECK-LABEL: load_zero_i64_from_global_address:
+; CHECK:         .functype load_zero_i64_from_global_address () -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 0
+; CHECK-NEXT:    v128.load64_zero gv_i64
+; CHECK-NEXT:    # fallthrough-return
+  %t = tail call <2 x i64> @llvm.wasm.load64.zero(i64* @gv_i64)
+  ret <2 x i64> %t
+}
--- a/llvm/test/MC/WebAssembly/simd-encodings.s
+++ b/llvm/test/MC/WebAssembly/simd-encodings.s
@ -463,9 +463,6 @@ main:
    # CHECK: i32x4.sub # encoding: [0xfd,0xb1,0x01]
    i32x4.sub

-    # CHECK: i32x4.dot_i16x8_s # encoding: [0xfd,0xb4,0x01]
-    i32x4.dot_i16x8_s
-
    # CHECK: i32x4.mul # encoding: [0xfd,0xb5,0x01]
    i32x4.mul

@ -481,6 +478,9 @@ main:
    # CHECK: i32x4.max_u # encoding: [0xfd,0xb9,0x01]
    i32x4.max_u

+    # CHECK: i32x4.dot_i16x8_s # encoding: [0xfd,0xba,0x01]
+    i32x4.dot_i16x8_s
+
    # CHECK: i64x2.neg # encoding: [0xfd,0xc1,0x01]
    i64x2.neg

@ -610,10 +610,16 @@ main:
    # CHECK: f32x4.convert_i32x4_u # encoding: [0xfd,0xfb,0x01]
    f32x4.convert_i32x4_u

-    # CHECK: f32x4.qfma # encoding: [0xfd,0xfc,0x01]
+    # CHECK: v128.load32_zero 32 # encoding: [0xfd,0xfc,0x01,0x02,0x20]
+    v128.load32_zero 32
+
+    # CHECK: v128.load64_zero 32 # encoding: [0xfd,0xfd,0x01,0x03,0x20]
+    v128.load64_zero 32
+
+    # CHECK: f32x4.qfma # encoding: [0xfd,0xb4,0x01]
    f32x4.qfma

-    # CHECK: f32x4.qfms # encoding: [0xfd,0xfd,0x01]
+    # CHECK: f32x4.qfms # encoding: [0xfd,0xd4,0x01]
    f32x4.qfms

    # CHECK: f64x2.qfma # encoding: [0xfd,0xfe,0x01]