[WebAssembly] Implement truncating vector stores

Rather than expanding truncating stores so that vectors are stored one
lane at a time, lower them to a sequence of instructions using
narrowing operations instead, when possible. Since the narrowing
operations have saturating semantics, but truncating stores require
truncation, mask the stored value to manually truncate it before
narrowing. Also, since narrowing is a binary operation, pass in the
original vector as the unused second argument.

Differential Revision: https://reviews.llvm.org/D84377
This commit is contained in:
Thomas Lively 2020-07-28 17:46:45 -07:00
parent 068808d102
commit ffd8c23ccb
3 changed files with 418 additions and 5 deletions

View File

@ -246,6 +246,9 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
setLoadExtAction(Ext, MVT::v4i32, MVT::v4i16, Legal);
setLoadExtAction(Ext, MVT::v2i64, MVT::v2i32, Legal);
}
// And some truncating stores are legal as well
setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
}
// Don't do anything clever with build_pairs

View File

@ -885,6 +885,12 @@ defm "" : SIMDConvert<v4i32, v4f32, fp_to_uint, "i32x4.trunc_sat_f32x4_u", 249>;
defm "" : SIMDConvert<v4f32, v4i32, sint_to_fp, "f32x4.convert_i32x4_s", 250>;
defm "" : SIMDConvert<v4f32, v4i32, uint_to_fp, "f32x4.convert_i32x4_u", 251>;
// Lower llvm.wasm.trunc.saturate.* to saturating instructions
def : Pat<(v4i32 (int_wasm_trunc_saturate_signed (v4f32 V128:$src))),
(fp_to_sint_v4i32_v4f32 (v4f32 V128:$src))>;
def : Pat<(v4i32 (int_wasm_trunc_saturate_unsigned (v4f32 V128:$src))),
(fp_to_uint_v4i32_v4f32 (v4f32 V128:$src))>;
// Widening operations
multiclass SIMDWiden<ValueType vec_t, string vec, ValueType arg_t, string arg,
bits<32> baseInst> {
@ -921,11 +927,95 @@ multiclass SIMDNarrow<ValueType vec_t, string vec, ValueType arg_t, string arg,
defm "" : SIMDNarrow<v16i8, "i8x16", v8i16, "i16x8", 101>;
defm "" : SIMDNarrow<v8i16, "i16x8", v4i32, "i32x4", 133>;
// Lower llvm.wasm.trunc.saturate.* to saturating instructions
def : Pat<(v4i32 (int_wasm_trunc_saturate_signed (v4f32 V128:$src))),
(fp_to_sint_v4i32_v4f32 (v4f32 V128:$src))>;
def : Pat<(v4i32 (int_wasm_trunc_saturate_unsigned (v4f32 V128:$src))),
(fp_to_uint_v4i32_v4f32 (v4f32 V128:$src))>;
// Use narrowing operations for truncating stores. Since the narrowing
// operations are saturating instead of truncating, we need to mask
// the stored values first.
// TODO: Use consts instead of splats
def store_v8i8_trunc_v8i16 :
OutPatFrag<(ops node:$val),
(EXTRACT_LANE_v2i64
(NARROW_U_v16i8
(AND_v4i32 (SPLAT_v4i32 (CONST_I32 0x00ff00ff)), node:$val),
node:$val // Unused input
),
0
)>;
def store_v4i16_trunc_v4i32 :
OutPatFrag<(ops node:$val),
(EXTRACT_LANE_v2i64
(NARROW_U_v8i16
(AND_v4i32 (SPLAT_v4i32 (CONST_I32 0x0000ffff)), node:$val),
node:$val // Unused input
),
0
)>;
// Store patterns adapted from WebAssemblyInstrMemory.td
multiclass NarrowingStorePatNoOffset<ValueType ty, PatFrag node,
OutPatFrag out> {
def : Pat<(node ty:$val, I32:$addr),
(STORE_I64_A32 0, 0, I32:$addr, (i64 (out ty:$val)))>,
Requires<[HasAddr32]>;
def : Pat<(node ty:$val, I64:$addr),
(STORE_I64_A64 0, 0, I64:$addr, (i64 (out ty:$val)))>,
Requires<[HasAddr64]>;
}
defm : NarrowingStorePatNoOffset<v8i16, truncstorevi8, store_v8i8_trunc_v8i16>;
defm : NarrowingStorePatNoOffset<v4i32, truncstorevi16,
store_v4i16_trunc_v4i32>;
multiclass NarrowingStorePatImmOff<ValueType ty, PatFrag kind,
PatFrag operand, OutPatFrag out> {
def : Pat<(kind ty:$val, (operand I32:$addr, imm:$off)),
(STORE_I64_A32 0, imm:$off, I32:$addr, (i64 (out ty:$val)))>,
Requires<[HasAddr32]>;
def : Pat<(kind ty:$val, (operand I64:$addr, imm:$off)),
(STORE_I64_A64 0, imm:$off, I64:$addr, (i64 (out ty:$val)))>,
Requires<[HasAddr64]>;
}
defm : NarrowingStorePatImmOff<v8i16, truncstorevi8, regPlusImm,
store_v8i8_trunc_v8i16>;
defm : NarrowingStorePatImmOff<v4i32, truncstorevi16, regPlusImm,
store_v4i16_trunc_v4i32>;
defm : NarrowingStorePatImmOff<v8i16, truncstorevi8, or_is_add,
store_v8i8_trunc_v8i16>;
defm : NarrowingStorePatImmOff<v4i32, truncstorevi16, or_is_add,
store_v4i16_trunc_v4i32>;
multiclass NarrowingStorePatOffsetOnly<ValueType ty, PatFrag kind,
OutPatFrag out> {
def : Pat<(kind ty:$val, imm:$off),
(STORE_I64_A32 0, imm:$off, (CONST_I32 0), (i64 (out ty:$val)))>,
Requires<[HasAddr32]>;
def : Pat<(kind ty:$val, imm:$off),
(STORE_I64_A64 0, imm:$off, (CONST_I64 0), (i64 (out ty:$val)))>,
Requires<[HasAddr64]>;
}
defm : NarrowingStorePatOffsetOnly<v8i16, truncstorevi8,
store_v8i8_trunc_v8i16>;
defm : NarrowingStorePatOffsetOnly<v4i32, truncstorevi16,
store_v4i16_trunc_v4i32>;
multiclass NarrowingStorePatGlobalAddrOffOnly<ValueType ty, PatFrag kind,
OutPatFrag out> {
def : Pat<(kind ty:$val, (WebAssemblywrapper tglobaladdr:$off)),
(STORE_I64_A32
0, tglobaladdr:$off, (CONST_I32 0), (i64 (out ty:$val)))>,
Requires<[IsNotPIC, HasAddr32]>;
def : Pat<(kind ty:$val, (WebAssemblywrapper tglobaladdr:$off)),
(STORE_I64_A64
0, tglobaladdr:$off, (CONST_I64 0), (i64 (out ty:$val)))>,
Requires<[IsNotPIC, HasAddr64]>;
}
defm : NarrowingStorePatGlobalAddrOffOnly<v8i16, truncstorevi8,
store_v8i8_trunc_v8i16>;
defm : NarrowingStorePatGlobalAddrOffOnly<v4i32, truncstorevi16,
store_v4i16_trunc_v4i32>;
// Bitcasts are nops
// Matching bitcast t1 to t1 causes strange errors, so avoid repeating types

View File

@ -918,6 +918,24 @@ define void @store_v8i16(<8 x i16> %v, <8 x i16>* %p) {
ret void
}
define void @store_narrowing_v8i16(<8 x i8> %v, <8 x i8>* %p) {
; CHECK-LABEL: store_narrowing_v8i16:
; CHECK: .functype store_narrowing_v8i16 (v128, i32) -> ()
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i32.const 16711935
; CHECK-NEXT: i32x4.splat
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.narrow_i16x8_u
; CHECK-NEXT: i64x2.extract_lane 0
; CHECK-NEXT: i64.store 0
; CHECK-NEXT: # fallthrough-return
store <8 x i8> %v, <8 x i8>* %p
ret void
}
define void @store_v8i16_with_folded_offset(<8 x i16> %v, <8 x i16>* %p) {
; CHECK-LABEL: store_v8i16_with_folded_offset:
; CHECK: .functype store_v8i16_with_folded_offset (v128, i32) -> ()
@ -933,6 +951,27 @@ define void @store_v8i16_with_folded_offset(<8 x i16> %v, <8 x i16>* %p) {
ret void
}
define void @store_narrowing_v8i16_with_folded_offset(<8 x i8> %v, <8 x i8>* %p) {
; CHECK-LABEL: store_narrowing_v8i16_with_folded_offset:
; CHECK: .functype store_narrowing_v8i16_with_folded_offset (v128, i32) -> ()
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i32.const 16711935
; CHECK-NEXT: i32x4.splat
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.narrow_i16x8_u
; CHECK-NEXT: i64x2.extract_lane 0
; CHECK-NEXT: i64.store 16
; CHECK-NEXT: # fallthrough-return
%q = ptrtoint <8 x i8>* %p to i32
%r = add nuw i32 %q, 16
%s = inttoptr i32 %r to <8 x i8>*
store <8 x i8> %v , <8 x i8>* %s
ret void
}
define void @store_v8i16_with_folded_gep_offset(<8 x i16> %v, <8 x i16>* %p) {
; CHECK-LABEL: store_v8i16_with_folded_gep_offset:
; CHECK: .functype store_v8i16_with_folded_gep_offset (v128, i32) -> ()
@ -946,6 +985,25 @@ define void @store_v8i16_with_folded_gep_offset(<8 x i16> %v, <8 x i16>* %p) {
ret void
}
define void @store_narrowing_v8i16_with_folded_gep_offset(<8 x i8> %v, <8 x i8>* %p) {
; CHECK-LABEL: store_narrowing_v8i16_with_folded_gep_offset:
; CHECK: .functype store_narrowing_v8i16_with_folded_gep_offset (v128, i32) -> ()
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i32.const 16711935
; CHECK-NEXT: i32x4.splat
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.narrow_i16x8_u
; CHECK-NEXT: i64x2.extract_lane 0
; CHECK-NEXT: i64.store 8
; CHECK-NEXT: # fallthrough-return
%s = getelementptr inbounds <8 x i8>, <8 x i8>* %p, i32 1
store <8 x i8> %v , <8 x i8>* %s
ret void
}
define void @store_v8i16_with_unfolded_gep_negative_offset(<8 x i16> %v, <8 x i16>* %p) {
; CHECK-LABEL: store_v8i16_with_unfolded_gep_negative_offset:
; CHECK: .functype store_v8i16_with_unfolded_gep_negative_offset (v128, i32) -> ()
@ -961,6 +1019,27 @@ define void @store_v8i16_with_unfolded_gep_negative_offset(<8 x i16> %v, <8 x i1
ret void
}
define void @store_narrowing_v8i16_with_unfolded_gep_negative_offset(<8 x i8> %v, <8 x i8>* %p) {
; CHECK-LABEL: store_narrowing_v8i16_with_unfolded_gep_negative_offset:
; CHECK: .functype store_narrowing_v8i16_with_unfolded_gep_negative_offset (v128, i32) -> ()
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i32.const -8
; CHECK-NEXT: i32.add
; CHECK-NEXT: i32.const 16711935
; CHECK-NEXT: i32x4.splat
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.narrow_i16x8_u
; CHECK-NEXT: i64x2.extract_lane 0
; CHECK-NEXT: i64.store 0
; CHECK-NEXT: # fallthrough-return
%s = getelementptr inbounds <8 x i8>, <8 x i8>* %p, i32 -1
store <8 x i8> %v , <8 x i8>* %s
ret void
}
define void @store_v8i16_with_unfolded_offset(<8 x i16> %v, <8 x i16>* %p) {
; CHECK-LABEL: store_v8i16_with_unfolded_offset:
; CHECK: .functype store_v8i16_with_unfolded_offset (v128, i32) -> ()
@ -978,6 +1057,29 @@ define void @store_v8i16_with_unfolded_offset(<8 x i16> %v, <8 x i16>* %p) {
ret void
}
define void @store_narrowing_v8i16_with_unfolded_offset(<8 x i8> %v, <8 x i8>* %p) {
; CHECK-LABEL: store_narrowing_v8i16_with_unfolded_offset:
; CHECK: .functype store_narrowing_v8i16_with_unfolded_offset (v128, i32) -> ()
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i32.const 16
; CHECK-NEXT: i32.add
; CHECK-NEXT: i32.const 16711935
; CHECK-NEXT: i32x4.splat
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.narrow_i16x8_u
; CHECK-NEXT: i64x2.extract_lane 0
; CHECK-NEXT: i64.store 0
; CHECK-NEXT: # fallthrough-return
%q = ptrtoint <8 x i8>* %p to i32
%r = add nsw i32 %q, 16
%s = inttoptr i32 %r to <8 x i8>*
store <8 x i8> %v , <8 x i8>* %s
ret void
}
define void @store_v8i16_with_unfolded_gep_offset(<8 x i16> %v, <8 x i16>* %p) {
; CHECK-LABEL: store_v8i16_with_unfolded_gep_offset:
; CHECK: .functype store_v8i16_with_unfolded_gep_offset (v128, i32) -> ()
@ -993,6 +1095,27 @@ define void @store_v8i16_with_unfolded_gep_offset(<8 x i16> %v, <8 x i16>* %p) {
ret void
}
define void @store_narrowing_v8i16_with_unfolded_gep_offset(<8 x i8> %v, <8 x i8>* %p) {
; CHECK-LABEL: store_narrowing_v8i16_with_unfolded_gep_offset:
; CHECK: .functype store_narrowing_v8i16_with_unfolded_gep_offset (v128, i32) -> ()
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i32.const 8
; CHECK-NEXT: i32.add
; CHECK-NEXT: i32.const 16711935
; CHECK-NEXT: i32x4.splat
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.narrow_i16x8_u
; CHECK-NEXT: i64x2.extract_lane 0
; CHECK-NEXT: i64.store 0
; CHECK-NEXT: # fallthrough-return
%s = getelementptr <8 x i8>, <8 x i8>* %p, i32 1
store <8 x i8> %v , <8 x i8>* %s
ret void
}
define void @store_v8i16_to_numeric_address(<8 x i16> %v) {
; CHECK-LABEL: store_v8i16_to_numeric_address:
; CHECK: .functype store_v8i16_to_numeric_address (v128) -> ()
@ -1006,6 +1129,25 @@ define void @store_v8i16_to_numeric_address(<8 x i16> %v) {
ret void
}
define void @store_narrowing_v8i16_to_numeric_address(<8 x i8> %v, <8 x i8>* %p) {
; CHECK-LABEL: store_narrowing_v8i16_to_numeric_address:
; CHECK: .functype store_narrowing_v8i16_to_numeric_address (v128, i32) -> ()
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: i32.const 0
; CHECK-NEXT: i32.const 16711935
; CHECK-NEXT: i32x4.splat
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.narrow_i16x8_u
; CHECK-NEXT: i64x2.extract_lane 0
; CHECK-NEXT: i64.store 32
; CHECK-NEXT: # fallthrough-return
%s = inttoptr i32 32 to <8 x i8>*
store <8 x i8> %v , <8 x i8>* %s
ret void
}
define void @store_v8i16_to_global_address(<8 x i16> %v) {
; CHECK-LABEL: store_v8i16_to_global_address:
; CHECK: .functype store_v8i16_to_global_address (v128) -> ()
@ -1018,6 +1160,24 @@ define void @store_v8i16_to_global_address(<8 x i16> %v) {
ret void
}
define void @store_narrowing_v8i16_to_global_address(<8 x i8> %v) {
; CHECK-LABEL: store_narrowing_v8i16_to_global_address:
; CHECK: .functype store_narrowing_v8i16_to_global_address (v128) -> ()
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: i32.const 0
; CHECK-NEXT: i32.const 16711935
; CHECK-NEXT: i32x4.splat
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.narrow_i16x8_u
; CHECK-NEXT: i64x2.extract_lane 0
; CHECK-NEXT: i64.store gv_v8i8
; CHECK-NEXT: # fallthrough-return
store <8 x i8> %v , <8 x i8>* @gv_v8i8
ret void
}
; ==============================================================================
; 4 x i32
; ==============================================================================
@ -1588,6 +1748,24 @@ define void @store_v4i32(<4 x i32> %v, <4 x i32>* %p) {
ret void
}
define void @store_narrowing_v4i32(<4 x i16> %v, <4 x i16>* %p) {
; CHECK-LABEL: store_narrowing_v4i32:
; CHECK: .functype store_narrowing_v4i32 (v128, i32) -> ()
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i32.const 65535
; CHECK-NEXT: i32x4.splat
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i16x8.narrow_i32x4_u
; CHECK-NEXT: i64x2.extract_lane 0
; CHECK-NEXT: i64.store 0
; CHECK-NEXT: # fallthrough-return
store <4 x i16> %v , <4 x i16>* %p
ret void
}
define void @store_v4i32_with_folded_offset(<4 x i32> %v, <4 x i32>* %p) {
; CHECK-LABEL: store_v4i32_with_folded_offset:
; CHECK: .functype store_v4i32_with_folded_offset (v128, i32) -> ()
@ -1603,6 +1781,27 @@ define void @store_v4i32_with_folded_offset(<4 x i32> %v, <4 x i32>* %p) {
ret void
}
define void @store_narrowing_v4i32_with_folded_offset(<4 x i16> %v, <4 x i16>* %p) {
; CHECK-LABEL: store_narrowing_v4i32_with_folded_offset:
; CHECK: .functype store_narrowing_v4i32_with_folded_offset (v128, i32) -> ()
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i32.const 65535
; CHECK-NEXT: i32x4.splat
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i16x8.narrow_i32x4_u
; CHECK-NEXT: i64x2.extract_lane 0
; CHECK-NEXT: i64.store 16
; CHECK-NEXT: # fallthrough-return
%q = ptrtoint <4 x i16>* %p to i32
%r = add nuw i32 %q, 16
%s = inttoptr i32 %r to <4 x i16>*
store <4 x i16> %v , <4 x i16>* %s
ret void
}
define void @store_v4i32_with_folded_gep_offset(<4 x i32> %v, <4 x i32>* %p) {
; CHECK-LABEL: store_v4i32_with_folded_gep_offset:
; CHECK: .functype store_v4i32_with_folded_gep_offset (v128, i32) -> ()
@ -1616,6 +1815,25 @@ define void @store_v4i32_with_folded_gep_offset(<4 x i32> %v, <4 x i32>* %p) {
ret void
}
define void @store_narrowing_v4i32_with_folded_gep_offset(<4 x i16> %v, <4 x i16>* %p) {
; CHECK-LABEL: store_narrowing_v4i32_with_folded_gep_offset:
; CHECK: .functype store_narrowing_v4i32_with_folded_gep_offset (v128, i32) -> ()
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i32.const 65535
; CHECK-NEXT: i32x4.splat
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i16x8.narrow_i32x4_u
; CHECK-NEXT: i64x2.extract_lane 0
; CHECK-NEXT: i64.store 8
; CHECK-NEXT: # fallthrough-return
%s = getelementptr inbounds <4 x i16>, <4 x i16>* %p, i32 1
store <4 x i16> %v , <4 x i16>* %s
ret void
}
define void @store_v4i32_with_unfolded_gep_negative_offset(<4 x i32> %v, <4 x i32>* %p) {
; CHECK-LABEL: store_v4i32_with_unfolded_gep_negative_offset:
; CHECK: .functype store_v4i32_with_unfolded_gep_negative_offset (v128, i32) -> ()
@ -1631,6 +1849,27 @@ define void @store_v4i32_with_unfolded_gep_negative_offset(<4 x i32> %v, <4 x i3
ret void
}
define void @store_narrowing_v4i32_with_unfolded_gep_negative_offset(<4 x i16> %v, <4 x i16>* %p) {
; CHECK-LABEL: store_narrowing_v4i32_with_unfolded_gep_negative_offset:
; CHECK: .functype store_narrowing_v4i32_with_unfolded_gep_negative_offset (v128, i32) -> ()
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i32.const -8
; CHECK-NEXT: i32.add
; CHECK-NEXT: i32.const 65535
; CHECK-NEXT: i32x4.splat
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i16x8.narrow_i32x4_u
; CHECK-NEXT: i64x2.extract_lane 0
; CHECK-NEXT: i64.store 0
; CHECK-NEXT: # fallthrough-return
%s = getelementptr inbounds <4 x i16>, <4 x i16>* %p, i32 -1
store <4 x i16> %v , <4 x i16>* %s
ret void
}
define void @store_v4i32_with_unfolded_offset(<4 x i32> %v, <4 x i32>* %p) {
; CHECK-LABEL: store_v4i32_with_unfolded_offset:
; CHECK: .functype store_v4i32_with_unfolded_offset (v128, i32) -> ()
@ -1648,6 +1887,29 @@ define void @store_v4i32_with_unfolded_offset(<4 x i32> %v, <4 x i32>* %p) {
ret void
}
define void @store_narrowing_v4i32_with_unfolded_offset(<4 x i16> %v, <4 x i16>* %p) {
; CHECK-LABEL: store_narrowing_v4i32_with_unfolded_offset:
; CHECK: .functype store_narrowing_v4i32_with_unfolded_offset (v128, i32) -> ()
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i32.const 16
; CHECK-NEXT: i32.add
; CHECK-NEXT: i32.const 65535
; CHECK-NEXT: i32x4.splat
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i16x8.narrow_i32x4_u
; CHECK-NEXT: i64x2.extract_lane 0
; CHECK-NEXT: i64.store 0
; CHECK-NEXT: # fallthrough-return
%q = ptrtoint <4 x i16>* %p to i32
%r = add nsw i32 %q, 16
%s = inttoptr i32 %r to <4 x i16>*
store <4 x i16> %v , <4 x i16>* %s
ret void
}
define void @store_v4i32_with_unfolded_gep_offset(<4 x i32> %v, <4 x i32>* %p) {
; CHECK-LABEL: store_v4i32_with_unfolded_gep_offset:
; CHECK: .functype store_v4i32_with_unfolded_gep_offset (v128, i32) -> ()
@ -1663,6 +1925,27 @@ define void @store_v4i32_with_unfolded_gep_offset(<4 x i32> %v, <4 x i32>* %p) {
ret void
}
define void @store_narrowing_v4i32_with_unfolded_gep_offset(<4 x i16> %v, <4 x i16>* %p) {
; CHECK-LABEL: store_narrowing_v4i32_with_unfolded_gep_offset:
; CHECK: .functype store_narrowing_v4i32_with_unfolded_gep_offset (v128, i32) -> ()
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i32.const 8
; CHECK-NEXT: i32.add
; CHECK-NEXT: i32.const 65535
; CHECK-NEXT: i32x4.splat
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i16x8.narrow_i32x4_u
; CHECK-NEXT: i64x2.extract_lane 0
; CHECK-NEXT: i64.store 0
; CHECK-NEXT: # fallthrough-return
%s = getelementptr <4 x i16>, <4 x i16>* %p, i32 1
store <4 x i16> %v , <4 x i16>* %s
ret void
}
define void @store_v4i32_to_numeric_address(<4 x i32> %v) {
; CHECK-LABEL: store_v4i32_to_numeric_address:
; CHECK: .functype store_v4i32_to_numeric_address (v128) -> ()
@ -1676,6 +1959,25 @@ define void @store_v4i32_to_numeric_address(<4 x i32> %v) {
ret void
}
define void @store_narrowing_v4i32_to_numeric_address(<4 x i16> %v) {
; CHECK-LABEL: store_narrowing_v4i32_to_numeric_address:
; CHECK: .functype store_narrowing_v4i32_to_numeric_address (v128) -> ()
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: i32.const 0
; CHECK-NEXT: i32.const 65535
; CHECK-NEXT: i32x4.splat
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i16x8.narrow_i32x4_u
; CHECK-NEXT: i64x2.extract_lane 0
; CHECK-NEXT: i64.store 32
; CHECK-NEXT: # fallthrough-return
%s = inttoptr i32 32 to <4 x i16>*
store <4 x i16> %v , <4 x i16>* %s
ret void
}
define void @store_v4i32_to_global_address(<4 x i32> %v) {
; CHECK-LABEL: store_v4i32_to_global_address:
; CHECK: .functype store_v4i32_to_global_address (v128) -> ()
@ -1688,6 +1990,24 @@ define void @store_v4i32_to_global_address(<4 x i32> %v) {
ret void
}
define void @store_narrowing_v4i32_to_global_address(<4 x i16> %v) {
; CHECK-LABEL: store_narrowing_v4i32_to_global_address:
; CHECK: .functype store_narrowing_v4i32_to_global_address (v128) -> ()
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: i32.const 0
; CHECK-NEXT: i32.const 65535
; CHECK-NEXT: i32x4.splat
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.and
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i16x8.narrow_i32x4_u
; CHECK-NEXT: i64x2.extract_lane 0
; CHECK-NEXT: i64.store gv_v4i16
; CHECK-NEXT: # fallthrough-return
store <4 x i16> %v , <4 x i16>* @gv_v4i16
ret void
}
; ==============================================================================
; 2 x i64
; ==============================================================================