[WebAssembly] Improve codegen for loading scalars from memory to v128

Use load32_zero instead of load32_splat to load the low 32 bits from memory to
v128. Test cases are added to cover this change.

Reviewed By: tlively

Differential Revision: https://reviews.llvm.org/D134257
This commit is contained in:
Fanchen Kong 2022-09-21 21:05:44 -07:00 committed by Thomas Lively
parent 6782d71680
commit 8a2729fea7
2 changed files with 355 additions and 56 deletions

View File

@ -285,14 +285,16 @@ defm "" : SIMDLoadZero<I32x4, 0x5c>;
defm "" : SIMDLoadZero<I64x2, 0x5d>;
// Use load_zero to load scalars into vectors as well where possible.
// TODO: i32, i16, and i8 scalars
def load_scalar :
PatFrag<(ops node:$addr), (scalar_to_vector (i64 (load $addr)))>;
defm : LoadPatNoOffset<v2i64, load_scalar, "LOAD_ZERO_I64x2">;
defm : LoadPatImmOff<v2i64, load_scalar, regPlusImm, "LOAD_ZERO_I64x2">;
defm : LoadPatImmOff<v2i64, load_scalar, or_is_add, "LOAD_ZERO_I64x2">;
defm : LoadPatOffsetOnly<v2i64, load_scalar, "LOAD_ZERO_I64x2">;
defm : LoadPatGlobalAddrOffOnly<v2i64, load_scalar, "LOAD_ZERO_I64x2">;
// TODO: i16, and i8 scalars
foreach vec = [I32x4, I64x2] in {
defvar inst = "LOAD_ZERO_"#vec;
defvar pat = PatFrag<(ops node:$addr), (scalar_to_vector (vec.lane_vt (load $addr)))>;
defm : LoadPatNoOffset<vec.vt, pat, inst>;
defm : LoadPatImmOff<vec.vt, pat, regPlusImm, inst>;
defm : LoadPatImmOff<vec.vt, pat, or_is_add, inst>;
defm : LoadPatOffsetOnly<vec.vt, pat, inst>;
defm : LoadPatGlobalAddrOffOnly<vec.vt, pat, inst>;
}
// TODO: f32x4 and f64x2 as well
foreach vec = [I32x4, I64x2] in {

View File

@ -1160,9 +1160,9 @@ define <4 x i32> @load_splat_v4i32(i32* %addr) {
ret <4 x i32> %v2
}
define <4 x i32> @load_sext_v4i32(<4 x i16>* %p) {
; CHECK-LABEL: load_sext_v4i32:
; CHECK: .functype load_sext_v4i32 (i32) -> (v128)
define <4 x i32> @load_sext_v4i16_to_v4i32(<4 x i16>* %p) {
; CHECK-LABEL: load_sext_v4i16_to_v4i32:
; CHECK: .functype load_sext_v4i16_to_v4i32 (i32) -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32x4.load16x4_s 0
@ -1172,9 +1172,9 @@ define <4 x i32> @load_sext_v4i32(<4 x i16>* %p) {
ret <4 x i32> %v2
}
define <4 x i32> @load_zext_v4i32(<4 x i16>* %p) {
; CHECK-LABEL: load_zext_v4i32:
; CHECK: .functype load_zext_v4i32 (i32) -> (v128)
define <4 x i32> @load_zext_v4i16_to_v4i32(<4 x i16>* %p) {
; CHECK-LABEL: load_zext_v4i16_to_v4i32:
; CHECK: .functype load_zext_v4i16_to_v4i32 (i32) -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32x4.load16x4_u 0
@ -1184,6 +1184,39 @@ define <4 x i32> @load_zext_v4i32(<4 x i16>* %p) {
ret <4 x i32> %v2
}
define <4 x i32> @load_sext_v4i8_to_v4i32(<4 x i8>* %p) {
; CHECK-LABEL: load_sext_v4i8_to_v4i32:
; CHECK: .functype load_sext_v4i8_to_v4i32 (i32) -> (v128)
; CHECK-NEXT: .local v128
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.load32_zero 0
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
; CHECK-NEXT: i32.const 24
; CHECK-NEXT: i32x4.shl
; CHECK-NEXT: i32.const 24
; CHECK-NEXT: i32x4.shr_s
; CHECK-NEXT: # fallthrough-return
%v = load <4 x i8>, <4 x i8>* %p
%v2 = sext <4 x i8> %v to <4 x i32>
ret <4 x i32> %v2
}
define <4 x i32> @load_zext_v4i8_to_v4i32(<4 x i8>* %p) {
; CHECK-LABEL: load_zext_v4i8_to_v4i32:
; CHECK: .functype load_zext_v4i8_to_v4i32 (i32) -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.load32_zero 0
; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
; CHECK-NEXT: # fallthrough-return
%v = load <4 x i8>, <4 x i8>* %p
%v2 = zext <4 x i8> %v to <4 x i32>
ret <4 x i32> %v2
}
define <4 x i16> @load_ext_v4i32(<4 x i16>* %p) {
; CHECK-LABEL: load_ext_v4i32:
; CHECK: .functype load_ext_v4i32 (i32) -> (v128)
@ -1225,9 +1258,9 @@ define <4 x i32> @load_splat_v4i32_with_folded_offset(i32* %p) {
ret <4 x i32> %v2
}
define <4 x i32> @load_sext_v4i32_with_folded_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_sext_v4i32_with_folded_offset:
; CHECK: .functype load_sext_v4i32_with_folded_offset (i32) -> (v128)
define <4 x i32> @load_sext_v4i16_to_v4i32_with_folded_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_sext_v4i16_to_v4i32_with_folded_offset:
; CHECK: .functype load_sext_v4i16_to_v4i32_with_folded_offset (i32) -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32x4.load16x4_s 16
@ -1240,9 +1273,9 @@ define <4 x i32> @load_sext_v4i32_with_folded_offset(<4 x i16>* %p) {
ret <4 x i32> %v2
}
define <4 x i32> @load_zext_v4i32_with_folded_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_zext_v4i32_with_folded_offset:
; CHECK: .functype load_zext_v4i32_with_folded_offset (i32) -> (v128)
define <4 x i32> @load_zext_v4i16_to_v4i32_with_folded_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_zext_v4i16_to_v4i32_with_folded_offset:
; CHECK: .functype load_zext_v4i16_to_v4i32_with_folded_offset (i32) -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32x4.load16x4_u 16
@ -1255,6 +1288,45 @@ define <4 x i32> @load_zext_v4i32_with_folded_offset(<4 x i16>* %p) {
ret <4 x i32> %v2
}
define <4 x i32> @load_sext_v4i8_to_v4i32_with_folded_offset(<4 x i8>* %p) {
; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_folded_offset:
; CHECK: .functype load_sext_v4i8_to_v4i32_with_folded_offset (i32) -> (v128)
; CHECK-NEXT: .local v128
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.load32_zero 16
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
; CHECK-NEXT: i32.const 24
; CHECK-NEXT: i32x4.shl
; CHECK-NEXT: i32.const 24
; CHECK-NEXT: i32x4.shr_s
; CHECK-NEXT: # fallthrough-return
%q = ptrtoint <4 x i8>* %p to i32
%r = add nuw i32 %q, 16
%s = inttoptr i32 %r to <4 x i8>*
%v = load <4 x i8>, <4 x i8>* %s
%v2 = sext <4 x i8> %v to <4 x i32>
ret <4 x i32> %v2
}
define <4 x i32> @load_zext_v4i8_to_v4i32_with_folded_offset(<4 x i8>* %p) {
; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_folded_offset:
; CHECK: .functype load_zext_v4i8_to_v4i32_with_folded_offset (i32) -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.load32_zero 16
; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
; CHECK-NEXT: # fallthrough-return
%q = ptrtoint <4 x i8>* %p to i32
%r = add nuw i32 %q, 16
%s = inttoptr i32 %r to <4 x i8>*
%v = load <4 x i8>, <4 x i8>* %s
%v2 = zext <4 x i8> %v to <4 x i32>
ret <4 x i32> %v2
}
define <4 x i16> @load_ext_v4i32_with_folded_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_ext_v4i32_with_folded_offset:
; CHECK: .functype load_ext_v4i32_with_folded_offset (i32) -> (v128)
@ -1295,9 +1367,9 @@ define <4 x i32> @load_splat_v4i32_with_folded_gep_offset(i32* %p) {
ret <4 x i32> %v2
}
define <4 x i32> @load_sext_v4i32_with_folded_gep_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_sext_v4i32_with_folded_gep_offset:
; CHECK: .functype load_sext_v4i32_with_folded_gep_offset (i32) -> (v128)
define <4 x i32> @load_sext_v4i16_to_v4i32_with_folded_gep_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_sext_v4i16_to_v4i32_with_folded_gep_offset:
; CHECK: .functype load_sext_v4i16_to_v4i32_with_folded_gep_offset (i32) -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32x4.load16x4_s 8
@ -1308,9 +1380,9 @@ define <4 x i32> @load_sext_v4i32_with_folded_gep_offset(<4 x i16>* %p) {
ret <4 x i32> %v2
}
define <4 x i32> @load_zext_v4i32_with_folded_gep_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_zext_v4i32_with_folded_gep_offset:
; CHECK: .functype load_zext_v4i32_with_folded_gep_offset (i32) -> (v128)
define <4 x i32> @load_zext_v4i16_to_v4i32_with_folded_gep_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_zext_v4i16_to_v4i32_with_folded_gep_offset:
; CHECK: .functype load_zext_v4i16_to_v4i32_with_folded_gep_offset (i32) -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32x4.load16x4_u 8
@ -1321,6 +1393,41 @@ define <4 x i32> @load_zext_v4i32_with_folded_gep_offset(<4 x i16>* %p) {
ret <4 x i32> %v2
}
define <4 x i32> @load_sext_v4i8_to_v4i32_with_folded_gep_offset(<4 x i8>* %p) {
; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_folded_gep_offset:
; CHECK: .functype load_sext_v4i8_to_v4i32_with_folded_gep_offset (i32) -> (v128)
; CHECK-NEXT: .local v128
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.load32_zero 4
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
; CHECK-NEXT: i32.const 24
; CHECK-NEXT: i32x4.shl
; CHECK-NEXT: i32.const 24
; CHECK-NEXT: i32x4.shr_s
; CHECK-NEXT: # fallthrough-return
%s = getelementptr inbounds <4 x i8>, <4 x i8>* %p, i32 1
%v = load <4 x i8>, <4 x i8>* %s
%v2 = sext <4 x i8> %v to <4 x i32>
ret <4 x i32> %v2
}
define <4 x i32> @load_zext_v4i8_to_v4i32_with_folded_gep_offset(<4 x i8>* %p) {
; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_folded_gep_offset:
; CHECK: .functype load_zext_v4i8_to_v4i32_with_folded_gep_offset (i32) -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: local.get 0
; CHECK-NEXT: v128.load32_zero 4
; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
; CHECK-NEXT: # fallthrough-return
%s = getelementptr inbounds <4 x i8>, <4 x i8>* %p, i32 1
%v = load <4 x i8>, <4 x i8>* %s
%v2 = zext <4 x i8> %v to <4 x i32>
ret <4 x i32> %v2
}
define <4 x i16> @load_ext_v4i32_with_folded_gep_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_ext_v4i32_with_folded_gep_offset:
; CHECK: .functype load_ext_v4i32_with_folded_gep_offset (i32) -> (v128)
@ -1363,9 +1470,9 @@ define <4 x i32> @load_splat_v4i32_with_unfolded_gep_negative_offset(i32* %p) {
ret <4 x i32> %v2
}
define <4 x i32> @load_sext_v4i32_with_unfolded_gep_negative_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_sext_v4i32_with_unfolded_gep_negative_offset:
; CHECK: .functype load_sext_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128)
define <4 x i32> @load_sext_v4i16_to_v4i32_with_unfolded_gep_negative_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_sext_v4i16_to_v4i32_with_unfolded_gep_negative_offset:
; CHECK: .functype load_sext_v4i16_to_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32.const -8
@ -1378,9 +1485,9 @@ define <4 x i32> @load_sext_v4i32_with_unfolded_gep_negative_offset(<4 x i16>* %
ret <4 x i32> %v2
}
define <4 x i32> @load_zext_v4i32_with_unfolded_gep_negative_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_zext_v4i32_with_unfolded_gep_negative_offset:
; CHECK: .functype load_zext_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128)
define <4 x i32> @load_zext_v4i16_to_v4i32_with_unfolded_gep_negative_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_zext_v4i16_to_v4i32_with_unfolded_gep_negative_offset:
; CHECK: .functype load_zext_v4i16_to_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32.const -8
@ -1393,6 +1500,45 @@ define <4 x i32> @load_zext_v4i32_with_unfolded_gep_negative_offset(<4 x i16>* %
ret <4 x i32> %v2
}
define <4 x i32> @load_sext_v4i8_to_v4i32_with_unfolded_gep_negative_offset(<4 x i8>* %p) {
; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_unfolded_gep_negative_offset:
; CHECK: .functype load_sext_v4i8_to_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128)
; CHECK-NEXT: .local v128
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32.const -4
; CHECK-NEXT: i32.add
; CHECK-NEXT: v128.load32_zero 0
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
; CHECK-NEXT: i32.const 24
; CHECK-NEXT: i32x4.shl
; CHECK-NEXT: i32.const 24
; CHECK-NEXT: i32x4.shr_s
; CHECK-NEXT: # fallthrough-return
%s = getelementptr inbounds <4 x i8>, <4 x i8>* %p, i32 -1
%v = load <4 x i8>, <4 x i8>* %s
%v2 = sext <4 x i8> %v to <4 x i32>
ret <4 x i32> %v2
}
define <4 x i32> @load_zext_v4i8_to_v4i32_with_unfolded_gep_negative_offset(<4 x i8>* %p) {
; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_unfolded_gep_negative_offset:
; CHECK: .functype load_zext_v4i8_to_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32.const -4
; CHECK-NEXT: i32.add
; CHECK-NEXT: v128.load32_zero 0
; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
; CHECK-NEXT: # fallthrough-return
%s = getelementptr inbounds <4 x i8>, <4 x i8>* %p, i32 -1
%v = load <4 x i8>, <4 x i8>* %s
%v2 = zext <4 x i8> %v to <4 x i32>
ret <4 x i32> %v2
}
define <4 x i16> @load_ext_v4i32_with_unfolded_gep_negative_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_ext_v4i32_with_unfolded_gep_negative_offset:
; CHECK: .functype load_ext_v4i32_with_unfolded_gep_negative_offset (i32) -> (v128)
@ -1441,9 +1587,9 @@ define <4 x i32> @load_splat_v4i32_with_unfolded_offset(i32* %p) {
ret <4 x i32> %v2
}
define <4 x i32> @load_sext_v4i32_with_unfolded_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_sext_v4i32_with_unfolded_offset:
; CHECK: .functype load_sext_v4i32_with_unfolded_offset (i32) -> (v128)
define <4 x i32> @load_sext_v4i16_to_v4i32_with_unfolded_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_sext_v4i16_to_v4i32_with_unfolded_offset:
; CHECK: .functype load_sext_v4i16_to_v4i32_with_unfolded_offset (i32) -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32.const 16
@ -1458,9 +1604,9 @@ define <4 x i32> @load_sext_v4i32_with_unfolded_offset(<4 x i16>* %p) {
ret <4 x i32> %v2
}
define <4 x i32> @load_zext_v4i32_with_unfolded_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_zext_v4i32_with_unfolded_offset:
; CHECK: .functype load_zext_v4i32_with_unfolded_offset (i32) -> (v128)
define <4 x i32> @load_zext_v4i16_to_v4i32_with_unfolded_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_zext_v4i16_to_v4i32_with_unfolded_offset:
; CHECK: .functype load_zext_v4i16_to_v4i32_with_unfolded_offset (i32) -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32.const 16
@ -1475,6 +1621,49 @@ define <4 x i32> @load_zext_v4i32_with_unfolded_offset(<4 x i16>* %p) {
ret <4 x i32> %v2
}
define <4 x i32> @load_sext_v4i8_to_v4i32_with_unfolded_offset(<4 x i8>* %p) {
; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_unfolded_offset:
; CHECK: .functype load_sext_v4i8_to_v4i32_with_unfolded_offset (i32) -> (v128)
; CHECK-NEXT: .local v128
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32.const 16
; CHECK-NEXT: i32.add
; CHECK-NEXT: v128.load32_zero 0
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
; CHECK-NEXT: i32.const 24
; CHECK-NEXT: i32x4.shl
; CHECK-NEXT: i32.const 24
; CHECK-NEXT: i32x4.shr_s
; CHECK-NEXT: # fallthrough-return
%q = ptrtoint <4 x i8>* %p to i32
%r = add nsw i32 %q, 16
%s = inttoptr i32 %r to <4 x i8>*
%v = load <4 x i8>, <4 x i8>* %s
%v2 = sext <4 x i8> %v to <4 x i32>
ret <4 x i32> %v2
}
define <4 x i32> @load_zext_v4i8_to_v4i32_with_unfolded_offset(<4 x i8>* %p) {
; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_unfolded_offset:
; CHECK: .functype load_zext_v4i8_to_v4i32_with_unfolded_offset (i32) -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32.const 16
; CHECK-NEXT: i32.add
; CHECK-NEXT: v128.load32_zero 0
; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
; CHECK-NEXT: # fallthrough-return
%q = ptrtoint <4 x i8>* %p to i32
%r = add nsw i32 %q, 16
%s = inttoptr i32 %r to <4 x i8>*
%v = load <4 x i8>, <4 x i8>* %s
%v2 = zext <4 x i8> %v to <4 x i32>
ret <4 x i32> %v2
}
define <4 x i16> @load_ext_v4i32_with_unfolded_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_ext_v4i32_with_unfolded_offset:
; CHECK: .functype load_ext_v4i32_with_unfolded_offset (i32) -> (v128)
@ -1521,9 +1710,9 @@ define <4 x i32> @load_splat_v4i32_with_unfolded_gep_offset(i32* %p) {
ret <4 x i32> %v2
}
define <4 x i32> @load_sext_v4i32_with_unfolded_gep_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_sext_v4i32_with_unfolded_gep_offset:
; CHECK: .functype load_sext_v4i32_with_unfolded_gep_offset (i32) -> (v128)
define <4 x i32> @load_sext_v4i16_to_v4i32_with_unfolded_gep_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_sext_v4i16_to_v4i32_with_unfolded_gep_offset:
; CHECK: .functype load_sext_v4i16_to_v4i32_with_unfolded_gep_offset (i32) -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32.const 8
@ -1536,9 +1725,9 @@ define <4 x i32> @load_sext_v4i32_with_unfolded_gep_offset(<4 x i16>* %p) {
ret <4 x i32> %v2
}
define <4 x i32> @load_zext_v4i32_with_unfolded_gep_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_zext_v4i32_with_unfolded_gep_offset:
; CHECK: .functype load_zext_v4i32_with_unfolded_gep_offset (i32) -> (v128)
define <4 x i32> @load_zext_v4i16_to_v4i32_with_unfolded_gep_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_zext_v4i16_to_v4i32_with_unfolded_gep_offset:
; CHECK: .functype load_zext_v4i16_to_v4i32_with_unfolded_gep_offset (i32) -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32.const 8
@ -1551,6 +1740,45 @@ define <4 x i32> @load_zext_v4i32_with_unfolded_gep_offset(<4 x i16>* %p) {
ret <4 x i32> %v2
}
define <4 x i32> @load_sext_v4i8_to_v4i32_with_unfolded_gep_offset(<4 x i8>* %p) {
; CHECK-LABEL: load_sext_v4i8_to_v4i32_with_unfolded_gep_offset:
; CHECK: .functype load_sext_v4i8_to_v4i32_with_unfolded_gep_offset (i32) -> (v128)
; CHECK-NEXT: .local v128
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32.const 4
; CHECK-NEXT: i32.add
; CHECK-NEXT: v128.load32_zero 0
; CHECK-NEXT: local.get 1
; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
; CHECK-NEXT: i32.const 24
; CHECK-NEXT: i32x4.shl
; CHECK-NEXT: i32.const 24
; CHECK-NEXT: i32x4.shr_s
; CHECK-NEXT: # fallthrough-return
%s = getelementptr <4 x i8>, <4 x i8>* %p, i32 1
%v = load <4 x i8>, <4 x i8>* %s
%v2 = sext <4 x i8> %v to <4 x i32>
ret <4 x i32> %v2
}
define <4 x i32> @load_zext_v4i8_to_v4i32_with_unfolded_gep_offset(<4 x i8>* %p) {
; CHECK-LABEL: load_zext_v4i8_to_v4i32_with_unfolded_gep_offset:
; CHECK: .functype load_zext_v4i8_to_v4i32_with_unfolded_gep_offset (i32) -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i32.const 4
; CHECK-NEXT: i32.add
; CHECK-NEXT: v128.load32_zero 0
; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
; CHECK-NEXT: # fallthrough-return
%s = getelementptr <4 x i8>, <4 x i8>* %p, i32 1
%v = load <4 x i8>, <4 x i8>* %s
%v2 = zext <4 x i8> %v to <4 x i32>
ret <4 x i32> %v2
}
define <4 x i16> @load_ext_v4i32_with_unfolded_gep_offset(<4 x i16>* %p) {
; CHECK-LABEL: load_ext_v4i32_with_unfolded_gep_offset:
; CHECK: .functype load_ext_v4i32_with_unfolded_gep_offset (i32) -> (v128)
@ -1591,9 +1819,9 @@ define <4 x i32> @load_splat_v4i32_from_numeric_address() {
ret <4 x i32> %v2
}
define <4 x i32> @load_sext_v4i32_from_numeric_address() {
; CHECK-LABEL: load_sext_v4i32_from_numeric_address:
; CHECK: .functype load_sext_v4i32_from_numeric_address () -> (v128)
define <4 x i32> @load_sext_v4i16_to_v4i32_from_numeric_address() {
; CHECK-LABEL: load_sext_v4i16_to_v4i32_from_numeric_address:
; CHECK: .functype load_sext_v4i16_to_v4i32_from_numeric_address () -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: i32.const 0
; CHECK-NEXT: i32x4.load16x4_s 32
@ -1604,9 +1832,9 @@ define <4 x i32> @load_sext_v4i32_from_numeric_address() {
ret <4 x i32> %v2
}
define <4 x i32> @load_zext_v4i32_from_numeric_address() {
; CHECK-LABEL: load_zext_v4i32_from_numeric_address:
; CHECK: .functype load_zext_v4i32_from_numeric_address () -> (v128)
define <4 x i32> @load_zext_v4i16_to_v4i32_from_numeric_address() {
; CHECK-LABEL: load_zext_v4i16_to_v4i32_from_numeric_address:
; CHECK: .functype load_zext_v4i16_to_v4i32_from_numeric_address () -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: i32.const 0
; CHECK-NEXT: i32x4.load16x4_u 32
@ -1617,6 +1845,41 @@ define <4 x i32> @load_zext_v4i32_from_numeric_address() {
ret <4 x i32> %v2
}
define <4 x i32> @load_sext_v4i8_to_v4i32_from_numeric_address() {
; CHECK-LABEL: load_sext_v4i8_to_v4i32_from_numeric_address:
; CHECK: .functype load_sext_v4i8_to_v4i32_from_numeric_address () -> (v128)
; CHECK-NEXT: .local v128
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: i32.const 0
; CHECK-NEXT: v128.load32_zero 32
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
; CHECK-NEXT: i32.const 24
; CHECK-NEXT: i32x4.shl
; CHECK-NEXT: i32.const 24
; CHECK-NEXT: i32x4.shr_s
; CHECK-NEXT: # fallthrough-return
%s = inttoptr i32 32 to <4 x i8>*
%v = load <4 x i8>, <4 x i8>* %s
%v2 = sext <4 x i8> %v to <4 x i32>
ret <4 x i32> %v2
}
define <4 x i32> @load_zext_v4i8_to_v4i32_from_numeric_address() {
; CHECK-LABEL: load_zext_v4i8_to_v4i32_from_numeric_address:
; CHECK: .functype load_zext_v4i8_to_v4i32_from_numeric_address () -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: i32.const 0
; CHECK-NEXT: v128.load32_zero 32
; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
; CHECK-NEXT: # fallthrough-return
%s = inttoptr i32 32 to <4 x i8>*
%v = load <4 x i8>, <4 x i8>* %s
%v2 = zext <4 x i8> %v to <4 x i32>
ret <4 x i32> %v2
}
define <4 x i16> @load_ext_v4i32_from_numeric_address() {
; CHECK-LABEL: load_ext_v4i32_from_numeric_address:
; CHECK: .functype load_ext_v4i32_from_numeric_address () -> (v128)
@ -1656,9 +1919,9 @@ define <4 x i32> @load_splat_v4i32_from_global_address() {
}
@gv_v4i16 = global <4 x i16> <i16 42, i16 42, i16 42, i16 42>
define <4 x i32> @load_sext_v4i32_from_global_address() {
; CHECK-LABEL: load_sext_v4i32_from_global_address:
; CHECK: .functype load_sext_v4i32_from_global_address () -> (v128)
define <4 x i32> @load_sext_v4i16_to_v4i32_from_global_address() {
; CHECK-LABEL: load_sext_v4i16_to_v4i32_from_global_address:
; CHECK: .functype load_sext_v4i16_to_v4i32_from_global_address () -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: i32.const 0
; CHECK-NEXT: i32x4.load16x4_s gv_v4i16
@ -1668,9 +1931,9 @@ define <4 x i32> @load_sext_v4i32_from_global_address() {
ret <4 x i32> %v2
}
define <4 x i32> @load_zext_v4i32_from_global_address() {
; CHECK-LABEL: load_zext_v4i32_from_global_address:
; CHECK: .functype load_zext_v4i32_from_global_address () -> (v128)
define <4 x i32> @load_zext_v4i16_to_v4i32_from_global_address() {
; CHECK-LABEL: load_zext_v4i16_to_v4i32_from_global_address:
; CHECK: .functype load_zext_v4i16_to_v4i32_from_global_address () -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: i32.const 0
; CHECK-NEXT: i32x4.load16x4_u gv_v4i16
@ -1680,6 +1943,40 @@ define <4 x i32> @load_zext_v4i32_from_global_address() {
ret <4 x i32> %v2
}
@gv_v4i8 = global <4 x i8> <i8 42, i8 42, i8 42, i8 42>
define <4 x i32> @load_sext_v4i8_to_v4i32_from_global_address() {
; CHECK-LABEL: load_sext_v4i8_to_v4i32_from_global_address:
; CHECK: .functype load_sext_v4i8_to_v4i32_from_global_address () -> (v128)
; CHECK-NEXT: .local v128
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: i32.const 0
; CHECK-NEXT: v128.load32_zero gv_v4i8
; CHECK-NEXT: local.get 0
; CHECK-NEXT: i8x16.shuffle 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0
; CHECK-NEXT: i32.const 24
; CHECK-NEXT: i32x4.shl
; CHECK-NEXT: i32.const 24
; CHECK-NEXT: i32x4.shr_s
; CHECK-NEXT: # fallthrough-return
%v = load <4 x i8>, <4 x i8>* @gv_v4i8
%v2 = sext <4 x i8> %v to <4 x i32>
ret <4 x i32> %v2
}
define <4 x i32> @load_zext_v4i8_to_v4i32_from_global_address() {
; CHECK-LABEL: load_zext_v4i8_to_v4i32_from_global_address:
; CHECK: .functype load_zext_v4i8_to_v4i32_from_global_address () -> (v128)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
; CHECK-NEXT: i32.const 0
; CHECK-NEXT: v128.load32_zero gv_v4i8
; CHECK-NEXT: i8x16.shuffle 16, 1, 2, 3, 17, 5, 6, 7, 18, 9, 10, 11, 19, 13, 14, 15
; CHECK-NEXT: # fallthrough-return
%v = load <4 x i8>, <4 x i8>* @gv_v4i8
%v2 = zext <4 x i8> %v to <4 x i32>
ret <4 x i32> %v2
}
define <4 x i16> @load_ext_v4i32_from_global_address() {
; CHECK-LABEL: load_ext_v4i32_from_global_address:
; CHECK: .functype load_ext_v4i32_from_global_address () -> (v128)