diff --git a/llvm/test/CodeGen/ARM/vld1.ll b/llvm/test/CodeGen/ARM/vld1.ll index f5383aafb2bb..c61ea8c9a789 100644 --- a/llvm/test/CodeGen/ARM/vld1.ll +++ b/llvm/test/CodeGen/ARM/vld1.ll @@ -10,28 +10,32 @@ define <8 x i8> @vld1i8(i8* %A) nounwind { define <4 x i16> @vld1i16(i16* %A) nounwind { ;CHECK: vld1i16: ;CHECK: vld1.16 - %tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i16* %A) + %tmp0 = bitcast i16* %A to i8* + %tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %tmp0) ret <4 x i16> %tmp1 } define <2 x i32> @vld1i32(i32* %A) nounwind { ;CHECK: vld1i32: ;CHECK: vld1.32 - %tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i32* %A) + %tmp0 = bitcast i32* %A to i8* + %tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %tmp0) ret <2 x i32> %tmp1 } define <2 x float> @vld1f(float* %A) nounwind { ;CHECK: vld1f: ;CHECK: vld1.32 - %tmp1 = call <2 x float> @llvm.arm.neon.vld1.v2f32(float* %A) + %tmp0 = bitcast float* %A to i8* + %tmp1 = call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %tmp0) ret <2 x float> %tmp1 } define <1 x i64> @vld1i64(i64* %A) nounwind { ;CHECK: vld1i64: ;CHECK: vld1.64 - %tmp1 = call <1 x i64> @llvm.arm.neon.vld1.v1i64(i64* %A) + %tmp0 = bitcast i64* %A to i8* + %tmp1 = call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %tmp0) ret <1 x i64> %tmp1 } @@ -45,28 +49,32 @@ define <16 x i8> @vld1Qi8(i8* %A) nounwind { define <8 x i16> @vld1Qi16(i16* %A) nounwind { ;CHECK: vld1Qi16: ;CHECK: vld1.16 - %tmp1 = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i16* %A) + %tmp0 = bitcast i16* %A to i8* + %tmp1 = call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %tmp0) ret <8 x i16> %tmp1 } define <4 x i32> @vld1Qi32(i32* %A) nounwind { ;CHECK: vld1Qi32: ;CHECK: vld1.32 - %tmp1 = call <4 x i32> @llvm.arm.neon.vld1.v4i32(i32* %A) + %tmp0 = bitcast i32* %A to i8* + %tmp1 = call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %tmp0) ret <4 x i32> %tmp1 } define <4 x float> @vld1Qf(float* %A) nounwind { ;CHECK: vld1Qf: ;CHECK: vld1.32 - %tmp1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(float* %A) + %tmp0 = bitcast float* %A to i8* + %tmp1 = call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %tmp0) ret <4 x float> %tmp1 } define <2 x i64> @vld1Qi64(i64* %A) nounwind { ;CHECK: vld1Qi64: ;CHECK: vld1.64 - %tmp1 = call <2 x i64> @llvm.arm.neon.vld1.v2i64(i64* %A) + %tmp0 = bitcast i64* %A to i8* + %tmp1 = call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %tmp0) ret <2 x i64> %tmp1 } diff --git a/llvm/test/CodeGen/ARM/vld2.ll b/llvm/test/CodeGen/ARM/vld2.ll index 23f7d2ca0cd3..f5dc06cb302e 100644 --- a/llvm/test/CodeGen/ARM/vld2.ll +++ b/llvm/test/CodeGen/ARM/vld2.ll @@ -24,7 +24,8 @@ define <8 x i8> @vld2i8(i8* %A) nounwind { define <4 x i16> @vld2i16(i16* %A) nounwind { ;CHECK: vld2i16: ;CHECK: vld2.16 - %tmp1 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16(i16* %A) + %tmp0 = bitcast i16* %A to i8* + %tmp1 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2.v4i16(i8* %tmp0) %tmp2 = extractvalue %struct.__neon_int16x4x2_t %tmp1, 0 %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp1, 1 %tmp4 = add <4 x i16> %tmp2, %tmp3 @@ -34,7 +35,8 @@ define <4 x i16> @vld2i16(i16* %A) nounwind { define <2 x i32> @vld2i32(i32* %A) nounwind { ;CHECK: vld2i32: ;CHECK: vld2.32 - %tmp1 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i32* %A) + %tmp0 = bitcast i32* %A to i8* + %tmp1 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2.v2i32(i8* %tmp0) %tmp2 = extractvalue %struct.__neon_int32x2x2_t %tmp1, 0 %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp1, 1 %tmp4 = add <2 x i32> %tmp2, %tmp3 @@ -44,7 +46,8 @@ define <2 x i32> @vld2i32(i32* %A) nounwind { define <2 x float> @vld2f(float* %A) nounwind { ;CHECK: vld2f: ;CHECK: vld2.32 - %tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32(float* %A) + %tmp0 = bitcast float* %A to i8* + %tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32(i8* %tmp0) %tmp2 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 0 %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 1 %tmp4 = add <2 x float> %tmp2, %tmp3 @@ -54,7 +57,8 @@ define <2 x float> @vld2f(float* %A) nounwind { define <1 x i64> @vld2i64(i64* %A) nounwind { ;CHECK: vld2i64: ;CHECK: vld1.64 - %tmp1 = call %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64(i64* %A) + %tmp0 = bitcast i64* %A to i8* + %tmp1 = call %struct.__neon_int64x1x2_t @llvm.arm.neon.vld2.v1i64(i8* %tmp0) %tmp2 = extractvalue %struct.__neon_int64x1x2_t %tmp1, 0 %tmp3 = extractvalue %struct.__neon_int64x1x2_t %tmp1, 1 %tmp4 = add <1 x i64> %tmp2, %tmp3 @@ -74,7 +78,8 @@ define <16 x i8> @vld2Qi8(i8* %A) nounwind { define <8 x i16> @vld2Qi16(i16* %A) nounwind { ;CHECK: vld2Qi16: ;CHECK: vld2.16 - %tmp1 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16(i16* %A) + %tmp0 = bitcast i16* %A to i8* + %tmp1 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2.v8i16(i8* %tmp0) %tmp2 = extractvalue %struct.__neon_int16x8x2_t %tmp1, 0 %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp1, 1 %tmp4 = add <8 x i16> %tmp2, %tmp3 @@ -84,7 +89,8 @@ define <8 x i16> @vld2Qi16(i16* %A) nounwind { define <4 x i32> @vld2Qi32(i32* %A) nounwind { ;CHECK: vld2Qi32: ;CHECK: vld2.32 - %tmp1 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i32* %A) + %tmp0 = bitcast i32* %A to i8* + %tmp1 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2.v4i32(i8* %tmp0) %tmp2 = extractvalue %struct.__neon_int32x4x2_t %tmp1, 0 %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp1, 1 %tmp4 = add <4 x i32> %tmp2, %tmp3 @@ -94,7 +100,8 @@ define <4 x i32> @vld2Qi32(i32* %A) nounwind { define <4 x float> @vld2Qf(float* %A) nounwind { ;CHECK: vld2Qf: ;CHECK: vld2.32 - %tmp1 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32(float* %A) + %tmp0 = bitcast float* %A to i8* + %tmp1 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2.v4f32(i8* %tmp0) %tmp2 = extractvalue %struct.__neon_float32x4x2_t %tmp1, 0 %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp1, 1 %tmp4 = add <4 x float> %tmp2, %tmp3 diff --git a/llvm/test/CodeGen/ARM/vld3.ll b/llvm/test/CodeGen/ARM/vld3.ll index 207dc6a22e45..33c4d37b3ed3 100644 --- a/llvm/test/CodeGen/ARM/vld3.ll +++ b/llvm/test/CodeGen/ARM/vld3.ll @@ -24,7 +24,8 @@ define <8 x i8> @vld3i8(i8* %A) nounwind { define <4 x i16> @vld3i16(i16* %A) nounwind { ;CHECK: vld3i16: ;CHECK: vld3.16 - %tmp1 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i16* %A) + %tmp0 = bitcast i16* %A to i8* + %tmp1 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i8* %tmp0) %tmp2 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 0 %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 2 %tmp4 = add <4 x i16> %tmp2, %tmp3 @@ -34,7 +35,8 @@ define <4 x i16> @vld3i16(i16* %A) nounwind { define <2 x i32> @vld3i32(i32* %A) nounwind { ;CHECK: vld3i32: ;CHECK: vld3.32 - %tmp1 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i32* %A) + %tmp0 = bitcast i32* %A to i8* + %tmp1 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3.v2i32(i8* %tmp0) %tmp2 = extractvalue %struct.__neon_int32x2x3_t %tmp1, 0 %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp1, 2 %tmp4 = add <2 x i32> %tmp2, %tmp3 @@ -44,7 +46,8 @@ define <2 x i32> @vld3i32(i32* %A) nounwind { define <2 x float> @vld3f(float* %A) nounwind { ;CHECK: vld3f: ;CHECK: vld3.32 - %tmp1 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32(float* %A) + %tmp0 = bitcast float* %A to i8* + %tmp1 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3.v2f32(i8* %tmp0) %tmp2 = extractvalue %struct.__neon_float32x2x3_t %tmp1, 0 %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp1, 2 %tmp4 = add <2 x float> %tmp2, %tmp3 @@ -54,7 +57,8 @@ define <2 x float> @vld3f(float* %A) nounwind { define <1 x i64> @vld3i64(i64* %A) nounwind { ;CHECK: vld3i64: ;CHECK: vld1.64 - %tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i64* %A) + %tmp0 = bitcast i64* %A to i8* + %tmp1 = call %struct.__neon_int64x1x3_t @llvm.arm.neon.vld3.v1i64(i8* %tmp0) %tmp2 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 0 %tmp3 = extractvalue %struct.__neon_int64x1x3_t %tmp1, 2 %tmp4 = add <1 x i64> %tmp2, %tmp3 @@ -76,7 +80,8 @@ define <8 x i16> @vld3Qi16(i16* %A) nounwind { ;CHECK: vld3Qi16: ;CHECK: vld3.16 ;CHECK: vld3.16 - %tmp1 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16(i16* %A) + %tmp0 = bitcast i16* %A to i8* + %tmp1 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3.v8i16(i8* %tmp0) %tmp2 = extractvalue %struct.__neon_int16x8x3_t %tmp1, 0 %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp1, 2 %tmp4 = add <8 x i16> %tmp2, %tmp3 @@ -87,7 +92,8 @@ define <4 x i32> @vld3Qi32(i32* %A) nounwind { ;CHECK: vld3Qi32: ;CHECK: vld3.32 ;CHECK: vld3.32 - %tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i32* %A) + %tmp0 = bitcast i32* %A to i8* + %tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i8* %tmp0) %tmp2 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 0 %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 2 %tmp4 = add <4 x i32> %tmp2, %tmp3 @@ -98,7 +104,8 @@ define <4 x float> @vld3Qf(float* %A) nounwind { ;CHECK: vld3Qf: ;CHECK: vld3.32 ;CHECK: vld3.32 - %tmp1 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32(float* %A) + %tmp0 = bitcast float* %A to i8* + %tmp1 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3.v4f32(i8* %tmp0) %tmp2 = extractvalue %struct.__neon_float32x4x3_t %tmp1, 0 %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp1, 2 %tmp4 = add <4 x float> %tmp2, %tmp3 diff --git a/llvm/test/CodeGen/ARM/vld4.ll b/llvm/test/CodeGen/ARM/vld4.ll index 0624f2977ea4..e800cb539aa4 100644 --- a/llvm/test/CodeGen/ARM/vld4.ll +++ b/llvm/test/CodeGen/ARM/vld4.ll @@ -24,7 +24,8 @@ define <8 x i8> @vld4i8(i8* %A) nounwind { define <4 x i16> @vld4i16(i16* %A) nounwind { ;CHECK: vld4i16: ;CHECK: vld4.16 - %tmp1 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16(i16* %A) + %tmp0 = bitcast i16* %A to i8* + %tmp1 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4.v4i16(i8* %tmp0) %tmp2 = extractvalue %struct.__neon_int16x4x4_t %tmp1, 0 %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp1, 2 %tmp4 = add <4 x i16> %tmp2, %tmp3 @@ -34,7 +35,8 @@ define <4 x i16> @vld4i16(i16* %A) nounwind { define <2 x i32> @vld4i32(i32* %A) nounwind { ;CHECK: vld4i32: ;CHECK: vld4.32 - %tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i32* %A) + %tmp0 = bitcast i32* %A to i8* + %tmp1 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4.v2i32(i8* %tmp0) %tmp2 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 0 %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp1, 2 %tmp4 = add <2 x i32> %tmp2, %tmp3 @@ -44,7 +46,8 @@ define <2 x i32> @vld4i32(i32* %A) nounwind { define <2 x float> @vld4f(float* %A) nounwind { ;CHECK: vld4f: ;CHECK: vld4.32 - %tmp1 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32(float* %A) + %tmp0 = bitcast float* %A to i8* + %tmp1 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4.v2f32(i8* %tmp0) %tmp2 = extractvalue %struct.__neon_float32x2x4_t %tmp1, 0 %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp1, 2 %tmp4 = add <2 x float> %tmp2, %tmp3 @@ -54,7 +57,8 @@ define <2 x float> @vld4f(float* %A) nounwind { define <1 x i64> @vld4i64(i64* %A) nounwind { ;CHECK: vld4i64: ;CHECK: vld1.64 - %tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i64* %A) + %tmp0 = bitcast i64* %A to i8* + %tmp1 = call %struct.__neon_int64x1x4_t @llvm.arm.neon.vld4.v1i64(i8* %tmp0) %tmp2 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 0 %tmp3 = extractvalue %struct.__neon_int64x1x4_t %tmp1, 2 %tmp4 = add <1 x i64> %tmp2, %tmp3 @@ -76,7 +80,8 @@ define <8 x i16> @vld4Qi16(i16* %A) nounwind { ;CHECK: vld4Qi16: ;CHECK: vld4.16 ;CHECK: vld4.16 - %tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i16* %A) + %tmp0 = bitcast i16* %A to i8* + %tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8* %tmp0) %tmp2 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 0 %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 2 %tmp4 = add <8 x i16> %tmp2, %tmp3 @@ -87,7 +92,8 @@ define <4 x i32> @vld4Qi32(i32* %A) nounwind { ;CHECK: vld4Qi32: ;CHECK: vld4.32 ;CHECK: vld4.32 - %tmp1 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32(i32* %A) + %tmp0 = bitcast i32* %A to i8* + %tmp1 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4.v4i32(i8* %tmp0) %tmp2 = extractvalue %struct.__neon_int32x4x4_t %tmp1, 0 %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp1, 2 %tmp4 = add <4 x i32> %tmp2, %tmp3 @@ -98,7 +104,8 @@ define <4 x float> @vld4Qf(float* %A) nounwind { ;CHECK: vld4Qf: ;CHECK: vld4.32 ;CHECK: vld4.32 - %tmp1 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32(float* %A) + %tmp0 = bitcast float* %A to i8* + %tmp1 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4.v4f32(i8* %tmp0) %tmp2 = extractvalue %struct.__neon_float32x4x4_t %tmp1, 0 %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp1, 2 %tmp4 = add <4 x float> %tmp2, %tmp3 diff --git a/llvm/test/CodeGen/ARM/vldlane.ll b/llvm/test/CodeGen/ARM/vldlane.ll index 53881a3f924e..46a200220f69 100644 --- a/llvm/test/CodeGen/ARM/vldlane.ll +++ b/llvm/test/CodeGen/ARM/vldlane.ll @@ -23,8 +23,9 @@ define <8 x i8> @vld2lanei8(i8* %A, <8 x i8>* %B) nounwind { define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind { ;CHECK: vld2lanei16: ;CHECK: vld2.16 + %tmp0 = bitcast i16* %A to i8* %tmp1 = load <4 x i16>* %B - %tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1) + %tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1) %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0 %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1 %tmp5 = add <4 x i16> %tmp3, %tmp4 @@ -34,8 +35,9 @@ define <4 x i16> @vld2lanei16(i16* %A, <4 x i16>* %B) nounwind { define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind { ;CHECK: vld2lanei32: ;CHECK: vld2.32 + %tmp0 = bitcast i32* %A to i8* %tmp1 = load <2 x i32>* %B - %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1) + %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1) %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0 %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1 %tmp5 = add <2 x i32> %tmp3, %tmp4 @@ -45,8 +47,9 @@ define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind { define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind { ;CHECK: vld2lanef: ;CHECK: vld2.32 + %tmp0 = bitcast float* %A to i8* %tmp1 = load <2 x float>* %B - %tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, i32 1) + %tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1) %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0 %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1 %tmp5 = add <2 x float> %tmp3, %tmp4 @@ -56,8 +59,9 @@ define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind { define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind { ;CHECK: vld2laneQi16: ;CHECK: vld2.16 + %tmp0 = bitcast i16* %A to i8* %tmp1 = load <8 x i16>* %B - %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i16* %A, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1) + %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1) %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0 %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1 %tmp5 = add <8 x i16> %tmp3, %tmp4 @@ -67,8 +71,9 @@ define <8 x i16> @vld2laneQi16(i16* %A, <8 x i16>* %B) nounwind { define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind { ;CHECK: vld2laneQi32: ;CHECK: vld2.32 + %tmp0 = bitcast i32* %A to i8* %tmp1 = load <4 x i32>* %B - %tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i32* %A, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2) + %tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2) %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1 %tmp5 = add <4 x i32> %tmp3, %tmp4 @@ -78,8 +83,9 @@ define <4 x i32> @vld2laneQi32(i32* %A, <4 x i32>* %B) nounwind { define <4 x float> @vld2laneQf(float* %A, <4 x float>* %B) nounwind { ;CHECK: vld2laneQf: ;CHECK: vld2.32 + %tmp0 = bitcast float* %A to i8* %tmp1 = load <4 x float>* %B - %tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(float* %A, <4 x float> %tmp1, <4 x float> %tmp1, i32 1) + %tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1) %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0 %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1 %tmp5 = add <4 x float> %tmp3, %tmp4 @@ -120,8 +126,9 @@ define <8 x i8> @vld3lanei8(i8* %A, <8 x i8>* %B) nounwind { define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind { ;CHECK: vld3lanei16: ;CHECK: vld3.16 + %tmp0 = bitcast i16* %A to i8* %tmp1 = load <4 x i16>* %B - %tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1) + %tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1) %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0 %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1 %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2 @@ -133,8 +140,9 @@ define <4 x i16> @vld3lanei16(i16* %A, <4 x i16>* %B) nounwind { define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind { ;CHECK: vld3lanei32: ;CHECK: vld3.32 + %tmp0 = bitcast i32* %A to i8* %tmp1 = load <2 x i32>* %B - %tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1) + %tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1) %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0 %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1 %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2 @@ -146,8 +154,9 @@ define <2 x i32> @vld3lanei32(i32* %A, <2 x i32>* %B) nounwind { define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind { ;CHECK: vld3lanef: ;CHECK: vld3.32 + %tmp0 = bitcast float* %A to i8* %tmp1 = load <2 x float>* %B - %tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1) + %tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1) %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0 %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1 %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2 @@ -159,8 +168,9 @@ define <2 x float> @vld3lanef(float* %A, <2 x float>* %B) nounwind { define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind { ;CHECK: vld3laneQi16: ;CHECK: vld3.16 + %tmp0 = bitcast i16* %A to i8* %tmp1 = load <8 x i16>* %B - %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i16* %A, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1) + %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1) %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0 %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1 %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2 @@ -172,8 +182,9 @@ define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind { define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind { ;CHECK: vld3laneQi32: ;CHECK: vld3.32 + %tmp0 = bitcast i32* %A to i8* %tmp1 = load <4 x i32>* %B - %tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i32* %A, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3) + %tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3) %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0 %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1 %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2 @@ -185,8 +196,9 @@ define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind { define <4 x float> @vld3laneQf(float* %A, <4 x float>* %B) nounwind { ;CHECK: vld3laneQf: ;CHECK: vld3.32 + %tmp0 = bitcast float* %A to i8* %tmp1 = load <4 x float>* %B - %tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(float* %A, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1) + %tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1) %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0 %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1 %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2 @@ -231,8 +243,9 @@ define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind { define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind { ;CHECK: vld4lanei16: ;CHECK: vld4.16 + %tmp0 = bitcast i16* %A to i8* %tmp1 = load <4 x i16>* %B - %tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1) + %tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1) %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0 %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1 %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2 @@ -246,8 +259,9 @@ define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind { define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind { ;CHECK: vld4lanei32: ;CHECK: vld4.32 + %tmp0 = bitcast i32* %A to i8* %tmp1 = load <2 x i32>* %B - %tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1) + %tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1) %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0 %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1 %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2 @@ -261,8 +275,9 @@ define <2 x i32> @vld4lanei32(i32* %A, <2 x i32>* %B) nounwind { define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind { ;CHECK: vld4lanef: ;CHECK: vld4.32 + %tmp0 = bitcast float* %A to i8* %tmp1 = load <2 x float>* %B - %tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1) + %tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1) %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0 %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1 %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2 @@ -276,8 +291,9 @@ define <2 x float> @vld4lanef(float* %A, <2 x float>* %B) nounwind { define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind { ;CHECK: vld4laneQi16: ;CHECK: vld4.16 + %tmp0 = bitcast i16* %A to i8* %tmp1 = load <8 x i16>* %B - %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i16* %A, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1) + %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1) %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0 %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1 %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2 @@ -291,8 +307,9 @@ define <8 x i16> @vld4laneQi16(i16* %A, <8 x i16>* %B) nounwind { define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind { ;CHECK: vld4laneQi32: ;CHECK: vld4.32 + %tmp0 = bitcast i32* %A to i8* %tmp1 = load <4 x i32>* %B - %tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i32* %A, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1) + %tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 1) %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0 %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1 %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2 @@ -306,8 +323,9 @@ define <4 x i32> @vld4laneQi32(i32* %A, <4 x i32>* %B) nounwind { define <4 x float> @vld4laneQf(float* %A, <4 x float>* %B) nounwind { ;CHECK: vld4laneQf: ;CHECK: vld4.32 + %tmp0 = bitcast float* %A to i8* %tmp1 = load <4 x float>* %B - %tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(float* %A, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1) + %tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1) %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0 %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1 %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2 diff --git a/llvm/test/CodeGen/ARM/vst1.ll b/llvm/test/CodeGen/ARM/vst1.ll index 602b124ffad9..95414c308914 100644 --- a/llvm/test/CodeGen/ARM/vst1.ll +++ b/llvm/test/CodeGen/ARM/vst1.ll @@ -11,32 +11,36 @@ define void @vst1i8(i8* %A, <8 x i8>* %B) nounwind { define void @vst1i16(i16* %A, <4 x i16>* %B) nounwind { ;CHECK: vst1i16: ;CHECK: vst1.16 + %tmp0 = bitcast i16* %A to i8* %tmp1 = load <4 x i16>* %B - call void @llvm.arm.neon.vst1.v4i16(i16* %A, <4 x i16> %tmp1) + call void @llvm.arm.neon.vst1.v4i16(i8* %tmp0, <4 x i16> %tmp1) ret void } define void @vst1i32(i32* %A, <2 x i32>* %B) nounwind { ;CHECK: vst1i32: ;CHECK: vst1.32 + %tmp0 = bitcast i32* %A to i8* %tmp1 = load <2 x i32>* %B - call void @llvm.arm.neon.vst1.v2i32(i32* %A, <2 x i32> %tmp1) + call void @llvm.arm.neon.vst1.v2i32(i8* %tmp0, <2 x i32> %tmp1) ret void } define void @vst1f(float* %A, <2 x float>* %B) nounwind { ;CHECK: vst1f: ;CHECK: vst1.32 + %tmp0 = bitcast float* %A to i8* %tmp1 = load <2 x float>* %B - call void @llvm.arm.neon.vst1.v2f32(float* %A, <2 x float> %tmp1) + call void @llvm.arm.neon.vst1.v2f32(i8* %tmp0, <2 x float> %tmp1) ret void } define void @vst1i64(i64* %A, <1 x i64>* %B) nounwind { ;CHECK: vst1i64: ;CHECK: vst1.64 + %tmp0 = bitcast i64* %A to i8* %tmp1 = load <1 x i64>* %B - call void @llvm.arm.neon.vst1.v1i64(i64* %A, <1 x i64> %tmp1) + call void @llvm.arm.neon.vst1.v1i64(i8* %tmp0, <1 x i64> %tmp1) ret void } @@ -51,32 +55,36 @@ define void @vst1Qi8(i8* %A, <16 x i8>* %B) nounwind { define void @vst1Qi16(i16* %A, <8 x i16>* %B) nounwind { ;CHECK: vst1Qi16: ;CHECK: vst1.16 + %tmp0 = bitcast i16* %A to i8* %tmp1 = load <8 x i16>* %B - call void @llvm.arm.neon.vst1.v8i16(i16* %A, <8 x i16> %tmp1) + call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %tmp1) ret void } define void @vst1Qi32(i32* %A, <4 x i32>* %B) nounwind { ;CHECK: vst1Qi32: ;CHECK: vst1.32 + %tmp0 = bitcast i32* %A to i8* %tmp1 = load <4 x i32>* %B - call void @llvm.arm.neon.vst1.v4i32(i32* %A, <4 x i32> %tmp1) + call void @llvm.arm.neon.vst1.v4i32(i8* %tmp0, <4 x i32> %tmp1) ret void } define void @vst1Qf(float* %A, <4 x float>* %B) nounwind { ;CHECK: vst1Qf: ;CHECK: vst1.32 + %tmp0 = bitcast float* %A to i8* %tmp1 = load <4 x float>* %B - call void @llvm.arm.neon.vst1.v4f32(float* %A, <4 x float> %tmp1) + call void @llvm.arm.neon.vst1.v4f32(i8* %tmp0, <4 x float> %tmp1) ret void } define void @vst1Qi64(i64* %A, <2 x i64>* %B) nounwind { ;CHECK: vst1Qi64: ;CHECK: vst1.64 + %tmp0 = bitcast i64* %A to i8* %tmp1 = load <2 x i64>* %B - call void @llvm.arm.neon.vst1.v2i64(i64* %A, <2 x i64> %tmp1) + call void @llvm.arm.neon.vst1.v2i64(i8* %tmp0, <2 x i64> %tmp1) ret void } diff --git a/llvm/test/CodeGen/ARM/vst2.ll b/llvm/test/CodeGen/ARM/vst2.ll index 17d6bee0f56c..3c98a2cbe60d 100644 --- a/llvm/test/CodeGen/ARM/vst2.ll +++ b/llvm/test/CodeGen/ARM/vst2.ll @@ -11,32 +11,36 @@ define void @vst2i8(i8* %A, <8 x i8>* %B) nounwind { define void @vst2i16(i16* %A, <4 x i16>* %B) nounwind { ;CHECK: vst2i16: ;CHECK: vst2.16 + %tmp0 = bitcast i16* %A to i8* %tmp1 = load <4 x i16>* %B - call void @llvm.arm.neon.vst2.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1) + call void @llvm.arm.neon.vst2.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1) ret void } define void @vst2i32(i32* %A, <2 x i32>* %B) nounwind { ;CHECK: vst2i32: ;CHECK: vst2.32 + %tmp0 = bitcast i32* %A to i8* %tmp1 = load <2 x i32>* %B - call void @llvm.arm.neon.vst2.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1) + call void @llvm.arm.neon.vst2.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1) ret void } define void @vst2f(float* %A, <2 x float>* %B) nounwind { ;CHECK: vst2f: ;CHECK: vst2.32 + %tmp0 = bitcast float* %A to i8* %tmp1 = load <2 x float>* %B - call void @llvm.arm.neon.vst2.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1) + call void @llvm.arm.neon.vst2.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1) ret void } define void @vst2i64(i64* %A, <1 x i64>* %B) nounwind { ;CHECK: vst2i64: ;CHECK: vst1.64 + %tmp0 = bitcast i64* %A to i8* %tmp1 = load <1 x i64>* %B - call void @llvm.arm.neon.vst2.v1i64(i64* %A, <1 x i64> %tmp1, <1 x i64> %tmp1) + call void @llvm.arm.neon.vst2.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1) ret void } @@ -51,24 +55,27 @@ define void @vst2Qi8(i8* %A, <16 x i8>* %B) nounwind { define void @vst2Qi16(i16* %A, <8 x i16>* %B) nounwind { ;CHECK: vst2Qi16: ;CHECK: vst2.16 + %tmp0 = bitcast i16* %A to i8* %tmp1 = load <8 x i16>* %B - call void @llvm.arm.neon.vst2.v8i16(i16* %A, <8 x i16> %tmp1, <8 x i16> %tmp1) + call void @llvm.arm.neon.vst2.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1) ret void } define void @vst2Qi32(i32* %A, <4 x i32>* %B) nounwind { ;CHECK: vst2Qi32: ;CHECK: vst2.32 + %tmp0 = bitcast i32* %A to i8* %tmp1 = load <4 x i32>* %B - call void @llvm.arm.neon.vst2.v4i32(i32* %A, <4 x i32> %tmp1, <4 x i32> %tmp1) + call void @llvm.arm.neon.vst2.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1) ret void } define void @vst2Qf(float* %A, <4 x float>* %B) nounwind { ;CHECK: vst2Qf: ;CHECK: vst2.32 + %tmp0 = bitcast float* %A to i8* %tmp1 = load <4 x float>* %B - call void @llvm.arm.neon.vst2.v4f32(float* %A, <4 x float> %tmp1, <4 x float> %tmp1) + call void @llvm.arm.neon.vst2.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1) ret void } diff --git a/llvm/test/CodeGen/ARM/vst3.ll b/llvm/test/CodeGen/ARM/vst3.ll index a831a0c08ce9..2599bc0db933 100644 --- a/llvm/test/CodeGen/ARM/vst3.ll +++ b/llvm/test/CodeGen/ARM/vst3.ll @@ -11,32 +11,36 @@ define void @vst3i8(i8* %A, <8 x i8>* %B) nounwind { define void @vst3i16(i16* %A, <4 x i16>* %B) nounwind { ;CHECK: vst3i16: ;CHECK: vst3.16 + %tmp0 = bitcast i16* %A to i8* %tmp1 = load <4 x i16>* %B - call void @llvm.arm.neon.vst3.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1) + call void @llvm.arm.neon.vst3.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1) ret void } define void @vst3i32(i32* %A, <2 x i32>* %B) nounwind { ;CHECK: vst3i32: ;CHECK: vst3.32 + %tmp0 = bitcast i32* %A to i8* %tmp1 = load <2 x i32>* %B - call void @llvm.arm.neon.vst3.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1) + call void @llvm.arm.neon.vst3.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1) ret void } define void @vst3f(float* %A, <2 x float>* %B) nounwind { ;CHECK: vst3f: ;CHECK: vst3.32 + %tmp0 = bitcast float* %A to i8* %tmp1 = load <2 x float>* %B - call void @llvm.arm.neon.vst3.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1) + call void @llvm.arm.neon.vst3.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1) ret void } define void @vst3i64(i64* %A, <1 x i64>* %B) nounwind { ;CHECK: vst3i64: ;CHECK: vst1.64 + %tmp0 = bitcast i64* %A to i8* %tmp1 = load <1 x i64>* %B - call void @llvm.arm.neon.vst3.v1i64(i64* %A, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1) + call void @llvm.arm.neon.vst3.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1) ret void } @@ -53,8 +57,9 @@ define void @vst3Qi16(i16* %A, <8 x i16>* %B) nounwind { ;CHECK: vst3Qi16: ;CHECK: vst3.16 ;CHECK: vst3.16 + %tmp0 = bitcast i16* %A to i8* %tmp1 = load <8 x i16>* %B - call void @llvm.arm.neon.vst3.v8i16(i16* %A, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1) + call void @llvm.arm.neon.vst3.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1) ret void } @@ -62,8 +67,9 @@ define void @vst3Qi32(i32* %A, <4 x i32>* %B) nounwind { ;CHECK: vst3Qi32: ;CHECK: vst3.32 ;CHECK: vst3.32 + %tmp0 = bitcast i32* %A to i8* %tmp1 = load <4 x i32>* %B - call void @llvm.arm.neon.vst3.v4i32(i32* %A, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1) + call void @llvm.arm.neon.vst3.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1) ret void } @@ -71,8 +77,9 @@ define void @vst3Qf(float* %A, <4 x float>* %B) nounwind { ;CHECK: vst3Qf: ;CHECK: vst3.32 ;CHECK: vst3.32 + %tmp0 = bitcast float* %A to i8* %tmp1 = load <4 x float>* %B - call void @llvm.arm.neon.vst3.v4f32(float* %A, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1) + call void @llvm.arm.neon.vst3.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1) ret void } diff --git a/llvm/test/CodeGen/ARM/vst4.ll b/llvm/test/CodeGen/ARM/vst4.ll index d92c017c30b2..878f0efaa480 100644 --- a/llvm/test/CodeGen/ARM/vst4.ll +++ b/llvm/test/CodeGen/ARM/vst4.ll @@ -11,32 +11,36 @@ define void @vst4i8(i8* %A, <8 x i8>* %B) nounwind { define void @vst4i16(i16* %A, <4 x i16>* %B) nounwind { ;CHECK: vst4i16: ;CHECK: vst4.16 + %tmp0 = bitcast i16* %A to i8* %tmp1 = load <4 x i16>* %B - call void @llvm.arm.neon.vst4.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1) + call void @llvm.arm.neon.vst4.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1) ret void } define void @vst4i32(i32* %A, <2 x i32>* %B) nounwind { ;CHECK: vst4i32: ;CHECK: vst4.32 + %tmp0 = bitcast i32* %A to i8* %tmp1 = load <2 x i32>* %B - call void @llvm.arm.neon.vst4.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1) + call void @llvm.arm.neon.vst4.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1) ret void } define void @vst4f(float* %A, <2 x float>* %B) nounwind { ;CHECK: vst4f: ;CHECK: vst4.32 + %tmp0 = bitcast float* %A to i8* %tmp1 = load <2 x float>* %B - call void @llvm.arm.neon.vst4.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1) + call void @llvm.arm.neon.vst4.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1) ret void } define void @vst4i64(i64* %A, <1 x i64>* %B) nounwind { ;CHECK: vst4i64: ;CHECK: vst1.64 + %tmp0 = bitcast i64* %A to i8* %tmp1 = load <1 x i64>* %B - call void @llvm.arm.neon.vst4.v1i64(i64* %A, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1) + call void @llvm.arm.neon.vst4.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1, <1 x i64> %tmp1) ret void } @@ -53,8 +57,9 @@ define void @vst4Qi16(i16* %A, <8 x i16>* %B) nounwind { ;CHECK: vst4Qi16: ;CHECK: vst4.16 ;CHECK: vst4.16 + %tmp0 = bitcast i16* %A to i8* %tmp1 = load <8 x i16>* %B - call void @llvm.arm.neon.vst4.v8i16(i16* %A, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1) + call void @llvm.arm.neon.vst4.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1) ret void } @@ -62,8 +67,9 @@ define void @vst4Qi32(i32* %A, <4 x i32>* %B) nounwind { ;CHECK: vst4Qi32: ;CHECK: vst4.32 ;CHECK: vst4.32 + %tmp0 = bitcast i32* %A to i8* %tmp1 = load <4 x i32>* %B - call void @llvm.arm.neon.vst4.v4i32(i32* %A, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1) + call void @llvm.arm.neon.vst4.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1) ret void } @@ -71,8 +77,9 @@ define void @vst4Qf(float* %A, <4 x float>* %B) nounwind { ;CHECK: vst4Qf: ;CHECK: vst4.32 ;CHECK: vst4.32 + %tmp0 = bitcast float* %A to i8* %tmp1 = load <4 x float>* %B - call void @llvm.arm.neon.vst4.v4f32(float* %A, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1) + call void @llvm.arm.neon.vst4.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1) ret void } diff --git a/llvm/test/CodeGen/ARM/vstlane.ll b/llvm/test/CodeGen/ARM/vstlane.ll index 3bfb14f17b77..cf50756d465e 100644 --- a/llvm/test/CodeGen/ARM/vstlane.ll +++ b/llvm/test/CodeGen/ARM/vstlane.ll @@ -11,48 +11,54 @@ define void @vst2lanei8(i8* %A, <8 x i8>* %B) nounwind { define void @vst2lanei16(i16* %A, <4 x i16>* %B) nounwind { ;CHECK: vst2lanei16: ;CHECK: vst2.16 + %tmp0 = bitcast i16* %A to i8* %tmp1 = load <4 x i16>* %B - call void @llvm.arm.neon.vst2lane.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1) + call void @llvm.arm.neon.vst2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1) ret void } define void @vst2lanei32(i32* %A, <2 x i32>* %B) nounwind { ;CHECK: vst2lanei32: ;CHECK: vst2.32 + %tmp0 = bitcast i32* %A to i8* %tmp1 = load <2 x i32>* %B - call void @llvm.arm.neon.vst2lane.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1) + call void @llvm.arm.neon.vst2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1) ret void } define void @vst2lanef(float* %A, <2 x float>* %B) nounwind { ;CHECK: vst2lanef: ;CHECK: vst2.32 + %tmp0 = bitcast float* %A to i8* %tmp1 = load <2 x float>* %B - call void @llvm.arm.neon.vst2lane.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, i32 1) + call void @llvm.arm.neon.vst2lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1) ret void } define void @vst2laneQi16(i16* %A, <8 x i16>* %B) nounwind { ;CHECK: vst2laneQi16: ;CHECK: vst2.16 + %tmp0 = bitcast i16* %A to i8* %tmp1 = load <8 x i16>* %B - call void @llvm.arm.neon.vst2lane.v8i16(i16* %A, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1) + call void @llvm.arm.neon.vst2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1) ret void } define void @vst2laneQi32(i32* %A, <4 x i32>* %B) nounwind { ;CHECK: vst2laneQi32: ;CHECK: vst2.32 + %tmp0 = bitcast i32* %A to i8* %tmp1 = load <4 x i32>* %B - call void @llvm.arm.neon.vst2lane.v4i32(i32* %A, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2) + call void @llvm.arm.neon.vst2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2) ret void } define void @vst2laneQf(float* %A, <4 x float>* %B) nounwind { ;CHECK: vst2laneQf: ;CHECK: vst2.32 + %tmp0 = bitcast float* %A to i8* %tmp1 = load <4 x float>* %B - call void @llvm.arm.neon.vst2lane.v4f32(float* %A, <4 x float> %tmp1, <4 x float> %tmp1, i32 3) + call void @llvm.arm.neon.vst2lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 3) ret void } @@ -76,48 +82,54 @@ define void @vst3lanei8(i8* %A, <8 x i8>* %B) nounwind { define void @vst3lanei16(i16* %A, <4 x i16>* %B) nounwind { ;CHECK: vst3lanei16: ;CHECK: vst3.16 + %tmp0 = bitcast i16* %A to i8* %tmp1 = load <4 x i16>* %B - call void @llvm.arm.neon.vst3lane.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1) + call void @llvm.arm.neon.vst3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1) ret void } define void @vst3lanei32(i32* %A, <2 x i32>* %B) nounwind { ;CHECK: vst3lanei32: ;CHECK: vst3.32 + %tmp0 = bitcast i32* %A to i8* %tmp1 = load <2 x i32>* %B - call void @llvm.arm.neon.vst3lane.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1) + call void @llvm.arm.neon.vst3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1) ret void } define void @vst3lanef(float* %A, <2 x float>* %B) nounwind { ;CHECK: vst3lanef: ;CHECK: vst3.32 + %tmp0 = bitcast float* %A to i8* %tmp1 = load <2 x float>* %B - call void @llvm.arm.neon.vst3lane.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1) + call void @llvm.arm.neon.vst3lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1) ret void } define void @vst3laneQi16(i16* %A, <8 x i16>* %B) nounwind { ;CHECK: vst3laneQi16: ;CHECK: vst3.16 + %tmp0 = bitcast i16* %A to i8* %tmp1 = load <8 x i16>* %B - call void @llvm.arm.neon.vst3lane.v8i16(i16* %A, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 6) + call void @llvm.arm.neon.vst3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 6) ret void } define void @vst3laneQi32(i32* %A, <4 x i32>* %B) nounwind { ;CHECK: vst3laneQi32: ;CHECK: vst3.32 + %tmp0 = bitcast i32* %A to i8* %tmp1 = load <4 x i32>* %B - call void @llvm.arm.neon.vst3lane.v4i32(i32* %A, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0) + call void @llvm.arm.neon.vst3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0) ret void } define void @vst3laneQf(float* %A, <4 x float>* %B) nounwind { ;CHECK: vst3laneQf: ;CHECK: vst3.32 + %tmp0 = bitcast float* %A to i8* %tmp1 = load <4 x float>* %B - call void @llvm.arm.neon.vst3lane.v4f32(float* %A, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1) + call void @llvm.arm.neon.vst3lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1) ret void } @@ -142,48 +154,54 @@ define void @vst4lanei8(i8* %A, <8 x i8>* %B) nounwind { define void @vst4lanei16(i16* %A, <4 x i16>* %B) nounwind { ;CHECK: vst4lanei16: ;CHECK: vst4.16 + %tmp0 = bitcast i16* %A to i8* %tmp1 = load <4 x i16>* %B - call void @llvm.arm.neon.vst4lane.v4i16(i16* %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1) + call void @llvm.arm.neon.vst4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1) ret void } define void @vst4lanei32(i32* %A, <2 x i32>* %B) nounwind { ;CHECK: vst4lanei32: ;CHECK: vst4.32 + %tmp0 = bitcast i32* %A to i8* %tmp1 = load <2 x i32>* %B - call void @llvm.arm.neon.vst4lane.v2i32(i32* %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1) + call void @llvm.arm.neon.vst4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1) ret void } define void @vst4lanef(float* %A, <2 x float>* %B) nounwind { ;CHECK: vst4lanef: ;CHECK: vst4.32 + %tmp0 = bitcast float* %A to i8* %tmp1 = load <2 x float>* %B - call void @llvm.arm.neon.vst4lane.v2f32(float* %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1) + call void @llvm.arm.neon.vst4lane.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1) ret void } define void @vst4laneQi16(i16* %A, <8 x i16>* %B) nounwind { ;CHECK: vst4laneQi16: ;CHECK: vst4.16 + %tmp0 = bitcast i16* %A to i8* %tmp1 = load <8 x i16>* %B - call void @llvm.arm.neon.vst4lane.v8i16(i16* %A, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 7) + call void @llvm.arm.neon.vst4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 7) ret void } define void @vst4laneQi32(i32* %A, <4 x i32>* %B) nounwind { ;CHECK: vst4laneQi32: ;CHECK: vst4.32 + %tmp0 = bitcast i32* %A to i8* %tmp1 = load <4 x i32>* %B - call void @llvm.arm.neon.vst4lane.v4i32(i32* %A, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2) + call void @llvm.arm.neon.vst4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2) ret void } define void @vst4laneQf(float* %A, <4 x float>* %B) nounwind { ;CHECK: vst4laneQf: ;CHECK: vst4.32 + %tmp0 = bitcast float* %A to i8* %tmp1 = load <4 x float>* %B - call void @llvm.arm.neon.vst4lane.v4f32(float* %A, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1) + call void @llvm.arm.neon.vst4lane.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1) ret void }