2017-01-11 07:42:21 +08:00
|
|
|
; RUN: opt < %s -cost-model -analyze -mtriple=aarch64-unknown | FileCheck %s
|
|
|
|
; RUN: opt < %s -cost-model -analyze -mtriple=aarch64-unknown -mattr=slow-misaligned-128store | FileCheck %s --check-prefix=SLOW_MISALIGNED_128_STORE
|
2016-12-16 02:36:59 +08:00
|
|
|
|
2014-03-29 18:18:08 +08:00
|
|
|
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
|
2016-12-16 02:36:59 +08:00
|
|
|
; CHECK-LABEL: getMemoryOpCost
|
|
|
|
; SLOW_MISALIGNED_128_STORE-LABEL: getMemoryOpCost
|
|
|
|
define void @getMemoryOpCost() {
|
2017-01-11 07:42:21 +08:00
|
|
|
; If FeatureSlowMisaligned128Store is set, we penalize 128-bit stores.
|
|
|
|
; The unlegalized 256-bit stores are further penalized when legalized down
|
|
|
|
; to 128-bit stores.
|
|
|
|
|
|
|
|
; CHECK: cost of 2 for {{.*}} store <4 x i64>
|
|
|
|
; SLOW_MISALIGNED_128_STORE: cost of 24 for {{.*}} store <4 x i64>
|
|
|
|
store <4 x i64> undef, <4 x i64> * undef
|
|
|
|
; CHECK-NEXT: cost of 2 for {{.*}} store <8 x i32>
|
|
|
|
; SLOW_MISALIGNED_128_STORE-NEXT: cost of 24 for {{.*}} store <8 x i32>
|
|
|
|
store <8 x i32> undef, <8 x i32> * undef
|
|
|
|
; CHECK-NEXT: cost of 2 for {{.*}} store <16 x i16>
|
|
|
|
; SLOW_MISALIGNED_128_STORE-NEXT: cost of 24 for {{.*}} store <16 x i16>
|
|
|
|
store <16 x i16> undef, <16 x i16> * undef
|
|
|
|
; CHECK-NEXT: cost of 2 for {{.*}} store <32 x i8>
|
|
|
|
; SLOW_MISALIGNED_128_STORE-NEXT: cost of 24 for {{.*}} store <32 x i8>
|
|
|
|
store <32 x i8> undef, <32 x i8> * undef
|
|
|
|
|
|
|
|
; CHECK-NEXT: cost of 2 for {{.*}} store <4 x double>
|
|
|
|
; SLOW_MISALIGNED_128_STORE-NEXT: cost of 24 for {{.*}} store <4 x double>
|
|
|
|
store <4 x double> undef, <4 x double> * undef
|
|
|
|
; CHECK-NEXT: cost of 2 for {{.*}} store <8 x float>
|
|
|
|
; SLOW_MISALIGNED_128_STORE-NEXT: cost of 24 for {{.*}} store <8 x float>
|
|
|
|
store <8 x float> undef, <8 x float> * undef
|
|
|
|
; CHECK-NEXT: cost of 2 for {{.*}} store <16 x half>
|
|
|
|
; SLOW_MISALIGNED_128_STORE-NEXT: cost of 24 for {{.*}} store <16 x half>
|
|
|
|
store <16 x half> undef, <16 x half> * undef
|
|
|
|
|
|
|
|
; CHECK-NEXT: cost of 1 for {{.*}} store <2 x i64>
|
|
|
|
; SLOW_MISALIGNED_128_STORE-NEXT: cost of 12 for {{.*}} store <2 x i64>
|
2014-03-29 18:18:08 +08:00
|
|
|
store <2 x i64> undef, <2 x i64> * undef
|
2017-01-11 07:42:21 +08:00
|
|
|
; CHECK-NEXT: cost of 1 for {{.*}} store <4 x i32>
|
|
|
|
; SLOW_MISALIGNED_128_STORE-NEXT: cost of 12 for {{.*}} store <4 x i32>
|
|
|
|
store <4 x i32> undef, <4 x i32> * undef
|
|
|
|
; CHECK-NEXT: cost of 1 for {{.*}} store <8 x i16>
|
|
|
|
; SLOW_MISALIGNED_128_STORE-NEXT: cost of 12 for {{.*}} store <8 x i16>
|
|
|
|
store <8 x i16> undef, <8 x i16> * undef
|
|
|
|
; CHECK-NEXT: cost of 1 for {{.*}} store <16 x i8>
|
|
|
|
; SLOW_MISALIGNED_128_STORE-NEXT: cost of 12 for {{.*}} store <16 x i8>
|
|
|
|
store <16 x i8> undef, <16 x i8> * undef
|
|
|
|
|
|
|
|
; CHECK-NEXT: cost of 1 for {{.*}} store <2 x double>
|
|
|
|
; SLOW_MISALIGNED_128_STORE-NEXT: cost of 12 for {{.*}} store <2 x double>
|
|
|
|
store <2 x double> undef, <2 x double> * undef
|
|
|
|
; CHECK-NEXT: cost of 1 for {{.*}} store <4 x float>
|
|
|
|
; SLOW_MISALIGNED_128_STORE-NEXT: cost of 12 for {{.*}} store <4 x float>
|
|
|
|
store <4 x float> undef, <4 x float> * undef
|
|
|
|
; CHECK-NEXT: cost of 1 for {{.*}} store <8 x half>
|
|
|
|
; SLOW_MISALIGNED_128_STORE-NEXT: cost of 12 for {{.*}} store <8 x half>
|
|
|
|
store <8 x half> undef, <8 x half> * undef
|
2014-03-29 18:18:08 +08:00
|
|
|
|
|
|
|
; We scalarize the loads/stores because there is no vector register name for
|
|
|
|
; these types (they get extended to v.4h/v.2s).
|
|
|
|
; CHECK: cost of 16 {{.*}} store
|
|
|
|
store <2 x i8> undef, <2 x i8> * undef
|
[AArch64] Add custom lowering for v4i8 trunc store
This patch adds a custom trunc store lowering for v4i8 vector types.
Since there is not v.4b register, the v4i8 is promoted to v4i16 (v.4h)
and default action for v4i8 is to extract each element and issue 4
byte stores.
A better strategy would be to extended the promoted v4i16 to v8i16
(with undef elements) and extract and store the word lane which
represents the v4i8 subvectores. The construction:
define void @foo(<4 x i16> %x, i8* nocapture %p) {
%0 = trunc <4 x i16> %x to <4 x i8>
%1 = bitcast i8* %p to <4 x i8>*
store <4 x i8> %0, <4 x i8>* %1, align 4, !tbaa !2
ret void
}
Can be optimized from:
umov w8, v0.h[3]
umov w9, v0.h[2]
umov w10, v0.h[1]
umov w11, v0.h[0]
strb w8, [x0, #3]
strb w9, [x0, #2]
strb w10, [x0, #1]
strb w11, [x0]
ret
To:
xtn v0.8b, v0.8h
str s0, [x0]
ret
The patch also adjust the memory cost for autovectorization, so the C
code:
void foo (const int *src, int width, unsigned char *dst)
{
for (int i = 0; i < width; i++)
*dst++ = *src++;
}
can be vectorized to:
.LBB0_4: // %vector.body
// =>This Inner Loop Header: Depth=1
ldr q0, [x0], #16
subs x12, x12, #4 // =4
xtn v0.4h, v0.4s
xtn v0.8b, v0.8h
st1 { v0.s }[0], [x2], #4
b.ne .LBB0_4
Instead of byte operations.
llvm-svn: 335735
2018-06-27 21:58:46 +08:00
|
|
|
; CHECK: cost of 1 {{.*}} store
|
2014-03-29 18:18:08 +08:00
|
|
|
store <4 x i8> undef, <4 x i8> * undef
|
|
|
|
; CHECK: cost of 16 {{.*}} load
|
2015-02-28 05:17:42 +08:00
|
|
|
load <2 x i8> , <2 x i8> * undef
|
2014-03-29 18:18:08 +08:00
|
|
|
; CHECK: cost of 64 {{.*}} load
|
2015-02-28 05:17:42 +08:00
|
|
|
load <4 x i8> , <4 x i8> * undef
|
2014-03-29 18:18:08 +08:00
|
|
|
|
|
|
|
ret void
|
|
|
|
}
|