forked from OSchip/llvm-project
[ARM] Make f16 interleaved accesses expensive.
There are no vldN/vstN f16 variants, even with +fullfp16. We could use the i16 variants, but, in practice, even with +fullfp16, the f16 sequence leading to the i16 shuffle usually gets scalarized. We'd need to improve our support for f16 codegen before getting there. Teach the cost model to consider f16 interleaved operations as expensive. Otherwise, we are all but guaranteed to end up with a large block of scalarized vector code. llvm-svn: 294819
This commit is contained in:
parent
fc979dc9dd
commit
8425f453ef
|
@ -533,7 +533,8 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
|
||||||
unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
|
unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
|
||||||
|
|
||||||
// vldN/vstN only support legal vector types of size 64 or 128 in bits.
|
// vldN/vstN only support legal vector types of size 64 or 128 in bits.
|
||||||
if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
|
if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128) &&
|
||||||
|
!VecTy->getScalarType()->isHalfTy())
|
||||||
return Factor;
|
return Factor;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -99,3 +99,34 @@ for.body:
|
||||||
for.end:
|
for.end:
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
%half.2 = type {half, half}
|
||||||
|
define void @half_factor_2(%half.2* %data, i64 %n) {
|
||||||
|
entry:
|
||||||
|
br label %for.body
|
||||||
|
|
||||||
|
; VF_4-LABEL: Checking a loop in "half_factor_2"
|
||||||
|
; VF_4: Found an estimated cost of 40 for VF 4 For instruction: %tmp2 = load half, half* %tmp0, align 2
|
||||||
|
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, half* %tmp1, align 2
|
||||||
|
; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2
|
||||||
|
; VF_4-NEXT: Found an estimated cost of 40 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2
|
||||||
|
; VF_8-LABEL: Checking a loop in "half_factor_2"
|
||||||
|
; VF_8: Found an estimated cost of 80 for VF 8 For instruction: %tmp2 = load half, half* %tmp0, align 2
|
||||||
|
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, half* %tmp1, align 2
|
||||||
|
; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2
|
||||||
|
; VF_8-NEXT: Found an estimated cost of 80 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2
|
||||||
|
for.body:
|
||||||
|
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
|
||||||
|
%tmp0 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 0
|
||||||
|
%tmp1 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 1
|
||||||
|
%tmp2 = load half, half* %tmp0, align 2
|
||||||
|
%tmp3 = load half, half* %tmp1, align 2
|
||||||
|
store half 0., half* %tmp0, align 2
|
||||||
|
store half 0., half* %tmp1, align 2
|
||||||
|
%i.next = add nuw nsw i64 %i, 1
|
||||||
|
%cond = icmp slt i64 %i.next, %n
|
||||||
|
br i1 %cond, label %for.body, label %for.end
|
||||||
|
|
||||||
|
for.end:
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue