forked from OSchip/llvm-project
[AArch64] Cost-model i8 vector loads/stores
Loads of <4 x i8> vectors were modeled as extremely expensive. And while we don't have a load instruction that supports this, it isn't that expensive to create a vector of i8 elements. The codegen for this was fixed/optimised in D105110. This now tweaks the cost model and enables SLP vectorisation of my motivating case loadi8.ll. Differential Revision: https://reviews.llvm.org/D103629
This commit is contained in:
parent
a96911c49b
commit
ee752134ac
|
@ -1423,8 +1423,9 @@ InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
|
||||||
unsigned AddressSpace,
|
unsigned AddressSpace,
|
||||||
TTI::TargetCostKind CostKind,
|
TTI::TargetCostKind CostKind,
|
||||||
const Instruction *I) {
|
const Instruction *I) {
|
||||||
|
EVT VT = TLI->getValueType(DL, Ty, true);
|
||||||
// Type legalization can't handle structs
|
// Type legalization can't handle structs
|
||||||
if (TLI->getValueType(DL, Ty, true) == MVT::Other)
|
if (VT == MVT::Other)
|
||||||
return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
|
return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
|
||||||
CostKind);
|
CostKind);
|
||||||
|
|
||||||
|
@ -1451,23 +1452,14 @@ InstructionCost AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
|
||||||
return LT.first * 2 * AmortizationCost;
|
return LT.first * 2 * AmortizationCost;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check truncating stores and extending loads.
|
||||||
if (useNeonVector(Ty) &&
|
if (useNeonVector(Ty) &&
|
||||||
cast<VectorType>(Ty)->getElementType()->isIntegerTy(8)) {
|
Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
|
||||||
unsigned ProfitableNumElements;
|
// v4i8 types are lowered to scalar a load/store and sshll/xtn.
|
||||||
if (Opcode == Instruction::Store)
|
if (VT == MVT::v4i8)
|
||||||
// We use a custom trunc store lowering so v.4b should be profitable.
|
return 2;
|
||||||
ProfitableNumElements = 4;
|
// Otherwise we need to scalarize.
|
||||||
else
|
return cast<FixedVectorType>(Ty)->getNumElements() * 2;
|
||||||
// We scalarize the loads because there is not v.4b register and we
|
|
||||||
// have to promote the elements to v.2.
|
|
||||||
ProfitableNumElements = 8;
|
|
||||||
|
|
||||||
if (cast<FixedVectorType>(Ty)->getNumElements() < ProfitableNumElements) {
|
|
||||||
unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
|
|
||||||
unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
|
|
||||||
// We generate 2 instructions per vector element.
|
|
||||||
return NumVectorizableInstsToAmortize * NumVecElts * 2;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return LT.first;
|
return LT.first;
|
||||||
|
|
|
@ -701,10 +701,10 @@ define i32 @load_extends() #0 {
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, i8* undef
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, i8* undef
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadi16 = load i16, i16* undef
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadi16 = load i16, i16* undef
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadi32 = load i32, i32* undef
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadi32 = load i32, i32* undef
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %loadv2i8 = load <2 x i8>, <2 x i8>* undef
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %loadv2i8 = load <2 x i8>, <2 x i8>* undef
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %loadv4i8 = load <4 x i8>, <4 x i8>* undef
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %loadv4i8 = load <4 x i8>, <4 x i8>* undef
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv8i8 = load <8 x i8>, <8 x i8>* undef
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv8i8 = load <8 x i8>, <8 x i8>* undef
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv2i16 = load <2 x i16>, <2 x i16>* undef
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %loadv2i16 = load <2 x i16>, <2 x i16>* undef
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4i16 = load <4 x i16>, <4 x i16>* undef
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4i16 = load <4 x i16>, <4 x i16>* undef
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv2i32 = load <2 x i32>, <2 x i32>* undef
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv2i32 = load <2 x i32>, <2 x i32>* undef
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4i32 = load <4 x i32>, <4 x i32>* undef
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadv4i32 = load <4 x i32>, <4 x i32>* undef
|
||||||
|
|
|
@ -49,8 +49,8 @@ define void @store8(<8 x i8>* %ptr, <8 x i8> %val) {
|
||||||
|
|
||||||
define <4 x i8> @load4(<4 x i8>* %ptr) {
|
define <4 x i8> @load4(<4 x i8>* %ptr) {
|
||||||
; CHECK: 'Cost Model Analysis' for function 'load4':
|
; CHECK: 'Cost Model Analysis' for function 'load4':
|
||||||
; CHECK-NEON: Cost Model: Found an estimated cost of 64 for instruction:
|
; CHECK-NEON: Cost Model: Found an estimated cost of 2 for instruction:
|
||||||
; CHECK-SVE-128: Cost Model: Found an estimated cost of 64 for instruction:
|
; CHECK-SVE-128: Cost Model: Found an estimated cost of 2 for instruction:
|
||||||
; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
|
; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
|
||||||
; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
|
; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
|
||||||
%out = load <4 x i8>, <4 x i8>* %ptr
|
%out = load <4 x i8>, <4 x i8>* %ptr
|
||||||
|
@ -59,8 +59,8 @@ define <4 x i8> @load4(<4 x i8>* %ptr) {
|
||||||
|
|
||||||
define void @store4(<4 x i8>* %ptr, <4 x i8> %val) {
|
define void @store4(<4 x i8>* %ptr, <4 x i8> %val) {
|
||||||
; CHECK: 'Cost Model Analysis' for function 'store4':
|
; CHECK: 'Cost Model Analysis' for function 'store4':
|
||||||
; CHECK-NEON: Cost Model: Found an estimated cost of 1 for instruction:
|
; CHECK-NEON: Cost Model: Found an estimated cost of 2 for instruction:
|
||||||
; CHECK-SVE-128: Cost Model: Found an estimated cost of 1 for instruction:
|
; CHECK-SVE-128: Cost Model: Found an estimated cost of 2 for instruction:
|
||||||
; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
|
; CHECK-SVE-256: Cost Model: Found an estimated cost of 1 for instruction:
|
||||||
; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
|
; CHECK-SVE-512: Cost Model: Found an estimated cost of 1 for instruction:
|
||||||
store <4 x i8> %val, <4 x i8>* %ptr
|
store <4 x i8> %val, <4 x i8>* %ptr
|
||||||
|
|
|
@ -23,10 +23,10 @@ define void @getMemoryOpCost() {
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x double> undef, <2 x double>* undef, align 4
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <2 x double> undef, <2 x double>* undef, align 4
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, <4 x float>* undef, align 4
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, <4 x float>* undef, align 4
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x half> undef, <8 x half>* undef, align 4
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <8 x half> undef, <8 x half>* undef, align 4
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store <2 x i8> undef, <2 x i8>* undef, align 2
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <2 x i8> undef, <2 x i8>* undef, align 2
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> undef, <4 x i8>* undef, align 4
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x i8> undef, <4 x i8>* undef, align 4
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %1 = load <2 x i8>, <2 x i8>* undef, align 2
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %1 = load <2 x i8>, <2 x i8>* undef, align 2
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %2 = load <4 x i8>, <4 x i8>* undef, align 4
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %2 = load <4 x i8>, <4 x i8>* undef, align 4
|
||||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||||
;
|
;
|
||||||
; SIZE-LABEL: 'getMemoryOpCost'
|
; SIZE-LABEL: 'getMemoryOpCost'
|
||||||
|
@ -65,10 +65,10 @@ define void @getMemoryOpCost() {
|
||||||
; SLOW_MISALIGNED_128_STORE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: store <2 x double> undef, <2 x double>* undef, align 4
|
; SLOW_MISALIGNED_128_STORE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: store <2 x double> undef, <2 x double>* undef, align 4
|
||||||
; SLOW_MISALIGNED_128_STORE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: store <4 x float> undef, <4 x float>* undef, align 4
|
; SLOW_MISALIGNED_128_STORE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: store <4 x float> undef, <4 x float>* undef, align 4
|
||||||
; SLOW_MISALIGNED_128_STORE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: store <8 x half> undef, <8 x half>* undef, align 4
|
; SLOW_MISALIGNED_128_STORE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: store <8 x half> undef, <8 x half>* undef, align 4
|
||||||
; SLOW_MISALIGNED_128_STORE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: store <2 x i8> undef, <2 x i8>* undef, align 2
|
; SLOW_MISALIGNED_128_STORE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <2 x i8> undef, <2 x i8>* undef, align 2
|
||||||
; SLOW_MISALIGNED_128_STORE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store <4 x i8> undef, <4 x i8>* undef, align 4
|
; SLOW_MISALIGNED_128_STORE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x i8> undef, <4 x i8>* undef, align 4
|
||||||
; SLOW_MISALIGNED_128_STORE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %1 = load <2 x i8>, <2 x i8>* undef, align 2
|
; SLOW_MISALIGNED_128_STORE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %1 = load <2 x i8>, <2 x i8>* undef, align 2
|
||||||
; SLOW_MISALIGNED_128_STORE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %2 = load <4 x i8>, <4 x i8>* undef, align 4
|
; SLOW_MISALIGNED_128_STORE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %2 = load <4 x i8>, <4 x i8>* undef, align 4
|
||||||
; SLOW_MISALIGNED_128_STORE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
; SLOW_MISALIGNED_128_STORE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||||
;
|
;
|
||||||
store <4 x i64> undef, <4 x i64> * undef
|
store <4 x i64> undef, <4 x i64> * undef
|
||||||
|
|
|
@ -4,12 +4,11 @@
|
||||||
; are not profitable.
|
; are not profitable.
|
||||||
|
|
||||||
; Test with a loop that contains memory accesses of i8 and i32 types. The
|
; Test with a loop that contains memory accesses of i8 and i32 types. The
|
||||||
; default maximum VF for NEON is 4, but vectorizing 4 x i8 is not
|
; default maximum VF for NEON is 4. And while we don't have an instruction to
|
||||||
; profitable. But we can extend to VF to 8 or 16, at which point the
|
; load 4 x i8, vectorization might still be profitable.
|
||||||
; i8 memory accesses become profitable.
|
|
||||||
define void @test_load_i8_store_i32(i8* noalias %src, i32* noalias %dst, i32 %off, i64 %N) {
|
define void @test_load_i8_store_i32(i8* noalias %src, i32* noalias %dst, i32 %off, i64 %N) {
|
||||||
; CHECK-LABEL: @test_load_i8_store_i32(
|
; CHECK-LABEL: @test_load_i8_store_i32(
|
||||||
; CHECK-NOT: x i8>
|
; CHECK: <4 x i8>
|
||||||
;
|
;
|
||||||
entry:
|
entry:
|
||||||
br label %loop
|
br label %loop
|
||||||
|
|
|
@ -1,20 +1,16 @@
|
||||||
; REQUIRES: asserts
|
; REQUIRES: asserts
|
||||||
; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -S --debug-only=loop-vectorize 2>&1 | FileCheck %s
|
; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -S --debug-only=loop-vectorize 2>&1 | FileCheck %s
|
||||||
|
|
||||||
; This test shows extremely high interleaving cost that, probably, should be fixed.
|
|
||||||
; Due to the high cost, interleaving is not beneficial and the cost model chooses to scalarize
|
|
||||||
; the load instructions.
|
|
||||||
|
|
||||||
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
|
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
|
||||||
target triple = "aarch64--linux-gnu"
|
target triple = "aarch64--linux-gnu"
|
||||||
|
|
||||||
%pair = type { i8, i8 }
|
%pair = type { i8, i8 }
|
||||||
|
|
||||||
; CHECK-LABEL: test
|
; CHECK-LABEL: test
|
||||||
; CHECK: Found an estimated cost of 20 for VF 2 For instruction: {{.*}} load i8
|
; CHECK: Found an estimated cost of 17 for VF 2 For instruction: {{.*}} load i8
|
||||||
; CHECK: Found an estimated cost of 0 for VF 2 For instruction: {{.*}} load i8
|
; CHECK: Found an estimated cost of 0 for VF 2 For instruction: {{.*}} load i8
|
||||||
; CHECK: vector.body
|
; CHECK: vector.body
|
||||||
; CHECK: load i8
|
; CHECK: load <4 x i8>
|
||||||
; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
|
; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
|
||||||
|
|
||||||
define void @test(%pair* %p, i64 %n) {
|
define void @test(%pair* %p, i64 %n) {
|
||||||
|
|
|
@ -171,10 +171,8 @@ define void @PR32038(i32 %n) {
|
||||||
;
|
;
|
||||||
; MAX-COST-LABEL: @PR32038(
|
; MAX-COST-LABEL: @PR32038(
|
||||||
; MAX-COST-NEXT: entry:
|
; MAX-COST-NEXT: entry:
|
||||||
; MAX-COST-NEXT: [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1
|
; MAX-COST-NEXT: [[TMP0:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <4 x i8>*), align 1
|
||||||
; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer
|
; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <4 x i8> [[TMP0]], zeroinitializer
|
||||||
; MAX-COST-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3) to <2 x i8>*), align 1
|
|
||||||
; MAX-COST-NEXT: [[TMP3:%.*]] = icmp eq <2 x i8> [[TMP2]], zeroinitializer
|
|
||||||
; MAX-COST-NEXT: [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
|
; MAX-COST-NEXT: [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
|
||||||
; MAX-COST-NEXT: [[P9:%.*]] = icmp eq i8 [[P8]], 0
|
; MAX-COST-NEXT: [[P9:%.*]] = icmp eq i8 [[P8]], 0
|
||||||
; MAX-COST-NEXT: [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
|
; MAX-COST-NEXT: [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
|
||||||
|
@ -186,21 +184,17 @@ define void @PR32038(i32 %n) {
|
||||||
; MAX-COST-NEXT: br label [[FOR_BODY:%.*]]
|
; MAX-COST-NEXT: br label [[FOR_BODY:%.*]]
|
||||||
; MAX-COST: for.body:
|
; MAX-COST: for.body:
|
||||||
; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
|
; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
|
||||||
; MAX-COST-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1
|
; MAX-COST-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
|
||||||
; MAX-COST-NEXT: [[TMP5:%.*]] = shufflevector <2 x i1> [[TMP1]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
|
; MAX-COST-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
|
||||||
; MAX-COST-NEXT: [[TMP6:%.*]] = shufflevector <4 x i1> poison, <4 x i1> [[TMP5]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
|
; MAX-COST-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
|
||||||
; MAX-COST-NEXT: [[TMP7:%.*]] = shufflevector <2 x i1> [[TMP3]], <2 x i1> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
|
; MAX-COST-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
|
||||||
; MAX-COST-NEXT: [[TMP8:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> [[TMP7]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
|
; MAX-COST-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
|
||||||
; MAX-COST-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
|
|
||||||
; MAX-COST-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
|
|
||||||
; MAX-COST-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
|
|
||||||
; MAX-COST-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
|
|
||||||
; MAX-COST-NEXT: [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80
|
; MAX-COST-NEXT: [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80
|
||||||
; MAX-COST-NEXT: [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80
|
; MAX-COST-NEXT: [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80
|
||||||
; MAX-COST-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]])
|
; MAX-COST-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
|
||||||
; MAX-COST-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[P27]]
|
; MAX-COST-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], [[P27]]
|
||||||
; MAX-COST-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], [[P29]]
|
; MAX-COST-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], [[P29]]
|
||||||
; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP15]], -5
|
; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP9]], -5
|
||||||
; MAX-COST-NEXT: [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80
|
; MAX-COST-NEXT: [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80
|
||||||
; MAX-COST-NEXT: [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]]
|
; MAX-COST-NEXT: [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]]
|
||||||
; MAX-COST-NEXT: [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80
|
; MAX-COST-NEXT: [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80
|
||||||
|
|
|
@ -13,52 +13,32 @@ define void @f_noalias(i8* noalias nocapture %dst, i8* noalias nocapture readonl
|
||||||
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SCALE]], align 16
|
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SCALE]], align 16
|
||||||
; CHECK-NEXT: [[OFFSET:%.*]] = getelementptr inbounds [[STRUCT_WEIGHT_T]], %struct.weight_t* [[W]], i64 0, i32 1
|
; CHECK-NEXT: [[OFFSET:%.*]] = getelementptr inbounds [[STRUCT_WEIGHT_T]], %struct.weight_t* [[W]], i64 0, i32 1
|
||||||
; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[OFFSET]], align 4
|
; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[OFFSET]], align 4
|
||||||
; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[SRC:%.*]], align 1
|
; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 1
|
||||||
; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP2]] to i32
|
; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 1
|
||||||
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], [[CONV]]
|
|
||||||
; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[MUL]], [[TMP1]]
|
|
||||||
; CHECK-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp ult i32 [[ADD]], 256
|
|
||||||
; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt i32 [[ADD]], 0
|
|
||||||
; CHECK-NEXT: [[SHR_I:%.*]] = sext i1 [[TMP3]] to i32
|
|
||||||
; CHECK-NEXT: [[COND_I:%.*]] = select i1 [[TOBOOL_NOT_I]], i32 [[ADD]], i32 [[SHR_I]]
|
|
||||||
; CHECK-NEXT: [[CONV_I:%.*]] = trunc i32 [[COND_I]] to i8
|
|
||||||
; CHECK-NEXT: store i8 [[CONV_I]], i8* [[DST:%.*]], align 1
|
|
||||||
; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 1
|
|
||||||
; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1
|
|
||||||
; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP4]] to i32
|
|
||||||
; CHECK-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[TMP0]], [[CONV_1]]
|
|
||||||
; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i32 [[MUL_1]], [[TMP1]]
|
|
||||||
; CHECK-NEXT: [[TOBOOL_NOT_I_1:%.*]] = icmp ult i32 [[ADD_1]], 256
|
|
||||||
; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[ADD_1]], 0
|
|
||||||
; CHECK-NEXT: [[SHR_I_1:%.*]] = sext i1 [[TMP5]] to i32
|
|
||||||
; CHECK-NEXT: [[COND_I_1:%.*]] = select i1 [[TOBOOL_NOT_I_1]], i32 [[ADD_1]], i32 [[SHR_I_1]]
|
|
||||||
; CHECK-NEXT: [[CONV_I_1:%.*]] = trunc i32 [[COND_I_1]] to i8
|
|
||||||
; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 1
|
|
||||||
; CHECK-NEXT: store i8 [[CONV_I_1]], i8* [[ARRAYIDX2_1]], align 1
|
|
||||||
; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 2
|
; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 2
|
||||||
; CHECK-NEXT: [[TMP6:%.*]] = load i8, i8* [[ARRAYIDX_2]], align 1
|
|
||||||
; CHECK-NEXT: [[CONV_2:%.*]] = zext i8 [[TMP6]] to i32
|
|
||||||
; CHECK-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[TMP0]], [[CONV_2]]
|
|
||||||
; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i32 [[MUL_2]], [[TMP1]]
|
|
||||||
; CHECK-NEXT: [[TOBOOL_NOT_I_2:%.*]] = icmp ult i32 [[ADD_2]], 256
|
|
||||||
; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt i32 [[ADD_2]], 0
|
|
||||||
; CHECK-NEXT: [[SHR_I_2:%.*]] = sext i1 [[TMP7]] to i32
|
|
||||||
; CHECK-NEXT: [[COND_I_2:%.*]] = select i1 [[TOBOOL_NOT_I_2]], i32 [[ADD_2]], i32 [[SHR_I_2]]
|
|
||||||
; CHECK-NEXT: [[CONV_I_2:%.*]] = trunc i32 [[COND_I_2]] to i8
|
|
||||||
; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 2
|
; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 2
|
||||||
; CHECK-NEXT: store i8 [[CONV_I_2]], i8* [[ARRAYIDX2_2]], align 1
|
|
||||||
; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 3
|
; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 3
|
||||||
; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX_3]], align 1
|
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[SRC]] to <4 x i8>*
|
||||||
; CHECK-NEXT: [[CONV_3:%.*]] = zext i8 [[TMP8]] to i32
|
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 1
|
||||||
; CHECK-NEXT: [[MUL_3:%.*]] = mul nsw i32 [[TMP0]], [[CONV_3]]
|
; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[TMP3]] to <4 x i32>
|
||||||
; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i32 [[MUL_3]], [[TMP1]]
|
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
|
||||||
; CHECK-NEXT: [[TOBOOL_NOT_I_3:%.*]] = icmp ult i32 [[ADD_3]], 256
|
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP0]], i32 1
|
||||||
; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt i32 [[ADD_3]], 0
|
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP0]], i32 2
|
||||||
; CHECK-NEXT: [[SHR_I_3:%.*]] = sext i1 [[TMP9]] to i32
|
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP0]], i32 3
|
||||||
; CHECK-NEXT: [[COND_I_3:%.*]] = select i1 [[TOBOOL_NOT_I_3]], i32 [[ADD_3]], i32 [[SHR_I_3]]
|
; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP8]], [[TMP4]]
|
||||||
; CHECK-NEXT: [[CONV_I_3:%.*]] = trunc i32 [[COND_I_3]] to i8
|
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
|
||||||
|
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP1]], i32 1
|
||||||
|
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP1]], i32 2
|
||||||
|
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP1]], i32 3
|
||||||
|
; CHECK-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP9]], [[TMP13]]
|
||||||
|
; CHECK-NEXT: [[TMP15:%.*]] = icmp ult <4 x i32> [[TMP14]], <i32 256, i32 256, i32 256, i32 256>
|
||||||
|
; CHECK-NEXT: [[TMP16:%.*]] = icmp sgt <4 x i32> [[TMP14]], zeroinitializer
|
||||||
|
; CHECK-NEXT: [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
|
||||||
|
; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> [[TMP14]], <4 x i32> [[TMP17]]
|
||||||
|
; CHECK-NEXT: [[TMP19:%.*]] = trunc <4 x i32> [[TMP18]] to <4 x i8>
|
||||||
; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 3
|
; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 3
|
||||||
; CHECK-NEXT: store i8 [[CONV_I_3]], i8* [[ARRAYIDX2_3]], align 1
|
; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8* [[DST]] to <4 x i8>*
|
||||||
|
; CHECK-NEXT: store <4 x i8> [[TMP19]], <4 x i8>* [[TMP20]], align 1
|
||||||
; CHECK-NEXT: ret void
|
; CHECK-NEXT: ret void
|
||||||
;
|
;
|
||||||
entry:
|
entry:
|
||||||
|
|
Loading…
Reference in New Issue