From d1460c88a6d8739920f86383ff7d17be3dc517f6 Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Sat, 2 Oct 2021 13:39:15 +0300
Subject: [PATCH] [X86][Costmodel] Load/store i8 Stride=3 VF=8 interleaving
 costs

While we already model this tuple, the values are divergent from reality, so fix them.

The only sched models that for cpu's that support avx2
but not avx512 are: haswell, broadwell, skylake, zen1-3

For load we have:
https://godbolt.org/z/1jeocxj55 - for intels `Block RThroughput: =6.0`; for ryzens, `Block RThroughput: <=3.0`
So pick cost of `6`.

For store we have:
https://godbolt.org/z/fr7xfa3K5 - for intels `Block RThroughput: =6.0`; for ryzens, `Block RThroughput: <=2.0`
So pick cost of `6`.

I'm directly using the shuffling asm the llc produced,
without any manual fixups that may be needed
to ensure sequential execution.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D110960
---
 llvm/lib/Target/X86/X86TargetTransformInfo.cpp                | 4 ++--
 .../Analysis/CostModel/X86/interleaved-load-i8-stride-3.ll    | 2 +-
 .../Analysis/CostModel/X86/interleaved-store-i8-stride-3.ll   | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index f2e46956cfd9..c21fcdfc2e86 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -5088,7 +5088,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX2(
 
       {3, MVT::v2i8, 3},  // (load 6i8 and) deinterleave into 3 x 2i8
       {3, MVT::v4i8, 3},   // (load 12i8 and) deinterleave into 3 x 4i8
-      {3, MVT::v8i8, 9},   // (load 24i8 and) deinterleave into 3 x 8i8
+      {3, MVT::v8i8, 6},   // (load 24i8 and) deinterleave into 3 x 8i8
       {3, MVT::v16i8, 11}, // (load 48i8 and) deinterleave into 3 x 16i8
       {3, MVT::v32i8, 13}, // (load 96i8 and) deinterleave into 3 x 32i8
 
@@ -5140,7 +5140,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX2(
 
       {3, MVT::v2i8, 4},   // interleave 3 x 2i8 into 6i8 (and store)
       {3, MVT::v4i8, 4},   // interleave 3 x 4i8 into 12i8 (and store)
-      {3, MVT::v8i8, 11},  // interleave 3 x 8i8 into 24i8 (and store)
+      {3, MVT::v8i8, 6},  // interleave 3 x 8i8 into 24i8 (and store)
       {3, MVT::v16i8, 11}, // interleave 3 x 16i8 into 48i8 (and store)
       {3, MVT::v32i8, 13}, // interleave 3 x 32i8 into 96i8 (and store)
 
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-3.ll
index e8feb4f54443..9ca08f22ccb3 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i8-stride-3.ll
@@ -28,7 +28,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; AVX2: LV: Found an estimated cost of 6 for VF 2 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; AVX2: LV: Found an estimated cost of 6 for VF 4 For instruction:   %v0 = load i8, i8* %in0, align 1
-; AVX2: LV: Found an estimated cost of 12 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
+; AVX2: LV: Found an estimated cost of 9 for VF 8 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; AVX2: LV: Found an estimated cost of 13 for VF 16 For instruction:   %v0 = load i8, i8* %in0, align 1
 ; AVX2: LV: Found an estimated cost of 16 for VF 32 For instruction:   %v0 = load i8, i8* %in0, align 1
 ;
diff --git a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-3.ll b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-3.ll
index f939d05b44b2..281f2dcd2600 100644
--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i8-stride-3.ll
@@ -28,7 +28,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; AVX2: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %v2, i8* %out2, align 1
 ; AVX2: LV: Found an estimated cost of 7 for VF 2 For instruction:   store i8 %v2, i8* %out2, align 1
 ; AVX2: LV: Found an estimated cost of 7 for VF 4 For instruction:   store i8 %v2, i8* %out2, align 1
-; AVX2: LV: Found an estimated cost of 14 for VF 8 For instruction:   store i8 %v2, i8* %out2, align 1
+; AVX2: LV: Found an estimated cost of 9 for VF 8 For instruction:   store i8 %v2, i8* %out2, align 1
 ; AVX2: LV: Found an estimated cost of 13 for VF 16 For instruction:   store i8 %v2, i8* %out2, align 1
 ; AVX2: LV: Found an estimated cost of 16 for VF 32 For instruction:   store i8 %v2, i8* %out2, align 1
 ;