[X86][AMX] Split greedy RA for tile register

When we fill the shape to tile configure memory, the shape is gotten
from AMX pseudo instruction. However the register for the shape may be
split or spilled by greedy RA. That cause we fill the shape to config
memory after ldtilecfg is executed, so that the shape configuration
would be wrong.
This patch is to split the tile register allocation from greedy register
allocation, so that after tile registers are allocated the shape
registers are still virtual register. The shape register only may be
redefined or multi-defined by phi elimination pass, two address pass.
That doesn't affect tile register configuration.

Differential Revision: https://reviews.llvm.org/D128584
This commit is contained in:
Luo, Yuanke 2022-06-25 22:00:44 +08:00
parent fc2d96c334
commit 5cb0979870
13 changed files with 334 additions and 33 deletions

View File

@ -345,6 +345,9 @@ protected:
// Helper to verify the analysis is really immutable.
void setOpt(bool &Opt, bool Val);
/// Return true if register allocator is specified by -regalloc=override.
bool isCustomizedRegAlloc();
/// Methods with trivial inline returns are convenient points in the common
/// codegen pass pipeline where targets may insert passes. Methods with
/// out-of-line standard implementations are major CodeGen stages called by

View File

@ -1405,6 +1405,11 @@ FunctionPass *TargetPassConfig::createRegAllocPass(bool Optimized) {
return createTargetRegisterAllocator(Optimized);
}
bool TargetPassConfig::isCustomizedRegAlloc() {
return RegAlloc !=
(RegisterRegAlloc::FunctionPassCtor)&useDefaultRegisterAllocator;
}
bool TargetPassConfig::addRegAssignAndRewriteFast() {
if (RegAlloc != (RegisterRegAlloc::FunctionPassCtor)&useDefaultRegisterAllocator &&
RegAlloc != (RegisterRegAlloc::FunctionPassCtor)&createFastRegisterAllocator)

View File

@ -676,6 +676,10 @@ bool X86RegisterInfo::isFixedRegister(const MachineFunction &MF,
return X86GenRegisterInfo::isFixedRegister(MF, PhysReg);
}
bool X86RegisterInfo::isTileRegisterClass(const TargetRegisterClass *RC) const {
return RC->getID() == X86::TILERegClassID;
}
void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const {
// Check if the EFLAGS register is marked as live-out. This shouldn't happen,
// because the calling convention defines the EFLAGS register as NOT

View File

@ -120,6 +120,9 @@ public:
bool isArgumentRegister(const MachineFunction &MF,
MCRegister Reg) const override;
/// Return true if it is tile register class.
bool isTileRegisterClass(const TargetRegisterClass *RC) const;
/// Returns true if PhysReg is a fixed register.
bool isFixedRegister(const MachineFunction &MF,
MCRegister PhysReg) const override;

View File

@ -36,6 +36,7 @@
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/RegAllocRegistry.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/DataLayout.h"
@ -58,6 +59,11 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
cl::desc("Enable the machine combiner pass"),
cl::init(true), cl::Hidden);
static cl::opt<bool>
EnableTileRAPass("x86-tile-ra",
cl::desc("Enable the tile register allocation pass"),
cl::init(true), cl::Hidden);
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
// Register the target.
RegisterTargetMachine<X86TargetMachine> X(getTheX86_32Target());
@ -386,7 +392,7 @@ public:
void addPreEmitPass() override;
void addPreEmitPass2() override;
void addPreSched2() override;
bool addPreRewrite() override;
bool addRegAssignAndRewriteOptimized() override;
std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
};
@ -612,11 +618,21 @@ bool X86PassConfig::addPostFastRegAllocRewrite() {
return true;
}
bool X86PassConfig::addPreRewrite() {
addPass(createX86TileConfigPass());
return true;
}
std::unique_ptr<CSEConfigBase> X86PassConfig::getCSEConfig() const {
return getStandardCSEConfigForOpt(TM->getOptLevel());
}
static bool onlyAllocateTileRegisters(const TargetRegisterInfo &TRI,
const TargetRegisterClass &RC) {
return static_cast<const X86RegisterInfo &>(TRI).isTileRegisterClass(&RC);
}
bool X86PassConfig::addRegAssignAndRewriteOptimized() {
// Don't support tile RA when RA is specified by command line "-regalloc".
if (!isCustomizedRegAlloc() && EnableTileRAPass) {
// Allocate tile register first.
addPass(createGreedyRegisterAllocator(onlyAllocateTileRegisters));
addPass(createX86TileConfigPass());
}
return TargetPassConfig::addRegAssignAndRewriteOptimized();
}

View File

@ -36,7 +36,7 @@
using namespace llvm;
#define DEBUG_TYPE "tile-config"
#define DEBUG_TYPE "tileconfig"
namespace {
@ -70,11 +70,11 @@ struct X86TileConfig : public MachineFunctionPass {
char X86TileConfig::ID = 0;
INITIALIZE_PASS_BEGIN(X86TileConfig, "tileconfig", "Tile Register Configure",
INITIALIZE_PASS_BEGIN(X86TileConfig, DEBUG_TYPE, "Tile Register Configure",
false, false)
INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
INITIALIZE_PASS_END(X86TileConfig, "tileconfig", "Tile Register Configure",
false, false)
INITIALIZE_PASS_END(X86TileConfig, DEBUG_TYPE, "Tile Register Configure", false,
false)
bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
@ -123,6 +123,8 @@ bool X86TileConfig::runOnMachineFunction(MachineFunction &MF) {
continue;
if (MRI.getRegClass(VirtReg)->getID() != X86::TILERegClassID)
continue;
if (VRM.getPhys(VirtReg) == VirtRegMap::NO_PHYS_REG)
continue;
unsigned Index = VRM.getPhys(VirtReg) - X86::TMM0;
if (!Phys2Virt[Index])
Phys2Virt[Index] = VirtReg;

View File

@ -485,14 +485,14 @@ define dso_local void @test_loop2(i32 %0) nounwind {
; CHECK-NEXT: pushq %r12
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: subq $1088, %rsp # imm = 0x440
; CHECK-NEXT: movl %edi, %ebx
; CHECK-NEXT: movl %edi, %r15d
; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vmovups %zmm0, (%rsp)
; CHECK-NEXT: movb $1, (%rsp)
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movl $buf, %r14d
; CHECK-NEXT: movl $32, %r15d
; CHECK-NEXT: movl $32, %ebx
; CHECK-NEXT: movw $8, %bp
; CHECK-NEXT: movl $buf+2048, %r12d
; CHECK-NEXT: .p2align 4, 0x90
@ -500,17 +500,17 @@ define dso_local void @test_loop2(i32 %0) nounwind {
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg (%rsp)
; CHECK-NEXT: testl %ebx, %ebx
; CHECK-NEXT: testl %r15d, %r15d
; CHECK-NEXT: jle .LBB3_3
; CHECK-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1
; CHECK-NEXT: tileloadd (%r14,%r15), %tmm0
; CHECK-NEXT: tileloadd (%r14,%rbx), %tmm0
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tilestored %tmm0, 64(%rsp,%rax) # 1024-byte Folded Spill
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg (%rsp)
; CHECK-NEXT: movabsq $64, %rax
; CHECK-NEXT: tileloadd 64(%rsp,%rax), %tmm0 # 1024-byte Folded Reload
; CHECK-NEXT: tilestored %tmm0, (%r12,%r15)
; CHECK-NEXT: tilestored %tmm0, (%r12,%rbx)
; CHECK-NEXT: callq foo
; CHECK-NEXT: jmp .LBB3_1
; CHECK-NEXT: .LBB3_3:

View File

@ -0,0 +1,227 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs -stop-before virtregrewriter | FileCheck %s
define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row_from, i32 %c_row_to, i32 %c_row_tile, i32 %c_col_from, i32 %c_col_to, i32 %c_col_tile) {
; CHECK-LABEL: name: foo
; CHECK: bb.0.entry:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.4(0x40000000)
; CHECK-NEXT: liveins: $esi, $edx, $rcx, $r8, $r9
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $r9
; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $r8
; CHECK-NEXT: MOV64mr %stack.1, 1, $noreg, 0, $noreg, $rcx :: (store (s64) into %stack.1)
; CHECK-NEXT: undef %82.sub_32bit:gr64_with_sub_8bit = COPY $edx
; CHECK-NEXT: undef %84.sub_32bit:gr64_nosp = COPY $esi
; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0
; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4)
; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4)
; CHECK-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.4, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.4, align 8)
; CHECK-NEXT: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.5, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.5, align 16)
; CHECK-NEXT: [[LEA64_32r:%[0-9]+]]:gr32 = LEA64_32r %82, 1, $noreg, 63, $noreg
; CHECK-NEXT: TEST32rr %82.sub_32bit, %82.sub_32bit, implicit-def $eflags
; CHECK-NEXT: [[CMOV32rr:%[0-9]+]]:gr32 = CMOV32rr [[CMOV32rr]], %82.sub_32bit, 9, implicit killed $eflags
; CHECK-NEXT: CMP32rr [[MOV32rm1]], [[MOV32rm]], implicit-def $eflags
; CHECK-NEXT: JCC_1 %bb.4, 13, implicit killed $eflags
; CHECK-NEXT: JMP_1 %bb.1
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1.for.cond14.preheader.lr.ph:
; CHECK-NEXT: successors: %bb.2(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: undef %88.sub_32bit:gr64_nosp = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0, align 8)
; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 16, $noreg, %88.sub_16bit :: (store (s512) into %stack.0 + 16, align 4)
; CHECK-NEXT: [[MOV32rm2:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.3, align 16)
; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 49, $noreg, [[MOV32rm2]].sub_8bit :: (store (s512) into %stack.0 + 49, align 1, basealign 4)
; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 48, $noreg, [[MOV32rm2]].sub_8bit :: (store (s512) into %stack.0 + 48, align 4)
; CHECK-NEXT: [[AND32ri8_:%[0-9]+]]:gr32 = AND32ri8 [[AND32ri8_]], -64, implicit-def dead $eflags
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr32 = COPY %82.sub_32bit
; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 18, $noreg, [[COPY2]].sub_16bit :: (store (s512) into %stack.0 + 18, align 2, basealign 4)
; CHECK-NEXT: [[SUB32rr:%[0-9]+]]:gr32 = SUB32rr [[SUB32rr]], [[AND32ri8_]], implicit-def dead $eflags
; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 18, $noreg, [[SUB32rr]].sub_16bit :: (store (s512) into %stack.0 + 18, align 2, basealign 4)
; CHECK-NEXT: [[MOVZX32rr16_:%[0-9]+]]:gr32 = MOVZX32rr16 [[SUB32rr]].sub_16bit
; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 50, $noreg, [[MOVZX32rr16_]].sub_8bit :: (store (s512) into %stack.0 + 50, align 2, basealign 4)
; CHECK-NEXT: [[SHR32ri:%[0-9]+]]:gr32 = SHR32ri [[SHR32ri]], 2, implicit-def dead $eflags
; CHECK-NEXT: MOV32mr %stack.2, 1, $noreg, 0, $noreg, [[SHR32ri]] :: (store (s32) into %stack.2)
; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 50, $noreg, [[SHR32ri]].sub_8bit :: (store (s512) into %stack.0 + 50, align 2, basealign 4)
; CHECK-NEXT: [[LEA64_32r:%[0-9]+]]:gr32 = LEA64_32r $noreg, 4, %88, 0, $noreg
; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 20, $noreg, [[LEA64_32r]].sub_16bit :: (store (s512) into %stack.0 + 20, align 4)
; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4)
; CHECK-NEXT: [[MOVSX64rr32_:%[0-9]+]]:gr64_nosp = MOVSX64rr32 %82.sub_32bit
; CHECK-NEXT: %82.sub_32bit:gr64_with_sub_8bit = nsw SUB32rr %82.sub_32bit, [[SUB32rr]], implicit-def dead $eflags
; CHECK-NEXT: undef %102.sub_32bit:gr64_with_sub_8bit = MOVZX32rr16 %82.sub_16bit
; CHECK-NEXT: MOV64mr %stack.3, 1, $noreg, 0, $noreg, %102 :: (store (s64) into %stack.3)
; CHECK-NEXT: undef %61.sub_32bit:gr64_with_sub_8bit = COPY %102.sub_32bit
; CHECK-NEXT: %61.sub_32bit:gr64_with_sub_8bit = IMUL32rr %61.sub_32bit, %84.sub_32bit, implicit-def dead $eflags
; CHECK-NEXT: [[LEA64_32r1:%[0-9]+]]:gr32 = LEA64_32r $noreg, 4, %84, 0, $noreg
; CHECK-NEXT: [[MOVSX64rr32_1:%[0-9]+]]:gr64 = MOVSX64rr32 [[LEA64_32r1]]
; CHECK-NEXT: MOV64mr %stack.4, 1, $noreg, 0, $noreg, [[MOVSX64rr32_1]] :: (store (s64) into %stack.4)
; CHECK-NEXT: [[MOVSX64rr32_2:%[0-9]+]]:gr64_nosp = MOVSX64rr32 %84.sub_32bit
; CHECK-NEXT: [[MOVSX64rm32_:%[0-9]+]]:gr64_nosp = MOVSX64rm32 %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8)
; CHECK-NEXT: [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = MOVSX64rr32 %88.sub_32bit
; CHECK-NEXT: [[MOVSX64rm32_1:%[0-9]+]]:gr64 = MOVSX64rm32 %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1, align 16)
; CHECK-NEXT: [[MOVSX64rr32_4:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm1]]
; CHECK-NEXT: [[MOVSX64rr32_5:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm2]]
; CHECK-NEXT: [[MOVSX64rr32_6:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm]]
; CHECK-NEXT: MOV64mr %stack.8, 1, $noreg, 0, $noreg, [[MOVSX64rr32_6]] :: (store (s64) into %stack.8)
; CHECK-NEXT: MOV64mr %stack.6, 1, $noreg, 0, $noreg, [[MOVSX64rr32_4]] :: (store (s64) into %stack.6)
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[MOVSX64rr32_4]]
; CHECK-NEXT: [[IMUL64rr:%[0-9]+]]:gr64_nosp = IMUL64rr [[IMUL64rr]], [[MOVSX64rr32_2]], implicit-def dead $eflags
; CHECK-NEXT: [[ADD64rr:%[0-9]+]]:gr64_nosp = ADD64rr [[ADD64rr]], [[MOVSX64rm32_]], implicit-def dead $eflags
; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r [[COPY]], 4, [[ADD64rr]], 0, $noreg
; CHECK-NEXT: MOV64mr %stack.9, 1, $noreg, 0, $noreg, [[LEA64r]] :: (store (s64) into %stack.9)
; CHECK-NEXT: MOV64mr %stack.7, 1, $noreg, 0, $noreg, [[MOVSX64rr32_5]] :: (store (s64) into %stack.7)
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_5]]
; CHECK-NEXT: [[IMUL64rr:%[0-9]+]]:gr64 = IMUL64rr [[IMUL64rr]], [[MOVSX64rr32_2]], implicit-def dead $eflags
; CHECK-NEXT: [[SHL64ri:%[0-9]+]]:gr64 = SHL64ri [[SHL64ri]], 2, implicit-def dead $eflags
; CHECK-NEXT: MOV64mr %stack.10, 1, $noreg, 0, $noreg, [[SHL64ri]] :: (store (s64) into %stack.10)
; CHECK-NEXT: [[LEA64r1:%[0-9]+]]:gr64 = LEA64r $noreg, 4, [[MOVSX64rr32_3]], 0, $noreg
; CHECK-NEXT: MOV64mr %stack.5, 1, $noreg, 0, $noreg, [[MOVSX64rm32_]] :: (store (s64) into %stack.5)
; CHECK-NEXT: [[LEA64_32r2:%[0-9]+]]:gr32 = LEA64_32r %61, 4, [[MOVSX64rm32_]], 0, $noreg
; CHECK-NEXT: MOV32mr %stack.11, 1, $noreg, 0, $noreg, [[LEA64_32r2]] :: (store (s32) into %stack.11)
; CHECK-NEXT: MOV64mr %stack.13, 1, $noreg, 0, $noreg, [[LEA64r1]] :: (store (s64) into %stack.13)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2.for.cond14.preheader:
; CHECK-NEXT: successors: %bb.3(0x40000000), %bb.5(0x40000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[MOV32rm3:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8)
; CHECK-NEXT: CMP32rm [[MOV32rm3]], %fixed-stack.1, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s32) from %fixed-stack.1, align 16)
; CHECK-NEXT: JCC_1 %bb.5, 13, implicit killed $eflags
; CHECK-NEXT: JMP_1 %bb.3
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3.for.body17.lr.ph:
; CHECK-NEXT: successors: %bb.6(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6)
; CHECK-NEXT: [[IMUL64rr:%[0-9]+]]:gr64 = nsw IMUL64rr [[IMUL64rr]], [[MOVSX64rr32_]], implicit-def dead $eflags
; CHECK-NEXT: [[ADD64rm:%[0-9]+]]:gr64 = ADD64rm [[ADD64rm]], %stack.3, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.3)
; CHECK-NEXT: [[ADD64rm1:%[0-9]+]]:gr64 = ADD64rm [[ADD64rm1]], %stack.1, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.1)
; CHECK-NEXT: MOV64mr %stack.12, 1, $noreg, 0, $noreg, [[ADD64rm1]] :: (store (s64) into %stack.12)
; CHECK-NEXT: [[MOV32rm4:%[0-9]+]]:gr32 = MOV32rm %stack.11, 1, $noreg, 0, $noreg :: (load (s32) from %stack.11)
; CHECK-NEXT: undef %68.sub_32bit:gr64_nosp = COPY [[MOV32rm4]]
; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9)
; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5)
; CHECK-NEXT: JMP_1 %bb.6
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.4.for.cond.cleanup:
; CHECK-NEXT: RET 0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.5.for.cond.cleanup16:
; CHECK-NEXT: successors: %bb.2(0x7c000000), %bb.4(0x04000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6)
; CHECK-NEXT: [[ADD64rm1:%[0-9]+]]:gr64 = ADD64rm [[ADD64rm1]], %stack.7, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.7)
; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm %stack.10, 1, $noreg, 0, $noreg :: (load (s64) from %stack.10)
; CHECK-NEXT: ADD64mr %stack.9, 1, $noreg, 0, $noreg, [[MOV64rm2]], implicit-def dead $eflags :: (store (s64) into %stack.9)
; CHECK-NEXT: MOV64mr %stack.6, 1, $noreg, 0, $noreg, [[ADD64rm1]] :: (store (s64) into %stack.6)
; CHECK-NEXT: CMP64rm [[ADD64rm1]], %stack.8, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s64) from %stack.8)
; CHECK-NEXT: JCC_1 %bb.2, 12, implicit killed $eflags
; CHECK-NEXT: JMP_1 %bb.4
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.6.for.body17:
; CHECK-NEXT: successors: %bb.6(0x7c000000), %bb.5(0x04000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[MOV32rm2]].sub_16bit, %88.sub_16bit
; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = MOV64rm %stack.12, 1, $noreg, 0, $noreg :: (load (s64) from %stack.12)
; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm2]].sub_16bit, [[SUB32rr]].sub_16bit, [[MOV64rm3]], 1, [[MOVSX64rr32_]], 0, $noreg
; CHECK-NEXT: [[MOVSX64rr32_7:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[MOVSX64rr32_7]].sub_32bit
; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr64 = COPY %88
; CHECK-NEXT: [[COPY3:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_]]
; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_2]]
; CHECK-NEXT: [[COPY5:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_3]]
; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64 = COPY [[MOVSX64rm32_1]]
; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr32 = COPY [[MOV32rm2]]
; CHECK-NEXT: [[COPY8:%[0-9]+]]:gr32 = COPY [[SUB32rr]]
; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64 = COPY [[COPY1]]
; CHECK-NEXT: [[LEA64r2:%[0-9]+]]:gr64 = LEA64r [[COPY9]], 1, [[MOVSX64rr32_7]], 0, $noreg
; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64_nosp = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4)
; Check LEA64_32r register is split to COPY10
; CHECK-NEXT: [[COPY10:%[0-9]+]]:gr32 = COPY [[LEA64_32r]]
; CHECK-NEXT: [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm %stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %stack.2)
; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm5]].sub_16bit, [[COPY10]].sub_16bit, [[LEA64r2]], 1, [[MOV64rm4]], 0, $noreg
; CHECK-NEXT: [[COPY11:%[0-9]+]]:gr32 = COPY [[COPY10]]
; CHECK-NEXT: [[COPY12:%[0-9]+]]:gr64 = COPY [[COPY9]]
; CHECK-NEXT: [[COPY12:%[0-9]+]]:gr32 = COPY [[COPY8]]
; CHECK-NEXT: [[MOV64rm5:%[0-9]+]]:gr64 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load (s64) from %stack.13)
; CHECK-NEXT: [[COPY13:%[0-9]+]]:gr32 = COPY [[COPY7]]
; CHECK-NEXT: [[COPY14:%[0-9]+]]:gr64 = COPY [[COPY6]]
; CHECK-NEXT: [[COPY15:%[0-9]+]]:gr64_nosp = COPY [[COPY5]]
; CHECK-NEXT: [[COPY16:%[0-9]+]]:gr64_nosp = COPY [[COPY4]]
; CHECK-NEXT: [[COPY17:%[0-9]+]]:gr64_nosp = COPY [[COPY3]]
; CHECK-NEXT: [[COPY18:%[0-9]+]]:gr64_nosp = COPY [[COPY2]]
; CHECK-NEXT: [[PTDPBSSDV:%[0-9]+]]:tile = PTDPBSSDV [[COPY13]].sub_16bit, [[COPY11]].sub_16bit, [[COPY12]].sub_16bit, [[PTDPBSSDV]], [[PTILELOADDV]], [[PTILELOADDV1]]
; CHECK-NEXT: PTILESTOREDV [[COPY13]].sub_16bit, [[COPY18]].sub_16bit, [[MOV64rm]], 1, [[COPY16]], 0, $noreg, [[PTDPBSSDV]]
; CHECK-NEXT: [[ADD64rr1:%[0-9]+]]:gr64 = ADD64rr [[ADD64rr1]], [[COPY15]], implicit-def dead $eflags
; CHECK-NEXT: [[ADD64rr2:%[0-9]+]]:gr64 = ADD64rr [[ADD64rr2]], [[MOV64rm5]], implicit-def dead $eflags
; CHECK-NEXT: [[MOVSX64rr32_7]].sub_32bit:gr64_nosp = ADD32rr [[MOVSX64rr32_7]].sub_32bit, [[COPY11]], implicit-def dead $eflags
; CHECK-NEXT: CMP64rr [[ADD64rr1]], [[COPY14]], implicit-def $eflags
; CHECK-NEXT: JCC_1 %bb.6, 12, implicit killed $eflags
; CHECK-NEXT: JMP_1 %bb.5
entry:
%rem = srem i32 %K, 64
%conv3 = trunc i32 %rem to i16
%conv4 = trunc i32 %c_row_tile to i16
%conv5 = trunc i32 %c_col_tile to i16
%0 = lshr i16 %conv3, 2
%conv13 = shl i16 %conv5, 2
%cmp83 = icmp slt i32 %c_row_from, %c_row_to
br i1 %cmp83, label %for.cond14.preheader.lr.ph, label %for.cond.cleanup
for.cond14.preheader.lr.ph: ; preds = %entry
%sub = sub nsw i32 %K, %rem
%conv1 = and i32 %sub, 65535
%cmp1581 = icmp slt i32 %c_col_from, %c_col_to
%conv20 = sext i32 %K to i64
%mul22 = mul nsw i32 %conv1, %N
%mul27 = shl nsw i32 %N, 2
%conv28 = sext i32 %mul27 to i64
%conv34 = sext i32 %N to i64
%1 = sext i32 %c_col_from to i64
%2 = sext i32 %c_col_tile to i64
%3 = sext i32 %c_col_to to i64
%4 = sext i32 %c_row_from to i64
%5 = sext i32 %c_row_tile to i64
%6 = zext i32 %conv1 to i64
%7 = sext i32 %c_row_to to i64
br label %for.cond14.preheader
for.cond14.preheader: ; preds = %for.cond.cleanup16, %for.cond14.preheader.lr.ph
%indvars.iv87 = phi i64 [ %4, %for.cond14.preheader.lr.ph ], [ %indvars.iv.next88, %for.cond.cleanup16 ]
br i1 %cmp1581, label %for.body17.lr.ph, label %for.cond.cleanup16
for.body17.lr.ph: ; preds = %for.cond14.preheader
%8 = mul nsw i64 %indvars.iv87, %conv20
%9 = add nsw i64 %8, %6
%arrayidx = getelementptr inbounds i8, ptr %A, i64 %9
%10 = mul nsw i64 %indvars.iv87, %conv34
br label %for.body17
for.cond.cleanup: ; preds = %for.cond.cleanup16, %entry
ret void
for.cond.cleanup16: ; preds = %for.body17, %for.cond14.preheader
%indvars.iv.next88 = add i64 %indvars.iv87, %5
%cmp = icmp slt i64 %indvars.iv.next88, %7
br i1 %cmp, label %for.cond14.preheader, label %for.cond.cleanup
for.body17: ; preds = %for.body17, %for.body17.lr.ph
%indvars.iv = phi i64 [ %1, %for.body17.lr.ph ], [ %indvars.iv.next, %for.body17 ]
%11 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %conv4, i16 %conv5)
%12 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %conv4, i16 %conv3, ptr %arrayidx, i64 %conv20)
%13 = trunc i64 %indvars.iv to i32
%mul23 = shl nsw i32 %13, 2
%add24 = add nsw i32 %mul23, %mul22
%idxprom25 = sext i32 %add24 to i64
%arrayidx26 = getelementptr inbounds i8, ptr %B_rcr4, i64 %idxprom25
%14 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %conv13, ptr %arrayidx26, i64 %conv28)
%15 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %conv4, i16 %conv13, i16 %conv3, x86_amx %11, x86_amx %12, x86_amx %14)
%16 = add nsw i64 %indvars.iv, %10
%arrayidx33 = getelementptr inbounds i32, ptr %C, i64 %16
tail call void @llvm.x86.tilestored64.internal(i16 %conv4, i16 %conv5, ptr %arrayidx33, i64 %conv34, x86_amx %15)
%indvars.iv.next = add i64 %indvars.iv, %2
%cmp15 = icmp slt i64 %indvars.iv.next, %3
br i1 %cmp15, label %for.body17, label %for.cond.cleanup16
}
declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, ptr, i64)
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
declare void @llvm.x86.tilestored64.internal(i16, i16, ptr, i64, x86_amx)

View File

@ -0,0 +1,40 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs -stop-after tileconfig | FileCheck %s
; Test the tile register is allocated in a seperate pass.
define i16 @foo(i32 noundef %t, i16 %row, i16 %col) nounwind {
; CHECK-LABEL: name: foo
; CHECK: bb.0.entry:
; CHECK-NEXT: liveins: $esi, $edx
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: undef %12.sub_32bit:gr64_nosp = COPY $edx
; CHECK-NEXT: undef %13.sub_32bit:gr64_with_sub_8bit = COPY $esi
; CHECK-NEXT: [[AVX512_512_SET0_:%[0-9]+]]:vr512 = AVX512_512_SET0
; CHECK-NEXT: VMOVUPSZmr %stack.0, 1, $noreg, 0, $noreg, [[AVX512_512_SET0_]] :: (store (s512) into %stack.0, align 4)
; CHECK-NEXT: MOV8mi %stack.0, 1, $noreg, 0, $noreg, 1 :: (store (s512) into %stack.0, align 4)
; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 20, $noreg, %12.sub_16bit :: (store (s512) into %stack.0 + 20, align 4)
; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 50, $noreg, %13.sub_8bit :: (store (s512) into %stack.0 + 50, align 2, basealign 4)
; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 18, $noreg, %12.sub_16bit :: (store (s512) into %stack.0 + 18, align 2, basealign 4)
; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 49, $noreg, %13.sub_8bit :: (store (s512) into %stack.0 + 49, align 1, basealign 4)
; CHECK-NEXT: MOV16mr %stack.0, 1, $noreg, 16, $noreg, %12.sub_16bit :: (store (s512) into %stack.0 + 16, align 4)
; CHECK-NEXT: MOV8mr %stack.0, 1, $noreg, 48, $noreg, %13.sub_8bit :: (store (s512) into %stack.0 + 48, align 4)
; CHECK-NEXT: PLDTILECFGV %stack.0, 1, $noreg, 0, $noreg, implicit-def dead $tmm0, implicit-def dead $tmm1, implicit-def dead $tmm2, implicit-def dead $tmm3, implicit-def dead $tmm4, implicit-def dead $tmm5, implicit-def dead $tmm6, implicit-def dead $tmm7 :: (load (s512) from %stack.0, align 4)
; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV %13.sub_16bit, %12.sub_16bit
; CHECK-NEXT: [[PTILEZEROV1:%[0-9]+]]:tile = PTILEZEROV %13.sub_16bit, %12.sub_16bit
; CHECK-NEXT: [[PTILEZEROV2:%[0-9]+]]:tile = PTILEZEROV %13.sub_16bit, %12.sub_16bit
; CHECK-NEXT: dead [[PTILEZEROV2]]:tile = PTDPBSSDV %13.sub_16bit, %12.sub_16bit, %12.sub_16bit, [[PTILEZEROV2]], [[PTILEZEROV]], [[PTILEZEROV1]]
; CHECK-NEXT: [[LEA64_32r:%[0-9]+]]:gr32 = LEA64_32r %13, 1, %12, 0, $noreg
; CHECK-NEXT: $ax = COPY [[LEA64_32r]].sub_16bit
; CHECK-NEXT: RET 0, killed $ax
entry:
%0 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col)
%1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col)
%2 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %row, i16 %col)
%3 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 %col, x86_amx %2, x86_amx %0, x86_amx %1)
%4 = add i16 %row, %col
ret i16 %4
}
declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)

View File

@ -106,14 +106,14 @@ define dso_local void @test2(ptr%buf) nounwind {
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw $8, %r14w
; CHECK-NEXT: movw $8, %bp
; CHECK-NEXT: tilezero %tmm0
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB1_3
; CHECK-NEXT: # %bb.1: # %loop.header.preheader
; CHECK-NEXT: movq %rdi, %rbx
; CHECK-NEXT: xorl %ebp, %ebp
; CHECK-NEXT: movq %rdi, %r14
; CHECK-NEXT: xorl %ebx, %ebx
; CHECK-NEXT: movl $32, %r15d
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB1_2: # %loop.header
@ -123,12 +123,12 @@ define dso_local void @test2(ptr%buf) nounwind {
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: tilezero %tmm2
; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm0
; CHECK-NEXT: tileloadd (%rbx,%r15), %tmm1
; CHECK-NEXT: tileloadd (%r14,%r15), %tmm0
; CHECK-NEXT: tileloadd (%r14,%r15), %tmm1
; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm2
; CHECK-NEXT: tilestored %tmm2, (%rbx,%r15)
; CHECK-NEXT: incl %ebp
; CHECK-NEXT: cmpw $100, %bp
; CHECK-NEXT: tilestored %tmm2, (%r14,%r15)
; CHECK-NEXT: incl %ebx
; CHECK-NEXT: cmpw $100, %bx
; CHECK-NEXT: jl .LBB1_2
; CHECK-NEXT: .LBB1_3: # %exit
; CHECK-NEXT: addq $72, %rsp

View File

@ -131,32 +131,32 @@ define dso_local void @test3(ptr%buf) nounwind {
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: movw $8, %r15w
; CHECK-NEXT: movw $8, %bp
; CHECK-NEXT: tilezero %tmm0
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: testb %al, %al
; CHECK-NEXT: jne .LBB1_3
; CHECK-NEXT: # %bb.1: # %loop.header.preheader
; CHECK-NEXT: movq %rdi, %rbx
; CHECK-NEXT: movq %rdi, %r15
; CHECK-NEXT: movl $32, %r14d
; CHECK-NEXT: xorl %ebp, %ebp
; CHECK-NEXT: xorl %ebx, %ebx
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB1_2: # %loop.header
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: tilestored %tmm0, (%rbx,%r14)
; CHECK-NEXT: tilestored %tmm0, (%r15,%r14)
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: tilezero %tmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; CHECK-NEXT: tilezero %tmm0
; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm1
; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm2
; CHECK-NEXT: tileloadd (%r15,%r14), %tmm1
; CHECK-NEXT: tileloadd (%r15,%r14), %tmm2
; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
; CHECK-NEXT: tilestored %tmm0, (%rbx,%r14)
; CHECK-NEXT: tilestored %tmm0, (%r15,%r14)
; CHECK-NEXT: tilezero %tmm0
; CHECK-NEXT: incl %ebp
; CHECK-NEXT: cmpw $100, %bp
; CHECK-NEXT: incl %ebx
; CHECK-NEXT: cmpw $100, %bx
; CHECK-NEXT: jl .LBB1_2
; CHECK-NEXT: .LBB1_3: # %exit
; CHECK-NEXT: addq $72, %rsp

View File

@ -146,6 +146,7 @@
; CHECK-NEXT: Machine Optimization Remark Emitter
; CHECK-NEXT: Greedy Register Allocator
; CHECK-NEXT: Tile Register Configure
; CHECK-NEXT: Greedy Register Allocator
; CHECK-NEXT: Virtual Register Rewriter
; CHECK-NEXT: Register Allocation Pass Scoring
; CHECK-NEXT: Stack Slot Coloring

View File

@ -1,4 +1,4 @@
; RUN: llc -verify-machineinstrs -O3 -use-registers-for-deopt-values -restrict-statepoint-remat=true -pass-remarks-filter=regalloc -pass-remarks-output=%t.yaml -stop-after=greedy -o - < %s 2>&1 | FileCheck %s
; RUN: llc -x86-tile-ra=0 -verify-machineinstrs -O3 -use-registers-for-deopt-values -restrict-statepoint-remat=true -pass-remarks-filter=regalloc -pass-remarks-output=%t.yaml -stop-after=greedy -o - < %s 2>&1 | FileCheck %s
; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
target triple = "x86_64-unknown-linux-gnu"