[MCA] Correctly check pipeline availability for partially overlapping resource groups.

This patch mostly reverts commit 70b37f4c03 which fixed PR50725.

In case of explicit consumption of multiple partially overlapping group
resources, the ResourceManager was not correctly checking pipeline
esources availability.

The fix for PR50725 only partially addressed a few instances of that issue.
This is a more general (although, technically slower) fix for that same issue.

It also fixes Issue 

Thanks to Haohai Wen for the small reproducible.
This commit is contained in:
Andrea Di Biagio 2022-09-07 11:27:22 +01:00
parent 8019b46bc7
commit 3262794804
4 changed files with 88 additions and 56 deletions
llvm
include/llvm/MCA
lib/MCA
test/tools/llvm-mca/X86/AlderlakeP

View File

@ -458,9 +458,6 @@ struct InstrDesc {
// A bitmask of used processor resource units.
uint64_t UsedProcResUnits;
// A bitmask of implicit uses of processor resource units.
uint64_t ImplicitlyUsedProcResUnits;
// A bitmask of used processor resource groups.
uint64_t UsedProcResGroups;
@ -481,6 +478,9 @@ struct InstrDesc {
// recycled.
unsigned IsRecyclable : 1;
// True if some of the consumed group resources are partially overlapping.
unsigned HasPartiallyOverlappingGroups : 1;
// A zero latency instruction doesn't consume any scheduler resources.
bool isZeroLatency() const { return !MaxLatency && Resources.empty(); }

View File

@ -281,26 +281,67 @@ void ResourceManager::releaseBuffers(uint64_t ConsumedBuffers) {
uint64_t ResourceManager::checkAvailability(const InstrDesc &Desc) const {
uint64_t BusyResourceMask = 0;
uint64_t ConsumedResourceMask = 0;
DenseMap<uint64_t, unsigned> AvailableUnits;
for (const std::pair<uint64_t, ResourceUsage> &E : Desc.Resources) {
unsigned NumUnits = E.second.isReserved() ? 0U : E.second.NumUnits;
unsigned Index = getResourceStateIndex(E.first);
if (!Resources[Index]->isReady(NumUnits))
const ResourceState &RS = *Resources[getResourceStateIndex(E.first)];
if (!RS.isReady(NumUnits)) {
BusyResourceMask |= E.first;
}
continue;
}
uint64_t ImplicitUses = Desc.ImplicitlyUsedProcResUnits;
while (ImplicitUses) {
uint64_t Use = ImplicitUses & -ImplicitUses;
ImplicitUses ^= Use;
unsigned Index = getResourceStateIndex(Use);
if (!Resources[Index]->isReady(/* NumUnits */ 1))
BusyResourceMask |= Index;
if (Desc.HasPartiallyOverlappingGroups && !RS.isAResourceGroup()) {
unsigned NumAvailableUnits = countPopulation(RS.getReadyMask());
NumAvailableUnits -= NumUnits;
AvailableUnits[E.first] = NumAvailableUnits;
if (!NumAvailableUnits)
ConsumedResourceMask |= E.first;
}
}
BusyResourceMask &= ProcResUnitMask;
if (BusyResourceMask)
return BusyResourceMask;
return Desc.UsedProcResGroups & ReservedResourceGroups;
BusyResourceMask = Desc.UsedProcResGroups & ReservedResourceGroups;
if (!Desc.HasPartiallyOverlappingGroups || BusyResourceMask)
return BusyResourceMask;
// If this instruction has overlapping groups, make sure that we can
// select at least one unit per group.
for (const std::pair<uint64_t, ResourceUsage> &E : Desc.Resources) {
const ResourceState &RS = *Resources[getResourceStateIndex(E.first)];
if (!E.second.isReserved() && RS.isAResourceGroup()) {
uint64_t ReadyMask = RS.getReadyMask() & ~ConsumedResourceMask;
if (!ReadyMask) {
BusyResourceMask |= RS.getReadyMask();
continue;
}
uint64_t ResourceMask = PowerOf2Floor(ReadyMask);
auto it = AvailableUnits.find(ResourceMask);
if (it == AvailableUnits.end()) {
unsigned Index = getResourceStateIndex(ResourceMask);
unsigned NumUnits = countPopulation(Resources[Index]->getReadyMask());
it =
AvailableUnits.insert(std::make_pair(ResourceMask, NumUnits)).first;
}
if (!it->second) {
BusyResourceMask |= it->first;
continue;
}
it->second--;
if (!it->second)
ConsumedResourceMask |= it->first;
}
}
return BusyResourceMask;
}
void ResourceManager::issueInstruction(

View File

@ -112,13 +112,12 @@ static void initializeUsedResources(InstrDesc &ID,
uint64_t UsedResourceUnits = 0;
uint64_t UsedResourceGroups = 0;
auto GroupIt = find_if(Worklist, [](const ResourcePlusCycles &Elt) {
return countPopulation(Elt.first) > 1;
});
unsigned FirstGroupIdx = std::distance(Worklist.begin(), GroupIt);
uint64_t ImpliedUsesOfResourceUnits = 0;
uint64_t UnitsFromResourceGroups = 0;
// Remove cycles contributed by smaller resources, and check if there
// are partially overlapping resource groups.
ID.HasPartiallyOverlappingGroups = false;
// Remove cycles contributed by smaller resources.
for (unsigned I = 0, E = Worklist.size(); I < E; ++I) {
ResourcePlusCycles &A = Worklist[I];
if (!A.second.size()) {
@ -129,21 +128,17 @@ static void initializeUsedResources(InstrDesc &ID,
ID.Resources.emplace_back(A);
uint64_t NormalizedMask = A.first;
if (countPopulation(A.first) == 1) {
UsedResourceUnits |= A.first;
} else {
// Remove the leading 1 from the resource group mask.
NormalizedMask ^= PowerOf2Floor(NormalizedMask);
UsedResourceGroups |= (A.first ^ NormalizedMask);
if (UnitsFromResourceGroups & NormalizedMask)
ID.HasPartiallyOverlappingGroups = true;
uint64_t AvailableMask = NormalizedMask & ~UsedResourceUnits;
if ((NormalizedMask != AvailableMask) &&
countPopulation(AvailableMask) == 1) {
// At simulation time, this resource group use will decay into a simple
// use of the resource unit identified by `AvailableMask`.
ImpliedUsesOfResourceUnits |= AvailableMask;
UsedResourceUnits |= AvailableMask;
}
UnitsFromResourceGroups |= NormalizedMask;
UsedResourceGroups |= (A.first ^ NormalizedMask);
}
for (unsigned J = I + 1; J < E; ++J) {
@ -156,31 +151,6 @@ static void initializeUsedResources(InstrDesc &ID,
}
}
// Look for implicit uses of processor resource units. These are resource
// units which are indirectly consumed by resource groups, and that must be
// always available on instruction issue.
while (ImpliedUsesOfResourceUnits) {
ID.ImplicitlyUsedProcResUnits |= ImpliedUsesOfResourceUnits;
ImpliedUsesOfResourceUnits = 0;
for (unsigned I = FirstGroupIdx, E = Worklist.size(); I < E; ++I) {
ResourcePlusCycles &A = Worklist[I];
if (!A.second.size())
continue;
uint64_t NormalizedMask = A.first;
assert(countPopulation(NormalizedMask) > 1);
// Remove the leading 1 from the resource group mask.
NormalizedMask ^= PowerOf2Floor(NormalizedMask);
uint64_t AvailableMask = NormalizedMask & ~UsedResourceUnits;
if ((NormalizedMask != AvailableMask) &&
countPopulation(AvailableMask) != 1)
continue;
UsedResourceUnits |= AvailableMask;
ImpliedUsesOfResourceUnits |= AvailableMask;
}
}
// A SchedWrite may specify a number of cycles in which a resource group
// is reserved. For example (on target x86; cpu Haswell):
//
@ -240,10 +210,10 @@ static void initializeUsedResources(InstrDesc &ID,
BufferIDs ^= Current;
}
dbgs() << "\t\t Used Units=" << format_hex(ID.UsedProcResUnits, 16) << '\n';
dbgs() << "\t\tImplicitly Used Units="
<< format_hex(ID.ImplicitlyUsedProcResUnits, 16) << '\n';
dbgs() << "\t\tUsed Groups=" << format_hex(ID.UsedProcResGroups, 16)
<< '\n';
dbgs() << "\t\tHasPartiallyOverlappingGroups="
<< ID.HasPartiallyOverlappingGroups << '\n';
});
}

View File

@ -0,0 +1,21 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=alderlake -all-views=false -summary-view < %s | FileCheck %s
# Issue #57548
# Do not crash when simulating instructions that consume partially overlapping
# resource groups.
vpsllw %xmm1, %ymm0, %ymm0
vpsllw %xmm1, %xmm2, %xmm1
vpand %ymm1, %ymm0, %ymm0
# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 300
# CHECK-NEXT: Total Cycles: 503
# CHECK-NEXT: Total uOps: 500
# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.99
# CHECK-NEXT: IPC: 0.60
# CHECK-NEXT: Block RThroughput: 1.0