[MachineOutliner][NFC] Traverse suffix tree using a RepeatedSubstring iterator

This takes the traversal methods introduced in r346269 and adapts them
into an iterator. This allows the outliner to iterate over repeated substrings
within the suffix tree directly without having to initially find all of the
substrings and then iterate over them after you've found them.

llvm-svn: 346345
This commit is contained in:
Jessica Paquette 2018-11-07 19:20:55 +00:00
parent 23f7106ecc
commit a409cc959b
1 changed files with 111 additions and 53 deletions

View File

@ -479,44 +479,6 @@ private:
return SuffixesToAdd; return SuffixesToAdd;
} }
/// Helper function for findRepeatedSubstrings.
/// Traverses the suffix tree that finds all nodes associated with a repeated
/// substring. That is, all internal non-root nodes. If the given node has
/// more than one leaf child, store the repeated strings in Substrings.
void
findRepeatedSubstringsHelper(SuffixTreeNode &Curr,
std::vector<RepeatedSubstring> &Substrings,
const unsigned MinLength = 1) {
assert(!Curr.isLeaf() && "Visited a leaf?");
std::vector<SuffixTreeNode *> LeafChildren;
unsigned Length = Curr.ConcatLen;
for (auto &ChildPair : Curr.Children) {
if (!ChildPair.second->isLeaf())
findRepeatedSubstringsHelper(*ChildPair.second, Substrings, MinLength);
else if (Length >= MinLength)
LeafChildren.push_back(ChildPair.second);
}
// The root node never has repeats. Quit here.
if (Curr.isRoot())
return;
// If there are no occurrences of the minimum length, then quit.
if (LeafChildren.empty() || LeafChildren.size() < 2)
return;
// We have a node associated with a repeated substring. Store that in
// Substrings and move on.
RepeatedSubstring RS;
RS.Length = Length;
// Each occurrence starts at a suffix given by a leaf child.
for (SuffixTreeNode *Leaf : LeafChildren)
RS.StartIndices.push_back(Leaf->SuffixIdx);
Substrings.push_back(RS);
}
public: public:
/// Construct a suffix tree from a sequence of unsigned integers. /// Construct a suffix tree from a sequence of unsigned integers.
/// ///
@ -545,14 +507,115 @@ public:
setSuffixIndices(*Root, 0); setSuffixIndices(*Root, 0);
} }
/// Finds all repeated substrings with an optionally-provided minimum length
/// and stores them in \p Substrings. /// Iterator for finding all repeated substrings in the suffix tree.
/// If \p MinLength is provided, only return those with a given minimum struct RepeatedSubstringIterator {
/// length. private:
void findRepeatedSubstrings(std::vector<RepeatedSubstring> &Substrings, /// The current node we're visiting.
const unsigned MinLength = 1) { SuffixTreeNode *N = nullptr;
findRepeatedSubstringsHelper(*Root, Substrings, MinLength);
} /// The repeated substring associated with this node.
RepeatedSubstring RS;
/// The nodes left to visit.
std::vector<SuffixTreeNode *> ToVisit;
/// The minimum length of a repeated substring to find.
/// Since we're outlining, we want at least two instructions in the range.
/// FIXME: This may not be true for targets like X86 which support many
/// instruction lengths.
const unsigned MinLength = 2;
/// Move the iterator to the next repeated substring.
void advance() {
// Clear the current state. If we're at the end of the range, then this
// is the state we want to be in.
RS = RepeatedSubstring();
N = nullptr;
// Continue visiting nodes until we find one which repeats more than once.
while (!ToVisit.empty()) {
SuffixTreeNode *Curr = ToVisit.back();
ToVisit.pop_back();
// Keep track of the length of the string associated with the node. If
// it's too short, we'll quit.
unsigned Length = Curr->ConcatLen;
// Each leaf node represents a repeat of a string.
std::vector<SuffixTreeNode *> LeafChildren;
// Iterate over each child, saving internal nodes for visiting, and
// leaf nodes in LeafChildren. Internal nodes represent individual
// strings, which may repeat.
for (auto &ChildPair : Curr->Children) {
// Save all of this node's children for processing.
if (!ChildPair.second->isLeaf())
ToVisit.push_back(ChildPair.second);
// It's not an internal node, so it must be a leaf. If we have a
// long enough string, then save the leaf children.
else if (Length >= MinLength)
LeafChildren.push_back(ChildPair.second);
}
// The root never represents a repeated substring. If we're looking at
// that, then skip it.
if (Curr->isRoot())
continue;
// Do we have any repeated substrings?
if (LeafChildren.size() >= 2) {
// Yes. Update the state to reflect this, and then bail out.
N = Curr;
RS.Length = Length;
for (SuffixTreeNode *Leaf : LeafChildren)
RS.StartIndices.push_back(Leaf->SuffixIdx);
break;
}
}
// At this point, either NewRS is an empty RepeatedSubstring, or it was
// set in the above loop. Similarly, N is either nullptr, or the node
// associated with NewRS.
}
public:
/// Return the current repeated substring.
RepeatedSubstring &operator*() { return RS; }
RepeatedSubstringIterator &operator++() {
advance();
return *this;
}
RepeatedSubstringIterator operator++(int I) {
RepeatedSubstringIterator It(*this);
advance();
return It;
}
bool operator==(const RepeatedSubstringIterator &Other) {
return N == Other.N;
}
bool operator!=(const RepeatedSubstringIterator &Other) {
return !(*this == Other);
}
RepeatedSubstringIterator(SuffixTreeNode *N) : N(N) {
// Do we have a non-null node?
if (N) {
// Yes. At the first step, we need to visit all of N's children.
// Note: This means that we visit N last.
ToVisit.push_back(N);
advance();
}
}
};
typedef RepeatedSubstringIterator iterator;
iterator begin() { return iterator(Root); }
iterator end() { return iterator(nullptr); }
}; };
/// Maps \p MachineInstrs to unsigned integers and stores the mappings. /// Maps \p MachineInstrs to unsigned integers and stores the mappings.
@ -963,13 +1026,8 @@ unsigned MachineOutliner::findCandidates(
// First, find dall of the repeated substrings in the tree of minimum length // First, find dall of the repeated substrings in the tree of minimum length
// 2. // 2.
// FIXME: 2 is an approximation which isn't necessarily true for, say, X86. for (auto It = ST.begin(), Et = ST.end(); It != Et; ++It) {
// If we factor in instruction lengths, we need more information than this. SuffixTree::RepeatedSubstring RS = *It;
// FIXME: It'd be nice if we could just have a repeated substring iterator.
std::vector<SuffixTree::RepeatedSubstring> RepeatedSubstrings;
ST.findRepeatedSubstrings(RepeatedSubstrings, 2);
for (SuffixTree::RepeatedSubstring &RS : RepeatedSubstrings) {
std::vector<Candidate> CandidatesForRepeatedSeq; std::vector<Candidate> CandidatesForRepeatedSeq;
unsigned StringLen = RS.Length; unsigned StringLen = RS.Length;
for (const unsigned &StartIdx : RS.StartIndices) { for (const unsigned &StartIdx : RS.StartIndices) {