Add a bad char heuristic to StringRef::find.

Based on Horspool's simplified version of Boyer-Moore. We use a constant-sized table of
uint8_ts to keep cache thrashing low, needles bigger than 255 bytes are uncommon anyways.

The worst case is still O(n*m) but we do a lot better on the average case now.

llvm-svn: 142061
This commit is contained in:
Benjamin Kramer 2011-10-15 10:08:31 +00:00
parent ebe13bc3f1
commit 4d681d7dc4
2 changed files with 32 additions and 3 deletions

View File

@ -144,9 +144,32 @@ size_t StringRef::find(StringRef Str, size_t From) const {
size_t N = Str.size();
if (N > Length)
return npos;
for (size_t e = Length - N + 1, i = min(From, e); i != e; ++i)
if (substr(i, N).equals(Str))
return i;
// For short haystacks or unsupported needles fall back to the naive algorithm
if (Length < 16 || N > 255 || N == 0) {
for (size_t e = Length - N + 1, i = min(From, e); i != e; ++i)
if (substr(i, N).equals(Str))
return i;
return npos;
}
// Build the bad char heuristic table, with uint8_t to reduce cache thrashing.
uint8_t BadCharSkip[256];
std::memset(BadCharSkip, N, 256);
for (unsigned i = 0; i != N-1; ++i)
BadCharSkip[(uint8_t)Str[i]] = N-1-i;
unsigned Len = Length, Pos = min(From, Length);
while (Len >= N) {
if (substr(Pos, N).equals(Str)) // See if this is the correct substring.
return Pos;
// Otherwise skip the appropriate number of bytes.
uint8_t Skip = BadCharSkip[(uint8_t)Data[Pos+N-1]];
Len -= Skip;
Pos += Skip;
}
return npos;
}

View File

@ -245,6 +245,12 @@ TEST(StringRefTest, Find) {
EXPECT_EQ(StringRef::npos, Str.find("zz"));
EXPECT_EQ(2U, Str.find("ll", 2));
EXPECT_EQ(StringRef::npos, Str.find("ll", 3));
EXPECT_EQ(0U, Str.find(""));
StringRef LongStr("hellx xello hell ello world foo bar hello");
EXPECT_EQ(36U, LongStr.find("hello"));
EXPECT_EQ(28U, LongStr.find("foo"));
EXPECT_EQ(12U, LongStr.find("hell", 2));
EXPECT_EQ(0U, LongStr.find(""));
EXPECT_EQ(3U, Str.rfind('l'));
EXPECT_EQ(StringRef::npos, Str.rfind('z'));