2012-02-19 05:00:49 +08:00
|
|
|
//===- llvm/unittest/ADT/HashingTest.cpp ----------------------------------===//
|
|
|
|
//
|
2019-01-19 16:50:56 +08:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2012-02-19 05:00:49 +08:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// Hashing.h unit tests.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "llvm/ADT/Hashing.h"
|
Rewrite LLVM's generalized support library for hashing to follow the API
of the proposed standard hashing interfaces (N3333), and to use
a modified and tuned version of the CityHash algorithm.
Some of the highlights of this change:
-- Significantly higher quality hashing algorithm with very well
distributed results, and extremely few collisions. Should be close to
a checksum for up to 64-bit keys. Very little clustering or clumping of
hash codes, to better distribute load on probed hash tables.
-- Built-in support for reserved values.
-- Simplified API that composes cleanly with other C++ idioms and APIs.
-- Better scaling performance as keys grow. This is the fastest
algorithm I've found and measured for moderately sized keys (such as
show up in some of the uniquing and folding use cases)
-- Support for enabling per-execution seeds to prevent table ordering
or other artifacts of hashing algorithms to impact the output of
LLVM. The seeding would make each run different and highlight these
problems during bootstrap.
This implementation was tested extensively using the SMHasher test
suite, and pased with flying colors, doing better than the original
CityHash algorithm even.
I've included a unittest, although it is somewhat minimal at the moment.
I've also added (or refactored into the proper location) type traits
necessary to implement this, and converted users of GeneralHash over.
My only immediate concerns with this implementation is the performance
of hashing small keys. I've already started working to improve this, and
will continue to do so. Currently, the only algorithms faster produce
lower quality results, but it is likely there is a better compromise
than the current one.
Many thanks to Jeffrey Yasskin who did most of the work on the N3333
paper, pair-programmed some of this code, and reviewed much of it. Many
thanks also go to Geoff Pike Pike and Jyrki Alakuijala, the original
authors of CityHash on which this is heavily based, and Austin Appleby
who created MurmurHash and the SMHasher test suite.
Also thanks to Nadav, Tobias, Howard, Jay, Nick, Ahmed, and Duncan for
all of the review comments! If there are further comments or concerns,
please let me know and I'll jump on 'em.
llvm-svn: 151822
2012-03-02 02:55:25 +08:00
|
|
|
#include "llvm/Support/DataTypes.h"
|
2017-06-06 19:06:56 +08:00
|
|
|
#include "gtest/gtest.h"
|
Rewrite LLVM's generalized support library for hashing to follow the API
of the proposed standard hashing interfaces (N3333), and to use
a modified and tuned version of the CityHash algorithm.
Some of the highlights of this change:
-- Significantly higher quality hashing algorithm with very well
distributed results, and extremely few collisions. Should be close to
a checksum for up to 64-bit keys. Very little clustering or clumping of
hash codes, to better distribute load on probed hash tables.
-- Built-in support for reserved values.
-- Simplified API that composes cleanly with other C++ idioms and APIs.
-- Better scaling performance as keys grow. This is the fastest
algorithm I've found and measured for moderately sized keys (such as
show up in some of the uniquing and folding use cases)
-- Support for enabling per-execution seeds to prevent table ordering
or other artifacts of hashing algorithms to impact the output of
LLVM. The seeding would make each run different and highlight these
problems during bootstrap.
This implementation was tested extensively using the SMHasher test
suite, and pased with flying colors, doing better than the original
CityHash algorithm even.
I've included a unittest, although it is somewhat minimal at the moment.
I've also added (or refactored into the proper location) type traits
necessary to implement this, and converted users of GeneralHash over.
My only immediate concerns with this implementation is the performance
of hashing small keys. I've already started working to improve this, and
will continue to do so. Currently, the only algorithms faster produce
lower quality results, but it is likely there is a better compromise
than the current one.
Many thanks to Jeffrey Yasskin who did most of the work on the N3333
paper, pair-programmed some of this code, and reviewed much of it. Many
thanks also go to Geoff Pike Pike and Jyrki Alakuijala, the original
authors of CityHash on which this is heavily based, and Austin Appleby
who created MurmurHash and the SMHasher test suite.
Also thanks to Nadav, Tobias, Howard, Jay, Nick, Ahmed, and Duncan for
all of the review comments! If there are further comments or concerns,
please let me know and I'll jump on 'em.
llvm-svn: 151822
2012-03-02 02:55:25 +08:00
|
|
|
#include <deque>
|
|
|
|
#include <list>
|
|
|
|
#include <map>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
namespace llvm {
|
|
|
|
|
|
|
|
// Helper for test code to print hash codes.
|
|
|
|
void PrintTo(const hash_code &code, std::ostream *os) {
|
|
|
|
*os << static_cast<size_t>(code);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Fake an object that is recognized as hashable data to test super large
|
|
|
|
// objects.
|
|
|
|
struct LargeTestInteger { uint64_t arr[8]; };
|
|
|
|
|
2012-03-03 17:39:54 +08:00
|
|
|
struct NonPOD {
|
|
|
|
uint64_t x, y;
|
|
|
|
NonPOD(uint64_t x, uint64_t y) : x(x), y(y) {}
|
|
|
|
friend hash_code hash_value(const NonPOD &obj) {
|
|
|
|
return hash_combine(obj.x, obj.y);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
Rewrite LLVM's generalized support library for hashing to follow the API
of the proposed standard hashing interfaces (N3333), and to use
a modified and tuned version of the CityHash algorithm.
Some of the highlights of this change:
-- Significantly higher quality hashing algorithm with very well
distributed results, and extremely few collisions. Should be close to
a checksum for up to 64-bit keys. Very little clustering or clumping of
hash codes, to better distribute load on probed hash tables.
-- Built-in support for reserved values.
-- Simplified API that composes cleanly with other C++ idioms and APIs.
-- Better scaling performance as keys grow. This is the fastest
algorithm I've found and measured for moderately sized keys (such as
show up in some of the uniquing and folding use cases)
-- Support for enabling per-execution seeds to prevent table ordering
or other artifacts of hashing algorithms to impact the output of
LLVM. The seeding would make each run different and highlight these
problems during bootstrap.
This implementation was tested extensively using the SMHasher test
suite, and pased with flying colors, doing better than the original
CityHash algorithm even.
I've included a unittest, although it is somewhat minimal at the moment.
I've also added (or refactored into the proper location) type traits
necessary to implement this, and converted users of GeneralHash over.
My only immediate concerns with this implementation is the performance
of hashing small keys. I've already started working to improve this, and
will continue to do so. Currently, the only algorithms faster produce
lower quality results, but it is likely there is a better compromise
than the current one.
Many thanks to Jeffrey Yasskin who did most of the work on the N3333
paper, pair-programmed some of this code, and reviewed much of it. Many
thanks also go to Geoff Pike Pike and Jyrki Alakuijala, the original
authors of CityHash on which this is heavily based, and Austin Appleby
who created MurmurHash and the SMHasher test suite.
Also thanks to Nadav, Tobias, Howard, Jay, Nick, Ahmed, and Duncan for
all of the review comments! If there are further comments or concerns,
please let me know and I'll jump on 'em.
llvm-svn: 151822
2012-03-02 02:55:25 +08:00
|
|
|
namespace hashing {
|
|
|
|
namespace detail {
|
2014-03-07 22:42:25 +08:00
|
|
|
template <> struct is_hashable_data<LargeTestInteger> : std::true_type {};
|
Rewrite LLVM's generalized support library for hashing to follow the API
of the proposed standard hashing interfaces (N3333), and to use
a modified and tuned version of the CityHash algorithm.
Some of the highlights of this change:
-- Significantly higher quality hashing algorithm with very well
distributed results, and extremely few collisions. Should be close to
a checksum for up to 64-bit keys. Very little clustering or clumping of
hash codes, to better distribute load on probed hash tables.
-- Built-in support for reserved values.
-- Simplified API that composes cleanly with other C++ idioms and APIs.
-- Better scaling performance as keys grow. This is the fastest
algorithm I've found and measured for moderately sized keys (such as
show up in some of the uniquing and folding use cases)
-- Support for enabling per-execution seeds to prevent table ordering
or other artifacts of hashing algorithms to impact the output of
LLVM. The seeding would make each run different and highlight these
problems during bootstrap.
This implementation was tested extensively using the SMHasher test
suite, and pased with flying colors, doing better than the original
CityHash algorithm even.
I've included a unittest, although it is somewhat minimal at the moment.
I've also added (or refactored into the proper location) type traits
necessary to implement this, and converted users of GeneralHash over.
My only immediate concerns with this implementation is the performance
of hashing small keys. I've already started working to improve this, and
will continue to do so. Currently, the only algorithms faster produce
lower quality results, but it is likely there is a better compromise
than the current one.
Many thanks to Jeffrey Yasskin who did most of the work on the N3333
paper, pair-programmed some of this code, and reviewed much of it. Many
thanks also go to Geoff Pike Pike and Jyrki Alakuijala, the original
authors of CityHash on which this is heavily based, and Austin Appleby
who created MurmurHash and the SMHasher test suite.
Also thanks to Nadav, Tobias, Howard, Jay, Nick, Ahmed, and Duncan for
all of the review comments! If there are further comments or concerns,
please let me know and I'll jump on 'em.
llvm-svn: 151822
2012-03-02 02:55:25 +08:00
|
|
|
} // namespace detail
|
|
|
|
} // namespace hashing
|
|
|
|
|
|
|
|
} // namespace llvm
|
2012-02-19 05:00:49 +08:00
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
2012-03-07 17:32:32 +08:00
|
|
|
enum TestEnumeration {
|
|
|
|
TE_Foo = 42,
|
|
|
|
TE_Bar = 43
|
|
|
|
};
|
2012-03-02 18:56:40 +08:00
|
|
|
|
Rewrite LLVM's generalized support library for hashing to follow the API
of the proposed standard hashing interfaces (N3333), and to use
a modified and tuned version of the CityHash algorithm.
Some of the highlights of this change:
-- Significantly higher quality hashing algorithm with very well
distributed results, and extremely few collisions. Should be close to
a checksum for up to 64-bit keys. Very little clustering or clumping of
hash codes, to better distribute load on probed hash tables.
-- Built-in support for reserved values.
-- Simplified API that composes cleanly with other C++ idioms and APIs.
-- Better scaling performance as keys grow. This is the fastest
algorithm I've found and measured for moderately sized keys (such as
show up in some of the uniquing and folding use cases)
-- Support for enabling per-execution seeds to prevent table ordering
or other artifacts of hashing algorithms to impact the output of
LLVM. The seeding would make each run different and highlight these
problems during bootstrap.
This implementation was tested extensively using the SMHasher test
suite, and pased with flying colors, doing better than the original
CityHash algorithm even.
I've included a unittest, although it is somewhat minimal at the moment.
I've also added (or refactored into the proper location) type traits
necessary to implement this, and converted users of GeneralHash over.
My only immediate concerns with this implementation is the performance
of hashing small keys. I've already started working to improve this, and
will continue to do so. Currently, the only algorithms faster produce
lower quality results, but it is likely there is a better compromise
than the current one.
Many thanks to Jeffrey Yasskin who did most of the work on the N3333
paper, pair-programmed some of this code, and reviewed much of it. Many
thanks also go to Geoff Pike Pike and Jyrki Alakuijala, the original
authors of CityHash on which this is heavily based, and Austin Appleby
who created MurmurHash and the SMHasher test suite.
Also thanks to Nadav, Tobias, Howard, Jay, Nick, Ahmed, and Duncan for
all of the review comments! If there are further comments or concerns,
please let me know and I'll jump on 'em.
llvm-svn: 151822
2012-03-02 02:55:25 +08:00
|
|
|
TEST(HashingTest, HashValueBasicTest) {
|
|
|
|
int x = 42, y = 43, c = 'x';
|
2014-06-09 06:29:17 +08:00
|
|
|
void *p = nullptr;
|
Rewrite LLVM's generalized support library for hashing to follow the API
of the proposed standard hashing interfaces (N3333), and to use
a modified and tuned version of the CityHash algorithm.
Some of the highlights of this change:
-- Significantly higher quality hashing algorithm with very well
distributed results, and extremely few collisions. Should be close to
a checksum for up to 64-bit keys. Very little clustering or clumping of
hash codes, to better distribute load on probed hash tables.
-- Built-in support for reserved values.
-- Simplified API that composes cleanly with other C++ idioms and APIs.
-- Better scaling performance as keys grow. This is the fastest
algorithm I've found and measured for moderately sized keys (such as
show up in some of the uniquing and folding use cases)
-- Support for enabling per-execution seeds to prevent table ordering
or other artifacts of hashing algorithms to impact the output of
LLVM. The seeding would make each run different and highlight these
problems during bootstrap.
This implementation was tested extensively using the SMHasher test
suite, and pased with flying colors, doing better than the original
CityHash algorithm even.
I've included a unittest, although it is somewhat minimal at the moment.
I've also added (or refactored into the proper location) type traits
necessary to implement this, and converted users of GeneralHash over.
My only immediate concerns with this implementation is the performance
of hashing small keys. I've already started working to improve this, and
will continue to do so. Currently, the only algorithms faster produce
lower quality results, but it is likely there is a better compromise
than the current one.
Many thanks to Jeffrey Yasskin who did most of the work on the N3333
paper, pair-programmed some of this code, and reviewed much of it. Many
thanks also go to Geoff Pike Pike and Jyrki Alakuijala, the original
authors of CityHash on which this is heavily based, and Austin Appleby
who created MurmurHash and the SMHasher test suite.
Also thanks to Nadav, Tobias, Howard, Jay, Nick, Ahmed, and Duncan for
all of the review comments! If there are further comments or concerns,
please let me know and I'll jump on 'em.
llvm-svn: 151822
2012-03-02 02:55:25 +08:00
|
|
|
uint64_t i = 71;
|
|
|
|
const unsigned ci = 71;
|
|
|
|
volatile int vi = 71;
|
|
|
|
const volatile int cvi = 71;
|
|
|
|
uintptr_t addr = reinterpret_cast<uintptr_t>(&y);
|
2012-03-03 17:39:54 +08:00
|
|
|
EXPECT_EQ(hash_value(42), hash_value(x));
|
2012-03-07 17:32:32 +08:00
|
|
|
EXPECT_EQ(hash_value(42), hash_value(TE_Foo));
|
2012-03-03 17:39:54 +08:00
|
|
|
EXPECT_NE(hash_value(42), hash_value(y));
|
2012-03-07 17:32:32 +08:00
|
|
|
EXPECT_NE(hash_value(42), hash_value(TE_Bar));
|
2012-03-03 17:39:54 +08:00
|
|
|
EXPECT_NE(hash_value(42), hash_value(p));
|
|
|
|
EXPECT_EQ(hash_value(71), hash_value(i));
|
|
|
|
EXPECT_EQ(hash_value(71), hash_value(ci));
|
|
|
|
EXPECT_EQ(hash_value(71), hash_value(vi));
|
|
|
|
EXPECT_EQ(hash_value(71), hash_value(cvi));
|
|
|
|
EXPECT_EQ(hash_value(c), hash_value('x'));
|
|
|
|
EXPECT_EQ(hash_value('4'), hash_value('0' + 4));
|
|
|
|
EXPECT_EQ(hash_value(addr), hash_value(&y));
|
2012-03-04 18:23:11 +08:00
|
|
|
}
|
2012-03-03 17:39:54 +08:00
|
|
|
|
2012-03-04 18:23:11 +08:00
|
|
|
TEST(HashingTest, HashValueStdPair) {
|
2012-03-03 17:39:54 +08:00
|
|
|
EXPECT_EQ(hash_combine(42, 43), hash_value(std::make_pair(42, 43)));
|
|
|
|
EXPECT_NE(hash_combine(43, 42), hash_value(std::make_pair(42, 43)));
|
|
|
|
EXPECT_NE(hash_combine(42, 43), hash_value(std::make_pair(42ull, 43ull)));
|
|
|
|
EXPECT_NE(hash_combine(42, 43), hash_value(std::make_pair(42, 43ull)));
|
|
|
|
EXPECT_NE(hash_combine(42, 43), hash_value(std::make_pair(42ull, 43)));
|
2012-03-02 17:26:36 +08:00
|
|
|
|
|
|
|
// Note that pairs are implicitly flattened to a direct sequence of data and
|
|
|
|
// hashed efficiently as a consequence.
|
2012-03-03 17:39:54 +08:00
|
|
|
EXPECT_EQ(hash_combine(42, 43, 44),
|
|
|
|
hash_value(std::make_pair(42, std::make_pair(43, 44))));
|
|
|
|
EXPECT_EQ(hash_value(std::make_pair(42, std::make_pair(43, 44))),
|
|
|
|
hash_value(std::make_pair(std::make_pair(42, 43), 44)));
|
2012-03-02 18:56:40 +08:00
|
|
|
|
|
|
|
// Ensure that pairs which have padding bytes *inside* them don't get treated
|
|
|
|
// this way.
|
2012-03-03 17:39:54 +08:00
|
|
|
EXPECT_EQ(hash_combine('0', hash_combine(1ull, '2')),
|
|
|
|
hash_value(std::make_pair('0', std::make_pair(1ull, '2'))));
|
2012-03-02 18:56:40 +08:00
|
|
|
|
|
|
|
// Ensure that non-POD pairs don't explode the traits used.
|
|
|
|
NonPOD obj1(1, 2), obj2(3, 4), obj3(5, 6);
|
2012-03-03 17:39:54 +08:00
|
|
|
EXPECT_EQ(hash_combine(obj1, hash_combine(obj2, obj3)),
|
|
|
|
hash_value(std::make_pair(obj1, std::make_pair(obj2, obj3))));
|
2012-02-19 05:00:49 +08:00
|
|
|
}
|
|
|
|
|
2020-07-17 00:48:34 +08:00
|
|
|
TEST(HashingTest, HashValueStdTuple) {
|
|
|
|
EXPECT_EQ(hash_combine(), hash_value(std::make_tuple()));
|
|
|
|
EXPECT_EQ(hash_combine(42), hash_value(std::make_tuple(42)));
|
|
|
|
EXPECT_EQ(hash_combine(42, 'c'), hash_value(std::make_tuple(42, 'c')));
|
|
|
|
|
|
|
|
EXPECT_NE(hash_combine(43, 42), hash_value(std::make_tuple(42, 43)));
|
|
|
|
EXPECT_NE(hash_combine(42, 43), hash_value(std::make_tuple(42ull, 43ull)));
|
|
|
|
EXPECT_NE(hash_combine(42, 43), hash_value(std::make_tuple(42, 43ull)));
|
|
|
|
EXPECT_NE(hash_combine(42, 43), hash_value(std::make_tuple(42ull, 43)));
|
|
|
|
}
|
|
|
|
|
2012-03-04 18:23:15 +08:00
|
|
|
TEST(HashingTest, HashValueStdString) {
|
|
|
|
std::string s = "Hello World!";
|
|
|
|
EXPECT_EQ(hash_combine_range(s.c_str(), s.c_str() + s.size()), hash_value(s));
|
|
|
|
EXPECT_EQ(hash_combine_range(s.c_str(), s.c_str() + s.size() - 1),
|
|
|
|
hash_value(s.substr(0, s.size() - 1)));
|
|
|
|
EXPECT_EQ(hash_combine_range(s.c_str() + 1, s.c_str() + s.size() - 1),
|
|
|
|
hash_value(s.substr(1, s.size() - 2)));
|
|
|
|
|
|
|
|
std::wstring ws = L"Hello Wide World!";
|
|
|
|
EXPECT_EQ(hash_combine_range(ws.c_str(), ws.c_str() + ws.size()),
|
|
|
|
hash_value(ws));
|
|
|
|
EXPECT_EQ(hash_combine_range(ws.c_str(), ws.c_str() + ws.size() - 1),
|
|
|
|
hash_value(ws.substr(0, ws.size() - 1)));
|
|
|
|
EXPECT_EQ(hash_combine_range(ws.c_str() + 1, ws.c_str() + ws.size() - 1),
|
|
|
|
hash_value(ws.substr(1, ws.size() - 2)));
|
|
|
|
}
|
|
|
|
|
Rewrite LLVM's generalized support library for hashing to follow the API
of the proposed standard hashing interfaces (N3333), and to use
a modified and tuned version of the CityHash algorithm.
Some of the highlights of this change:
-- Significantly higher quality hashing algorithm with very well
distributed results, and extremely few collisions. Should be close to
a checksum for up to 64-bit keys. Very little clustering or clumping of
hash codes, to better distribute load on probed hash tables.
-- Built-in support for reserved values.
-- Simplified API that composes cleanly with other C++ idioms and APIs.
-- Better scaling performance as keys grow. This is the fastest
algorithm I've found and measured for moderately sized keys (such as
show up in some of the uniquing and folding use cases)
-- Support for enabling per-execution seeds to prevent table ordering
or other artifacts of hashing algorithms to impact the output of
LLVM. The seeding would make each run different and highlight these
problems during bootstrap.
This implementation was tested extensively using the SMHasher test
suite, and pased with flying colors, doing better than the original
CityHash algorithm even.
I've included a unittest, although it is somewhat minimal at the moment.
I've also added (or refactored into the proper location) type traits
necessary to implement this, and converted users of GeneralHash over.
My only immediate concerns with this implementation is the performance
of hashing small keys. I've already started working to improve this, and
will continue to do so. Currently, the only algorithms faster produce
lower quality results, but it is likely there is a better compromise
than the current one.
Many thanks to Jeffrey Yasskin who did most of the work on the N3333
paper, pair-programmed some of this code, and reviewed much of it. Many
thanks also go to Geoff Pike Pike and Jyrki Alakuijala, the original
authors of CityHash on which this is heavily based, and Austin Appleby
who created MurmurHash and the SMHasher test suite.
Also thanks to Nadav, Tobias, Howard, Jay, Nick, Ahmed, and Duncan for
all of the review comments! If there are further comments or concerns,
please let me know and I'll jump on 'em.
llvm-svn: 151822
2012-03-02 02:55:25 +08:00
|
|
|
template <typename T, size_t N> T *begin(T (&arr)[N]) { return arr; }
|
|
|
|
template <typename T, size_t N> T *end(T (&arr)[N]) { return arr + N; }
|
|
|
|
|
|
|
|
// Provide a dummy, hashable type designed for easy verification: its hash is
|
|
|
|
// the same as its value.
|
|
|
|
struct HashableDummy { size_t value; };
|
|
|
|
hash_code hash_value(HashableDummy dummy) { return dummy.value; }
|
|
|
|
|
|
|
|
TEST(HashingTest, HashCombineRangeBasicTest) {
|
|
|
|
// Leave this uninitialized in the hope that valgrind will catch bad reads.
|
|
|
|
int dummy;
|
|
|
|
hash_code dummy_hash = hash_combine_range(&dummy, &dummy);
|
2012-03-02 08:48:38 +08:00
|
|
|
EXPECT_NE(hash_code(0), dummy_hash);
|
Rewrite LLVM's generalized support library for hashing to follow the API
of the proposed standard hashing interfaces (N3333), and to use
a modified and tuned version of the CityHash algorithm.
Some of the highlights of this change:
-- Significantly higher quality hashing algorithm with very well
distributed results, and extremely few collisions. Should be close to
a checksum for up to 64-bit keys. Very little clustering or clumping of
hash codes, to better distribute load on probed hash tables.
-- Built-in support for reserved values.
-- Simplified API that composes cleanly with other C++ idioms and APIs.
-- Better scaling performance as keys grow. This is the fastest
algorithm I've found and measured for moderately sized keys (such as
show up in some of the uniquing and folding use cases)
-- Support for enabling per-execution seeds to prevent table ordering
or other artifacts of hashing algorithms to impact the output of
LLVM. The seeding would make each run different and highlight these
problems during bootstrap.
This implementation was tested extensively using the SMHasher test
suite, and pased with flying colors, doing better than the original
CityHash algorithm even.
I've included a unittest, although it is somewhat minimal at the moment.
I've also added (or refactored into the proper location) type traits
necessary to implement this, and converted users of GeneralHash over.
My only immediate concerns with this implementation is the performance
of hashing small keys. I've already started working to improve this, and
will continue to do so. Currently, the only algorithms faster produce
lower quality results, but it is likely there is a better compromise
than the current one.
Many thanks to Jeffrey Yasskin who did most of the work on the N3333
paper, pair-programmed some of this code, and reviewed much of it. Many
thanks also go to Geoff Pike Pike and Jyrki Alakuijala, the original
authors of CityHash on which this is heavily based, and Austin Appleby
who created MurmurHash and the SMHasher test suite.
Also thanks to Nadav, Tobias, Howard, Jay, Nick, Ahmed, and Duncan for
all of the review comments! If there are further comments or concerns,
please let me know and I'll jump on 'em.
llvm-svn: 151822
2012-03-02 02:55:25 +08:00
|
|
|
|
|
|
|
const int arr1[] = { 1, 2, 3 };
|
|
|
|
hash_code arr1_hash = hash_combine_range(begin(arr1), end(arr1));
|
|
|
|
EXPECT_NE(dummy_hash, arr1_hash);
|
|
|
|
EXPECT_EQ(arr1_hash, hash_combine_range(begin(arr1), end(arr1)));
|
2012-02-19 05:00:49 +08:00
|
|
|
|
Rewrite LLVM's generalized support library for hashing to follow the API
of the proposed standard hashing interfaces (N3333), and to use
a modified and tuned version of the CityHash algorithm.
Some of the highlights of this change:
-- Significantly higher quality hashing algorithm with very well
distributed results, and extremely few collisions. Should be close to
a checksum for up to 64-bit keys. Very little clustering or clumping of
hash codes, to better distribute load on probed hash tables.
-- Built-in support for reserved values.
-- Simplified API that composes cleanly with other C++ idioms and APIs.
-- Better scaling performance as keys grow. This is the fastest
algorithm I've found and measured for moderately sized keys (such as
show up in some of the uniquing and folding use cases)
-- Support for enabling per-execution seeds to prevent table ordering
or other artifacts of hashing algorithms to impact the output of
LLVM. The seeding would make each run different and highlight these
problems during bootstrap.
This implementation was tested extensively using the SMHasher test
suite, and pased with flying colors, doing better than the original
CityHash algorithm even.
I've included a unittest, although it is somewhat minimal at the moment.
I've also added (or refactored into the proper location) type traits
necessary to implement this, and converted users of GeneralHash over.
My only immediate concerns with this implementation is the performance
of hashing small keys. I've already started working to improve this, and
will continue to do so. Currently, the only algorithms faster produce
lower quality results, but it is likely there is a better compromise
than the current one.
Many thanks to Jeffrey Yasskin who did most of the work on the N3333
paper, pair-programmed some of this code, and reviewed much of it. Many
thanks also go to Geoff Pike Pike and Jyrki Alakuijala, the original
authors of CityHash on which this is heavily based, and Austin Appleby
who created MurmurHash and the SMHasher test suite.
Also thanks to Nadav, Tobias, Howard, Jay, Nick, Ahmed, and Duncan for
all of the review comments! If there are further comments or concerns,
please let me know and I'll jump on 'em.
llvm-svn: 151822
2012-03-02 02:55:25 +08:00
|
|
|
const std::vector<int> vec(begin(arr1), end(arr1));
|
|
|
|
EXPECT_EQ(arr1_hash, hash_combine_range(vec.begin(), vec.end()));
|
|
|
|
|
|
|
|
const std::list<int> list(begin(arr1), end(arr1));
|
|
|
|
EXPECT_EQ(arr1_hash, hash_combine_range(list.begin(), list.end()));
|
|
|
|
|
|
|
|
const std::deque<int> deque(begin(arr1), end(arr1));
|
|
|
|
EXPECT_EQ(arr1_hash, hash_combine_range(deque.begin(), deque.end()));
|
|
|
|
|
|
|
|
const int arr2[] = { 3, 2, 1 };
|
|
|
|
hash_code arr2_hash = hash_combine_range(begin(arr2), end(arr2));
|
|
|
|
EXPECT_NE(dummy_hash, arr2_hash);
|
|
|
|
EXPECT_NE(arr1_hash, arr2_hash);
|
|
|
|
|
|
|
|
const int arr3[] = { 1, 1, 2, 3 };
|
|
|
|
hash_code arr3_hash = hash_combine_range(begin(arr3), end(arr3));
|
|
|
|
EXPECT_NE(dummy_hash, arr3_hash);
|
|
|
|
EXPECT_NE(arr1_hash, arr3_hash);
|
|
|
|
|
|
|
|
const int arr4[] = { 1, 2, 3, 3 };
|
|
|
|
hash_code arr4_hash = hash_combine_range(begin(arr4), end(arr4));
|
|
|
|
EXPECT_NE(dummy_hash, arr4_hash);
|
|
|
|
EXPECT_NE(arr1_hash, arr4_hash);
|
|
|
|
|
|
|
|
const size_t arr5[] = { 1, 2, 3 };
|
|
|
|
const HashableDummy d_arr5[] = { {1}, {2}, {3} };
|
|
|
|
hash_code arr5_hash = hash_combine_range(begin(arr5), end(arr5));
|
|
|
|
hash_code d_arr5_hash = hash_combine_range(begin(d_arr5), end(d_arr5));
|
|
|
|
EXPECT_EQ(arr5_hash, d_arr5_hash);
|
2012-02-19 05:00:49 +08:00
|
|
|
}
|
|
|
|
|
Rewrite LLVM's generalized support library for hashing to follow the API
of the proposed standard hashing interfaces (N3333), and to use
a modified and tuned version of the CityHash algorithm.
Some of the highlights of this change:
-- Significantly higher quality hashing algorithm with very well
distributed results, and extremely few collisions. Should be close to
a checksum for up to 64-bit keys. Very little clustering or clumping of
hash codes, to better distribute load on probed hash tables.
-- Built-in support for reserved values.
-- Simplified API that composes cleanly with other C++ idioms and APIs.
-- Better scaling performance as keys grow. This is the fastest
algorithm I've found and measured for moderately sized keys (such as
show up in some of the uniquing and folding use cases)
-- Support for enabling per-execution seeds to prevent table ordering
or other artifacts of hashing algorithms to impact the output of
LLVM. The seeding would make each run different and highlight these
problems during bootstrap.
This implementation was tested extensively using the SMHasher test
suite, and pased with flying colors, doing better than the original
CityHash algorithm even.
I've included a unittest, although it is somewhat minimal at the moment.
I've also added (or refactored into the proper location) type traits
necessary to implement this, and converted users of GeneralHash over.
My only immediate concerns with this implementation is the performance
of hashing small keys. I've already started working to improve this, and
will continue to do so. Currently, the only algorithms faster produce
lower quality results, but it is likely there is a better compromise
than the current one.
Many thanks to Jeffrey Yasskin who did most of the work on the N3333
paper, pair-programmed some of this code, and reviewed much of it. Many
thanks also go to Geoff Pike Pike and Jyrki Alakuijala, the original
authors of CityHash on which this is heavily based, and Austin Appleby
who created MurmurHash and the SMHasher test suite.
Also thanks to Nadav, Tobias, Howard, Jay, Nick, Ahmed, and Duncan for
all of the review comments! If there are further comments or concerns,
please let me know and I'll jump on 'em.
llvm-svn: 151822
2012-03-02 02:55:25 +08:00
|
|
|
TEST(HashingTest, HashCombineRangeLengthDiff) {
|
|
|
|
// Test that as only the length varies, we compute different hash codes for
|
|
|
|
// sequences.
|
|
|
|
std::map<size_t, size_t> code_to_size;
|
|
|
|
std::vector<char> all_one_c(256, '\xff');
|
|
|
|
for (unsigned Idx = 1, Size = all_one_c.size(); Idx < Size; ++Idx) {
|
|
|
|
hash_code code = hash_combine_range(&all_one_c[0], &all_one_c[0] + Idx);
|
|
|
|
std::map<size_t, size_t>::iterator
|
|
|
|
I = code_to_size.insert(std::make_pair(code, Idx)).first;
|
|
|
|
EXPECT_EQ(Idx, I->second);
|
|
|
|
}
|
|
|
|
code_to_size.clear();
|
|
|
|
std::vector<char> all_zero_c(256, '\0');
|
|
|
|
for (unsigned Idx = 1, Size = all_zero_c.size(); Idx < Size; ++Idx) {
|
|
|
|
hash_code code = hash_combine_range(&all_zero_c[0], &all_zero_c[0] + Idx);
|
|
|
|
std::map<size_t, size_t>::iterator
|
|
|
|
I = code_to_size.insert(std::make_pair(code, Idx)).first;
|
|
|
|
EXPECT_EQ(Idx, I->second);
|
|
|
|
}
|
|
|
|
code_to_size.clear();
|
|
|
|
std::vector<unsigned> all_one_int(512, -1);
|
|
|
|
for (unsigned Idx = 1, Size = all_one_int.size(); Idx < Size; ++Idx) {
|
|
|
|
hash_code code = hash_combine_range(&all_one_int[0], &all_one_int[0] + Idx);
|
|
|
|
std::map<size_t, size_t>::iterator
|
|
|
|
I = code_to_size.insert(std::make_pair(code, Idx)).first;
|
|
|
|
EXPECT_EQ(Idx, I->second);
|
|
|
|
}
|
|
|
|
code_to_size.clear();
|
|
|
|
std::vector<unsigned> all_zero_int(512, 0);
|
|
|
|
for (unsigned Idx = 1, Size = all_zero_int.size(); Idx < Size; ++Idx) {
|
|
|
|
hash_code code = hash_combine_range(&all_zero_int[0], &all_zero_int[0] + Idx);
|
|
|
|
std::map<size_t, size_t>::iterator
|
|
|
|
I = code_to_size.insert(std::make_pair(code, Idx)).first;
|
|
|
|
EXPECT_EQ(Idx, I->second);
|
|
|
|
}
|
2012-02-19 05:00:49 +08:00
|
|
|
}
|
|
|
|
|
Rewrite LLVM's generalized support library for hashing to follow the API
of the proposed standard hashing interfaces (N3333), and to use
a modified and tuned version of the CityHash algorithm.
Some of the highlights of this change:
-- Significantly higher quality hashing algorithm with very well
distributed results, and extremely few collisions. Should be close to
a checksum for up to 64-bit keys. Very little clustering or clumping of
hash codes, to better distribute load on probed hash tables.
-- Built-in support for reserved values.
-- Simplified API that composes cleanly with other C++ idioms and APIs.
-- Better scaling performance as keys grow. This is the fastest
algorithm I've found and measured for moderately sized keys (such as
show up in some of the uniquing and folding use cases)
-- Support for enabling per-execution seeds to prevent table ordering
or other artifacts of hashing algorithms to impact the output of
LLVM. The seeding would make each run different and highlight these
problems during bootstrap.
This implementation was tested extensively using the SMHasher test
suite, and pased with flying colors, doing better than the original
CityHash algorithm even.
I've included a unittest, although it is somewhat minimal at the moment.
I've also added (or refactored into the proper location) type traits
necessary to implement this, and converted users of GeneralHash over.
My only immediate concerns with this implementation is the performance
of hashing small keys. I've already started working to improve this, and
will continue to do so. Currently, the only algorithms faster produce
lower quality results, but it is likely there is a better compromise
than the current one.
Many thanks to Jeffrey Yasskin who did most of the work on the N3333
paper, pair-programmed some of this code, and reviewed much of it. Many
thanks also go to Geoff Pike Pike and Jyrki Alakuijala, the original
authors of CityHash on which this is heavily based, and Austin Appleby
who created MurmurHash and the SMHasher test suite.
Also thanks to Nadav, Tobias, Howard, Jay, Nick, Ahmed, and Duncan for
all of the review comments! If there are further comments or concerns,
please let me know and I'll jump on 'em.
llvm-svn: 151822
2012-03-02 02:55:25 +08:00
|
|
|
TEST(HashingTest, HashCombineRangeGoldenTest) {
|
|
|
|
struct { const char *s; uint64_t hash; } golden_data[] = {
|
2018-09-01 03:24:37 +08:00
|
|
|
#if SIZE_MAX == UINT64_MAX || SIZE_MAX == UINT32_MAX
|
Rewrite LLVM's generalized support library for hashing to follow the API
of the proposed standard hashing interfaces (N3333), and to use
a modified and tuned version of the CityHash algorithm.
Some of the highlights of this change:
-- Significantly higher quality hashing algorithm with very well
distributed results, and extremely few collisions. Should be close to
a checksum for up to 64-bit keys. Very little clustering or clumping of
hash codes, to better distribute load on probed hash tables.
-- Built-in support for reserved values.
-- Simplified API that composes cleanly with other C++ idioms and APIs.
-- Better scaling performance as keys grow. This is the fastest
algorithm I've found and measured for moderately sized keys (such as
show up in some of the uniquing and folding use cases)
-- Support for enabling per-execution seeds to prevent table ordering
or other artifacts of hashing algorithms to impact the output of
LLVM. The seeding would make each run different and highlight these
problems during bootstrap.
This implementation was tested extensively using the SMHasher test
suite, and pased with flying colors, doing better than the original
CityHash algorithm even.
I've included a unittest, although it is somewhat minimal at the moment.
I've also added (or refactored into the proper location) type traits
necessary to implement this, and converted users of GeneralHash over.
My only immediate concerns with this implementation is the performance
of hashing small keys. I've already started working to improve this, and
will continue to do so. Currently, the only algorithms faster produce
lower quality results, but it is likely there is a better compromise
than the current one.
Many thanks to Jeffrey Yasskin who did most of the work on the N3333
paper, pair-programmed some of this code, and reviewed much of it. Many
thanks also go to Geoff Pike Pike and Jyrki Alakuijala, the original
authors of CityHash on which this is heavily based, and Austin Appleby
who created MurmurHash and the SMHasher test suite.
Also thanks to Nadav, Tobias, Howard, Jay, Nick, Ahmed, and Duncan for
all of the review comments! If there are further comments or concerns,
please let me know and I'll jump on 'em.
llvm-svn: 151822
2012-03-02 02:55:25 +08:00
|
|
|
{ "a", 0xaeb6f9d5517c61f8ULL },
|
|
|
|
{ "ab", 0x7ab1edb96be496b4ULL },
|
|
|
|
{ "abc", 0xe38e60bf19c71a3fULL },
|
|
|
|
{ "abcde", 0xd24461a66de97f6eULL },
|
|
|
|
{ "abcdefgh", 0x4ef872ec411dec9dULL },
|
|
|
|
{ "abcdefghijklm", 0xe8a865539f4eadfeULL },
|
|
|
|
{ "abcdefghijklmnopqrstu", 0x261cdf85faaf4e79ULL },
|
|
|
|
{ "abcdefghijklmnopqrstuvwxyzabcdef", 0x43ba70e4198e3b2aULL },
|
|
|
|
{ "abcdefghijklmnopqrstuvwxyzabcdef"
|
|
|
|
"abcdefghijklmnopqrstuvwxyzghijkl"
|
|
|
|
"abcdefghijklmnopqrstuvwxyzmnopqr"
|
|
|
|
"abcdefghijklmnopqrstuvwxyzstuvwx"
|
|
|
|
"abcdefghijklmnopqrstuvwxyzyzabcd", 0xdcd57fb2afdf72beULL },
|
|
|
|
{ "a", 0xaeb6f9d5517c61f8ULL },
|
|
|
|
{ "aa", 0xf2b3b69a9736a1ebULL },
|
|
|
|
{ "aaa", 0xf752eb6f07b1cafeULL },
|
|
|
|
{ "aaaaa", 0x812bd21e1236954cULL },
|
|
|
|
{ "aaaaaaaa", 0xff07a2cff08ac587ULL },
|
|
|
|
{ "aaaaaaaaaaaaa", 0x84ac949d54d704ecULL },
|
|
|
|
{ "aaaaaaaaaaaaaaaaaaaaa", 0xcb2c8fb6be8f5648ULL },
|
|
|
|
{ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0xcc40ab7f164091b6ULL },
|
|
|
|
{ "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
|
|
|
|
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
|
|
|
|
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
|
|
|
|
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
|
|
|
|
"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0xc58e174c1e78ffe9ULL },
|
|
|
|
{ "z", 0x1ba160d7e8f8785cULL },
|
|
|
|
{ "zz", 0x2c5c03172f1285d7ULL },
|
|
|
|
{ "zzz", 0x9d2c4f4b507a2ac3ULL },
|
|
|
|
{ "zzzzz", 0x0f03b9031735693aULL },
|
|
|
|
{ "zzzzzzzz", 0xe674147c8582c08eULL },
|
|
|
|
{ "zzzzzzzzzzzzz", 0x3162d9fa6938db83ULL },
|
|
|
|
{ "zzzzzzzzzzzzzzzzzzzzz", 0x37b9a549e013620cULL },
|
|
|
|
{ "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", 0x8921470aff885016ULL },
|
|
|
|
{ "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"
|
|
|
|
"zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"
|
|
|
|
"zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"
|
|
|
|
"zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"
|
|
|
|
"zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz", 0xf60fdcd9beb08441ULL },
|
|
|
|
{ "a", 0xaeb6f9d5517c61f8ULL },
|
|
|
|
{ "ab", 0x7ab1edb96be496b4ULL },
|
|
|
|
{ "aba", 0x3edb049950884d0aULL },
|
|
|
|
{ "ababa", 0x8f2de9e73a97714bULL },
|
|
|
|
{ "abababab", 0xee14a29ddf0ce54cULL },
|
|
|
|
{ "ababababababa", 0x38b3ddaada2d52b4ULL },
|
|
|
|
{ "ababababababababababa", 0xd3665364219f2b85ULL },
|
2012-03-02 18:01:29 +08:00
|
|
|
{ "abababababababababababababababab", 0xa75cd6afbf1bc972ULL },
|
Rewrite LLVM's generalized support library for hashing to follow the API
of the proposed standard hashing interfaces (N3333), and to use
a modified and tuned version of the CityHash algorithm.
Some of the highlights of this change:
-- Significantly higher quality hashing algorithm with very well
distributed results, and extremely few collisions. Should be close to
a checksum for up to 64-bit keys. Very little clustering or clumping of
hash codes, to better distribute load on probed hash tables.
-- Built-in support for reserved values.
-- Simplified API that composes cleanly with other C++ idioms and APIs.
-- Better scaling performance as keys grow. This is the fastest
algorithm I've found and measured for moderately sized keys (such as
show up in some of the uniquing and folding use cases)
-- Support for enabling per-execution seeds to prevent table ordering
or other artifacts of hashing algorithms to impact the output of
LLVM. The seeding would make each run different and highlight these
problems during bootstrap.
This implementation was tested extensively using the SMHasher test
suite, and pased with flying colors, doing better than the original
CityHash algorithm even.
I've included a unittest, although it is somewhat minimal at the moment.
I've also added (or refactored into the proper location) type traits
necessary to implement this, and converted users of GeneralHash over.
My only immediate concerns with this implementation is the performance
of hashing small keys. I've already started working to improve this, and
will continue to do so. Currently, the only algorithms faster produce
lower quality results, but it is likely there is a better compromise
than the current one.
Many thanks to Jeffrey Yasskin who did most of the work on the N3333
paper, pair-programmed some of this code, and reviewed much of it. Many
thanks also go to Geoff Pike Pike and Jyrki Alakuijala, the original
authors of CityHash on which this is heavily based, and Austin Appleby
who created MurmurHash and the SMHasher test suite.
Also thanks to Nadav, Tobias, Howard, Jay, Nick, Ahmed, and Duncan for
all of the review comments! If there are further comments or concerns,
please let me know and I'll jump on 'em.
llvm-svn: 151822
2012-03-02 02:55:25 +08:00
|
|
|
{ "abababababababababababababababab"
|
|
|
|
"abababababababababababababababab"
|
|
|
|
"abababababababababababababababab"
|
|
|
|
"abababababababababababababababab"
|
|
|
|
"abababababababababababababababab", 0x840192d129f7a22bULL }
|
2012-03-02 07:06:19 +08:00
|
|
|
#else
|
|
|
|
#error This test only supports 64-bit and 32-bit systems.
|
|
|
|
#endif
|
Rewrite LLVM's generalized support library for hashing to follow the API
of the proposed standard hashing interfaces (N3333), and to use
a modified and tuned version of the CityHash algorithm.
Some of the highlights of this change:
-- Significantly higher quality hashing algorithm with very well
distributed results, and extremely few collisions. Should be close to
a checksum for up to 64-bit keys. Very little clustering or clumping of
hash codes, to better distribute load on probed hash tables.
-- Built-in support for reserved values.
-- Simplified API that composes cleanly with other C++ idioms and APIs.
-- Better scaling performance as keys grow. This is the fastest
algorithm I've found and measured for moderately sized keys (such as
show up in some of the uniquing and folding use cases)
-- Support for enabling per-execution seeds to prevent table ordering
or other artifacts of hashing algorithms to impact the output of
LLVM. The seeding would make each run different and highlight these
problems during bootstrap.
This implementation was tested extensively using the SMHasher test
suite, and pased with flying colors, doing better than the original
CityHash algorithm even.
I've included a unittest, although it is somewhat minimal at the moment.
I've also added (or refactored into the proper location) type traits
necessary to implement this, and converted users of GeneralHash over.
My only immediate concerns with this implementation is the performance
of hashing small keys. I've already started working to improve this, and
will continue to do so. Currently, the only algorithms faster produce
lower quality results, but it is likely there is a better compromise
than the current one.
Many thanks to Jeffrey Yasskin who did most of the work on the N3333
paper, pair-programmed some of this code, and reviewed much of it. Many
thanks also go to Geoff Pike Pike and Jyrki Alakuijala, the original
authors of CityHash on which this is heavily based, and Austin Appleby
who created MurmurHash and the SMHasher test suite.
Also thanks to Nadav, Tobias, Howard, Jay, Nick, Ahmed, and Duncan for
all of the review comments! If there are further comments or concerns,
please let me know and I'll jump on 'em.
llvm-svn: 151822
2012-03-02 02:55:25 +08:00
|
|
|
};
|
|
|
|
for (unsigned i = 0; i < sizeof(golden_data)/sizeof(*golden_data); ++i) {
|
|
|
|
StringRef str = golden_data[i].s;
|
|
|
|
hash_code hash = hash_combine_range(str.begin(), str.end());
|
2012-03-02 07:20:45 +08:00
|
|
|
#if 0 // Enable this to generate paste-able text for the above structure.
|
Rewrite LLVM's generalized support library for hashing to follow the API
of the proposed standard hashing interfaces (N3333), and to use
a modified and tuned version of the CityHash algorithm.
Some of the highlights of this change:
-- Significantly higher quality hashing algorithm with very well
distributed results, and extremely few collisions. Should be close to
a checksum for up to 64-bit keys. Very little clustering or clumping of
hash codes, to better distribute load on probed hash tables.
-- Built-in support for reserved values.
-- Simplified API that composes cleanly with other C++ idioms and APIs.
-- Better scaling performance as keys grow. This is the fastest
algorithm I've found and measured for moderately sized keys (such as
show up in some of the uniquing and folding use cases)
-- Support for enabling per-execution seeds to prevent table ordering
or other artifacts of hashing algorithms to impact the output of
LLVM. The seeding would make each run different and highlight these
problems during bootstrap.
This implementation was tested extensively using the SMHasher test
suite, and pased with flying colors, doing better than the original
CityHash algorithm even.
I've included a unittest, although it is somewhat minimal at the moment.
I've also added (or refactored into the proper location) type traits
necessary to implement this, and converted users of GeneralHash over.
My only immediate concerns with this implementation is the performance
of hashing small keys. I've already started working to improve this, and
will continue to do so. Currently, the only algorithms faster produce
lower quality results, but it is likely there is a better compromise
than the current one.
Many thanks to Jeffrey Yasskin who did most of the work on the N3333
paper, pair-programmed some of this code, and reviewed much of it. Many
thanks also go to Geoff Pike Pike and Jyrki Alakuijala, the original
authors of CityHash on which this is heavily based, and Austin Appleby
who created MurmurHash and the SMHasher test suite.
Also thanks to Nadav, Tobias, Howard, Jay, Nick, Ahmed, and Duncan for
all of the review comments! If there are further comments or concerns,
please let me know and I'll jump on 'em.
llvm-svn: 151822
2012-03-02 02:55:25 +08:00
|
|
|
std::string member_str = "\"" + str.str() + "\",";
|
2012-03-02 07:06:19 +08:00
|
|
|
fprintf(stderr, " { %-35s 0x%016llxULL },\n",
|
|
|
|
member_str.c_str(), static_cast<uint64_t>(hash));
|
Rewrite LLVM's generalized support library for hashing to follow the API
of the proposed standard hashing interfaces (N3333), and to use
a modified and tuned version of the CityHash algorithm.
Some of the highlights of this change:
-- Significantly higher quality hashing algorithm with very well
distributed results, and extremely few collisions. Should be close to
a checksum for up to 64-bit keys. Very little clustering or clumping of
hash codes, to better distribute load on probed hash tables.
-- Built-in support for reserved values.
-- Simplified API that composes cleanly with other C++ idioms and APIs.
-- Better scaling performance as keys grow. This is the fastest
algorithm I've found and measured for moderately sized keys (such as
show up in some of the uniquing and folding use cases)
-- Support for enabling per-execution seeds to prevent table ordering
or other artifacts of hashing algorithms to impact the output of
LLVM. The seeding would make each run different and highlight these
problems during bootstrap.
This implementation was tested extensively using the SMHasher test
suite, and pased with flying colors, doing better than the original
CityHash algorithm even.
I've included a unittest, although it is somewhat minimal at the moment.
I've also added (or refactored into the proper location) type traits
necessary to implement this, and converted users of GeneralHash over.
My only immediate concerns with this implementation is the performance
of hashing small keys. I've already started working to improve this, and
will continue to do so. Currently, the only algorithms faster produce
lower quality results, but it is likely there is a better compromise
than the current one.
Many thanks to Jeffrey Yasskin who did most of the work on the N3333
paper, pair-programmed some of this code, and reviewed much of it. Many
thanks also go to Geoff Pike Pike and Jyrki Alakuijala, the original
authors of CityHash on which this is heavily based, and Austin Appleby
who created MurmurHash and the SMHasher test suite.
Also thanks to Nadav, Tobias, Howard, Jay, Nick, Ahmed, and Duncan for
all of the review comments! If there are further comments or concerns,
please let me know and I'll jump on 'em.
llvm-svn: 151822
2012-03-02 02:55:25 +08:00
|
|
|
#endif
|
|
|
|
EXPECT_EQ(static_cast<size_t>(golden_data[i].hash),
|
|
|
|
static_cast<size_t>(hash));
|
|
|
|
}
|
2012-02-19 05:00:49 +08:00
|
|
|
}
|
|
|
|
|
Rewrite LLVM's generalized support library for hashing to follow the API
of the proposed standard hashing interfaces (N3333), and to use
a modified and tuned version of the CityHash algorithm.
Some of the highlights of this change:
-- Significantly higher quality hashing algorithm with very well
distributed results, and extremely few collisions. Should be close to
a checksum for up to 64-bit keys. Very little clustering or clumping of
hash codes, to better distribute load on probed hash tables.
-- Built-in support for reserved values.
-- Simplified API that composes cleanly with other C++ idioms and APIs.
-- Better scaling performance as keys grow. This is the fastest
algorithm I've found and measured for moderately sized keys (such as
show up in some of the uniquing and folding use cases)
-- Support for enabling per-execution seeds to prevent table ordering
or other artifacts of hashing algorithms to impact the output of
LLVM. The seeding would make each run different and highlight these
problems during bootstrap.
This implementation was tested extensively using the SMHasher test
suite, and pased with flying colors, doing better than the original
CityHash algorithm even.
I've included a unittest, although it is somewhat minimal at the moment.
I've also added (or refactored into the proper location) type traits
necessary to implement this, and converted users of GeneralHash over.
My only immediate concerns with this implementation is the performance
of hashing small keys. I've already started working to improve this, and
will continue to do so. Currently, the only algorithms faster produce
lower quality results, but it is likely there is a better compromise
than the current one.
Many thanks to Jeffrey Yasskin who did most of the work on the N3333
paper, pair-programmed some of this code, and reviewed much of it. Many
thanks also go to Geoff Pike Pike and Jyrki Alakuijala, the original
authors of CityHash on which this is heavily based, and Austin Appleby
who created MurmurHash and the SMHasher test suite.
Also thanks to Nadav, Tobias, Howard, Jay, Nick, Ahmed, and Duncan for
all of the review comments! If there are further comments or concerns,
please let me know and I'll jump on 'em.
llvm-svn: 151822
2012-03-02 02:55:25 +08:00
|
|
|
TEST(HashingTest, HashCombineBasicTest) {
|
|
|
|
// Hashing a sequence of homogenous types matches range hashing.
|
|
|
|
const int i1 = 42, i2 = 43, i3 = 123, i4 = 999, i5 = 0, i6 = 79;
|
|
|
|
const int arr1[] = { i1, i2, i3, i4, i5, i6 };
|
|
|
|
EXPECT_EQ(hash_combine_range(arr1, arr1 + 1), hash_combine(i1));
|
|
|
|
EXPECT_EQ(hash_combine_range(arr1, arr1 + 2), hash_combine(i1, i2));
|
|
|
|
EXPECT_EQ(hash_combine_range(arr1, arr1 + 3), hash_combine(i1, i2, i3));
|
|
|
|
EXPECT_EQ(hash_combine_range(arr1, arr1 + 4), hash_combine(i1, i2, i3, i4));
|
|
|
|
EXPECT_EQ(hash_combine_range(arr1, arr1 + 5),
|
|
|
|
hash_combine(i1, i2, i3, i4, i5));
|
|
|
|
EXPECT_EQ(hash_combine_range(arr1, arr1 + 6),
|
|
|
|
hash_combine(i1, i2, i3, i4, i5, i6));
|
|
|
|
|
2012-06-02 18:20:22 +08:00
|
|
|
// Hashing a sequence of heterogeneous types which *happen* to all produce the
|
Rewrite LLVM's generalized support library for hashing to follow the API
of the proposed standard hashing interfaces (N3333), and to use
a modified and tuned version of the CityHash algorithm.
Some of the highlights of this change:
-- Significantly higher quality hashing algorithm with very well
distributed results, and extremely few collisions. Should be close to
a checksum for up to 64-bit keys. Very little clustering or clumping of
hash codes, to better distribute load on probed hash tables.
-- Built-in support for reserved values.
-- Simplified API that composes cleanly with other C++ idioms and APIs.
-- Better scaling performance as keys grow. This is the fastest
algorithm I've found and measured for moderately sized keys (such as
show up in some of the uniquing and folding use cases)
-- Support for enabling per-execution seeds to prevent table ordering
or other artifacts of hashing algorithms to impact the output of
LLVM. The seeding would make each run different and highlight these
problems during bootstrap.
This implementation was tested extensively using the SMHasher test
suite, and pased with flying colors, doing better than the original
CityHash algorithm even.
I've included a unittest, although it is somewhat minimal at the moment.
I've also added (or refactored into the proper location) type traits
necessary to implement this, and converted users of GeneralHash over.
My only immediate concerns with this implementation is the performance
of hashing small keys. I've already started working to improve this, and
will continue to do so. Currently, the only algorithms faster produce
lower quality results, but it is likely there is a better compromise
than the current one.
Many thanks to Jeffrey Yasskin who did most of the work on the N3333
paper, pair-programmed some of this code, and reviewed much of it. Many
thanks also go to Geoff Pike Pike and Jyrki Alakuijala, the original
authors of CityHash on which this is heavily based, and Austin Appleby
who created MurmurHash and the SMHasher test suite.
Also thanks to Nadav, Tobias, Howard, Jay, Nick, Ahmed, and Duncan for
all of the review comments! If there are further comments or concerns,
please let me know and I'll jump on 'em.
llvm-svn: 151822
2012-03-02 02:55:25 +08:00
|
|
|
// same data for hashing produces the same as a range-based hash of the
|
|
|
|
// fundamental values.
|
|
|
|
const size_t s1 = 1024, s2 = 8888, s3 = 9000000;
|
|
|
|
const HashableDummy d1 = { 1024 }, d2 = { 8888 }, d3 = { 9000000 };
|
|
|
|
const size_t arr2[] = { s1, s2, s3 };
|
|
|
|
EXPECT_EQ(hash_combine_range(begin(arr2), end(arr2)),
|
|
|
|
hash_combine(s1, s2, s3));
|
|
|
|
EXPECT_EQ(hash_combine(s1, s2, s3), hash_combine(s1, s2, d3));
|
|
|
|
EXPECT_EQ(hash_combine(s1, s2, s3), hash_combine(s1, d2, s3));
|
|
|
|
EXPECT_EQ(hash_combine(s1, s2, s3), hash_combine(d1, s2, s3));
|
|
|
|
EXPECT_EQ(hash_combine(s1, s2, s3), hash_combine(d1, d2, s3));
|
|
|
|
EXPECT_EQ(hash_combine(s1, s2, s3), hash_combine(d1, d2, d3));
|
|
|
|
|
|
|
|
// Permuting values causes hashes to change.
|
|
|
|
EXPECT_NE(hash_combine(i1, i1, i1), hash_combine(i1, i1, i2));
|
|
|
|
EXPECT_NE(hash_combine(i1, i1, i1), hash_combine(i1, i2, i1));
|
|
|
|
EXPECT_NE(hash_combine(i1, i1, i1), hash_combine(i2, i1, i1));
|
|
|
|
EXPECT_NE(hash_combine(i1, i1, i1), hash_combine(i2, i2, i1));
|
|
|
|
EXPECT_NE(hash_combine(i1, i1, i1), hash_combine(i2, i2, i2));
|
|
|
|
EXPECT_NE(hash_combine(i2, i1, i1), hash_combine(i1, i1, i2));
|
|
|
|
EXPECT_NE(hash_combine(i1, i1, i2), hash_combine(i1, i2, i1));
|
|
|
|
EXPECT_NE(hash_combine(i1, i2, i1), hash_combine(i2, i1, i1));
|
|
|
|
|
|
|
|
// Changing type w/o changing value causes hashes to change.
|
|
|
|
EXPECT_NE(hash_combine(i1, i2, i3), hash_combine((char)i1, i2, i3));
|
|
|
|
EXPECT_NE(hash_combine(i1, i2, i3), hash_combine(i1, (char)i2, i3));
|
|
|
|
EXPECT_NE(hash_combine(i1, i2, i3), hash_combine(i1, i2, (char)i3));
|
|
|
|
|
|
|
|
// This is array of uint64, but it should have the exact same byte pattern as
|
|
|
|
// an array of LargeTestIntegers.
|
|
|
|
const uint64_t bigarr[] = {
|
|
|
|
0xaaaaaaaaababababULL, 0xacacacacbcbcbcbcULL, 0xccddeeffeeddccbbULL,
|
|
|
|
0xdeadbeafdeadbeefULL, 0xfefefefededededeULL, 0xafafafafededededULL,
|
|
|
|
0xffffeeeeddddccccULL, 0xaaaacbcbffffababULL,
|
|
|
|
0xaaaaaaaaababababULL, 0xacacacacbcbcbcbcULL, 0xccddeeffeeddccbbULL,
|
|
|
|
0xdeadbeafdeadbeefULL, 0xfefefefededededeULL, 0xafafafafededededULL,
|
|
|
|
0xffffeeeeddddccccULL, 0xaaaacbcbffffababULL,
|
|
|
|
0xaaaaaaaaababababULL, 0xacacacacbcbcbcbcULL, 0xccddeeffeeddccbbULL,
|
|
|
|
0xdeadbeafdeadbeefULL, 0xfefefefededededeULL, 0xafafafafededededULL,
|
|
|
|
0xffffeeeeddddccccULL, 0xaaaacbcbffffababULL
|
|
|
|
};
|
|
|
|
// Hash a preposterously large integer, both aligned with the buffer and
|
|
|
|
// misaligned.
|
|
|
|
const LargeTestInteger li = { {
|
|
|
|
0xaaaaaaaaababababULL, 0xacacacacbcbcbcbcULL, 0xccddeeffeeddccbbULL,
|
|
|
|
0xdeadbeafdeadbeefULL, 0xfefefefededededeULL, 0xafafafafededededULL,
|
|
|
|
0xffffeeeeddddccccULL, 0xaaaacbcbffffababULL
|
|
|
|
} };
|
|
|
|
// Rotate the storage from 'li'.
|
|
|
|
const LargeTestInteger l2 = { {
|
|
|
|
0xacacacacbcbcbcbcULL, 0xccddeeffeeddccbbULL, 0xdeadbeafdeadbeefULL,
|
|
|
|
0xfefefefededededeULL, 0xafafafafededededULL, 0xffffeeeeddddccccULL,
|
|
|
|
0xaaaacbcbffffababULL, 0xaaaaaaaaababababULL
|
|
|
|
} };
|
|
|
|
const LargeTestInteger l3 = { {
|
|
|
|
0xccddeeffeeddccbbULL, 0xdeadbeafdeadbeefULL, 0xfefefefededededeULL,
|
|
|
|
0xafafafafededededULL, 0xffffeeeeddddccccULL, 0xaaaacbcbffffababULL,
|
|
|
|
0xaaaaaaaaababababULL, 0xacacacacbcbcbcbcULL
|
|
|
|
} };
|
|
|
|
EXPECT_EQ(hash_combine_range(begin(bigarr), end(bigarr)),
|
|
|
|
hash_combine(li, li, li));
|
|
|
|
EXPECT_EQ(hash_combine_range(bigarr, bigarr + 9),
|
|
|
|
hash_combine(bigarr[0], l2));
|
|
|
|
EXPECT_EQ(hash_combine_range(bigarr, bigarr + 10),
|
|
|
|
hash_combine(bigarr[0], bigarr[1], l3));
|
|
|
|
EXPECT_EQ(hash_combine_range(bigarr, bigarr + 17),
|
|
|
|
hash_combine(li, bigarr[0], l2));
|
|
|
|
EXPECT_EQ(hash_combine_range(bigarr, bigarr + 18),
|
|
|
|
hash_combine(li, bigarr[0], bigarr[1], l3));
|
|
|
|
EXPECT_EQ(hash_combine_range(bigarr, bigarr + 18),
|
|
|
|
hash_combine(bigarr[0], l2, bigarr[9], l3));
|
|
|
|
EXPECT_EQ(hash_combine_range(bigarr, bigarr + 20),
|
|
|
|
hash_combine(bigarr[0], l2, bigarr[9], l3, bigarr[18], bigarr[19]));
|
2012-02-19 05:00:49 +08:00
|
|
|
}
|
|
|
|
|
2015-02-10 07:21:05 +08:00
|
|
|
TEST(HashingTest, HashCombineArgs18) {
|
|
|
|
// This tests that we can pass in up to 18 args.
|
|
|
|
#define CHECK_SAME(...) \
|
|
|
|
EXPECT_EQ(hash_combine(__VA_ARGS__), hash_combine(__VA_ARGS__))
|
|
|
|
CHECK_SAME(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18);
|
|
|
|
CHECK_SAME(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17);
|
|
|
|
CHECK_SAME(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
|
|
|
CHECK_SAME(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
|
|
|
CHECK_SAME(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14);
|
|
|
|
CHECK_SAME(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
|
|
|
|
CHECK_SAME(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
|
|
|
|
CHECK_SAME(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
|
|
|
|
CHECK_SAME(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
|
|
|
|
CHECK_SAME(1, 2, 3, 4, 5, 6, 7, 8, 9);
|
|
|
|
CHECK_SAME(1, 2, 3, 4, 5, 6, 7, 8);
|
|
|
|
CHECK_SAME(1, 2, 3, 4, 5, 6, 7);
|
|
|
|
CHECK_SAME(1, 2, 3, 4, 5, 6);
|
|
|
|
CHECK_SAME(1, 2, 3, 4, 5);
|
|
|
|
CHECK_SAME(1, 2, 3, 4);
|
|
|
|
CHECK_SAME(1, 2, 3);
|
|
|
|
CHECK_SAME(1, 2);
|
|
|
|
CHECK_SAME(1);
|
|
|
|
#undef CHECK_SAME
|
|
|
|
}
|
|
|
|
|
2012-02-19 05:00:49 +08:00
|
|
|
}
|