initial commit

This commit is contained in:
ducle.canh 2023-01-05 14:19:18 +08:00
parent 23676dbc0a
commit b8c76c1aab
15791 changed files with 462744 additions and 654826 deletions

View File

@ -1,4 +1,4 @@
I hereby agree to the terms of the CLA available at: https://yandex.ru/legal/cla/?lang=en
I hereby agree to the terms of the CLA available at: [placeholder]
Changelog category (leave one):
- New Feature
@ -21,9 +21,6 @@ Detailed description / Documentation draft:
...
By adding documentation, you'll allow users to try your new feature immediately, not when someone else will have time to document it later. Documentation is necessary for all features that affect user experience in any way. You can add brief documentation draft above, or add documentation right into your patch as Markdown files in [docs](https://github.com/ClickHouse/ClickHouse/tree/master/docs) folder.
If you are doing this for the first time, it's recommended to read the lightweight [Contributing to ClickHouse Documentation](https://github.com/ClickHouse/ClickHouse/tree/master/docs/README.md) guide first.
If you are doing this for the first time, it's recommended to read the lightweight [Contributing to ByConity Documentation](https://github.com/ByConity/ByConity/blob/master/CONTRIBUTING.md) guide first.
Information about CI checks: https://clickhouse.tech/docs/en/development/continuous-integration/

23
.github/workflows/ci.yml vendored Normal file
View File

@ -0,0 +1,23 @@
name: CI
on:
# Triggers the workflow on push or pull request events but only for the "main" branch
push:
branches: [ "master" ]
pull_request:
branches: [ "master" ]
jobs:
scm_build:
name: Build binary
container: vivekscl/test:latest
runs-on: ubuntu-latest
steps:
- name: Check out repository code
uses: actions/checkout@v2
- name: Build binary with build_bin.sh
env:
CUSTOM_CMAKE_BUILD_TYPE: "Release"
run: |
git -C "$GITHUB_WORKSPACE" submodule sync
git -C "$GITHUB_WORKSPACE" submodule update --init --recursive
./build_bin.sh

13
.gitignore vendored
View File

@ -10,8 +10,7 @@
*.logrt
/build
/build_*
/build-*
/cnch_test
/tests/venv
# logs
@ -144,8 +143,18 @@ website/package-lock.json
*.iml
package.json
package-lock.json
# data store
/programs/server/data
/programs/server/metadata
/programs/server/store
/output
.tags
# generating code
/src/Protos/*.pb.cc
/src/Protos/*.pb.h

68
.gitmodules vendored
View File

@ -8,9 +8,6 @@
[submodule "contrib/lz4"]
path = contrib/lz4
url = https://github.com/lz4/lz4.git
[submodule "contrib/librdkafka"]
path = contrib/librdkafka
url = https://github.com/ClickHouse-Extras/librdkafka.git
[submodule "contrib/cctz"]
path = contrib/cctz
url = https://github.com/ClickHouse-Extras/cctz.git
@ -46,9 +43,6 @@
path = contrib/protobuf
url = https://github.com/ClickHouse-Extras/protobuf.git
branch = v3.13.0.1
[submodule "contrib/boost"]
path = contrib/boost
url = https://github.com/ClickHouse-Extras/boost.git
[submodule "contrib/base64"]
path = contrib/base64
url = https://github.com/ClickHouse-Extras/Turbo-Base64.git
@ -59,9 +53,6 @@
[submodule "contrib/thrift"]
path = contrib/thrift
url = https://github.com/apache/thrift.git
[submodule "contrib/libhdfs3"]
path = contrib/libhdfs3
url = https://github.com/ClickHouse-Extras/libhdfs3.git
[submodule "contrib/libxml2"]
path = contrib/libxml2
url = https://github.com/GNOME/libxml2.git
@ -148,6 +139,7 @@
[submodule "contrib/msgpack-c"]
path = contrib/msgpack-c
url = https://github.com/msgpack/msgpack-c
branch = cpp_master
[submodule "contrib/libcpuid"]
path = contrib/libcpuid
url = https://github.com/ClickHouse-Extras/libcpuid.git
@ -157,10 +149,6 @@
[submodule "contrib/AMQP-CPP"]
path = contrib/AMQP-CPP
url = https://github.com/ClickHouse-Extras/AMQP-CPP.git
[submodule "contrib/cassandra"]
path = contrib/cassandra
url = https://github.com/ClickHouse-Extras/cpp-driver.git
branch = clickhouse
[submodule "contrib/libuv"]
path = contrib/libuv
url = https://github.com/ClickHouse-Extras/libuv.git
@ -184,16 +172,12 @@
path = contrib/cyrus-sasl
url = https://github.com/ClickHouse-Extras/cyrus-sasl
branch = cyrus-sasl-2.1
[submodule "contrib/croaring"]
path = contrib/croaring
url = https://github.com/RoaringBitmap/CRoaring
branch = v0.2.66
[submodule "contrib/miniselect"]
path = contrib/miniselect
url = https://github.com/danlark1/miniselect
[submodule "contrib/rocksdb"]
path = contrib/rocksdb
url = https://github.com/ClickHouse-Extras/rocksdb.git
url = https://github.com/ClickHouse-Extras/rocksdb.git
[submodule "contrib/xz"]
path = contrib/xz
url = https://github.com/xz-mirror/xz
@ -221,10 +205,56 @@
url = https://github.com/ClickHouse-Extras/nanodbc.git
[submodule "contrib/datasketches-cpp"]
path = contrib/datasketches-cpp
url = https://github.com/ClickHouse-Extras/datasketches-cpp.git
url = https://github.com/apache/datasketches-cpp.git
[submodule "contrib/yaml-cpp"]
path = contrib/yaml-cpp
url = https://github.com/ClickHouse-Extras/yaml-cpp.git
[submodule "contrib/libpqxx"]
path = contrib/libpqxx
url = https://github.com/ClickHouse-Extras/libpqxx.git
[submodule "contrib/croaring"]
path = contrib/croaring
url = https://github.com/ByConity/CRoaring.git
[submodule "contrib/librdkafka"]
path = contrib/librdkafka
url = https://github.com/ClickHouse/librdkafka.git
[submodule "contrib/cassandra"]
path = contrib/cassandra
url = https://github.com/ByConity/cassandra.git
[submodule "contrib/incubator-brpc"]
path = contrib/incubator-brpc
url = https://github.com/ByConity/incubator-brpc.git
branch = brpc1p3
[submodule "contrib/gflags"]
path = contrib/gflags
url = https://github.com/gflags/gflags.git
[submodule "contrib/libhdfs3-open"]
path = contrib/libhdfs3-open
url = https://github.com/ByConity/libhdfs3-open.git
[submodule "contrib/boost"]
path = contrib/boost
url = https://github.com/ClickHouse-Extras/boost.git
[submodule "contrib/breakpad"]
path = contrib/breakpad
url = https://github.com/ByConity/breakpad.git
[submodule "contrib/ipdb"]
path = contrib/ipdb
url = https://github.com/ByConity/ipdb-c.git
[submodule "contrib/maxminddb"]
path = contrib/maxminddb
url = https://github.com/ByConity/libmaxminddb.git
[submodule "contrib/json-c"]
path = contrib/json-c
url = https://github.com/json-c/json-c.git
[submodule "contrib/numactl"]
path = contrib/numactl
url = https://github.com/ByConity/numactl.git
[submodule "contrib/benchmark"]
path = contrib/benchmark
url = https://github.com/google/benchmark.git
[submodule "contrib/hivemetastore"]
path = contrib/hivemetastore
url = https://github.com/ClickHouse/hive-metastore.git
[submodule "contrib/udns"]
path = contrib/udns
url = https://github.com/ortclib/udns.git

BIN
ByConity-architecture.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 414 KiB

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.3)
cmake_minimum_required(VERSION 3.17)
foreach(policy
CMP0023
@ -333,14 +333,22 @@ endif()
set(COMPILER_FLAGS "${COMPILER_FLAGS}")
set (CMAKE_BUILD_COLOR_MAKEFILE ON)
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMPILER_FLAGS} ${PLATFORM_EXTRA_CXX_FLAG} ${COMMON_WARNING_FLAGS} ${CXX_WARNING_FLAGS}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer ${COMPILER_FLAGS} ${PLATFORM_EXTRA_CXX_FLAG} ${COMMON_WARNING_FLAGS} ${CXX_WARNING_FLAGS}")
set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3 ${CMAKE_CXX_FLAGS_ADD}")
set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g3 -ggdb3 -fno-inline ${CMAKE_CXX_FLAGS_ADD}")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMPILER_FLAGS} ${COMMON_WARNING_FLAGS} ${CMAKE_C_FLAGS_ADD}")
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-omit-frame-pointer ${COMPILER_FLAGS} ${COMMON_WARNING_FLAGS} ${CMAKE_C_FLAGS_ADD}")
set (CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -O3 ${CMAKE_C_FLAGS_ADD}")
set (CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O0 -g3 -ggdb3 -fno-inline ${CMAKE_C_FLAGS_ADD}")
# Ignored if `lldb` is used
option(NO_LIMIT_DEBUG_INFO "Add no-limit-debug-info to compile flags.")
if (NO_LIMIT_DEBUG_INFO)
set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fno-limit-debug-info")
set (CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -fno-limit-debug-info")
endif()
if (COMPILER_CLANG)
if (OS_DARWIN)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")
@ -440,6 +448,12 @@ elseif (OS_FREEBSD)
include(cmake/freebsd/default_libs.cmake)
endif ()
# Determine whether to use community map implementation
option (USE_COMMUNITY_MAP "Use clickhouse community map implmentation" OFF)
if (USE_COMMUNITY_MAP)
add_definitions(-DUSE_COMMUNITY_MAP)
endif (USE_COMMUNITY_MAP)
######################################
### Add targets below this comment ###
######################################
@ -499,6 +513,7 @@ include (GNUInstallDirs)
include (cmake/contrib_finder.cmake)
find_contrib_lib(double-conversion) # Must be before parquet
include (cmake/lib_name.cmake)
include (cmake/find/ssl.cmake)
include (cmake/find/ldap.cmake) # after ssl
include (cmake/find/icu.cmake)
@ -540,6 +555,8 @@ include (cmake/find/rocksdb.cmake)
include (cmake/find/libpqxx.cmake)
include (cmake/find/nuraft.cmake)
include (cmake/find/yaml-cpp.cmake)
include (cmake/find/json-c.cmake)
include (cmake/find/libnuma.cmake)
if(NOT USE_INTERNAL_PARQUET_LIBRARY)
set (ENABLE_ORC OFF CACHE INTERNAL "")
@ -552,11 +569,18 @@ include (cmake/find/cassandra.cmake)
include (cmake/find/sentry.cmake)
include (cmake/find/stats.cmake)
include (cmake/find/datasketches.cmake)
include (cmake/find_gflags.cmake)
include (cmake/find/brpc.cmake)
include (cmake/find/breakpad.cmake)
include (cmake/find/benchmark.cmake)
include (cmake/find/hivemetastore.cmake)
set (USE_INTERNAL_CITYHASH_LIBRARY ON CACHE INTERNAL "")
find_contrib_lib(cityhash)
find_contrib_lib(farmhash)
find_contrib_lib(minizip)
find_contrib_lib(udns)
if (ENABLE_TESTS)
include (cmake/find/gtest.cmake)

View File

@ -1,21 +1,12 @@
# Contributing to ClickHouse
# Contributing to ByConity
ClickHouse is an open project, and you can contribute to it in many ways. You can help with ideas, code, or documentation. We appreciate any efforts that help us to make the project better.
ByC is an open project, and you can contribute to it in many ways. You can help with ideas, code, or documentation. We appreciate any efforts that help us to make the project better.
Thank you.
## Technical Info
We have a [developer's guide](https://clickhouse.yandex/docs/en/development/developer_instruction/) for writing code for ClickHouse. Besides this guide, you can find [Overview of ClickHouse Architecture](https://clickhouse.yandex/docs/en/development/architecture/) and instructions on how to build ClickHouse in different environments.
If you want to contribute to documentation, read the [Contributing to ClickHouse Documentation](docs/README.md) guide.
## Legal Info
In order for us (YANDEX LLC) to accept patches and other contributions from you, you may adopt our Yandex Contributor License Agreement (the "**CLA**"). The current version of the CLA you may find here:
1) https://yandex.ru/legal/cla/?lang=en (in English) and
2) https://yandex.ru/legal/cla/?lang=ru (in Russian).
In order for us to accept patches and other contributions from you, you may adopt our License Agreement
By adopting the CLA, you state the following:
* You obviously wish and are willingly licensing your contributions to us for our open source projects under the terms of the CLA,
@ -29,11 +20,10 @@ If you agree with these principles, please read and adopt our CLA. By providing
If you have already adopted terms and conditions of the CLA, you are able to provide your contributes. When you submit your pull request, please add the following information into it:
```
I hereby agree to the terms of the CLA available at: [link].
I hereby agree to the terms of the CLA available at:
```
Replace the bracketed text as follows:
* [link] is the link at the current version of the CLA (you may add here a link https://yandex.ru/legal/cla/?lang=en (in English) or a link https://yandex.ru/legal/cla/?lang=ru (in Russian).
It is enough to provide us such notification once.

130
README.md
View File

@ -1,15 +1,121 @@
[![ClickHouse — open source distributed column-oriented DBMS](https://github.com/ClickHouse/ClickHouse/raw/master/website/images/logo-400x240.png)](https://clickhouse.tech)
# ByConity
ClickHouse® is an open-source column-oriented database management system that allows generating analytical data reports in real time.
<p align="center">
<img src="ByConity-architecture.png" alt="ByConity-architecture" width="800"/>
</p>
## Useful Links
ByConity is a data warehouse designed for changes in modern cloud architecture. It adopts a cloud-native architecture design to meet the requirements of data warehouse users for flexible scaling, separation of reads and writes, resource isolation, and strong data consistency. At the same time, it provides excellent query and write performance.
* [Official website](https://clickhouse.tech/) has quick high-level overview of ClickHouse on main page.
* [Tutorial](https://clickhouse.tech/docs/en/getting_started/tutorial/) shows how to set up and query small ClickHouse cluster.
* [Documentation](https://clickhouse.tech/docs/en/) provides more in-depth information.
* [YouTube channel](https://www.youtube.com/c/ClickHouseDB) has a lot of content about ClickHouse in video format.
* [Slack](https://join.slack.com/t/clickhousedb/shared_invite/zt-rxm3rdrk-lIUmhLC3V8WTaL0TGxsOmg) and [Telegram](https://telegram.me/clickhouse_en) allow to chat with ClickHouse users in real-time.
* [Blog](https://clickhouse.yandex/blog/en/) contains various ClickHouse-related articles, as well as announcements and reports about events.
* [Code Browser](https://clickhouse.tech/codebrowser/html_report/ClickHouse/index.html) with syntax highlight and navigation.
* [Contacts](https://clickhouse.tech/#contacts) can help to get your questions answered if there are any.
* You can also [fill this form](https://clickhouse.tech/#meet) to meet Yandex ClickHouse team in person.
ByConity is using a large number of mature OLAP technologies, such as column storage engine, MPP execution, intelligent query optimization, vectorized execution, Codegen, indexing, and data compression; it also makes special technological innovations for the cloud scenarios and storage-computing separation architecture.
ByConity is built on top of [ClickHouse](https://github.com/ClickHouse/ClickHouse). We appreciate the excellent work of the ClickHouse team.
## Try ByConity
You can quickly bring up a ByConity playground by following this simple [guide](https://github.com/ByConity/byconity-docker).
A minimal ByConity cluster include:
- A [FoundationDB](https://www.foundationdb.org/) database cluster to store meta data.
- A [HDFS](https://hadoop.apache.org/docs/r1.2.1/hdfs_design.html) cluster to store data.
- A ByConity server to receive request from clients.
- A ByConity read worker to carry execution of read requests forward from server.
- A ByConity write worker to carry execution of write requests forward from server.
- A ByConity TSO server to provide timestamp
- A ByConity daemon manager to manage background jobs that run in server
## Build ByConity
The easiest way to build ByConity is built in [docker](https://github.com/ByConity/ByConity/tree/master/docker/builder)
It can also be built the following operating systems:
- Linux
### 1. Prepare Prerequisites
The following packages are required:
- Git
- CMake 3.17 or newer
- Ninja
- C++ compiler: clang-11 or clang-12
- Linker: lld
```
sudo apt-get update
sudo apt-get install git cmake ccache python3 ninja-build libssl-dev libsnappy-dev apt-transport-https
# install llvm 12
sudo apt install lsb-release wget software-properties-common gnupg # pre-requisites of llvm.sh
wget https://apt.llvm.org/llvm.sh
chmod +x llvm.sh
sudo ./llvm.sh 12
```
### 2. Checkout Source Code
```
git clone --recursive https://github.com/ByConity/ByConity.git byconity
```
### 3. Build
```
cd byconity
mkdir build && cd build
export CC=clang-12
export CXX=clang++-12
cmake ..
ninja
```
Then you can find the binary in the programs folder
```
clickhouse-client # byconity client
clickhouse-server # byconity server
clickhouse-worker # byconity worker
tso_server # byconity tso
daemon_manager # byconity daemon manager
resource_manager # byconity resource manager
```
## Run ByConity Locally
Assuming you have [FoundationDB](https://apple.github.io/foundationdb/local-dev.html) and [HDFS](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/ClusterSetup.html) set up and running locally:
1. Modify the template config
2. Run the local deployment script to run all the components
### Modify the template config
The config templates can be found in deploy/template. You should replace the following in in `byconity-server.xml` and `byconity-worker.xml`:
1. `Path_To_FDB` with path to your FoundationDB `fdb.cluster` file path
2. `HOST:PORT` with the host and port of your name node in your HDFS cluster
```xml
<catalog_service>
<type>fdb</type>
<fdb>
<cluster_file>/Path_To_FDB/fdb.cluster</cluster_file>
</fdb>
</catalog_service>
...
<tso_service>
<port>49963</port>
<type>fdb</type>
<fdb>
<cluster_file>/Path_To_FDB/fdb.cluster</cluster_file>
</fdb>
<tso_window_ms>3000</tso_window_ms>
<tso_max_retry_count>3</tso_max_retry_count>
</tso_service>
...
<hdfs_nnproxy>hdfs://HOST:PORT</hdfs_nnproxy>
```
### Run the local deployment script
1. Make sure you have `python3.9` and `tmux` installed
2. Install missing libraries if any. For example:
1. `pip3.9 install psutils`
3. Run tmux in another terminal
4. Run the deploy script in a separate terminal. `template_paths` and `program_dir` args are compulsory
1. `cd ByConity/deploy`
2. `python3.9 deploy.py --template_paths template/byconity-server.xml template/byconity-worker.xml --program_dir /home/ByConity/build/programs`
3. There are other arguments for the script. For example, you can run 2 servers with argument `-s 2`

View File

@ -1,28 +1,3 @@
# Security Policy
## Supported Versions
The following versions of ClickHouse server are
currently being supported with security updates:
| Version | Supported |
| ------- | ------------------ |
| 1.x | :x: |
| 18.x | :x: |
| 19.x | :x: |
| 20.1 | :x: |
| 20.3 | :white_check_mark: |
| 20.4 | :x: |
| 20.5 | :x: |
| 20.6 | :x: |
| 20.7 | :x: |
| 20.8 | :white_check_mark: |
| 20.9 | :x: |
| 20.10 | :x: |
| 20.11 | :white_check_mark: |
| 20.12 | :white_check_mark: |
| 21.1 | :white_check_mark: |
## Reporting a Vulnerability
To report a potential vulnerability in ClickHouse please send the details about it to [clickhouse-feedback@yandex-team.com](mailto:clickhouse-feedback@yandex-team.com).
stay tuned.

View File

@ -9,6 +9,7 @@ add_subdirectory (pcg-random)
add_subdirectory (widechar_width)
add_subdirectory (readpassphrase)
add_subdirectory (bridge)
add_subdirectory (libbiginteger)
if (USE_MYSQL)
add_subdirectory (mysqlxx)

View File

@ -128,7 +128,15 @@ void IBridge::defineOptions(Poco::Util::OptionSet & options)
void IBridge::initialize(Application & self)
{
BaseDaemon::closeFDs();
/// BaseDaemon will close inheritable file descriptors from parent processe to avoid
/// security vulnerability issue and file resource resuse issue like tcp port resuse.
/// But closing inheritable fds here may be too late, since global variables initialized
/// before main entry may open some fds already, which leading to implicit problems such as
/// closing fd witch already closed by BaseDaemon before.
/// For example, Brpc will create Bvars as global variable and will open some file under /proc
/// before closeFDs called.
/// For our sitiuation, just ingoring inheritable fds is ok.
// BaseDaemon::closeFDs();
is_help = config().has("help");
if (is_help)

View File

@ -60,6 +60,7 @@ DateLUTImpl::DateLUTImpl(const std::string & time_zone_)
offset_at_start_of_epoch = cctz_time_zone.lookup(cctz_time_zone.lookup(epoch).pre).offset;
offset_at_start_of_lut = cctz_time_zone.lookup(cctz_time_zone.lookup(lut_start).pre).offset;
offset_is_whole_number_of_hours_during_epoch = true;
offset_is_whole_number_of_minutes_during_epoch = true;
cctz::civil_day date = lut_start;
@ -108,6 +109,9 @@ DateLUTImpl::DateLUTImpl(const std::string & time_zone_)
if (offset_is_whole_number_of_hours_during_epoch && start_of_day > 0 && start_of_day % 3600)
offset_is_whole_number_of_hours_during_epoch = false;
if (offset_is_whole_number_of_minutes_during_epoch && start_of_day > 0 && start_of_day % 60)
offset_is_whole_number_of_minutes_during_epoch = false;
/// If UTC offset was changed this day.
/// Change in time zone without transition is possible, e.g. Moscow 1991 Sun, 31 Mar, 02:00 MSK to EEST
cctz::time_zone::civil_transition transition{};
@ -170,6 +174,20 @@ DateLUTImpl::DateLUTImpl(const std::string & time_zone_)
{
years_months_lut[year_months_lut_index] = first_day_of_last_month;
}
/// Fill saturated LUT.
{
ssize_t day = DATE_LUT_SIZE - 1;
for (; day >= 0; --day)
{
if (lut[day].date >= 0)
lut_saturated[day] = lut[day];
else
break;
}
for (; day >= 0; --day)
lut_saturated[day] = lut_saturated[day + 1];
}
}

View File

@ -10,18 +10,23 @@
#include <type_traits>
#define DATE_LUT_MIN_YEAR 1925 /// 1925 since wast majority of timezones changed to 15-minute aligned offsets somewhere in 1924 or earlier.
#define DATE_LUT_MAX_YEAR 2283 /// Last supported year (complete)
#define DATE_LUT_MIN_YEAR 1900 /// 1900 since majority of financial organizations consider 1900 as an initial year.
#define DATE_LUT_MAX_YEAR 2299 /// Last supported year (complete)
#define DATE_LUT_YEARS (1 + DATE_LUT_MAX_YEAR - DATE_LUT_MIN_YEAR) /// Number of years in lookup table
#define DATE_LUT_SIZE 0x20000
#define DATE_LUT_SIZE 0x23AB1
#define DATE_LUT_MAX (0xFFFFFFFFU - 86400)
#define DATE_LUT_MAX_DAY_NUM 0xFFFF
#define DAYNUM_OFFSET_EPOCH 25567
/// Max int value of Date32, DATE LUT cache size minus daynum_offset_epoch
#define DATE_LUT_MAX_EXTEND_DAY_NUM (DATE_LUT_SIZE - DAYNUM_OFFSET_EPOCH)
/// A constant to add to time_t so every supported time point becomes non-negative and still has the same remainder of division by 3600.
/// If we treat "remainder of division" operation in the sense of modular arithmetic (not like in C++).
#define DATE_LUT_ADD ((1970 - DATE_LUT_MIN_YEAR) * 366 * 86400)
#define DATE_LUT_ADD ((1970 - DATE_LUT_MIN_YEAR) * 366L * 86400)
#if defined(__PPC__)
@ -59,63 +64,81 @@ private:
// has to be a separate type to support overloading
// TODO: make sure that any arithmetic on LUTIndex actually results in valid LUTIndex.
STRONG_TYPEDEF(UInt32, LUTIndex)
// Same as above but select different function overloads for zero saturation.
STRONG_TYPEDEF(UInt32, LUTIndexWithSaturation)
static inline LUTIndex normalizeLUTIndex(UInt32 index)
{
if (index >= DATE_LUT_SIZE)
return LUTIndex(DATE_LUT_SIZE - 1);
return LUTIndex{index};
}
static inline LUTIndex normalizeLUTIndex(Int64 index)
{
if (unlikely(index < 0))
return LUTIndex(0);
if (index >= DATE_LUT_SIZE)
return LUTIndex(DATE_LUT_SIZE - 1);
return LUTIndex{index};
}
template <typename T>
friend inline LUTIndex operator+(const LUTIndex & index, const T v)
{
return LUTIndex{(index.toUnderType() + UInt32(v)) & date_lut_mask};
return normalizeLUTIndex(index.toUnderType() + UInt32(v));
}
template <typename T>
friend inline LUTIndex operator+(const T v, const LUTIndex & index)
{
return LUTIndex{(v + index.toUnderType()) & date_lut_mask};
return normalizeLUTIndex(static_cast<Int64>(v + index.toUnderType()));
}
friend inline LUTIndex operator+(const LUTIndex & index, const LUTIndex & v)
{
return LUTIndex{(index.toUnderType() + v.toUnderType()) & date_lut_mask};
return normalizeLUTIndex(static_cast<UInt32>(index.toUnderType() + v.toUnderType()));
}
template <typename T>
friend inline LUTIndex operator-(const LUTIndex & index, const T v)
{
return LUTIndex{(index.toUnderType() - UInt32(v)) & date_lut_mask};
return normalizeLUTIndex(static_cast<Int64>(index.toUnderType() - UInt32(v)));
}
template <typename T>
friend inline LUTIndex operator-(const T v, const LUTIndex & index)
{
return LUTIndex{(v - index.toUnderType()) & date_lut_mask};
return normalizeLUTIndex(static_cast<Int64>(v - index.toUnderType()));
}
friend inline LUTIndex operator-(const LUTIndex & index, const LUTIndex & v)
{
return LUTIndex{(index.toUnderType() - v.toUnderType()) & date_lut_mask};
return normalizeLUTIndex(static_cast<Int64>(index.toUnderType() - v.toUnderType()));
}
template <typename T>
friend inline LUTIndex operator*(const LUTIndex & index, const T v)
{
return LUTIndex{(index.toUnderType() * UInt32(v)) & date_lut_mask};
return normalizeLUTIndex(index.toUnderType() * UInt32(v));
}
template <typename T>
friend inline LUTIndex operator*(const T v, const LUTIndex & index)
{
return LUTIndex{(v * index.toUnderType()) & date_lut_mask};
return normalizeLUTIndex(v * index.toUnderType());
}
template <typename T>
friend inline LUTIndex operator/(const LUTIndex & index, const T v)
{
return LUTIndex{(index.toUnderType() / UInt32(v)) & date_lut_mask};
return normalizeLUTIndex(index.toUnderType() / UInt32(v));
}
template <typename T>
friend inline LUTIndex operator/(const T v, const LUTIndex & index)
{
return LUTIndex{(UInt32(v) / index.toUnderType()) & date_lut_mask};
return normalizeLUTIndex(UInt32(v) / index.toUnderType());
}
public:
@ -164,14 +187,9 @@ public:
static_assert(sizeof(Values) == 16);
private:
/// Mask is all-ones to allow efficient protection against overflow.
static constexpr UInt32 date_lut_mask = 0x1ffff;
static_assert(date_lut_mask == DATE_LUT_SIZE - 1);
/// Offset to epoch in days (ExtendedDayNum) of the first day in LUT.
/// "epoch" is the Unix Epoch (starts at unix timestamp zero)
static constexpr UInt32 daynum_offset_epoch = 16436;
static constexpr UInt32 daynum_offset_epoch = 25567;
static_assert(daynum_offset_epoch == (1970 - DATE_LUT_MIN_YEAR) * 365 + (1970 - DATE_LUT_MIN_YEAR / 4 * 4) / 4);
/// Lookup table is indexed by LUTIndex.
@ -180,6 +198,9 @@ private:
/// In comparison to std::vector, plain array is cheaper by one indirection.
Values lut[DATE_LUT_SIZE + 1];
/// Same as above but with dates < 1970-01-01 saturated to 1970-01-01.
Values lut_saturated[DATE_LUT_SIZE + 1];
/// Year number after DATE_LUT_MIN_YEAR -> LUTIndex in lut for start of year.
LUTIndex years_lut[DATE_LUT_YEARS];
@ -191,6 +212,7 @@ private:
/// UTC offset at the beginning of the first supported year.
Time offset_at_start_of_lut;
bool offset_is_whole_number_of_hours_during_epoch;
bool offset_is_whole_number_of_minutes_during_epoch;
/// Time zone name.
std::string time_zone;
@ -224,12 +246,12 @@ private:
inline LUTIndex toLUTIndex(DayNum d) const
{
return LUTIndex{(d + daynum_offset_epoch) & date_lut_mask};
return normalizeLUTIndex(d + daynum_offset_epoch);
}
inline LUTIndex toLUTIndex(ExtendedDayNum d) const
{
return LUTIndex{static_cast<UInt32>(d + daynum_offset_epoch) & date_lut_mask};
return normalizeLUTIndex(static_cast<UInt32>(d + daynum_offset_epoch));
}
inline LUTIndex toLUTIndex(Time t) const
@ -248,19 +270,32 @@ private:
return lut[toLUTIndex(v)];
}
template <typename T, typename Divisor>
static inline T roundDown(T x, Divisor divisor)
template <typename DateOrTime, typename Divisor>
inline DateOrTime roundDown(DateOrTime x, Divisor divisor) const
{
static_assert(std::is_integral_v<T> && std::is_integral_v<Divisor>);
static_assert(std::is_integral_v<DateOrTime> && std::is_integral_v<Divisor>);
assert(divisor > 0);
if (likely(x >= 0))
return x / divisor * divisor;
if (likely(offset_is_whole_number_of_hours_during_epoch))
{
if (likely(x >= 0))
return x / divisor * divisor;
/// Integer division for negative numbers rounds them towards zero (up).
/// We will shift the number so it will be rounded towards -inf (down).
/// Integer division for negative numbers rounds them towards zero (up).
/// We will shift the number so it will be rounded towards -inf (down).
return (x + 1 - divisor) / divisor * divisor;
}
return (x + 1 - divisor) / divisor * divisor;
Time date = find(x).date;
Time res = date + (x - date) / divisor * divisor;
if constexpr (std::is_unsigned_v<DateOrTime> || std::is_same_v<DateOrTime, DayNum>)
{
if (unlikely(res < 0))
return 0;
return res;
}
else
return res;
}
public:
@ -270,17 +305,39 @@ public:
auto getOffsetAtStartOfEpoch() const { return offset_at_start_of_epoch; }
auto getTimeOffsetAtStartOfLUT() const { return offset_at_start_of_lut; }
static auto getDayNumOffsetEpoch() { return daynum_offset_epoch; }
/// All functions below are thread-safe; arguments are not checked.
inline ExtendedDayNum toDayNum(ExtendedDayNum d) const
static ExtendedDayNum toDayNum(ExtendedDayNum d)
{
return d;
}
template <typename DateOrTime>
inline ExtendedDayNum toDayNum(DateOrTime v) const
static UInt32 saturateMinus(UInt32 x, UInt32 y)
{
return ExtendedDayNum{static_cast<ExtendedDayNum::UnderlyingType>(toLUTIndex(v).toUnderType() - daynum_offset_epoch)};
UInt32 res = x - y;
res &= -Int32(res <= x);
return res;
}
static ExtendedDayNum toDayNum(LUTIndex d)
{
return ExtendedDayNum{static_cast<ExtendedDayNum::UnderlyingType>(d.toUnderType() - daynum_offset_epoch)};
}
static DayNum toDayNum(LUTIndexWithSaturation d)
{
return DayNum{static_cast<DayNum::UnderlyingType>(saturateMinus(d.toUnderType(), daynum_offset_epoch))};
}
template <typename DateOrTime>
inline auto toDayNum(DateOrTime v) const
{
if constexpr (std::is_unsigned_v<DateOrTime> || std::is_same_v<DateOrTime, DayNum>)
return DayNum{static_cast<DayNum::UnderlyingType>(saturateMinus(toLUTIndex(v).toUnderType(), daynum_offset_epoch))};
else
return ExtendedDayNum{static_cast<ExtendedDayNum::UnderlyingType>(toLUTIndex(v).toUnderType() - daynum_offset_epoch)};
}
/// Round down to start of monday.
@ -288,14 +345,20 @@ public:
inline Time toFirstDayOfWeek(DateOrTime v) const
{
const LUTIndex i = toLUTIndex(v);
return lut[i - (lut[i].day_of_week - 1)].date;
if constexpr (std::is_unsigned_v<DateOrTime> || std::is_same_v<DateOrTime, DayNum>)
return lut_saturated[i - (lut[i].day_of_week - 1)].date;
else
return lut[i - (lut[i].day_of_week - 1)].date;
}
template <typename DateOrTime>
inline ExtendedDayNum toFirstDayNumOfWeek(DateOrTime v) const
inline auto toFirstDayNumOfWeek(DateOrTime v) const
{
const LUTIndex i = toLUTIndex(v);
return toDayNum(i - (lut[i].day_of_week - 1));
if constexpr (std::is_unsigned_v<DateOrTime> || std::is_same_v<DateOrTime, DayNum>)
return toDayNum(LUTIndexWithSaturation(i - (lut[i].day_of_week - 1)));
else
return toDayNum(LUTIndex(i - (lut[i].day_of_week - 1)));
}
/// Round down to start of month.
@ -303,21 +366,57 @@ public:
inline Time toFirstDayOfMonth(DateOrTime v) const
{
const LUTIndex i = toLUTIndex(v);
return lut[i - (lut[i].day_of_month - 1)].date;
if constexpr (std::is_unsigned_v<DateOrTime> || std::is_same_v<DateOrTime, DayNum>)
return lut_saturated[i - (lut[i].day_of_month - 1)].date;
else
return lut[i - (lut[i].day_of_month - 1)].date;
}
template <typename DateOrTime>
inline ExtendedDayNum toFirstDayNumOfMonth(DateOrTime v) const
inline auto toFirstDayNumOfMonth(DateOrTime v) const
{
const LUTIndex i = toLUTIndex(v);
return toDayNum(i - (lut[i].day_of_month - 1));
if constexpr (std::is_unsigned_v<DateOrTime> || std::is_same_v<DateOrTime, DayNum>)
return toDayNum(LUTIndexWithSaturation(i - (lut[i].day_of_month - 1)));
else
return toDayNum(LUTIndex(i - (lut[i].day_of_month - 1)));
}
/// Round down to start of bi-month
template <typename DateOrTime>
inline auto toFirstDayNumOfBiMonth(DateOrTime v) const
{
const LUTIndex i = toLUTIndex(v);
if constexpr (std::is_unsigned_v<DateOrTime> || std::is_same_v<DateOrTime, DayNum>)
{
if (lut[i].month & 1)
return toDayNum(LUTIndexWithSaturation(i - (lut[i].day_of_month - 1)));
/// January, March, May, July has 31 days
else if (lut[i].month <= 8)
return toDayNum(LUTIndexWithSaturation(i - lut[i].day_of_month - 30));
else
return toDayNum(LUTIndexWithSaturation(i - lut[i].day_of_month - 29));
}
else
{
if (lut[i].month & 1)
return toDayNum(LUTIndex(i - (lut[i].day_of_month - 1)));
/// January, March, May, July has 31 days
else if (lut[i].month <= 8)
return toDayNum(LUTIndex(i - lut[i].day_of_month - 30));
else
return toDayNum(LUTIndex(i - lut[i].day_of_month - 29));
}
}
/// Round down to start of quarter.
template <typename DateOrTime>
inline ExtendedDayNum toFirstDayNumOfQuarter(DateOrTime v) const
inline auto toFirstDayNumOfQuarter(DateOrTime v) const
{
return toDayNum(toFirstDayOfQuarterIndex(v));
if constexpr (std::is_unsigned_v<DateOrTime> || std::is_same_v<DateOrTime, DayNum>)
return toDayNum(LUTIndexWithSaturation(toFirstDayOfQuarterIndex(v)));
else
return toDayNum(LUTIndex(toFirstDayOfQuarterIndex(v)));
}
template <typename DateOrTime>
@ -355,9 +454,12 @@ public:
}
template <typename DateOrTime>
inline ExtendedDayNum toFirstDayNumOfYear(DateOrTime v) const
inline auto toFirstDayNumOfYear(DateOrTime v) const
{
return toDayNum(toFirstDayNumOfYearIndex(v));
if constexpr (std::is_unsigned_v<DateOrTime> || std::is_same_v<DateOrTime, DayNum>)
return toDayNum(LUTIndexWithSaturation(toFirstDayNumOfYearIndex(v)));
else
return toDayNum(LUTIndex(toFirstDayNumOfYearIndex(v)));
}
inline Time toFirstDayOfNextMonth(Time t) const
@ -455,10 +557,21 @@ public:
inline unsigned toSecond(Time t) const
{
auto res = t % 60;
if (likely(res >= 0))
return res;
return res + 60;
if (likely(offset_is_whole_number_of_minutes_during_epoch))
{
Time res = t % 60;
if (likely(res >= 0))
return res;
return res + 60;
}
LUTIndex index = findIndex(t);
Time time = t - lut[index].date;
if (time >= lut[index].time_at_offset_change())
time += lut[index].amount_of_offset_change();
return time % 60;
}
inline unsigned toMinute(Time t) const
@ -479,29 +592,16 @@ public:
}
/// NOTE: Assuming timezone offset is a multiple of 15 minutes.
inline Time toStartOfMinute(Time t) const { return roundDown(t, 60); }
inline Time toStartOfFiveMinute(Time t) const { return roundDown(t, 300); }
inline Time toStartOfFifteenMinutes(Time t) const { return roundDown(t, 900); }
inline Time toStartOfTenMinutes(Time t) const
{
if (t >= 0 && offset_is_whole_number_of_hours_during_epoch)
return t / 600 * 600;
/// More complex logic is for Nepal - it has offset 05:45. Australia/Eucla is also unfortunate.
Time date = find(t).date;
return date + (t - date) / 600 * 600;
}
/// NOTE: Assuming timezone transitions are multiple of hours. Lord Howe Island in Australia is a notable exception.
inline Time toStartOfHour(Time t) const
{
if (t >= 0 && offset_is_whole_number_of_hours_during_epoch)
return t / 3600 * 3600;
Time date = find(t).date;
return date + (t - date) / 3600 * 3600;
}
template <typename DateOrTime>
DateOrTime toStartOfMinute(DateOrTime t) const { return toStartOfMinuteInterval(t, 1); }
template <typename DateOrTime>
DateOrTime toStartOfFiveMinutes(DateOrTime t) const { return toStartOfMinuteInterval(t, 5); }
template <typename DateOrTime>
DateOrTime toStartOfFifteenMinutes(DateOrTime t) const { return toStartOfMinuteInterval(t, 15); }
template <typename DateOrTime>
DateOrTime toStartOfTenMinutes(DateOrTime t) const { return toStartOfMinuteInterval(t, 10); }
template <typename DateOrTime>
DateOrTime toStartOfHour(DateOrTime t) const { return roundDown(t, 3600); }
/** Number of calendar day since the beginning of UNIX epoch (1970-01-01 is zero)
* We use just two bytes for it. It covers the range up to 2105 and slightly more.
@ -515,7 +615,13 @@ public:
inline Time fromDayNum(ExtendedDayNum d) const { return lut[toLUTIndex(d)].date; }
template <typename DateOrTime>
inline Time toDate(DateOrTime v) const { return lut[toLUTIndex(v)].date; }
inline Time toDate(DateOrTime v) const
{
if constexpr (std::is_unsigned_v<DateOrTime> || std::is_same_v<DateOrTime, DayNum>)
return lut_saturated[toLUTIndex(v)].date;
else
return lut[toLUTIndex(v)].date;
}
template <typename DateOrTime>
inline unsigned toMonth(DateOrTime v) const { return lut[toLUTIndex(v)].month; }
@ -578,9 +684,12 @@ public:
}
template <typename DateOrTime>
inline ExtendedDayNum toFirstDayNumOfISOYear(DateOrTime v) const
inline auto toFirstDayNumOfISOYear(DateOrTime v) const
{
return toDayNum(toFirstDayNumOfISOYearIndex(v));
if constexpr (std::is_unsigned_v<DateOrTime> || std::is_same_v<DateOrTime, DayNum>)
return toDayNum(LUTIndexWithSaturation(toFirstDayNumOfISOYearIndex(v)));
else
return toDayNum(LUTIndex(toFirstDayNumOfISOYearIndex(v)));
}
inline Time toFirstDayOfISOYear(Time t) const
@ -593,7 +702,7 @@ public:
template <typename DateOrTime>
inline unsigned toISOWeek(DateOrTime v) const
{
return 1 + (toFirstDayNumOfWeek(v) - toFirstDayNumOfISOYear(v)) / 7;
return 1 + (toFirstDayNumOfWeek(v) - toDayNum(toFirstDayNumOfISOYearIndex(v))) / 7;
}
/*
@ -659,7 +768,7 @@ public:
{
if (!week_year_mode && ((first_weekday_mode && weekday != 0) || (!first_weekday_mode && weekday >= 4)))
return yw;
week_year_mode = 1;
week_year_mode = true;
(yw.first)--;
first_daynr -= (days = calc_days_in_year(yw.first));
weekday = (weekday + 53 * 7 - days) % 7;
@ -721,7 +830,7 @@ public:
/// Get first day of week with week_mode, return Sunday or Monday
template <typename DateOrTime>
inline ExtendedDayNum toFirstDayNumOfWeek(DateOrTime v, UInt8 week_mode) const
inline auto toFirstDayNumOfWeek(DateOrTime v, UInt8 week_mode) const
{
bool monday_first_mode = week_mode & static_cast<UInt8>(WeekModeFlag::MONDAY_FIRST);
if (monday_first_mode)
@ -730,7 +839,10 @@ public:
}
else
{
return (toDayOfWeek(v) != 7) ? ExtendedDayNum(v - toDayOfWeek(v)) : toDayNum(v);
if constexpr (std::is_unsigned_v<DateOrTime> || std::is_same_v<DateOrTime, DayNum>)
return (toDayOfWeek(v) != 7) ? DayNum(saturateMinus(v, toDayOfWeek(v))) : toDayNum(v);
else
return (toDayOfWeek(v) != 7) ? ExtendedDayNum(v - toDayOfWeek(v)) : toDayNum(v);
}
}
@ -806,7 +918,7 @@ public:
}
template <typename DateOrTime>
inline ExtendedDayNum toStartOfYearInterval(DateOrTime v, UInt64 years) const
inline auto toStartOfYearInterval(DateOrTime v, UInt64 years) const
{
if (years == 1)
return toFirstDayNumOfYear(v);
@ -819,42 +931,63 @@ public:
if (unlikely(year < DATE_LUT_MIN_YEAR))
year = DATE_LUT_MIN_YEAR;
return toDayNum(years_lut[year - DATE_LUT_MIN_YEAR]);
if constexpr (std::is_unsigned_v<DateOrTime> || std::is_same_v<DateOrTime, DayNum>)
return toDayNum(LUTIndexWithSaturation(years_lut[year - DATE_LUT_MIN_YEAR]));
else
return toDayNum(years_lut[year - DATE_LUT_MIN_YEAR]);
}
inline ExtendedDayNum toStartOfQuarterInterval(ExtendedDayNum d, UInt64 quarters) const
template <typename Date,
typename = std::enable_if_t<std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>>>
inline auto toStartOfQuarterInterval(Date d, UInt64 quarters) const
{
if (quarters == 1)
return toFirstDayNumOfQuarter(d);
return toStartOfMonthInterval(d, quarters * 3);
}
inline ExtendedDayNum toStartOfMonthInterval(ExtendedDayNum d, UInt64 months) const
template <typename Date,
typename = std::enable_if_t<std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>>>
inline auto toStartOfMonthInterval(Date d, UInt64 months) const
{
if (months == 1)
return toFirstDayNumOfMonth(d);
const Values & values = lut[toLUTIndex(d)];
UInt32 month_total_index = (values.year - DATE_LUT_MIN_YEAR) * 12 + values.month - 1;
return toDayNum(years_months_lut[month_total_index / months * months]);
if constexpr (std::is_same_v<Date, DayNum>)
return toDayNum(LUTIndexWithSaturation(years_months_lut[month_total_index / months * months]));
else
return toDayNum(years_months_lut[month_total_index / months * months]);
}
inline ExtendedDayNum toStartOfWeekInterval(ExtendedDayNum d, UInt64 weeks) const
template <typename Date,
typename = std::enable_if_t<std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>>>
inline auto toStartOfWeekInterval(Date d, UInt64 weeks) const
{
if (weeks == 1)
return toFirstDayNumOfWeek(d);
UInt64 days = weeks * 7;
// January 1st 1970 was Thursday so we need this 4-days offset to make weeks start on Monday.
return ExtendedDayNum(4 + (d - 4) / days * days);
if constexpr (std::is_same_v<Date, DayNum>)
return DayNum(4 + (d - 4) / days * days);
else
return ExtendedDayNum(4 + (d - 4) / days * days);
}
inline Time toStartOfDayInterval(ExtendedDayNum d, UInt64 days) const
template <typename Date,
typename = std::enable_if_t<std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>>>
inline Time toStartOfDayInterval(Date d, UInt64 days) const
{
if (days == 1)
return toDate(d);
return lut[toLUTIndex(ExtendedDayNum(d / days * days))].date;
if constexpr (std::is_same_v<Date, DayNum>)
return lut_saturated[toLUTIndex(ExtendedDayNum(d / days * days))].date;
else
return lut[toLUTIndex(ExtendedDayNum(d / days * days))].date;
}
inline Time toStartOfHourInterval(Time t, UInt64 hours) const
template <typename DateOrTime>
DateOrTime toStartOfHourInterval(DateOrTime t, UInt64 hours) const
{
if (hours == 1)
return toStartOfHour(t);
@ -894,47 +1027,70 @@ public:
time = time / seconds * seconds;
}
return values.date + time;
Time res = values.date + time;
if constexpr (std::is_unsigned_v<DateOrTime> || std::is_same_v<DateOrTime, DayNum>)
{
if (unlikely(res < 0))
return 0;
return res;
}
else
return res;
}
inline Time toStartOfMinuteInterval(Time t, UInt64 minutes) const
template <typename DateOrTime>
DateOrTime toStartOfMinuteInterval(DateOrTime t, UInt64 minutes) const
{
if (minutes == 1)
return toStartOfMinute(t);
Int64 divisor = 60 * minutes;
if (likely(offset_is_whole_number_of_minutes_during_epoch))
{
if (likely(t >= 0))
return t / divisor * divisor;
return (t + 1 - divisor) / divisor * divisor;
}
/** In contrast to "toStartOfHourInterval" function above,
* the minute intervals are not aligned to the midnight.
* You will get unexpected results if for example, you round down to 60 minute interval
* and there was a time shift to 30 minutes.
*
* But this is not specified in docs and can be changed in future.
*/
UInt64 seconds = 60 * minutes;
return roundDown(t, seconds);
Time date = find(t).date;
Time res = date + (t - date) / divisor * divisor;
if constexpr (std::is_unsigned_v<DateOrTime> || std::is_same_v<DateOrTime, DayNum>)
{
if (unlikely(res < 0))
return 0;
return res;
}
else
return res;
}
inline Time toStartOfSecondInterval(Time t, UInt64 seconds) const
template <typename DateOrTime>
DateOrTime toStartOfSecondInterval(DateOrTime t, UInt64 seconds) const
{
if (seconds == 1)
return t;
if (seconds % 60 == 0)
return toStartOfMinuteInterval(t, seconds / 60);
return roundDown(t, seconds);
}
inline LUTIndex makeLUTIndex(Int16 year, UInt8 month, UInt8 day_of_month) const
{
if (unlikely(year < DATE_LUT_MIN_YEAR || year > DATE_LUT_MAX_YEAR || month < 1 || month > 12 || day_of_month < 1 || day_of_month > 31))
if (unlikely(year < DATE_LUT_MIN_YEAR || month < 1 || month > 12 || day_of_month < 1 || day_of_month > 31))
return LUTIndex(0);
return LUTIndex{years_months_lut[(year - DATE_LUT_MIN_YEAR) * 12 + month - 1] + day_of_month - 1};
if (unlikely(year > DATE_LUT_MAX_YEAR))
return LUTIndex(DATE_LUT_SIZE - 1);
auto year_lut_index = (year - DATE_LUT_MIN_YEAR) * 12 + month - 1;
UInt32 index = years_months_lut[year_lut_index].toUnderType() + day_of_month - 1;
/// When date is out of range, default value is DATE_LUT_SIZE - 1 (2299-12-31)
return LUTIndex{std::min(index, static_cast<UInt32>(DATE_LUT_SIZE - 1))};
}
/// Create DayNum from year, month, day of month.
inline ExtendedDayNum makeDayNum(Int16 year, UInt8 month, UInt8 day_of_month) const
inline ExtendedDayNum makeDayNum(Int16 year, UInt8 month, UInt8 day_of_month, Int32 default_error_day_num = 0) const
{
if (unlikely(year < DATE_LUT_MIN_YEAR || year > DATE_LUT_MAX_YEAR || month < 1 || month > 12 || day_of_month < 1 || day_of_month > 31))
return ExtendedDayNum(0);
if (unlikely(year < DATE_LUT_MIN_YEAR || month < 1 || month > 12 || day_of_month < 1 || day_of_month > 31))
return ExtendedDayNum(default_error_day_num);
return toDayNum(makeLUTIndex(year, month, day_of_month));
}
@ -1071,6 +1227,16 @@ public:
num % 100);
}
template <typename DateOrTime>
inline auto toLastDayNumOfMonth(DateOrTime v) const
{
const LUTIndex i = toLUTIndex(v);
if constexpr (std::is_unsigned_v<DateOrTime> || std::is_same_v<DateOrTime, DayNum>)
return toDayNum(LUTIndexWithSaturation(i + (lut[i].days_in_month - lut[i].day_of_month)));
else
return toDayNum(LUTIndex(i + (lut[i].days_in_month - lut[i].day_of_month)));
}
/// Adding calendar intervals.
/// Implementation specific behaviour when delta is too big.
@ -1091,9 +1257,9 @@ public:
return lut[new_index].date + time;
}
inline NO_SANITIZE_UNDEFINED Time addWeeks(Time t, Int64 delta) const
inline NO_SANITIZE_UNDEFINED Time addWeeks(Time t, Int32 delta) const
{
return addDays(t, delta * 7);
return addDays(t, static_cast<Int64>(delta) * 7);
}
inline UInt8 saturateDayOfMonth(Int16 year, UInt8 month, UInt8 day_of_month) const
@ -1136,7 +1302,11 @@ public:
/// If resulting month has less deys than source month, then saturation can happen.
/// Example: 31 Aug + 1 month = 30 Sep.
inline Time NO_SANITIZE_UNDEFINED addMonths(Time t, Int64 delta) const
template <
typename DateTime,
typename
= std::enable_if_t<std::is_same_v<DateTime, UInt32> || std::is_same_v<DateTime, Int64> || std::is_same_v<DateTime, time_t>>>
inline Time NO_SANITIZE_UNDEFINED addMonths(DateTime t, Int64 delta) const
{
const auto result_day = addMonthsIndex(t, delta);
@ -1150,22 +1320,30 @@ public:
if (time >= lut[result_day].time_at_offset_change())
time -= lut[result_day].amount_of_offset_change();
return lut[result_day].date + time;
auto res = lut[result_day].date + time;
if constexpr (std::is_same_v<DateTime, UInt32>)
{
/// Common compiler should generate branchless code for this saturation operation.
return res <= 0 ? 0 : res;
}
else
return res;
}
inline ExtendedDayNum NO_SANITIZE_UNDEFINED addMonths(ExtendedDayNum d, Int64 delta) const
template <typename Date,
typename = std::enable_if_t<std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>>>
inline auto NO_SANITIZE_UNDEFINED addMonths(Date d, Int64 delta) const
{
return toDayNum(addMonthsIndex(d, delta));
if constexpr (std::is_same_v<Date, DayNum>)
return toDayNum(LUTIndexWithSaturation(addMonthsIndex(d, delta)));
else
return toDayNum(addMonthsIndex(d, delta));
}
inline Time NO_SANITIZE_UNDEFINED addQuarters(Time t, Int64 delta) const
template <typename DateOrTime>
inline auto addQuarters(DateOrTime d, Int32 delta) const
{
return addMonths(t, delta * 3);
}
inline ExtendedDayNum addQuarters(ExtendedDayNum d, Int64 delta) const
{
return addMonths(d, delta * 3);
return addMonths(d, static_cast<Int64>(delta) * 3);
}
template <typename DateOrTime>
@ -1185,7 +1363,11 @@ public:
}
/// Saturation can occur if 29 Feb is mapped to non-leap year.
inline Time addYears(Time t, Int64 delta) const
template <
typename DateTime,
typename
= std::enable_if_t<std::is_same_v<DateTime, UInt32> || std::is_same_v<DateTime, Int64> || std::is_same_v<DateTime, time_t>>>
inline Time addYears(DateTime t, Int64 delta) const
{
auto result_day = addYearsIndex(t, delta);
@ -1199,12 +1381,24 @@ public:
if (time >= lut[result_day].time_at_offset_change())
time -= lut[result_day].amount_of_offset_change();
return lut[result_day].date + time;
auto res = lut[result_day].date + time;
if constexpr (std::is_same_v<DateTime, UInt32>)
{
/// Common compiler should generate branchless code for this saturation operation.
return res <= 0 ? 0 : res;
}
else
return res;
}
inline ExtendedDayNum addYears(ExtendedDayNum d, Int64 delta) const
template <typename Date,
typename = std::enable_if_t<std::is_same_v<Date, DayNum> || std::is_same_v<Date, ExtendedDayNum>>>
inline auto addYears(Date d, Int64 delta) const
{
return toDayNum(addYearsIndex(d, delta));
if constexpr (std::is_same_v<Date, DayNum>)
return toDayNum(LUTIndexWithSaturation(addYearsIndex(d, delta)));
else
return toDayNum(addYearsIndex(d, delta));
}

View File

@ -70,6 +70,14 @@ public:
m_day = values.day_of_month;
}
explicit LocalDate(ExtendedDayNum day_num)
{
const auto & values = DateLUT::instance().getValues(day_num);
m_year = values.year;
m_month = values.month;
m_day = values.day_of_month;
}
LocalDate(unsigned short year_, unsigned char month_, unsigned char day_)
: m_year(year_), m_month(month_), m_day(day_)
{
@ -98,11 +106,22 @@ public:
return DayNum(lut.makeDayNum(m_year, m_month, m_day).toUnderType());
}
ExtendedDayNum getExtendedDayNum() const
{
const auto & lut = DateLUT::instance();
return ExtendedDayNum (lut.makeDayNum(m_year, m_month, m_day).toUnderType());
}
operator DayNum() const
{
return getDayNum();
}
operator time_t() const
{
return DateLUT::instance().makeDate(m_year, m_month, m_day);
}
unsigned short year() const { return m_year; }
unsigned char month() const { return m_month; }
unsigned char day() const { return m_day; }

View File

@ -91,6 +91,14 @@ public:
LocalDateTime(const LocalDateTime &) noexcept = default;
LocalDateTime & operator= (const LocalDateTime &) noexcept = default;
operator time_t() const
{
return m_year == 0
? 0
: DateLUT::instance().makeDateTime(m_year, m_month, m_day, m_hour, m_minute, m_second);
}
unsigned short year() const { return m_year; }
unsigned char month() const { return m_month; }
unsigned char day() const { return m_day; }

View File

@ -46,9 +46,10 @@ struct StringRef
constexpr StringRef() = default;
std::string toString() const { return std::string(data, size); }
constexpr std::string_view toView() const { return std::string_view{data, size}; }
explicit operator std::string() const { return toString(); }
constexpr explicit operator std::string_view() const { return {data, size}; }
constexpr explicit operator std::string_view() const { return std::string_view{data, size}; }
};
/// Here constexpr doesn't implicate inline, see https://www.viva64.com/en/w/v1043/

View File

@ -117,6 +117,25 @@
# define ALWAYS_INLINE_NO_SANITIZE_UNDEFINED ALWAYS_INLINE
#endif
#if !defined(ABORT_ON_LOGICAL_ERROR)
#if !defined(NDEBUG) || defined(ADDRESS_SANITIZER) || defined(THREAD_SANITIZER) || defined(MEMORY_SANITIZER) || defined(UNDEFINED_BEHAVIOR_SANITIZER)
#define ABORT_ON_LOGICAL_ERROR
#endif
#endif
/// chassert(x) is similar to assert(x), but:
/// - works in builds with sanitizers, not only in debug builds
/// - tries to print failed assertion into server log
/// It can be used for all assertions except heavy ones.
/// Heavy assertions (that run loops or call complex functions) are allowed in debug builds only.
#if !defined(chassert)
#if defined(ABORT_ON_LOGICAL_ERROR)
#define chassert(x) static_cast<bool>(x) ? void(0) : abortOnFailedAssertion(#x)
#else
#define chassert(x) ((void)0)
#endif
#endif
/// A template function for suppressing warnings about unused variables or function results.
template <typename... Args>
constexpr void UNUSED(Args &&... args [[maybe_unused]])

View File

@ -10,6 +10,8 @@ using Int128 = wide::integer<128, signed>;
using UInt128 = wide::integer<128, unsigned>;
using Int256 = wide::integer<256, signed>;
using UInt256 = wide::integer<256, unsigned>;
using Int512 = wide::integer<512, signed>;
using UInt512 = wide::integer<512, unsigned>;
static_assert(sizeof(Int256) == 32);
static_assert(sizeof(UInt256) == 32);
@ -25,6 +27,7 @@ struct is_signed
template <> struct is_signed<Int128> { static constexpr bool value = true; };
template <> struct is_signed<Int256> { static constexpr bool value = true; };
template <> struct is_signed<Int512> { static constexpr bool value = true; };
template <typename T>
inline constexpr bool is_signed_v = is_signed<T>::value;
@ -37,6 +40,7 @@ struct is_unsigned
template <> struct is_unsigned<UInt128> { static constexpr bool value = true; };
template <> struct is_unsigned<UInt256> { static constexpr bool value = true; };
template <> struct is_unsigned<UInt512> { static constexpr bool value = true; };
template <typename T>
inline constexpr bool is_unsigned_v = is_unsigned<T>::value;
@ -53,6 +57,8 @@ template <> struct is_integer<Int128> { static constexpr bool value = true; };
template <> struct is_integer<UInt128> { static constexpr bool value = true; };
template <> struct is_integer<Int256> { static constexpr bool value = true; };
template <> struct is_integer<UInt256> { static constexpr bool value = true; };
template <> struct is_integer<Int512> { static constexpr bool value = true; };
template <> struct is_integer<UInt512> { static constexpr bool value = true; };
template <typename T>
inline constexpr bool is_integer_v = is_integer<T>::value;
@ -68,6 +74,8 @@ template <> struct is_arithmetic<Int128> { static constexpr bool value = true; }
template <> struct is_arithmetic<UInt128> { static constexpr bool value = true; };
template <> struct is_arithmetic<Int256> { static constexpr bool value = true; };
template <> struct is_arithmetic<UInt256> { static constexpr bool value = true; };
template <> struct is_arithmetic<Int512> { static constexpr bool value = true; };
template <> struct is_arithmetic<UInt512> { static constexpr bool value = true; };
template <typename T>
@ -83,6 +91,8 @@ template <> struct make_unsigned<Int128> { using type = UInt128; };
template <> struct make_unsigned<UInt128> { using type = UInt128; };
template <> struct make_unsigned<Int256> { using type = UInt256; };
template <> struct make_unsigned<UInt256> { using type = UInt256; };
template <> struct make_unsigned<Int512> { using type = UInt512; };
template <> struct make_unsigned<UInt512> { using type = UInt512; };
template <typename T> using make_unsigned_t = typename make_unsigned<T>::type;
@ -96,6 +106,8 @@ template <> struct make_signed<Int128> { using type = Int128; };
template <> struct make_signed<UInt128> { using type = Int128; };
template <> struct make_signed<Int256> { using type = Int256; };
template <> struct make_signed<UInt256> { using type = Int256; };
template <> struct make_signed<Int512> { using type = Int512; };
template <> struct make_signed<UInt512> { using type = Int512; };
template <typename T> using make_signed_t = typename make_signed<T>::type;
@ -109,6 +121,8 @@ template <> struct is_big_int<Int128> { static constexpr bool value = true; };
template <> struct is_big_int<UInt128> { static constexpr bool value = true; };
template <> struct is_big_int<Int256> { static constexpr bool value = true; };
template <> struct is_big_int<UInt256> { static constexpr bool value = true; };
template <> struct is_big_int<Int512> { static constexpr bool value = true; };
template <> struct is_big_int<UInt512> { static constexpr bool value = true; };
template <typename T>
inline constexpr bool is_big_int_v = is_big_int<T>::value;

View File

@ -15,6 +15,22 @@ namespace
return Poco::Net::DNS::hostName();
}
}
std::string getIPOrFQDNOrHostNameImpl()
{
try
{
auto this_host = Poco::Net::DNS::thisHost();
if (this_host.addresses().size() > 0)
return this_host.addresses().front().toString();
else
return this_host.name();
}
catch (...)
{
return Poco::Net::DNS::hostName();
}
}
}
@ -23,3 +39,9 @@ const std::string & getFQDNOrHostName()
static std::string result = getFQDNOrHostNameImpl();
return result;
}
const std::string & getIPOrFQDNOrHostName()
{
static std::string result = getIPOrFQDNOrHostNameImpl();
return result;
}

View File

@ -7,3 +7,4 @@
* If it does not work, return hostname - similar to calling 'hostname' without flags or 'uname -n'.
*/
const std::string & getFQDNOrHostName();
const std::string & getIPOrFQDNOrHostName();

View File

@ -23,12 +23,12 @@ namespace
#define LOG_IMPL(logger, priority, PRIORITY, ...) do \
{ \
const bool is_clients_log = (DB::CurrentThread::getGroup() != nullptr) && \
const bool __is_clients_log = (DB::CurrentThread::getGroup() != nullptr) && \
(DB::CurrentThread::getGroup()->client_logs_level >= (priority)); \
if ((logger)->is((PRIORITY)) || is_clients_log) \
if ((logger)->is((PRIORITY)) || __is_clients_log) \
{ \
std::string formatted_message = numArgs(__VA_ARGS__) > 1 ? fmt::format(__VA_ARGS__) : firstArg(__VA_ARGS__); \
if (auto channel = (logger)->getChannel()) \
if (auto __channel = (logger)->getChannel()) \
{ \
std::string file_function; \
file_function += __FILE__; \
@ -36,7 +36,7 @@ namespace
file_function += __PRETTY_FUNCTION__; \
Poco::Message poco_message((logger)->name(), formatted_message, \
(PRIORITY), file_function.c_str(), __LINE__); \
channel->log(poco_message); \
__channel->log(poco_message); \
} \
} \
} while (false)

44
base/common/singleton.h Normal file
View File

@ -0,0 +1,44 @@
#pragma once
namespace ext
{
/** Example (1):
*
* class Derived : public ext::singleton<Derived>
* {
* friend class ext::singleton<Derived>;
* ...
* protected:
* Derived() {};
* };
*
* Example (2):
*
* class Some
* {
* ...
* };
*
* class SomeSingleton : public Some, public ext::singleton<SomeSingleton> {}
*/
template <typename T> class singleton
{
public:
static T & instance()
{
/// C++11 has thread safe statics. GCC and Clang have thread safe statics by default even before C++11.
static T instance;
return instance;
}
protected:
singleton() {}
private:
singleton(const singleton &);
singleton & operator=(const singleton &);
};
}

View File

@ -990,8 +990,15 @@ constexpr integer<Bits, Signed>::integer(std::initializer_list<T> il) noexcept
{
auto it = il.begin();
for (size_t i = 0; i < _impl::item_count; ++i)
{
if (it < il.end())
{
items[i] = *it;
++it;
}
else
items[i] = 0;
}
}
}

View File

@ -63,6 +63,11 @@
# include <Common/config_version.h>
#endif
#include <Common/config.h>
#if USE_BREAKPAD
# include "client/linux/handler/exception_handler.h"
#endif
#if defined(OS_DARWIN)
# pragma GCC diagnostic ignored "-Wunused-macros"
# define _XOPEN_SOURCE 700 // ucontext is not available without _XOPEN_SOURCE
@ -73,6 +78,33 @@ namespace fs = std::filesystem;
DB::PipeFDs signal_pipe;
#if USE_BREAKPAD
static bool use_minidump = true;
static std::shared_ptr<google_breakpad::MinidumpDescriptor> descriptor;
static std::shared_ptr<google_breakpad::ExceptionHandler> eh;
static bool dumpCallbackInfo(const google_breakpad::MinidumpDescriptor & descriptor, void *, bool succeeded)
{
LOG_INFO(&Poco::Logger::get("Minidump"), "SCM {}, Signal dump path: {}", VERSION_SCM, descriptor.path());
return succeeded;
}
static bool dumpCallbackError(const google_breakpad::MinidumpDescriptor & descriptor, void *, bool succeeded)
{
LOG_ERROR(&Poco::Logger::get("Minidump"), "SCM {}, core dump path: {}", VERSION_SCM, descriptor.path());
return succeeded;
}
static std::string getMinidumpPath(const std::string & log_path)
{
auto path = Poco::Path(log_path).makeParent();
if (path.toString().empty())
return "/tmp";
return path.toString();
}
#endif
/** Reset signal handler to the default and send signal to itself.
* It's called from user signal handler to write core dump.
@ -80,6 +112,11 @@ DB::PipeFDs signal_pipe;
static void call_default_signal_handler(int sig)
{
signal(sig, SIG_DFL);
#if USE_BREAKPAD
if (use_minidump)
eh = std::shared_ptr<google_breakpad::ExceptionHandler>(
new google_breakpad::ExceptionHandler(*descriptor, nullptr, dumpCallbackError, nullptr, true, -1));
#endif
raise(sig);
}
@ -116,6 +153,11 @@ static void closeLogsSignalHandler(int sig, siginfo_t *, void *)
writeSignalIDtoSignalPipe(sig);
}
static void minidumpSignalHandler(int sig, siginfo_t *, void *)
{
writeSignalIDtoSignalPipe(sig);
}
static void terminateRequestedSignalHandler(int sig, siginfo_t *, void *)
{
DENY_ALLOCATIONS_IN_SCOPE;
@ -212,6 +254,17 @@ public:
BaseDaemon::instance().closeLogs(BaseDaemon::instance().logger());
LOG_INFO(log, "Opened new log file after received signal.");
}
else if (sig == SIGUSR2)
{
LOG_DEBUG(log, "Received signal to minidump");
#if USE_BREAKPAD
if (use_minidump)
{
google_breakpad::ExceptionHandler s_eh(*descriptor, NULL, dumpCallbackInfo, NULL, true, -1);
s_eh.WriteMinidump();
}
#endif
}
else if (sig == Signals::StdTerminate)
{
UInt32 thread_num;
@ -222,9 +275,7 @@ public:
onTerminate(message, thread_num);
}
else if (sig == SIGINT ||
sig == SIGQUIT ||
sig == SIGTERM)
else if (sig == SIGINT || sig == SIGQUIT || sig == SIGTERM)
{
daemon.handleSignal(sig);
}
@ -261,8 +312,15 @@ private:
void onTerminate(const std::string & message, UInt32 thread_num) const
{
LOG_FATAL(log, "(version {}{}, {}) (from thread {}) {}",
VERSION_STRING, VERSION_OFFICIAL, daemon.build_id_info, thread_num, message);
LOG_FATAL(
log,
"(version {}{} scm {}, {}) (from thread {}) {}",
VERSION_STRING,
VERSION_OFFICIAL,
VERSION_SCM,
daemon.build_id_info,
thread_num,
message);
}
void onFault(
@ -288,15 +346,30 @@ private:
if (query_id.empty())
{
LOG_FATAL(log, "(version {}{}, {}) (from thread {}) (no query) Received signal {} ({})",
VERSION_STRING, VERSION_OFFICIAL, daemon.build_id_info,
thread_num, strsignal(sig), sig);
LOG_FATAL(
log,
"(version {}{} scm{}, {}) (from thread {}) (no query) Received signal {} ({})",
VERSION_STRING,
VERSION_OFFICIAL,
VERSION_SCM,
daemon.build_id_info,
thread_num,
strsignal(sig),
sig);
}
else
{
LOG_FATAL(log, "(version {}{}, {}) (from thread {}) (query_id: {}) Received signal {} ({})",
VERSION_STRING, VERSION_OFFICIAL, daemon.build_id_info,
thread_num, query_id, strsignal(sig), sig);
LOG_FATAL(
log,
"(version {}{} scm {}, {}) (from thread {}) (query_id: {}) Received signal {} ({})",
VERSION_STRING,
VERSION_OFFICIAL,
VERSION_SCM,
daemon.build_id_info,
thread_num,
query_id,
strsignal(sig),
sig);
}
String error_message;
@ -330,8 +403,11 @@ private:
String calculated_binary_hash = getHashOfLoadedBinaryHex();
if (daemon.stored_binary_hash.empty())
{
LOG_FATAL(log, "Calculated checksum of the binary: {}."
" There is no information about the reference checksum.", calculated_binary_hash);
LOG_FATAL(
log,
"Calculated checksum of the binary: {}."
" There is no information about the reference checksum.",
calculated_binary_hash);
}
else if (calculated_binary_hash == daemon.stored_binary_hash)
{
@ -339,15 +415,18 @@ private:
}
else
{
LOG_FATAL(log, "Calculated checksum of the ClickHouse binary ({0}) does not correspond"
LOG_FATAL(
log,
"Calculated checksum of the ClickHouse binary ({0}) does not correspond"
" to the reference checksum stored in the binary ({1})."
" It may indicate one of the following:"
" - the file was changed just after startup;"
" - the file is damaged on disk due to faulty hardware;"
" - the loaded executable is damaged in memory due to faulty hardware;"
" - the file was intentionally modified;"
" - logical error in code."
, calculated_binary_hash, daemon.stored_binary_hash);
" - logical error in code.",
calculated_binary_hash,
daemon.stored_binary_hash);
}
#endif
@ -587,7 +666,15 @@ void debugIncreaseOOMScore() {}
void BaseDaemon::initialize(Application & self)
{
closeFDs();
/// BaseDaemon will close inheritable file descriptors from parent processe to avoid
/// security vulnerability issue and file resource resuse issue like tcp port resuse.
/// But closing inheritable fds here may be too late, since global variables initialized
/// before main entry may open some fds already, which leading to implicit problems such as
/// closing fd witch already closed by BaseDaemon before.
/// For example, Brpc will create Bvars as global variable and will open some file under /proc
/// before closeFDs called.
/// For our sitiuation, just ingoring inheritable fds is ok.
// closeFDs();
ServerApplication::initialize(self);
@ -687,6 +774,12 @@ void BaseDaemon::initialize(Application & self)
throw Poco::Exception("Cannot change directory to /tmp");
}
#if USE_BREAKPAD
use_minidump = config().getBool("use_minidump", true);
if (use_minidump)
descriptor = std::make_shared<google_breakpad::MinidumpDescriptor>(getMinidumpPath(config().getString("logger.log", "")));
#endif
/// sensitive data masking rules are not used here
buildLoggers(config(), logger(), self.commandName());
@ -796,6 +889,7 @@ void BaseDaemon::initializeTerminationAndSignalProcessing()
addSignalHandler({SIGABRT, SIGSEGV, SIGILL, SIGBUS, SIGSYS, SIGFPE, SIGPIPE, SIGTSTP, SIGTRAP}, signalHandler, &handled_signals);
addSignalHandler({SIGHUP, SIGUSR1}, closeLogsSignalHandler, &handled_signals);
addSignalHandler({SIGINT, SIGQUIT, SIGTERM}, terminateRequestedSignalHandler, &handled_signals);
addSignalHandler({SIGUSR2}, minidumpSignalHandler, &handled_signals);
#if defined(SANITIZER)
__sanitizer_set_death_callback(sanitizerDeathCallback);
@ -831,10 +925,9 @@ void BaseDaemon::initializeTerminationAndSignalProcessing()
void BaseDaemon::logRevision() const
{
Poco::Logger::root().information("Starting " + std::string{VERSION_FULL}
+ " with revision " + std::to_string(ClickHouseRevision::getVersionRevision())
+ ", " + build_id_info
+ ", PID " + std::to_string(getpid()));
Poco::Logger::root().information(
"Starting " + std::string { VERSION_FULL } + " scm " + std::string{VERSION_SCM} + " with revision "
+ std::to_string(ClickHouseRevision::getVersionRevision()) + ", " + build_id_info + ", PID " + std::to_string(getpid()));
}
void BaseDaemon::defineOptions(Poco::Util::OptionSet & new_options)

View File

@ -15,3 +15,16 @@ target_link_libraries (daemon PUBLIC loggers PRIVATE clickhouse_common_io clickh
if (USE_SENTRY)
target_link_libraries (daemon PRIVATE ${SENTRY_LIBRARY})
endif ()
if (USE_BREAKPAD)
target_link_libraries(daemon PRIVATE breakpad_client)
# Ignore warnings while compiling breakpad
target_compile_options(daemon PRIVATE "-w")
add_dependencies(daemon minidump_stackwalk minidump_dump minidump-2-core core2md dump_syms)
install(TARGETS minidump_stackwalk minidump_dump minidump-2-core core2md dump_syms
RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/usr/breakpad/bin
COMPONENT breakpad
)
endif (USE_BREAKPAD)

View File

@ -85,7 +85,7 @@ TRAP(getprotoent)
TRAP(getpwent)
TRAP(getpwent_r)
TRAP(getpwnam)
TRAP(getpwuid)
//TRAP(getpwuid)
TRAP(getservbyname)
TRAP(getservbyport)
TRAP(getservent)
@ -201,14 +201,14 @@ TRAP(dlerror) // Used by tsan
TRAP(ftw)
TRAP(getc_unlocked)
//TRAP(getenv) // Ok at program startup
TRAP(inet_ntoa)
// TRAP(inet_ntoa) // Used by udns
TRAP(lgamma)
TRAP(lgammaf)
TRAP(lgammal)
TRAP(nftw)
TRAP(nl_langinfo)
TRAP(putc_unlocked)
TRAP(rand)
//TRAP(rand)
/** In the current POSIX.1 specification (POSIX.1-2008), readdir() is not required to be thread-safe. However, in modern
* implementations (including the glibc implementation), concurrent calls to readdir() that specify different directory streams
* are thread-safe. In cases where multiple threads must read from the same directory stream, using readdir() with external

View File

@ -0,0 +1,524 @@
#include "BigInteger.hh"
#include "BigIntegerUtils.hh"
void BigInteger::operator =(const BigInteger &x) {
// Calls like a = a have no effect
if (this == &x)
return;
// Copy sign
sign = x.sign;
// Copy the rest
mag = x.mag;
}
BigInteger::BigInteger(const Blk *b, Index blen, Sign s) : mag(b, blen) {
switch (s) {
case zero:
if (!mag.isZero())
throw "BigInteger::BigInteger(const Blk *, Index, Sign): Cannot use a sign of zero with a nonzero magnitude";
sign = zero;
break;
case positive:
case negative:
// If the magnitude is zero, force the sign to zero.
sign = mag.isZero() ? zero : s;
break;
}
}
BigInteger::BigInteger(const BigUnsigned &x, Sign s) : mag(x) {
switch (s) {
case zero:
if (!mag.isZero())
throw "BigInteger::BigInteger(const BigUnsigned &, Sign): Cannot use a sign of zero with a nonzero magnitude";
sign = zero;
break;
case positive:
case negative:
// If the magnitude is zero, force the sign to zero.
sign = mag.isZero() ? zero : s;
break;
}
}
/* CONSTRUCTION FROM PRIMITIVE INTEGERS
* Same idea as in BigUnsigned.cc, except that negative input results in a
* negative BigInteger instead of an exception. */
// Done longhand to let us use initialization.
BigInteger::BigInteger(unsigned long x) : mag(x) { sign = mag.isZero() ? zero : positive; }
BigInteger::BigInteger(unsigned int x) : mag(x) { sign = mag.isZero() ? zero : positive; }
BigInteger::BigInteger(unsigned short x) : mag(x) { sign = mag.isZero() ? zero : positive; }
// For signed input, determine the desired magnitude and sign separately.
namespace {
template <class X, class UX>
BigInteger::Blk magOf(X x) {
/* UX(...) cast needed to stop short(-2^15), which negates to
* itself, from sign-extending in the conversion to Blk. */
return BigInteger::Blk(x < 0 ? UX(-x) : x);
}
template <class X>
BigInteger::Sign signOf(X x) {
return (x == 0) ? BigInteger::zero
: (x > 0) ? BigInteger::positive
: BigInteger::negative;
}
}
BigInteger::BigInteger(long x) : sign(signOf(x)), mag(magOf<long , unsigned long >(x)) {}
BigInteger::BigInteger(int x) : sign(signOf(x)), mag(magOf<int , unsigned int >(x)) {}
BigInteger::BigInteger(short x) : sign(signOf(x)), mag(magOf<short, unsigned short>(x)) {}
// CONVERSION TO PRIMITIVE INTEGERS
/* Reuse BigUnsigned's conversion to an unsigned primitive integer.
* The friend is a separate function rather than
* BigInteger::convertToUnsignedPrimitive to avoid requiring BigUnsigned to
* declare BigInteger. */
template <class X>
inline X convertBigUnsignedToPrimitiveAccess(const BigUnsigned &a) {
return a.convertToPrimitive<X>();
}
template <class X>
X BigInteger::convertToUnsignedPrimitive() const {
if (sign == negative)
throw "BigInteger::to<Primitive>: "
"Cannot convert a negative integer to an unsigned type";
else
return convertBigUnsignedToPrimitiveAccess<X>(mag);
}
/* Similar to BigUnsigned::convertToPrimitive, but split into two cases for
* nonnegative and negative numbers. */
template <class X, class UX>
X BigInteger::convertToSignedPrimitive() const {
if (sign == zero)
return 0;
else if (mag.getLength() == 1) {
// The single block might fit in an X. Try the conversion.
Blk b = mag.getBlock(0);
if (sign == positive) {
X x = X(b);
if (x >= 0 && Blk(x) == b)
return x;
} else {
X x = -X(b);
/* UX(...) needed to avoid rejecting conversion of
* -2^15 to a short. */
if (x < 0 && Blk(UX(-x)) == b)
return x;
}
// Otherwise fall through.
}
throw "BigInteger::to<Primitive>: "
"Value is too big to fit in the requested type";
}
unsigned long BigInteger::toUnsignedLong () const { return convertToUnsignedPrimitive<unsigned long > (); }
unsigned int BigInteger::toUnsignedInt () const { return convertToUnsignedPrimitive<unsigned int > (); }
unsigned short BigInteger::toUnsignedShort() const { return convertToUnsignedPrimitive<unsigned short> (); }
long BigInteger::toLong () const { return convertToSignedPrimitive <long , unsigned long> (); }
int BigInteger::toInt () const { return convertToSignedPrimitive <int , unsigned int> (); }
short BigInteger::toShort () const { return convertToSignedPrimitive <short, unsigned short>(); }
// COMPARISON
BigInteger::CmpRes BigInteger::compareTo(const BigInteger &x) const {
// A greater sign implies a greater number
if (sign < x.sign)
return less;
else if (sign > x.sign)
return greater;
else switch (sign) {
// If the signs are the same...
case zero:
return equal; // Two zeros are equal
case positive:
// Compare the magnitudes
return mag.compareTo(x.mag);
case negative:
// Compare the magnitudes, but return the opposite result
return CmpRes(-mag.compareTo(x.mag));
}
}
/* COPY-LESS OPERATIONS
* These do some messing around to determine the sign of the result,
* then call one of BigUnsigned's copy-less operations. */
// See remarks about aliased calls in BigUnsigned.cc .
#define DTRT_ALIASED(cond, op) \
if (cond) { \
BigInteger tmpThis; \
tmpThis.op; \
*this = tmpThis; \
return; \
}
void BigInteger::add(const BigInteger &a, const BigInteger &b) {
DTRT_ALIASED(this == &a || this == &b, add(a, b))
// If one argument is zero, copy the other.
if (a.sign == zero)
operator =(b);
else if (b.sign == zero)
operator =(a);
// If the arguments have the same sign, take the
// common sign and add their magnitudes.
else if (a.sign == b.sign) {
sign = a.sign;
mag.add(a.mag, b.mag);
} else {
// Otherwise, their magnitudes must be compared.
switch (a.mag.compareTo(b.mag)) {
case equal:
// If their magnitudes are the same, copy zero.
mag = 0;
sign = zero;
break;
// Otherwise, take the sign of the greater, and subtract
// the lesser magnitude from the greater magnitude.
case greater:
sign = a.sign;
mag.subtract(a.mag, b.mag);
break;
case less:
sign = b.sign;
mag.subtract(b.mag, a.mag);
break;
}
}
}
void BigInteger::subtract(const BigInteger &a, const BigInteger &b) {
// Notice that this routine is identical to BigInteger::add,
// if one replaces b.sign by its opposite.
DTRT_ALIASED(this == &a || this == &b, subtract(a, b))
// If a is zero, copy b and flip its sign. If b is zero, copy a.
if (a.sign == zero) {
mag = b.mag;
// Take the negative of _b_'s, sign, not ours.
// Bug pointed out by Sam Larkin on 2005.03.30.
sign = Sign(-b.sign);
} else if (b.sign == zero)
operator =(a);
// If their signs differ, take a.sign and add the magnitudes.
else if (a.sign != b.sign) {
sign = a.sign;
mag.add(a.mag, b.mag);
} else {
// Otherwise, their magnitudes must be compared.
switch (a.mag.compareTo(b.mag)) {
// If their magnitudes are the same, copy zero.
case equal:
mag = 0;
sign = zero;
break;
// If a's magnitude is greater, take a.sign and
// subtract a from b.
case greater:
sign = a.sign;
mag.subtract(a.mag, b.mag);
break;
// If b's magnitude is greater, take the opposite
// of b.sign and subtract b from a.
case less:
sign = Sign(-b.sign);
mag.subtract(b.mag, a.mag);
break;
}
}
}
void BigInteger::multiply(const BigInteger &a, const BigInteger &b) {
DTRT_ALIASED(this == &a || this == &b, multiply(a, b))
// If one object is zero, copy zero and return.
if (a.sign == zero || b.sign == zero) {
sign = zero;
mag = 0;
return;
}
// If the signs of the arguments are the same, the result
// is positive, otherwise it is negative.
sign = (a.sign == b.sign) ? positive : negative;
// Multiply the magnitudes.
mag.multiply(a.mag, b.mag);
}
/*
* DIVISION WITH REMAINDER
* Please read the comments before the definition of
* `BigUnsigned::divideWithRemainder' in `BigUnsigned.cc' for lots of
* information you should know before reading this function.
*
* Following Knuth, I decree that x / y is to be
* 0 if y==0 and floor(real-number x / y) if y!=0.
* Then x % y shall be x - y*(integer x / y).
*
* Note that x = y * (x / y) + (x % y) always holds.
* In addition, (x % y) is from 0 to y - 1 if y > 0,
* and from -(|y| - 1) to 0 if y < 0. (x % y) = x if y = 0.
*
* Examples: (q = a / b, r = a % b)
* a b q r
* === === === ===
* 4 3 1 1
* -4 3 -2 2
* 4 -3 -2 -2
* -4 -3 1 -1
*/
void BigInteger::divideWithRemainder(const BigInteger &b, BigInteger &q) {
// Defend against aliased calls;
// same idea as in BigUnsigned::divideWithRemainder .
if (this == &q)
throw "BigInteger::divideWithRemainder: Cannot write quotient and remainder into the same variable";
if (this == &b || &q == &b) {
BigInteger tmpB(b);
divideWithRemainder(tmpB, q);
return;
}
// Division by zero gives quotient 0 and remainder *this
if (b.sign == zero) {
q.mag = 0;
q.sign = zero;
return;
}
// 0 / b gives quotient 0 and remainder 0
if (sign == zero) {
q.mag = 0;
q.sign = zero;
return;
}
// Here *this != 0, b != 0.
// Do the operands have the same sign?
if (sign == b.sign) {
// Yes: easy case. Quotient is zero or positive.
q.sign = positive;
} else {
// No: harder case. Quotient is negative.
q.sign = negative;
// Decrease the magnitude of the dividend by one.
mag--;
/*
* We tinker with the dividend before and with the
* quotient and remainder after so that the result
* comes out right. To see why it works, consider the following
* list of examples, where A is the magnitude-decreased
* a, Q and R are the results of BigUnsigned division
* with remainder on A and |b|, and q and r are the
* final results we want:
*
* a A b Q R q r
* -3 -2 3 0 2 -1 0
* -4 -3 3 1 0 -2 2
* -5 -4 3 1 1 -2 1
* -6 -5 3 1 2 -2 0
*
* It appears that we need a total of 3 corrections:
* Decrease the magnitude of a to get A. Increase the
* magnitude of Q to get q (and make it negative).
* Find r = (b - 1) - R and give it the desired sign.
*/
}
// Divide the magnitudes.
mag.divideWithRemainder(b.mag, q.mag);
if (sign != b.sign) {
// More for the harder case (as described):
// Increase the magnitude of the quotient by one.
q.mag++;
// Modify the remainder.
mag.subtract(b.mag, mag);
mag--;
}
// Sign of the remainder is always the sign of the divisor b.
sign = b.sign;
// Set signs to zero as necessary. (Thanks David Allen!)
if (mag.isZero())
sign = zero;
if (q.mag.isZero())
q.sign = zero;
// WHEW!!!
}
// Negation
void BigInteger::negate(const BigInteger &a) {
DTRT_ALIASED(this == &a, negate(a))
// Copy a's magnitude
mag = a.mag;
// Copy the opposite of a.sign
sign = Sign(-a.sign);
}
// INCREMENT/DECREMENT OPERATORS
// Prefix increment
void BigInteger::operator ++() {
if (sign == negative) {
mag--;
if (mag == 0)
sign = zero;
} else {
mag++;
sign = positive; // if not already
}
}
// Postfix increment: same as prefix
void BigInteger::operator ++(int) {
operator ++();
}
// Prefix decrement
void BigInteger::operator --() {
if (sign == positive) {
mag--;
if (mag == 0)
sign = zero;
} else {
mag++;
sign = negative;
}
}
// Postfix decrement: same as prefix
void BigInteger::operator --(int)
{
operator--();
}
void BigInteger::bitXor(const BigInteger &a, const BigInteger &b)
{
DTRT_ALIASED(this == &a || this == &b, bitXor(a, b))
if (a.isZero())
{
sign = b.sign;
mag = b.mag;
return;
}
else if (b.isZero())
{
sign = a.sign;
mag = a.mag;
return;
}
if (a.sign == b.sign)
sign = positive;
else
sign = negative;
bool needDeleteA = false, needDeleteB = false;
BigUnsigned *ma = const_cast<BigUnsigned*>(&(a.mag)), *mb = const_cast<BigUnsigned*>(&(b.mag));
if (a.sign == negative)
{
ma = new BigUnsigned();
*ma = ~a.mag + 1;
needDeleteA = true;
}
if (b.sign == negative)
{
mb = new BigUnsigned();
*mb = ~b.mag + 1;
needDeleteB = true;
}
if (ma->getLength() < mb->getLength())
ma->fillLeading(mb->getLength(), a.sign == positive);
else if (ma->getLength() > mb->getLength())
mb->fillLeading(ma->getLength(), b.sign == positive);
mag.bitXor(*ma, *mb, false);
if (sign == positive)
{
mag.zapLeadingZeros();
if (mag.isZero())
sign = zero;
}
else if (sign == negative)
{
mag = ~mag + 1;
}
if (needDeleteA)
delete ma;
if (needDeleteB)
delete mb;
}
void BigInteger::bitShiftLeft(const BigInteger &a, int b)
{
DTRT_ALIASED(this == &a, bitShiftLeft(a, b))
if (b < 0) {
if (b << 1 == 0)
throw "BigInteger::bitShiftLeft: "
"Pathological shift amount not implemented";
else {
bitShiftRight(a, -b);
return;
}
}
if (a.sign == zero)
{
sign = zero;
mag.setZero();
}
else
{
mag = a.mag << b;
sign = a.sign;
}
}
void BigInteger::bitShiftRight(const BigInteger &a, int b)
{
DTRT_ALIASED(this == &a, bitShiftRight(a, b))
if (b < 0) {
if (b << 1 == 0)
throw "BigInteger::bitShiftRight: "
"Pathological shift amount not implemented";
else {
bitShiftLeft(a, -b);
return;
}
}
if (a.sign == zero)
{
sign = zero;
mag.setZero();
}
else
{
sign = a.sign;
mag = a.mag >> b;
if (mag.isZero())
{
/// Handle the special case that all bits were shifted off the end
if (sign == positive)
sign = zero;
else
mag = 1;
}
else
{
if (sign == negative)
{
/// Find out whether any one-bits were shifted off the end
int nULong = b >> 6;
int nBits = b & 0x3f;
bool onesLost = false;
for (int i = 0; i < nULong && !onesLost; ++i)
onesLost = (a.mag.getBlock(i) != 0);
if (!onesLost && nBits != 0)
onesLost = ((a.mag.getBlock(nULong) << (64 - nBits)) != 0);
if (onesLost)
mag += 1;
}
}
}
}

View File

@ -0,0 +1,256 @@
#ifndef BIGINTEGER_H
#define BIGINTEGER_H
#include "BigUnsigned.hh"
/* A BigInteger object represents a signed integer of size limited only by
* available memory. BigUnsigneds support most mathematical operators and can
* be converted to and from most primitive integer types.
*
* A BigInteger is just an aggregate of a BigUnsigned and a sign. (It is no
* longer derived from BigUnsigned because that led to harmful implicit
* conversions.) */
class BigInteger {
public:
typedef BigUnsigned::Blk Blk;
typedef BigUnsigned::Index Index;
typedef BigUnsigned::CmpRes CmpRes;
static const CmpRes
less = BigUnsigned::less ,
equal = BigUnsigned::equal ,
greater = BigUnsigned::greater;
// Enumeration for the sign of a BigInteger.
enum Sign { negative = -1, zero = 0, positive = 1 };
protected:
Sign sign;
BigUnsigned mag;
public:
// Constructs zero.
BigInteger() : sign(zero), mag() {}
// Copy constructor
BigInteger(const BigInteger &x) : sign(x.sign), mag(x.mag) {}
// Assignment operator
void operator=(const BigInteger &x);
// Constructor that copies from a given array of blocks with a sign.
BigInteger(const Blk *b, Index blen, Sign s);
// Nonnegative constructor that copies from a given array of blocks.
BigInteger(const Blk *b, Index blen) : mag(b, blen) {
sign = mag.isZero() ? zero : positive;
}
// Constructor from a BigUnsigned and a sign
BigInteger(const BigUnsigned &x, Sign s);
// Nonnegative constructor from a BigUnsigned
BigInteger(const BigUnsigned &x) : mag(x) {
sign = mag.isZero() ? zero : positive;
}
// Constructors from primitive integer types
BigInteger(unsigned long x);
BigInteger( long x);
BigInteger(unsigned int x);
BigInteger( int x);
BigInteger(unsigned short x);
BigInteger( short x);
/* Converters to primitive integer types
* The implicit conversion operators caused trouble, so these are now
* named. */
unsigned long toUnsignedLong () const;
long toLong () const;
unsigned int toUnsignedInt () const;
int toInt () const;
unsigned short toUnsignedShort() const;
short toShort () const;
protected:
// Helper
template <class X> X convertToUnsignedPrimitive() const;
template <class X, class UX> X convertToSignedPrimitive() const;
public:
// ACCESSORS
Sign getSign() const { return sign; }
/* The client can't do any harm by holding a read-only reference to the
* magnitude. */
const BigUnsigned &getMagnitude() const { return mag; }
// Some accessors that go through to the magnitude
Index getLength() const { return mag.getLength(); }
Index getCapacity() const { return mag.getCapacity(); }
Blk getBlock(Index i) const { return mag.getBlock(i); }
bool isZero() const { return sign == zero; } // A bit special
// COMPARISONS
// Compares this to x like Perl's <=>
CmpRes compareTo(const BigInteger &x) const;
// Ordinary comparison operators
bool operator ==(const BigInteger &x) const {
return sign == x.sign && mag == x.mag;
}
bool operator !=(const BigInteger &x) const { return !operator ==(x); }
bool operator < (const BigInteger &x) const { return compareTo(x) == less ; }
bool operator <=(const BigInteger &x) const { return compareTo(x) != greater; }
bool operator >=(const BigInteger &x) const { return compareTo(x) != less ; }
bool operator > (const BigInteger &x) const { return compareTo(x) == greater; }
// OPERATORS -- See the discussion in BigUnsigned.hh.
void add (const BigInteger &a, const BigInteger &b);
void subtract(const BigInteger &a, const BigInteger &b);
void multiply(const BigInteger &a, const BigInteger &b);
/* See the comment on BigUnsigned::divideWithRemainder. Semantics
* differ from those of primitive integers when negatives and/or zeros
* are involved. */
void divideWithRemainder(const BigInteger &b, BigInteger &q);
void negate(const BigInteger &a);
/* Bitwise operators are not provided for BigIntegers. Use
* getMagnitude to get the magnitude and operate on that instead. */
void bitXor(const BigInteger &a, const BigInteger &b);
void bitShiftLeft(const BigInteger &a, int b);
void bitShiftRight(const BigInteger &a, int b);
BigInteger operator +(const BigInteger &x) const;
BigInteger operator -(const BigInteger &x) const;
BigInteger operator *(const BigInteger &x) const;
BigInteger operator /(const BigInteger &x) const;
BigInteger operator %(const BigInteger &x) const;
BigInteger operator -() const;
void operator +=(const BigInteger &x);
void operator -=(const BigInteger &x);
void operator *=(const BigInteger &x);
void operator /=(const BigInteger &x);
void operator %=(const BigInteger &x);
void flipSign();
BigInteger operator <<(int b) const;
BigInteger operator >>(int b) const;
BigInteger operator ^(const BigInteger &x) const;
void operator ^=(const BigInteger &x);
void operator <<=(int b);
void operator >>=(int b);
// INCREMENT/DECREMENT OPERATORS
void operator ++( );
void operator ++(int);
void operator --( );
void operator --(int);
};
// NORMAL OPERATORS
/* These create an object to hold the result and invoke
* the appropriate put-here operation on it, passing
* this and x. The new object is then returned. */
inline BigInteger BigInteger::operator +(const BigInteger &x) const {
BigInteger ans;
ans.add(*this, x);
return ans;
}
inline BigInteger BigInteger::operator -(const BigInteger &x) const {
BigInteger ans;
ans.subtract(*this, x);
return ans;
}
inline BigInteger BigInteger::operator *(const BigInteger &x) const {
BigInteger ans;
ans.multiply(*this, x);
return ans;
}
inline BigInteger BigInteger::operator /(const BigInteger &x) const {
if (x.isZero()) throw "BigInteger::operator /: division by zero";
BigInteger q, r;
r = *this;
r.divideWithRemainder(x, q);
return q;
}
inline BigInteger BigInteger::operator %(const BigInteger &x) const {
if (x.isZero()) throw "BigInteger::operator %: division by zero";
BigInteger q, r;
r = *this;
r.divideWithRemainder(x, q);
return r;
}
inline BigInteger BigInteger::operator -() const {
BigInteger ans;
ans.negate(*this);
return ans;
}
/*
* ASSIGNMENT OPERATORS
*
* Now the responsibility for making a temporary copy if necessary
* belongs to the put-here operations. See Assignment Operators in
* BigUnsigned.hh.
*/
inline void BigInteger::operator +=(const BigInteger &x) {
add(*this, x);
}
inline void BigInteger::operator -=(const BigInteger &x) {
subtract(*this, x);
}
inline void BigInteger::operator *=(const BigInteger &x) {
multiply(*this, x);
}
inline void BigInteger::operator /=(const BigInteger &x) {
if (x.isZero()) throw "BigInteger::operator /=: division by zero";
/* The following technique is slightly faster than copying *this first
* when x is large. */
BigInteger q;
divideWithRemainder(x, q);
// *this contains the remainder, but we overwrite it with the quotient.
*this = q;
}
inline void BigInteger::operator %=(const BigInteger &x) {
if (x.isZero()) throw "BigInteger::operator %=: division by zero";
BigInteger q;
// Mods *this by x. Don't care about quotient left in q.
divideWithRemainder(x, q);
}
// This one is trivial
inline void BigInteger::flipSign() {
sign = Sign(-sign);
}
inline BigInteger BigInteger::operator <<(int b) const {
BigInteger ans;
ans.bitShiftLeft(*this, b);
return ans;
}
inline BigInteger BigInteger::operator >>(int b) const {
BigInteger ans;
ans.bitShiftRight(*this, b);
return ans;
}
inline BigInteger BigInteger::operator ^(const BigInteger &x) const {
BigInteger ans;
ans.bitXor(*this, x);
return ans;
}
inline void BigInteger::operator ^=(const BigInteger &x)
{
bitXor(*this, x);
}
inline void BigInteger::operator <<=(int b) {
bitShiftLeft(*this, b);
}
inline void BigInteger::operator >>=(int b) {
bitShiftRight(*this, b);
}
#endif

View File

@ -0,0 +1,70 @@
#include "BigIntegerAlgorithms.hh"
BigUnsigned gcd(BigUnsigned a, BigUnsigned b) {
BigUnsigned trash;
// Neat in-place alternating technique.
for (;;) {
if (b.isZero())
return a;
a.divideWithRemainder(b, trash);
if (a.isZero())
return b;
b.divideWithRemainder(a, trash);
}
}
void extendedEuclidean(BigInteger m, BigInteger n,
BigInteger &g, BigInteger &r, BigInteger &s) {
if (&g == &r || &g == &s || &r == &s)
throw "BigInteger extendedEuclidean: Outputs are aliased";
BigInteger r1(1), s1(0), r2(0), s2(1), q;
/* Invariants:
* r1*m(orig) + s1*n(orig) == m(current)
* r2*m(orig) + s2*n(orig) == n(current) */
for (;;) {
if (n.isZero()) {
r = r1; s = s1; g = m;
return;
}
// Subtract q times the second invariant from the first invariant.
m.divideWithRemainder(n, q);
r1 -= q*r2; s1 -= q*s2;
if (m.isZero()) {
r = r2; s = s2; g = n;
return;
}
// Subtract q times the first invariant from the second invariant.
n.divideWithRemainder(m, q);
r2 -= q*r1; s2 -= q*s1;
}
}
BigUnsigned modinv(const BigInteger &x, const BigUnsigned &n) {
BigInteger g, r, s;
extendedEuclidean(x, n, g, r, s);
if (g == 1)
// r*x + s*n == 1, so r*x === 1 (mod n), so r is the answer.
return (r % n).getMagnitude(); // (r % n) will be nonnegative
else
throw "BigInteger modinv: x and n have a common factor";
}
BigUnsigned modexp(const BigInteger &base, const BigUnsigned &exponent,
const BigUnsigned &modulus) {
BigUnsigned ans = 1, base2 = (base % modulus).getMagnitude();
BigUnsigned::Index i = exponent.bitLength();
// For each bit of the exponent, most to least significant...
while (i > 0) {
i--;
// Square.
ans *= ans;
ans %= modulus;
// And multiply if the bit is a 1.
if (exponent.getBit(i)) {
ans *= base2;
ans %= modulus;
}
}
return ans;
}

View File

@ -0,0 +1,25 @@
#ifndef BIGINTEGERALGORITHMS_H
#define BIGINTEGERALGORITHMS_H
#include "BigInteger.hh"
/* Some mathematical algorithms for big integers.
* This code is new and, as such, experimental. */
// Returns the greatest common divisor of a and b.
BigUnsigned gcd(BigUnsigned a, BigUnsigned b);
/* Extended Euclidean algorithm.
* Given m and n, finds gcd g and numbers r, s such that r*m + s*n == g. */
void extendedEuclidean(BigInteger m, BigInteger n,
BigInteger &g, BigInteger &r, BigInteger &s);
/* Returns the multiplicative inverse of x modulo n, or throws an exception if
* they have a common factor. */
BigUnsigned modinv(const BigInteger &x, const BigUnsigned &n);
// Returns (base ^ exponent) % modulus.
BigUnsigned modexp(const BigInteger &base, const BigUnsigned &exponent,
const BigUnsigned &modulus);
#endif

View File

@ -0,0 +1,8 @@
// This header file includes all of the library header files.
#include "NumberlikeArray.hh"
#include "BigUnsigned.hh"
#include "BigInteger.hh"
#include "BigIntegerAlgorithms.hh"
#include "BigUnsignedInABase.hh"
#include "BigIntegerUtils.hh"

View File

@ -0,0 +1,93 @@
#include "BigIntegerUtils.hh"
#include "BigUnsignedInABase.hh"
#include <iostream>
std::string bigUnsignedToString(const BigUnsigned &x) {
return std::string(BigUnsignedInABase(x, 10));
}
std::string bigIntegerToString(const BigInteger &x) {
return (x.getSign() == BigInteger::negative)
? (std::string("-") + bigUnsignedToString(x.getMagnitude()))
: (bigUnsignedToString(x.getMagnitude()));
}
BigUnsigned stringToBigUnsigned(const char *s, const size_t & length)
{
return BigUnsigned(BigUnsignedInABase(s, length,10));
}
BigUnsigned stringToBigUnsigned(const std::string &s) {
return stringToBigUnsigned(s.c_str(), s.length());
}
BigInteger stringToBigInteger(const std::string &s) {
// Recognize a sign followed by a BigUnsigned.
return (s[0] == '-') ? BigInteger(stringToBigUnsigned(s.substr(1, s.length() - 1)), BigInteger::negative)
: (s[0] == '+') ? BigInteger(stringToBigUnsigned(s.substr(1, s.length() - 1)))
: BigInteger(stringToBigUnsigned(s));
}
BigInteger stringToBigInteger(const char *s, const size_t & length) {
// Recognize a sign followed by a BigUnsigned.
const char *data = s;
if (length > 0)
{
if (*data == '-')
return BigInteger(stringToBigUnsigned(++data, length - 1), BigInteger::negative);
else if (*data == '+')
return BigInteger(stringToBigUnsigned(++data, length - 1));
}
return BigInteger(stringToBigUnsigned(data, length));
}
bool isDigit(const char *s, const size_t & length)
{
if (length == 0)
return false;
const char *data = s;
size_t remaining_length = length;
if (*data == '-' || *data == '+')
{
++data;
--remaining_length;
if (remaining_length == 0)
return false;
}
while (remaining_length > 0)
{
if (*data < '0' || *data > '9')
return false;
++data;
--remaining_length;
}
return true;
}
std::ostream &operator <<(std::ostream &os, const BigUnsigned &x) {
BigUnsignedInABase::Base base;
long osFlags = os.flags();
if (osFlags & os.dec)
base = 10;
else if (osFlags & os.hex) {
base = 16;
if (osFlags & os.showbase)
os << "0x";
} else if (osFlags & os.oct) {
base = 8;
if (osFlags & os.showbase)
os << '0';
} else
throw "std::ostream << BigUnsigned: Could not determine the desired base from output-stream flags";
std::string s = std::string(BigUnsignedInABase(x, base));
os << s;
return os;
}
std::ostream &operator <<(std::ostream &os, const BigInteger &x) {
if (x.getSign() == BigInteger::negative)
os << '-';
os << x.getMagnitude();
return os;
}

View File

@ -0,0 +1,74 @@
#ifndef BIGINTEGERUTILS_H
#define BIGINTEGERUTILS_H
#include "BigInteger.hh"
#include <string>
/* This file provides:
* - Convenient std::string <-> BigUnsigned/BigInteger conversion routines
* - std::ostream << operators for BigUnsigned/BigInteger */
// std::string conversion routines. Base 10 only.
std::string bigUnsignedToString(const BigUnsigned &x);
std::string bigIntegerToString(const BigInteger &x);
BigUnsigned stringToBigUnsigned(const std::string &s);
BigInteger stringToBigInteger(const std::string &s);
BigUnsigned stringToBigUnsigned(const char *s, const size_t & length);
BigInteger stringToBigInteger(const char *s, const size_t & length);
bool isDigit(const char *s, const size_t & length);
// Creates a BigInteger from data such as `char's; read below for details.
template <class T>
BigInteger dataToBigInteger(const T* data, BigInteger::Index length, BigInteger::Sign sign);
// Outputs x to os, obeying the flags `dec', `hex', `bin', and `showbase'.
std::ostream &operator <<(std::ostream &os, const BigUnsigned &x);
// Outputs x to os, obeying the flags `dec', `hex', `bin', and `showbase'.
// My somewhat arbitrary policy: a negative sign comes before a base indicator (like -0xFF).
std::ostream &operator <<(std::ostream &os, const BigInteger &x);
// BEGIN TEMPLATE DEFINITIONS.
/*
* Converts binary data to a BigInteger.
* Pass an array `data', its length, and the desired sign.
*
* Elements of `data' may be of any type `T' that has the following
* two properties (this includes almost all integral types):
*
* (1) `sizeof(T)' correctly gives the amount of binary data in one
* value of `T' and is a factor of `sizeof(Blk)'.
*
* (2) When a value of `T' is casted to a `Blk', the low bytes of
* the result contain the desired binary data.
*/
template <class T>
BigInteger dataToBigInteger(const T* data, BigInteger::Index length, BigInteger::Sign sign) {
// really ceiling(numBytes / sizeof(BigInteger::Blk))
unsigned int pieceSizeInBits = 8 * sizeof(T);
unsigned int piecesPerBlock = sizeof(BigInteger::Blk) / sizeof(T);
unsigned int numBlocks = (length + piecesPerBlock - 1) / piecesPerBlock;
// Allocate our block array
BigInteger::Blk *blocks = new BigInteger::Blk[numBlocks];
BigInteger::Index blockNum, pieceNum, pieceNumHere;
// Convert
for (blockNum = 0, pieceNum = 0; blockNum < numBlocks; blockNum++) {
BigInteger::Blk curBlock = 0;
for (pieceNumHere = 0; pieceNumHere < piecesPerBlock && pieceNum < length;
pieceNumHere++, pieceNum++)
curBlock |= (BigInteger::Blk(data[pieceNum]) << (pieceSizeInBits * pieceNumHere));
blocks[blockNum] = curBlock;
}
// Create the BigInteger.
BigInteger x(blocks, numBlocks, sign);
delete [] blocks;
return x;
}
#endif

View File

@ -0,0 +1,710 @@
#include "BigUnsigned.hh"
#include "BigIntegerUtils.hh"
// Memory management definitions have moved to the bottom of NumberlikeArray.hh.
// The templates used by these constructors and converters are at the bottom of
// BigUnsigned.hh.
BigUnsigned::BigUnsigned(unsigned long x) { initFromPrimitive (x); }
BigUnsigned::BigUnsigned(unsigned int x) { initFromPrimitive (x); }
BigUnsigned::BigUnsigned(unsigned short x) { initFromPrimitive (x); }
BigUnsigned::BigUnsigned( long x) { initFromSignedPrimitive(x); }
BigUnsigned::BigUnsigned( int x) { initFromSignedPrimitive(x); }
BigUnsigned::BigUnsigned( short x) { initFromSignedPrimitive(x); }
unsigned long BigUnsigned::toUnsignedLong () const { return convertToPrimitive <unsigned long >(); }
unsigned int BigUnsigned::toUnsignedInt () const { return convertToPrimitive <unsigned int >(); }
unsigned short BigUnsigned::toUnsignedShort() const { return convertToPrimitive <unsigned short>(); }
long BigUnsigned::toLong () const { return convertToSignedPrimitive< long >(); }
int BigUnsigned::toInt () const { return convertToSignedPrimitive< int >(); }
short BigUnsigned::toShort () const { return convertToSignedPrimitive< short>(); }
// BIT/BLOCK ACCESSORS
void BigUnsigned::setBlock(Index i, Blk newBlock) {
if (newBlock == 0) {
if (i < len) {
blk[i] = 0;
zapLeadingZeros();
}
// If i >= len, no effect.
} else {
if (i >= len) {
// The nonzero block extends the number.
allocateAndCopy(i+1);
// Zero any added blocks that we aren't setting.
for (Index j = len; j < i; j++)
blk[j] = 0;
len = i+1;
}
blk[i] = newBlock;
}
}
/* Evidently the compiler wants BigUnsigned:: on the return type because, at
* that point, it hasn't yet parsed the BigUnsigned:: on the name to get the
* proper scope. */
BigUnsigned::Index BigUnsigned::bitLength() const {
if (isZero())
return 0;
else {
Blk leftmostBlock = getBlock(len - 1);
Index leftmostBlockLen = 0;
while (leftmostBlock != 0) {
leftmostBlock >>= 1;
leftmostBlockLen++;
}
return leftmostBlockLen + (len - 1) * N;
}
}
void BigUnsigned::setBit(Index bi, bool newBit) {
Index blockI = bi / N;
Blk block = getBlock(blockI), mask = Blk(1) << (bi % N);
block = newBit ? (block | mask) : (block & ~mask);
setBlock(blockI, block);
}
// COMPARISON
BigUnsigned::CmpRes BigUnsigned::compareTo(const BigUnsigned &x) const {
// A bigger length implies a bigger number.
if (len < x.len)
return less;
else if (len > x.len)
return greater;
else {
// Compare blocks one by one from left to right.
Index i = len;
while (i > 0) {
i--;
if (blk[i] == x.blk[i])
continue;
else if (blk[i] > x.blk[i])
return greater;
else
return less;
}
// If no blocks differed, the numbers are equal.
return equal;
}
}
// COPY-LESS OPERATIONS
/*
* On most calls to copy-less operations, it's safe to read the inputs little by
* little and write the outputs little by little. However, if one of the
* inputs is coming from the same variable into which the output is to be
* stored (an "aliased" call), we risk overwriting the input before we read it.
* In this case, we first compute the result into a temporary BigUnsigned
* variable and then copy it into the requested output variable *this.
* Each put-here operation uses the DTRT_ALIASED macro (Do The Right Thing on
* aliased calls) to generate code for this check.
*
* I adopted this approach on 2007.02.13 (see Assignment Operators in
* BigUnsigned.hh). Before then, put-here operations rejected aliased calls
* with an exception. I think doing the right thing is better.
*
* Some of the put-here operations can probably handle aliased calls safely
* without the extra copy because (for example) they process blocks strictly
* right-to-left. At some point I might determine which ones don't need the
* copy, but my reasoning would need to be verified very carefully. For now
* I'll leave in the copy.
*/
#define DTRT_ALIASED(cond, op) \
if (cond) { \
BigUnsigned tmpThis; \
tmpThis.op; \
*this = tmpThis; \
return; \
}
void BigUnsigned::add(const BigUnsigned &a, const BigUnsigned &b) {
DTRT_ALIASED(this == &a || this == &b, add(a, b))
// If one argument is zero, copy the other.
if (a.len == 0) {
operator =(b);
return;
} else if (b.len == 0) {
operator =(a);
return;
}
// Some variables...
// Carries in and out of an addition stage
bool carryIn, carryOut;
Blk temp;
Index i;
// a2 points to the longer input, b2 points to the shorter
const BigUnsigned *a2, *b2;
if (a.len >= b.len) {
a2 = &a;
b2 = &b;
} else {
a2 = &b;
b2 = &a;
}
// Set prelimiary length and make room in this BigUnsigned
len = a2->len + 1;
allocate(len);
// For each block index that is present in both inputs...
for (i = 0, carryIn = false; i < b2->len; i++) {
// Add input blocks
temp = a2->blk[i] + b2->blk[i];
// If a rollover occurred, the result is less than either input.
// This test is used many times in the BigUnsigned code.
carryOut = (temp < a2->blk[i]);
// If a carry was input, handle it
if (carryIn) {
temp++;
carryOut |= (temp == 0);
}
blk[i] = temp; // Save the addition result
carryIn = carryOut; // Pass the carry along
}
// If there is a carry left over, increase blocks until
// one does not roll over.
for (; i < a2->len && carryIn; i++) {
temp = a2->blk[i] + 1;
carryIn = (temp == 0);
blk[i] = temp;
}
// If the carry was resolved but the larger number
// still has blocks, copy them over.
for (; i < a2->len; i++)
blk[i] = a2->blk[i];
// Set the extra block if there's still a carry, decrease length otherwise
if (carryIn)
blk[i] = 1;
else
len--;
}
void BigUnsigned::subtract(const BigUnsigned &a, const BigUnsigned &b) {
DTRT_ALIASED(this == &a || this == &b, subtract(a, b))
if (b.len == 0) {
// If b is zero, copy a.
operator =(a);
return;
} else if (a.len < b.len)
// If a is shorter than b, the result is negative.
throw "BigUnsigned::subtract: "
"Negative result in unsigned calculation";
// Some variables...
bool borrowIn, borrowOut;
Blk temp;
Index i;
// Set preliminary length and make room
len = a.len;
allocate(len);
// For each block index that is present in both inputs...
for (i = 0, borrowIn = false; i < b.len; i++) {
temp = a.blk[i] - b.blk[i];
// If a reverse rollover occurred,
// the result is greater than the block from a.
borrowOut = (temp > a.blk[i]);
// Handle an incoming borrow
if (borrowIn) {
borrowOut |= (temp == 0);
temp--;
}
blk[i] = temp; // Save the subtraction result
borrowIn = borrowOut; // Pass the borrow along
}
// If there is a borrow left over, decrease blocks until
// one does not reverse rollover.
for (; i < a.len && borrowIn; i++) {
borrowIn = (a.blk[i] == 0);
blk[i] = a.blk[i] - 1;
}
/* If there's still a borrow, the result is negative.
* Throw an exception, but zero out this object so as to leave it in a
* predictable state. */
if (borrowIn) {
len = 0;
throw "BigUnsigned::subtract: Negative result in unsigned calculation";
} else
// Copy over the rest of the blocks
for (; i < a.len; i++)
blk[i] = a.blk[i];
// Zap leading zeros
zapLeadingZeros();
}
/*
* About the multiplication and division algorithms:
*
* I searched unsucessfully for fast C++ built-in operations like the `b_0'
* and `c_0' Knuth describes in Section 4.3.1 of ``The Art of Computer
* Programming'' (replace `place' by `Blk'):
*
* ``b_0[:] multiplication of a one-place integer by another one-place
* integer, giving a two-place answer;
*
* ``c_0[:] division of a two-place integer by a one-place integer,
* provided that the quotient is a one-place integer, and yielding
* also a one-place remainder.''
*
* I also missed his note that ``[b]y adjusting the word size, if
* necessary, nearly all computers will have these three operations
* available'', so I gave up on trying to use algorithms similar to his.
* A future version of the library might include such algorithms; I
* would welcome contributions from others for this.
*
* I eventually decided to use bit-shifting algorithms. To multiply `a'
* and `b', we zero out the result. Then, for each `1' bit in `a', we
* shift `b' left the appropriate amount and add it to the result.
* Similarly, to divide `a' by `b', we shift `b' left varying amounts,
* repeatedly trying to subtract it from `a'. When we succeed, we note
* the fact by setting a bit in the quotient. While these algorithms
* have the same O(n^2) time complexity as Knuth's, the ``constant factor''
* is likely to be larger.
*
* Because I used these algorithms, which require single-block addition
* and subtraction rather than single-block multiplication and division,
* the innermost loops of all four routines are very similar. Study one
* of them and all will become clear.
*/
/*
* This is a little inline function used by both the multiplication
* routine and the division routine.
*
* `getShiftedBlock' returns the `x'th block of `num << y'.
* `y' may be anything from 0 to N - 1, and `x' may be anything from
* 0 to `num.len'.
*
* Two things contribute to this block:
*
* (1) The `N - y' low bits of `num.blk[x]', shifted `y' bits left.
*
* (2) The `y' high bits of `num.blk[x-1]', shifted `N - y' bits right.
*
* But we must be careful if `x == 0' or `x == num.len', in
* which case we should use 0 instead of (2) or (1), respectively.
*
* If `y == 0', then (2) contributes 0, as it should. However,
* in some computer environments, for a reason I cannot understand,
* `a >> b' means `a >> (b % N)'. This means `num.blk[x-1] >> (N - y)'
* will return `num.blk[x-1]' instead of the desired 0 when `y == 0';
* the test `y == 0' handles this case specially.
*/
inline BigUnsigned::Blk getShiftedBlock(const BigUnsigned &num,
BigUnsigned::Index x, unsigned int y) {
BigUnsigned::Blk part1 = (x == 0 || y == 0) ? 0 : (num.blk[x - 1] >> (BigUnsigned::N - y));
BigUnsigned::Blk part2 = (x == num.len) ? 0 : (num.blk[x] << y);
return part1 | part2;
}
void BigUnsigned::multiply(const BigUnsigned &a, const BigUnsigned &b) {
DTRT_ALIASED(this == &a || this == &b, multiply(a, b))
// If either a or b is zero, set to zero.
if (a.len == 0 || b.len == 0) {
len = 0;
return;
}
/*
* Overall method:
*
* Set this = 0.
* For each 1-bit of `a' (say the `i2'th bit of block `i'):
* Add `b << (i blocks and i2 bits)' to *this.
*/
// Variables for the calculation
Index i, j, k;
unsigned int i2;
Blk temp;
bool carryIn, carryOut;
// Set preliminary length and make room
len = a.len + b.len;
allocate(len);
// Zero out this object
for (i = 0; i < len; i++)
blk[i] = 0;
// For each block of the first number...
for (i = 0; i < a.len; i++) {
// For each 1-bit of that block...
for (i2 = 0; i2 < N; i2++) {
if ((a.blk[i] & (Blk(1) << i2)) == 0)
continue;
/*
* Add b to this, shifted left i blocks and i2 bits.
* j is the index in b, and k = i + j is the index in this.
*
* `getShiftedBlock', a short inline function defined above,
* is now used for the bit handling. It replaces the more
* complex `bHigh' code, in which each run of the loop dealt
* immediately with the low bits and saved the high bits to
* be picked up next time. The last run of the loop used to
* leave leftover high bits, which were handled separately.
* Instead, this loop runs an additional time with j == b.len.
* These changes were made on 2005.01.11.
*/
for (j = 0, k = i, carryIn = false; j <= b.len; j++, k++) {
/*
* The body of this loop is very similar to the body of the first loop
* in `add', except that this loop does a `+=' instead of a `+'.
*/
temp = blk[k] + getShiftedBlock(b, j, i2);
carryOut = (temp < blk[k]);
if (carryIn) {
temp++;
carryOut |= (temp == 0);
}
blk[k] = temp;
carryIn = carryOut;
}
// No more extra iteration to deal with `bHigh'.
// Roll-over a carry as necessary.
for (; carryIn; k++) {
blk[k]++;
carryIn = (blk[k] == 0);
}
}
}
// Zap possible leading zero
if (blk[len - 1] == 0)
len--;
}
/*
* DIVISION WITH REMAINDER
* This monstrous function mods *this by the given divisor b while storing the
* quotient in the given object q; at the end, *this contains the remainder.
* The seemingly bizarre pattern of inputs and outputs was chosen so that the
* function copies as little as possible (since it is implemented by repeated
* subtraction of multiples of b from *this).
*
* "modWithQuotient" might be a better name for this function, but I would
* rather not change the name now.
*/
void BigUnsigned::divideWithRemainder(const BigUnsigned &b, BigUnsigned &q) {
/* Defending against aliased calls is more complex than usual because we
* are writing to both *this and q.
*
* It would be silly to try to write quotient and remainder to the
* same variable. Rule that out right away. */
if (this == &q)
throw "BigUnsigned::divideWithRemainder: Cannot write quotient and remainder into the same variable";
/* Now *this and q are separate, so the only concern is that b might be
* aliased to one of them. If so, use a temporary copy of b. */
if (this == &b || &q == &b) {
BigUnsigned tmpB(b);
divideWithRemainder(tmpB, q);
return;
}
/*
* Knuth's definition of mod (which this function uses) is somewhat
* different from the C++ definition of % in case of division by 0.
*
* We let a / 0 == 0 (it doesn't matter much) and a % 0 == a, no
* exceptions thrown. This allows us to preserve both Knuth's demand
* that a mod 0 == a and the useful property that
* (a / b) * b + (a % b) == a.
*/
if (b.len == 0) {
q.len = 0;
return;
}
/*
* If *this.len < b.len, then *this < b, and we can be sure that b doesn't go into
* *this at all. The quotient is 0 and *this is already the remainder (so leave it alone).
*/
if (len < b.len) {
q.len = 0;
return;
}
// At this point we know (*this).len >= b.len > 0. (Whew!)
/*
* Overall method:
*
* For each appropriate i and i2, decreasing:
* Subtract (b << (i blocks and i2 bits)) from *this, storing the
* result in subtractBuf.
* If the subtraction succeeds with a nonnegative result:
* Turn on bit i2 of block i of the quotient q.
* Copy subtractBuf back into *this.
* Otherwise bit i2 of block i remains off, and *this is unchanged.
*
* Eventually q will contain the entire quotient, and *this will
* be left with the remainder.
*
* subtractBuf[x] corresponds to blk[x], not blk[x+i], since 2005.01.11.
* But on a single iteration, we don't touch the i lowest blocks of blk
* (and don't use those of subtractBuf) because these blocks are
* unaffected by the subtraction: we are subtracting
* (b << (i blocks and i2 bits)), which ends in at least `i' zero
* blocks. */
// Variables for the calculation
Index i, j, k;
unsigned int i2;
Blk temp;
bool borrowIn, borrowOut;
/*
* Make sure we have an extra zero block just past the value.
*
* When we attempt a subtraction, we might shift `b' so
* its first block begins a few bits left of the dividend,
* and then we'll try to compare these extra bits with
* a nonexistent block to the left of the dividend. The
* extra zero block ensures sensible behavior; we need
* an extra block in `subtractBuf' for exactly the same reason.
*/
Index origLen = len; // Save real length.
/* To avoid an out-of-bounds access in case of reallocation, allocate
* first and then increment the logical length. */
allocateAndCopy(len + 1);
len++;
blk[origLen] = 0; // Zero the added block.
// subtractBuf holds part of the result of a subtraction; see above.
Blk *subtractBuf = new Blk[len];
// Set preliminary length for quotient and make room
q.len = origLen - b.len + 1;
q.allocate(q.len);
// Zero out the quotient
for (i = 0; i < q.len; i++)
q.blk[i] = 0;
// For each possible left-shift of b in blocks...
i = q.len;
while (i > 0) {
i--;
// For each possible left-shift of b in bits...
// (Remember, N is the number of bits in a Blk.)
q.blk[i] = 0;
i2 = N;
while (i2 > 0) {
i2--;
/*
* Subtract b, shifted left i blocks and i2 bits, from *this,
* and store the answer in subtractBuf. In the for loop, `k == i + j'.
*
* Compare this to the middle section of `multiply'. They
* are in many ways analogous. See especially the discussion
* of `getShiftedBlock'.
*/
for (j = 0, k = i, borrowIn = false; j <= b.len; j++, k++) {
temp = blk[k] - getShiftedBlock(b, j, i2);
borrowOut = (temp > blk[k]);
if (borrowIn) {
borrowOut |= (temp == 0);
temp--;
}
// Since 2005.01.11, indices of `subtractBuf' directly match those of `blk', so use `k'.
subtractBuf[k] = temp;
borrowIn = borrowOut;
}
// No more extra iteration to deal with `bHigh'.
// Roll-over a borrow as necessary.
for (; k < origLen && borrowIn; k++) {
borrowIn = (blk[k] == 0);
subtractBuf[k] = blk[k] - 1;
}
/*
* If the subtraction was performed successfully (!borrowIn),
* set bit i2 in block i of the quotient.
*
* Then, copy the portion of subtractBuf filled by the subtraction
* back to *this. This portion starts with block i and ends--
* where? Not necessarily at block `i + b.len'! Well, we
* increased k every time we saved a block into subtractBuf, so
* the region of subtractBuf we copy is just [i, k).
*/
if (!borrowIn) {
q.blk[i] |= (Blk(1) << i2);
while (k > i) {
k--;
blk[k] = subtractBuf[k];
}
}
}
}
// Zap possible leading zero in quotient
if (q.blk[q.len - 1] == 0)
q.len--;
// Zap any/all leading zeros in remainder
zapLeadingZeros();
// Deallocate subtractBuf.
// (Thanks to Brad Spencer for noticing my accidental omission of this!)
delete [] subtractBuf;
}
/* BITWISE OPERATORS
* These are straightforward blockwise operations except that they differ in
* the output length and the necessity of zapLeadingZeros. */
void BigUnsigned::bitAnd(const BigUnsigned &a, const BigUnsigned &b, bool zap_leading_zeros) {
DTRT_ALIASED(this == &a || this == &b, bitAnd(a, b))
// The bitwise & can't be longer than either operand.
len = (a.len >= b.len) ? b.len : a.len;
allocate(len);
Index i;
for (i = 0; i < len; i++)
blk[i] = a.blk[i] & b.blk[i];
if (zap_leading_zeros)
zapLeadingZeros();
}
void BigUnsigned::bitOr(const BigUnsigned &a, const BigUnsigned &b) {
DTRT_ALIASED(this == &a || this == &b, bitOr(a, b))
Index i;
const BigUnsigned *a2, *b2;
if (a.len >= b.len) {
a2 = &a;
b2 = &b;
} else {
a2 = &b;
b2 = &a;
}
allocate(a2->len);
for (i = 0; i < b2->len; i++)
blk[i] = a2->blk[i] | b2->blk[i];
for (; i < a2->len; i++)
blk[i] = a2->blk[i];
len = a2->len;
// Doesn't need zapLeadingZeros.
}
void BigUnsigned::bitXor(const BigUnsigned &a, const BigUnsigned &b, bool zap_leading_zeros) {
DTRT_ALIASED(this == &a || this == &b, bitXor(a, b))
Index i;
const BigUnsigned *a2, *b2;
if (a.len >= b.len) {
a2 = &a;
b2 = &b;
} else {
a2 = &b;
b2 = &a;
}
allocate(a2->len);
for (i = 0; i < b2->len; i++)
blk[i] = a2->blk[i] ^ b2->blk[i];
for (; i < a2->len; i++)
blk[i] = a2->blk[i];
len = a2->len;
if (zap_leading_zeros)
zapLeadingZeros();
}
void BigUnsigned::bitNeg(const BigUnsigned &a, bool zap_leading_zeros) {
len = a.len;
allocate(len);
Index i;
for (i = 0; i < a.len; i++)
blk[i] = ~a.blk[i];
if (zap_leading_zeros)
zapLeadingZeros();
}
void BigUnsigned::bitShiftLeft(const BigUnsigned &a, int b) {
DTRT_ALIASED(this == &a, bitShiftLeft(a, b))
if (b < 0) {
if (b << 1 == 0)
throw "BigUnsigned::bitShiftLeft: "
"Pathological shift amount not implemented";
else {
bitShiftRight(a, -b);
return;
}
}
Index shiftBlocks = b / N;
unsigned int shiftBits = b % N;
// + 1: room for high bits nudged left into another block
len = a.len + shiftBlocks + 1;
allocate(len);
Index i, j;
for (i = 0; i < shiftBlocks; i++)
blk[i] = 0;
for (j = 0, i = shiftBlocks; j <= a.len; j++, i++)
blk[i] = getShiftedBlock(a, j, shiftBits);
// Zap possible leading zero
if (blk[len - 1] == 0)
len--;
}
void BigUnsigned::bitShiftRight(const BigUnsigned &a, int b) {
DTRT_ALIASED(this == &a, bitShiftRight(a, b))
if (b < 0) {
if (b << 1 == 0)
throw "BigUnsigned::bitShiftRight: "
"Pathological shift amount not implemented";
else {
bitShiftLeft(a, -b);
return;
}
}
// This calculation is wacky, but expressing the shift as a left bit shift
// within each block lets us use getShiftedBlock.
Index rightShiftBlocks = (b + N - 1) / N;
unsigned int leftShiftBits = N * rightShiftBlocks - b;
// Now (N * rightShiftBlocks - leftShiftBits) == b
// and 0 <= leftShiftBits < N.
if (rightShiftBlocks >= a.len + 1) {
// All of a is guaranteed to be shifted off, even considering the left
// bit shift.
len = 0;
return;
}
// Now we're allocating a positive amount.
// + 1: room for high bits nudged left into another block
len = a.len + 1 - rightShiftBlocks;
allocate(len);
Index i, j;
for (j = rightShiftBlocks, i = 0; j <= a.len; j++, i++)
blk[i] = getShiftedBlock(a, j, leftShiftBits);
// Zap possible leading zero
if (blk[len - 1] == 0)
len--;
}
// INCREMENT/DECREMENT OPERATORS
// Prefix increment
void BigUnsigned::operator ++() {
Index i;
bool carry = true;
for (i = 0; i < len && carry; i++) {
blk[i]++;
carry = (blk[i] == 0);
}
if (carry) {
// Allocate and then increase length, as in divideWithRemainder
allocateAndCopy(len + 1);
len++;
blk[i] = 1;
}
}
// Postfix increment: same as prefix
void BigUnsigned::operator ++(int) {
operator ++();
}
// Prefix decrement
void BigUnsigned::operator --() {
if (len == 0)
throw "BigUnsigned::operator --(): Cannot decrement an unsigned zero";
Index i;
bool borrow = true;
for (i = 0; borrow; i++) {
borrow = (blk[i] == 0);
blk[i]--;
}
// Zap possible leading zero (there can only be one)
if (blk[len - 1] == 0)
len--;
}
// Postfix decrement: same as prefix
void BigUnsigned::operator --(int) {
operator --();
}

View File

@ -0,0 +1,454 @@
#ifndef BIGUNSIGNED_H
#define BIGUNSIGNED_H
#include "NumberlikeArray.hh"
#include <climits>
class BigInteger;
/* A BigUnsigned object represents a nonnegative integer of size limited only by
* available memory. BigUnsigneds support most mathematical operators and can
* be converted to and from most primitive integer types.
*
* The number is stored as a NumberlikeArray of unsigned longs as if it were
* written in base 256^sizeof(unsigned long). The least significant block is
* first, and the length is such that the most significant block is nonzero. */
class BigUnsigned : protected NumberlikeArray<unsigned long> {
public:
// Enumeration for the result of a comparison.
enum CmpRes { less = -1, equal = 0, greater = 1 };
// BigUnsigneds are built with a Blk type of unsigned long.
typedef unsigned long Blk;
typedef NumberlikeArray<Blk>::Index Index;
using NumberlikeArray<Blk>::N;
friend class BigInteger;
protected:
// Creates a BigUnsigned with a capacity; for internal use.
BigUnsigned(int, Index c) : NumberlikeArray<Blk>(nullptr, c) {}
// Decreases len to eliminate any leading zero blocks.
void zapLeadingZeros() {
while (len > 0 && blk[len - 1] == 0)
len--;
}
void fillLeading(Index c, bool positive = true)
{
if (c > len)
{
unsigned long fillValue = positive ? 0 : ULONG_MAX;
allocateAndCopy(c);
for (Index i = len; i < c; ++i)
blk[i] = fillValue;
len = c;
}
}
bool isValueZero() const
{
for (Index i = 0; i < len; ++i)
{
if (blk[i] != 0)
return false;
}
return true;
}
public:
// Constructs zero.
BigUnsigned() : NumberlikeArray<Blk>() {}
// Copy constructor
BigUnsigned(const BigUnsigned &x) : NumberlikeArray<Blk>(x) {}
// Assignment operator
void operator=(const BigUnsigned &x) {
NumberlikeArray<Blk>::operator =(x);
}
// Constructor that copies from a given array of blocks.
BigUnsigned(const Blk *b, Index blen) : NumberlikeArray<Blk>(b, blen) {
// Eliminate any leading zeros we may have been passed.
zapLeadingZeros();
}
// Destructor. NumberlikeArray does the delete for us.
~BigUnsigned() {}
// Constructors from primitive integer types
BigUnsigned(unsigned long x);
BigUnsigned( long x);
BigUnsigned(unsigned int x);
BigUnsigned( int x);
BigUnsigned(unsigned short x);
BigUnsigned( short x);
protected:
// Helpers
template <class X> void initFromPrimitive (X x);
template <class X> void initFromSignedPrimitive(X x);
public:
/* Converters to primitive integer types
* The implicit conversion operators caused trouble, so these are now
* named. */
unsigned long toUnsignedLong () const;
long toLong () const;
unsigned int toUnsignedInt () const;
int toInt () const;
unsigned short toUnsignedShort() const;
short toShort () const;
protected:
// Helpers
template <class X> X convertToSignedPrimitive() const;
template <class X> X convertToPrimitive () const;
public:
// BIT/BLOCK ACCESSORS
// Expose these from NumberlikeArray directly.
using NumberlikeArray<Blk>::getCapacity;
using NumberlikeArray<Blk>::getLength;
/* Returns the requested block, or 0 if it is beyond the length (as if
* the number had 0s infinitely to the left). */
Blk getBlock(Index i) const { return i >= len ? 0 : blk[i]; }
/* Sets the requested block. The number grows or shrinks as necessary. */
void setBlock(Index i, Blk newBlock);
// The number is zero if and only if the canonical length is zero.
bool isZero() const { return NumberlikeArray<Blk>::isEmpty(); }
void setZero() { clear(); }
/* Returns the length of the number in bits, i.e., zero if the number
* is zero and otherwise one more than the largest value of bi for
* which getBit(bi) returns true. */
Index bitLength() const;
/* Get the state of bit bi, which has value 2^bi. Bits beyond the
* number's length are considered to be 0. */
bool getBit(Index bi) const {
return (getBlock(bi / N) & (Blk(1) << (bi % N))) != 0;
}
/* Sets the state of bit bi to newBit. The number grows or shrinks as
* necessary. */
void setBit(Index bi, bool newBit);
// COMPARISONS
// Compares this to x like Perl's <=>
CmpRes compareTo(const BigUnsigned &x) const;
// Ordinary comparison operators
bool operator ==(const BigUnsigned &x) const {
return NumberlikeArray<Blk>::operator ==(x);
}
bool operator !=(const BigUnsigned &x) const {
return NumberlikeArray<Blk>::operator !=(x);
}
bool operator < (const BigUnsigned &x) const { return compareTo(x) == less ; }
bool operator <=(const BigUnsigned &x) const { return compareTo(x) != greater; }
bool operator >=(const BigUnsigned &x) const { return compareTo(x) != less ; }
bool operator > (const BigUnsigned &x) const { return compareTo(x) == greater; }
/*
* BigUnsigned and BigInteger both provide three kinds of operators.
* Here ``big-integer'' refers to BigInteger or BigUnsigned.
*
* (1) Overloaded ``return-by-value'' operators:
* +, -, *, /, %, unary -, &, |, ^, <<, >>.
* Big-integer code using these operators looks identical to code using
* the primitive integer types. These operators take one or two
* big-integer inputs and return a big-integer result, which can then
* be assigned to a BigInteger variable or used in an expression.
* Example:
* BigInteger a(1), b = 1;
* BigInteger c = a + b;
*
* (2) Overloaded assignment operators:
* +=, -=, *=, /=, %=, flipSign, &=, |=, ^=, <<=, >>=, ++, --.
* Again, these are used on big integers just like on ints. They take
* one writable big integer that both provides an operand and receives a
* result. Most also take a second read-only operand.
* Example:
* BigInteger a(1), b(1);
* a += b;
*
* (3) Copy-less operations: `add', `subtract', etc.
* These named methods take operands as arguments and store the result
* in the receiver (*this), avoiding unnecessary copies and allocations.
* `divideWithRemainder' is special: it both takes the dividend from and
* stores the remainder into the receiver, and it takes a separate
* object in which to store the quotient. NOTE: If you are wondering
* why these don't return a value, you probably mean to use the
* overloaded return-by-value operators instead.
*
* Examples:
* BigInteger a(43), b(7), c, d;
*
* c = a + b; // Now c == 50.
* c.add(a, b); // Same effect but without the two copies.
*
* c.divideWithRemainder(b, d);
* // 50 / 7; now d == 7 (quotient) and c == 1 (remainder).
*
* // ``Aliased'' calls now do the right thing using a temporary
* // copy, but see note on `divideWithRemainder'.
* a.add(a, b);
*/
// COPY-LESS OPERATIONS
// These 8: Arguments are read-only operands, result is saved in *this.
void add(const BigUnsigned &a, const BigUnsigned &b);
void subtract(const BigUnsigned &a, const BigUnsigned &b);
void multiply(const BigUnsigned &a, const BigUnsigned &b);
void bitAnd(const BigUnsigned &a, const BigUnsigned &b, bool zap_leading_zeros = true);
void bitOr(const BigUnsigned &a, const BigUnsigned &b);
void bitXor(const BigUnsigned &a, const BigUnsigned &b, bool zap_leading_zeros = true);
void bitNeg(const BigUnsigned &a, bool zap_leading_zeros = true);
/* Negative shift amounts translate to opposite-direction shifts,
* except for -2^(8*sizeof(int)-1) which is unimplemented. */
void bitShiftLeft(const BigUnsigned &a, int b);
void bitShiftRight(const BigUnsigned &a, int b);
/* `a.divideWithRemainder(b, q)' is like `q = a / b, a %= b'.
* / and % use semantics similar to Knuth's, which differ from the
* primitive integer semantics under division by zero. See the
* implementation in BigUnsigned.cc for details.
* `a.divideWithRemainder(b, a)' throws an exception: it doesn't make
* sense to write quotient and remainder into the same variable. */
void divideWithRemainder(const BigUnsigned &b, BigUnsigned &q);
/* `divide' and `modulo' are no longer offered. Use
* `divideWithRemainder' instead. */
// OVERLOADED RETURN-BY-VALUE OPERATORS
BigUnsigned operator +(const BigUnsigned &x) const;
BigUnsigned operator -(const BigUnsigned &x) const;
BigUnsigned operator *(const BigUnsigned &x) const;
BigUnsigned operator /(const BigUnsigned &x) const;
BigUnsigned operator %(const BigUnsigned &x) const;
/* OK, maybe unary minus could succeed in one case, but it really
* shouldn't be used, so it isn't provided. */
BigUnsigned operator &(const BigUnsigned &x) const;
BigUnsigned operator |(const BigUnsigned &x) const;
BigUnsigned operator ^(const BigUnsigned &x) const;
BigUnsigned operator ~() const;
BigUnsigned operator <<(int b) const;
BigUnsigned operator >>(int b) const;
// OVERLOADED ASSIGNMENT OPERATORS
void operator +=(const BigUnsigned &x);
void operator -=(const BigUnsigned &x);
void operator *=(const BigUnsigned &x);
void operator /=(const BigUnsigned &x);
void operator %=(const BigUnsigned &x);
void operator &=(const BigUnsigned &x);
void operator |=(const BigUnsigned &x);
void operator ^=(const BigUnsigned &x);
void operator <<=(int b);
void operator >>=(int b);
/* INCREMENT/DECREMENT OPERATORS
* To discourage messy coding, these do not return *this, so prefix
* and postfix behave the same. */
void operator ++( );
void operator ++(int);
void operator --( );
void operator --(int);
// Helper function that needs access to BigUnsigned internals
friend Blk getShiftedBlock(const BigUnsigned &num, Index x,
unsigned int y);
// See BigInteger.cc.
template <class X>
friend X convertBigUnsignedToPrimitiveAccess(const BigUnsigned &a);
};
/* Implementing the return-by-value and assignment operators in terms of the
* copy-less operations. The copy-less operations are responsible for making
* any necessary temporary copies to work around aliasing. */
inline BigUnsigned BigUnsigned::operator +(const BigUnsigned &x) const {
BigUnsigned ans;
ans.add(*this, x);
return ans;
}
inline BigUnsigned BigUnsigned::operator -(const BigUnsigned &x) const {
BigUnsigned ans;
ans.subtract(*this, x);
return ans;
}
inline BigUnsigned BigUnsigned::operator *(const BigUnsigned &x) const {
BigUnsigned ans;
ans.multiply(*this, x);
return ans;
}
inline BigUnsigned BigUnsigned::operator /(const BigUnsigned &x) const {
if (x.isZero()) throw "BigUnsigned::operator /: division by zero";
BigUnsigned q, r;
r = *this;
r.divideWithRemainder(x, q);
return q;
}
inline BigUnsigned BigUnsigned::operator %(const BigUnsigned &x) const {
if (x.isZero()) throw "BigUnsigned::operator %: division by zero";
BigUnsigned q, r;
r = *this;
r.divideWithRemainder(x, q);
return r;
}
inline BigUnsigned BigUnsigned::operator &(const BigUnsigned &x) const {
BigUnsigned ans;
ans.bitAnd(*this, x);
return ans;
}
inline BigUnsigned BigUnsigned::operator |(const BigUnsigned &x) const {
BigUnsigned ans;
ans.bitOr(*this, x);
return ans;
}
inline BigUnsigned BigUnsigned::operator ^(const BigUnsigned &x) const {
BigUnsigned ans;
ans.bitXor(*this, x);
return ans;
}
inline BigUnsigned BigUnsigned::operator ~() const {
BigUnsigned ans;
ans.bitNeg(*this);
return ans;
}
inline BigUnsigned BigUnsigned::operator <<(int b) const {
BigUnsigned ans;
ans.bitShiftLeft(*this, b);
return ans;
}
inline BigUnsigned BigUnsigned::operator >>(int b) const {
BigUnsigned ans;
ans.bitShiftRight(*this, b);
return ans;
}
inline void BigUnsigned::operator +=(const BigUnsigned &x) {
add(*this, x);
}
inline void BigUnsigned::operator -=(const BigUnsigned &x) {
subtract(*this, x);
}
inline void BigUnsigned::operator *=(const BigUnsigned &x) {
multiply(*this, x);
}
inline void BigUnsigned::operator /=(const BigUnsigned &x) {
if (x.isZero()) throw "BigUnsigned::operator /=: division by zero";
/* The following technique is slightly faster than copying *this first
* when x is large. */
BigUnsigned q;
divideWithRemainder(x, q);
// *this contains the remainder, but we overwrite it with the quotient.
*this = q;
}
inline void BigUnsigned::operator %=(const BigUnsigned &x) {
if (x.isZero()) throw "BigUnsigned::operator %=: division by zero";
BigUnsigned q;
// Mods *this by x. Don't care about quotient left in q.
divideWithRemainder(x, q);
}
inline void BigUnsigned::operator &=(const BigUnsigned &x) {
bitAnd(*this, x);
}
inline void BigUnsigned::operator |=(const BigUnsigned &x) {
bitOr(*this, x);
}
inline void BigUnsigned::operator ^=(const BigUnsigned &x) {
bitXor(*this, x);
}
inline void BigUnsigned::operator <<=(int b) {
bitShiftLeft(*this, b);
}
inline void BigUnsigned::operator >>=(int b) {
bitShiftRight(*this, b);
}
/* Templates for conversions of BigUnsigned to and from primitive integers.
* BigInteger.cc needs to instantiate convertToPrimitive, and the uses in
* BigUnsigned.cc didn't do the trick; I think g++ inlined convertToPrimitive
* instead of generating linkable instantiations. So for consistency, I put
* all the templates here. */
// CONSTRUCTION FROM PRIMITIVE INTEGERS
/* Initialize this BigUnsigned from the given primitive integer. The same
* pattern works for all primitive integer types, so I put it into a template to
* reduce code duplication. (Don't worry: this is protected and we instantiate
* it only with primitive integer types.) Type X could be signed, but x is
* known to be nonnegative. */
template <class X>
void BigUnsigned::initFromPrimitive(X x) {
if (x == 0)
; // NumberlikeArray already initialized us to zero.
else {
// Create a single block. blk is NULL; no need to delete it.
cap = 1;
blk = new Blk[1];
len = 1;
blk[0] = Blk(x);
}
}
/* Ditto, but first check that x is nonnegative. I could have put the check in
* initFromPrimitive and let the compiler optimize it out for unsigned-type
* instantiations, but I wanted to avoid the warning stupidly issued by g++ for
* a condition that is constant in *any* instantiation, even if not in all. */
template <class X>
void BigUnsigned::initFromSignedPrimitive(X x) {
if (x < 0)
throw "BigUnsigned constructor: "
"Cannot construct a BigUnsigned from a negative number";
else
initFromPrimitive(x);
}
// CONVERSION TO PRIMITIVE INTEGERS
/* Template with the same idea as initFromPrimitive. This might be slightly
* slower than the previous version with the masks, but it's much shorter and
* clearer, which is the library's stated goal. */
template <class X>
X BigUnsigned::convertToPrimitive() const {
if (len == 0)
// The number is zero; return zero.
return 0;
else if (len == 1) {
// The single block might fit in an X. Try the conversion.
X x = X(blk[0]);
// Make sure the result accurately represents the block.
if (Blk(x) == blk[0])
// Successful conversion.
return x;
// Otherwise fall through.
}
throw "BigUnsigned::to<Primitive>: "
"Value is too big to fit in the requested type";
}
/* Wrap the above in an x >= 0 test to make sure we got a nonnegative result,
* not a negative one that happened to convert back into the correct nonnegative
* one. (E.g., catch incorrect conversion of 2^31 to the long -2^31.) Again,
* separated to avoid a g++ warning. */
template <class X>
X BigUnsigned::convertToSignedPrimitive() const {
X x = convertToPrimitive<X>();
if (x >= 0)
return x;
else
throw "BigUnsigned::to(Primitive): "
"Value is too big to fit in the requested type";
}
#endif

View File

@ -0,0 +1,131 @@
#include "BigUnsignedInABase.hh"
BigUnsignedInABase::BigUnsignedInABase(const Digit *d, Index l, Base base_)
: NumberlikeArray<Digit>(d, l), base(base_) {
// Check the base
if (base < 2)
throw "BigUnsignedInABase::BigUnsignedInABase(const Digit *, Index, Base): The base must be at least 2";
// Validate the digits.
for (Index i = 0; i < l; i++)
if (blk[i] >= base)
throw "BigUnsignedInABase::BigUnsignedInABase(const Digit *, Index, Base): A digit is too large for the specified base";
// Eliminate any leading zeros we may have been passed.
zapLeadingZeros();
}
namespace {
unsigned int bitLen(unsigned int x) {
unsigned int len = 0;
while (x > 0) {
x >>= 1;
len++;
}
return len;
}
unsigned int ceilingDiv(unsigned int a, unsigned int b) {
return (a + b - 1) / b;
}
}
BigUnsignedInABase::BigUnsignedInABase(const BigUnsigned &x, Base base_) {
// Check the base
if (base_ < 2)
throw "BigUnsignedInABase(BigUnsigned, Base): The base must be at least 2";
this->base = base_;
// Get an upper bound on how much space we need
int maxBitLenOfX = x.getLength() * BigUnsigned::N;
int minBitsPerDigit = bitLen(base) - 1;
int maxDigitLenOfX = ceilingDiv(maxBitLenOfX, minBitsPerDigit);
len = maxDigitLenOfX; // Another change to comply with `staying in bounds'.
allocate(len); // Get the space
BigUnsigned x2(x), buBase(base);
Index digitNum = 0;
while (!x2.isZero()) {
// Get last digit. This is like `lastDigit = x2 % buBase, x2 /= buBase'.
BigUnsigned lastDigit(x2);
lastDigit.divideWithRemainder(buBase, x2);
// Save the digit.
blk[digitNum] = lastDigit.toUnsignedShort();
// Move on. We can't run out of room: we figured it out above.
digitNum++;
}
// Save the actual length.
len = digitNum;
}
BigUnsignedInABase::operator BigUnsigned() const {
BigUnsigned ans(0), buBase(base), temp;
Index digitNum = len;
while (digitNum > 0) {
digitNum--;
temp.multiply(ans, buBase);
ans.add(temp, BigUnsigned(blk[digitNum]));
}
return ans;
}
BigUnsignedInABase::BigUnsignedInABase(const char *s, const size_t & length, Base base_)
{
// Check the base.
if (base_ > 36)
throw "BigUnsignedInABase(std::string, Base): The default string conversion routines use the symbol set 0-9, A-Z and therefore support only up to base 36. You tried a conversion with a base over 36; write your own string conversion routine.";
if (length == 0)
throw "BigUnsignedInABase(std::string, Base): The input string is empty.";
// Save the base.
// This pattern is seldom seen in C++, but the analogous ``this.'' is common in Java.
this->base = base_;
// `s.length()' is a `size_t', while `len' is a `NumberlikeArray::Index',
// also known as an `unsigned int'. Some compilers warn without this cast.
len = Index(length);
allocate(len);
Index digitNum;
const char *data = s;
for (digitNum = len; digitNum > 0; digitNum--, data++) {
char theSymbol = *data;
if (theSymbol >= '0' && theSymbol <= '9')
blk[digitNum-1] = theSymbol - '0';
else if (theSymbol >= 'A' && theSymbol <= 'Z')
blk[digitNum-1] = theSymbol - 'A' + 10;
else if (theSymbol >= 'a' && theSymbol <= 'z')
blk[digitNum-1] = theSymbol - 'a' + 10;
else
throw "BigUnsignedInABase(std::string, Base): Bad symbol in input. Only 0-9, A-Z, a-z are accepted.";
if (blk[digitNum-1] >= base)
throw "BigUnsignedInABase::BigUnsignedInABase(const Digit *, Index, Base): A digit is too large for the specified base";
}
zapLeadingZeros();
}
BigUnsignedInABase::BigUnsignedInABase(const std::string &s, Base base_):BigUnsignedInABase(s.c_str(), s.length(), base_) {}
BigUnsignedInABase::operator std::string() const {
if (base > 36)
throw "BigUnsignedInABase ==> std::string: The default string conversion routines use the symbol set 0-9, A-Z and therefore support only up to base 36. You tried a conversion with a base over 36; write your own string conversion routine.";
if (len == 0)
return std::string("0");
// Some compilers don't have push_back, so use a char * buffer instead.
char *s = new char[len + 1];
s[len] = '\0';
Index digitNum, symbolNumInString;
for (symbolNumInString = 0; symbolNumInString < len; symbolNumInString++) {
digitNum = len - 1 - symbolNumInString;
Digit theDigit = blk[digitNum];
if (theDigit < 10)
s[symbolNumInString] = char('0' + theDigit);
else
s[symbolNumInString] = char('A' + theDigit - 10);
}
std::string s2(s);
delete [] s;
return s2;
}

View File

@ -0,0 +1,123 @@
#ifndef BIGUNSIGNEDINABASE_H
#define BIGUNSIGNEDINABASE_H
#include "NumberlikeArray.hh"
#include "BigUnsigned.hh"
#include <string>
/*
* A BigUnsignedInABase object represents a nonnegative integer of size limited
* only by available memory, represented in a user-specified base that can fit
* in an `unsigned short' (most can, and this saves memory).
*
* BigUnsignedInABase is intended as an intermediary class with little
* functionality of its own. BigUnsignedInABase objects can be constructed
* from, and converted to, BigUnsigneds (requiring multiplication, mods, etc.)
* and `std::string's (by switching digit values for appropriate characters).
*
* BigUnsignedInABase is similar to BigUnsigned. Note the following:
*
* (1) They represent the number in exactly the same way, except that
* BigUnsignedInABase uses ``digits'' (or Digit) where BigUnsigned uses
* ``blocks'' (or Blk).
*
* (2) Both use the management features of NumberlikeArray. (In fact, my desire
* to add a BigUnsignedInABase class without duplicating a lot of code led me to
* introduce NumberlikeArray.)
*
* (3) The only arithmetic operation supported by BigUnsignedInABase is an
* equality test. Use BigUnsigned for arithmetic.
*/
class BigUnsignedInABase : protected NumberlikeArray<unsigned short> {
public:
// The digits of a BigUnsignedInABase are unsigned shorts.
typedef unsigned short Digit;
// That's also the type of a base.
typedef Digit Base;
protected:
// The base in which this BigUnsignedInABase is expressed
Base base;
// Creates a BigUnsignedInABase with a capacity; for internal use.
BigUnsignedInABase(int, Index c) : NumberlikeArray<Digit>(nullptr, c) {}
// Decreases len to eliminate any leading zero digits.
void zapLeadingZeros() {
while (len > 0 && blk[len - 1] == 0)
len--;
}
public:
// Constructs zero in base 2.
BigUnsignedInABase() : NumberlikeArray<Digit>(), base(2) {}
// Copy constructor
BigUnsignedInABase(const BigUnsignedInABase &x) : NumberlikeArray<Digit>(x), base(x.base) {}
// Assignment operator
void operator =(const BigUnsignedInABase &x) {
NumberlikeArray<Digit>::operator =(x);
base = x.base;
}
// Constructor that copies from a given array of digits.
BigUnsignedInABase(const Digit *d, Index l, Base base);
// Destructor. NumberlikeArray does the delete for us.
~BigUnsignedInABase() {}
// LINKS TO BIGUNSIGNED
BigUnsignedInABase(const BigUnsigned &x, Base base);
operator BigUnsigned() const;
/* LINKS TO STRINGS
*
* These use the symbols ``0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'' to
* represent digits of 0 through 35. When parsing strings, lowercase is
* also accepted.
*
* All string representations are big-endian (big-place-value digits
* first). (Computer scientists have adopted zero-based counting; why
* can't they tolerate little-endian numbers?)
*
* No string representation has a ``base indicator'' like ``0x''.
*
* An exception is made for zero: it is converted to ``0'' and not the
* empty string.
*
* If you want different conventions, write your own routines to go
* between BigUnsignedInABase and strings. It's not hard.
*/
operator std::string() const;
BigUnsignedInABase(const std::string &s, Base base);
BigUnsignedInABase(const char *s, const size_t & length, Base base);
public:
// ACCESSORS
Base getBase() const { return base; }
// Expose these from NumberlikeArray directly.
using NumberlikeArray<Digit>::getCapacity;
using NumberlikeArray<Digit>::getLength;
/* Returns the requested digit, or 0 if it is beyond the length (as if
* the number had 0s infinitely to the left). */
Digit getDigit(Index i) const { return i >= len ? 0 : blk[i]; }
// The number is zero if and only if the canonical length is zero.
bool isZero() const { return NumberlikeArray<Digit>::isEmpty(); }
/* Equality test. For the purposes of this test, two BigUnsignedInABase
* values must have the same base to be equal. */
bool operator ==(const BigUnsignedInABase &x) const {
return base == x.base && NumberlikeArray<Digit>::operator ==(x);
}
bool operator !=(const BigUnsignedInABase &x) const { return !operator ==(x); }
};
#endif

View File

@ -0,0 +1,8 @@
add_library(biginteger
BigInteger.cc
BigUnsigned.cc
BigIntegerUtils.cc
BigIntegerAlgorithms.cc
BigUnsignedInABase.cc
)
target_include_directories(biginteger PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})

View File

@ -0,0 +1,185 @@
#ifndef NUMBERLIKEARRAY_H
#define NUMBERLIKEARRAY_H
// Make sure we have NULL.
#ifndef NULL
#define NULL 0
#endif
/* A NumberlikeArray<Blk> object holds a heap-allocated array of Blk with a
* length and a capacity and provides basic memory management features.
* BigUnsigned and BigUnsignedInABase both subclass it.
*
* NumberlikeArray provides no information hiding. Subclasses should use
* nonpublic inheritance and manually expose members as desired using
* declarations like this:
*
* public:
* NumberlikeArray< the-type-argument >::getLength;
*/
template <class Blk>
class NumberlikeArray {
public:
// Type for the index of a block in the array
typedef unsigned int Index;
// The number of bits in a block, defined below.
static const unsigned int N;
// The current allocated capacity of this NumberlikeArray (in blocks)
Index cap;
// The actual length of the value stored in this NumberlikeArray (in blocks)
Index len;
// Heap-allocated array of the blocks (can be NULL if len == 0)
Blk *blk;
// Constructs a ``zero'' NumberlikeArray with the given capacity.
NumberlikeArray(Index c) : cap(c), len(0) {
blk = (cap > 0) ? (new Blk[cap]) : nullptr;
}
/* Constructs a zero NumberlikeArray without allocating a backing array.
* A subclass that doesn't know the needed capacity at initialization
* time can use this constructor and then overwrite blk without first
* deleting it. */
NumberlikeArray() : cap(0), len(0) {
blk = nullptr;
}
// Destructor. Note that `delete NULL' is a no-op.
~NumberlikeArray() {
delete [] blk;
}
/* Ensures that the array has at least the requested capacity; may
* destroy the contents. */
void allocate(Index c);
/* Ensures that the array has at least the requested capacity; does not
* destroy the contents. */
void allocateAndCopy(Index c);
// Copy constructor
NumberlikeArray(const NumberlikeArray<Blk> &x);
// Assignment operator
void operator=(const NumberlikeArray<Blk> &x);
// Constructor that copies from a given array of blocks
NumberlikeArray(const Blk *b, Index blen);
// ACCESSORS
Index getCapacity() const { return cap; }
Index getLength() const { return len; }
Blk getBlock(Index i) const { return blk[i]; }
bool isEmpty() const { return len == 0; }
void clear()
{
cap = len = 0;
if (blk)
delete [] blk;
blk = nullptr;
}
/* Equality comparison: checks if both objects have the same length and
* equal (==) array elements to that length. Subclasses may wish to
* override. */
bool operator ==(const NumberlikeArray<Blk> &x) const;
bool operator !=(const NumberlikeArray<Blk> &x) const {
return !operator ==(x);
}
};
/* BEGIN TEMPLATE DEFINITIONS. They are present here so that source files that
* include this header file can generate the necessary real definitions. */
template <class Blk>
const unsigned int NumberlikeArray<Blk>::N = 8 * sizeof(Blk);
template <class Blk>
void NumberlikeArray<Blk>::allocate(Index c) {
// If the requested capacity is more than the current capacity...
if (c > cap) {
// Delete the old number array
delete [] blk;
// Allocate the new array
cap = c;
blk = new Blk[cap];
}
}
template <class Blk>
void NumberlikeArray<Blk>::allocateAndCopy(Index c) {
// If the requested capacity is more than the current capacity...
if (c > cap) {
Blk *oldBlk = blk;
// Allocate the new number array
cap = c;
blk = new Blk[cap];
// Copy number blocks
Index i;
for (i = 0; i < len; i++)
blk[i] = oldBlk[i];
// Delete the old array
delete [] oldBlk;
}
}
template <class Blk>
NumberlikeArray<Blk>::NumberlikeArray(const NumberlikeArray<Blk> &x)
: len(x.len) {
// Create array
cap = len;
blk = new Blk[cap];
// Copy blocks
Index i;
for (i = 0; i < len; i++)
blk[i] = x.blk[i];
}
template <class Blk>
void NumberlikeArray<Blk>::operator=(const NumberlikeArray<Blk> &x) {
/* Calls like a = a have no effect; catch them before the aliasing
* causes a problem */
if (this == &x)
return;
// Copy length
len = x.len;
// Expand array if necessary
allocate(len);
// Copy number blocks
Index i;
for (i = 0; i < len; i++)
blk[i] = x.blk[i];
}
template <class Blk>
NumberlikeArray<Blk>::NumberlikeArray(const Blk *b, Index blen)
: cap(blen), len(blen) {
// Create array
blk = new Blk[cap];
// Copy blocks
Index i;
for (i = 0; i < len; i++)
blk[i] = b[i];
}
template <class Blk>
bool NumberlikeArray<Blk>::operator ==(const NumberlikeArray<Blk> &x) const {
if (len != x.len)
// Definitely unequal.
return false;
else {
// Compare corresponding blocks one by one.
Index i;
for (i = 0; i < len; i++)
if (blk[i] != x.blk[i])
return false;
// No blocks differed, so the objects are equal.
return true;
}
}
#endif

View File

@ -0,0 +1,3 @@
Modified by wujian based on https://mattmccutchen.net/bigint/
Have the same xor/shiftRight etc bit operation results with the java.math.BigInteger

View File

@ -100,7 +100,7 @@ void OwnSplitChannel::logSplit(const Poco::Message & msg)
columns[i++]->insert(msg.getSource());
columns[i++]->insert(msg.getText());
logs_queue->emplace(std::move(columns));
[[maybe_unused]] auto push_result = logs_queue->emplace(std::move(columns));
}
/// Also log to system.text_log table, if message is not too noisy

View File

@ -1,180 +0,0 @@
#!/usr/bin/env bash
# script to run query to databases
function usage()
{
cat <<EOF
usage: $0 options
This script run benhmark for database
OPTIONS:
-c config file where some script variables are defined
-n table name
-h Show this message
-t how many times execute each query. default is '3'
-q query file
-e expect file
-s /etc/init.d/service
-p table name pattern to be replaced to name. default is 'hits_10m'
EOF
}
TIMES=3
table_name_pattern=hits_10m
while getopts “c:ht:n:q:e:s:r” OPTION
do
case $OPTION in
c)
source $OPTARG
;;
?)
;;
esac
done
OPTIND=1
while getopts “c:ht:n:q:e:s:r” OPTION
do
case $OPTION in
h)
usage
exit 0
;;
t)
TIMES=$OPTARG
;;
n)
table_name=$OPTARG
;;
q)
test_file=$OPTARG
;;
e)
expect_file=$OPTARG
;;
s)
etc_init_d_service=$OPTARG
;;
p)
table_name_pattern=$OPTARG
;;
c)
;;
r)
restart_server_each_query=1
;;
?)
usage
exit 0
;;
esac
done
if [[ ! -f $expect_file ]]; then
echo "Not found: expect file"
exit 1
fi
if [[ ! -f $test_file ]]; then
echo "Not found: test file"
exit 1
fi
if [[ ! -f $etc_init_d_service ]]; then
echo "Not found: /etc/init.d/service with path=$etc_init_d_service"
use_service=0
else
use_service=1
fi
if [[ "$table_name_pattern" == "" ]]; then
echo "Empty table_name_pattern"
exit 1
fi
if [[ "$table_name" == "" ]]; then
echo "Empty table_name"
exit 1
fi
function execute()
{
queries=("${@}")
queries_count=${#queries[@]}
if [ -z $TIMES ]; then
TIMES=1
fi
index=0
while [ "$index" -lt "$queries_count" ]; do
query=${queries[$index]}
if [[ $query == "" ]]; then
let "index = $index + 1"
continue
fi
comment_re='--.*'
if [[ $query =~ $comment_re ]]; then
echo "$query"
echo
else
sync
sudo sh -c "echo 3 > /proc/sys/vm/drop_caches"
if [[ "$restart_server_each_query" == "1" && "$use_service" == "1" ]]; then
echo "restart server: $etc_init_d_service restart"
sudo $etc_init_d_service restart
fi
for i in $(seq $TIMES)
do
if [[ -f $etc_init_d_service && "$use_service" == "1" ]]; then
sudo $etc_init_d_service status
server_status=$?
expect -f $expect_file ""
if [[ "$?" != "0" || $server_status != "0" ]]; then
echo "restart server: $etc_init_d_service restart"
sudo $etc_init_d_service restart
fi
#wait until can connect to server
restart_timer=0
restart_limit=60
expect -f $expect_file "" &> /dev/null
while [ "$?" != "0" ]; do
echo "waiting"
sleep 1
let "restart_timer = $restart_timer + 1"
if (( $restart_limit < $restart_timer )); then
sudo $etc_init_d_service restart
restart_timer=0
fi
expect -f $expect_file "" &> /dev/null
done
fi
echo
echo "times: $i"
echo "query:" "$query"
expect -f $expect_file "$query"
done
fi
let "index = $index + 1"
done
}
temp_test_file=temp_queries_$table_name
cat $test_file | sed s/$table_name_pattern/$table_name/g > $temp_test_file
mapfile -t test_queries < $temp_test_file
echo "start time: $(date)"
time execute "${test_queries[@]}"
echo "stop time: $(date)"

View File

@ -1,22 +0,0 @@
#!/usr/bin/env bash
QUERIES_FILE="queries.sql"
TABLE=$1
TRIES=3
cat "$QUERIES_FILE" | sed "s|{table}|\"${TABLE}\"|g" | while read query; do
echo -n "["
for i in $(seq 1 $TRIES); do
while true; do
RES=$(command time -f %e -o /dev/stdout curl -sS -G --data-urlencode "query=$query" --data "default_format=Null&max_memory_usage=100000000000&max_memory_usage_for_all_queries=100000000000&max_concurrent_queries_for_user=100&database=*$YT_CLIQUE_ID" --location-trusted -H "Authorization: OAuth $YT_TOKEN" "$YT_PROXY.yt.yandex.net/query" 2>/dev/null);
if [[ $? == 0 ]]; then
[[ $RES =~ 'fail|Exception' ]] || break;
fi
done
[[ "$?" == "0" ]] && echo -n "${RES}" || echo -n "null"
[[ "$i" != $TRIES ]] && echo -n ", "
done
echo "],"
done

View File

@ -1,18 +0,0 @@
#!/usr/bin/env bash
QUERIES_FILE="queries.sql"
TABLE=$1
TRIES=3
cat "$QUERIES_FILE" | sed "s/{table}/${TABLE}/g" | while read query; do
sync
echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
echo -n "["
for i in $(seq 1 $TRIES); do
RES=$(clickhouse-client --time --format=Null --query="$query" 2>&1)
[[ "$?" == "0" ]] && echo -n "${RES}" || echo -n "null"
[[ "$i" != $TRIES ]] && echo -n ", "
done
echo "],"
done

View File

@ -1,19 +0,0 @@
#!/usr/bin/env bash
QUERIES_FILE="queries.sql"
TABLE=$1
TRIES=3
cat "$QUERIES_FILE" | sed "s|{table}|\"${TABLE}\"|g" | while read query; do
echo -n "["
for i in $(seq 1 $TRIES); do
while true; do
RES=$(command time -f %e -o time ./yql --clickhouse --syntax-version 1 -f empty <<< "USE chyt.hume; PRAGMA max_memory_usage = 100000000000; PRAGMA max_memory_usage_for_all_queries = 100000000000; $query" >/dev/null 2>&1 && cat time) && break;
done
[[ "$?" == "0" ]] && echo -n "${RES}" || echo -n "null"
[[ "$i" != $TRIES ]] && echo -n ", "
done
echo "],"
done

View File

@ -1,43 +0,0 @@
SELECT count() FROM {table};
SELECT count() FROM {table} WHERE AdvEngineID != 0;
SELECT sum(AdvEngineID), count(), avg(ResolutionWidth) FROM {table} ;
SELECT sum(UserID) FROM {table} ;
SELECT uniq(UserID) FROM {table} ;
SELECT uniq(SearchPhrase) FROM {table} ;
SELECT min(EventDate), max(EventDate) FROM {table} ;
SELECT AdvEngineID, count() FROM {table} WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY count() DESC;
SELECT RegionID, uniq(UserID) AS u FROM {table} GROUP BY RegionID ORDER BY u DESC LIMIT 10;
SELECT RegionID, sum(AdvEngineID), count() AS c, avg(ResolutionWidth), uniq(UserID) FROM {table} GROUP BY RegionID ORDER BY c DESC LIMIT 10;
SELECT MobilePhoneModel, uniq(UserID) AS u FROM {table} WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
SELECT MobilePhone, MobilePhoneModel, uniq(UserID) AS u FROM {table} WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
SELECT SearchPhrase, count() AS c FROM {table} WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT SearchPhrase, uniq(UserID) AS u FROM {table} WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
SELECT SearchEngineID, SearchPhrase, count() AS c FROM {table} WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT UserID, count() FROM {table} GROUP BY UserID ORDER BY count() DESC LIMIT 10;
SELECT UserID, SearchPhrase, count() FROM {table} GROUP BY UserID, SearchPhrase ORDER BY count() DESC LIMIT 10;
SELECT UserID, SearchPhrase, count() FROM {table} GROUP BY UserID, SearchPhrase LIMIT 10;
SELECT UserID, toMinute(EventTime) AS m, SearchPhrase, count() FROM {table} GROUP BY UserID, m, SearchPhrase ORDER BY count() DESC LIMIT 10;
SELECT UserID FROM {table} WHERE UserID = 12345678901234567890;
SELECT count() FROM {table} WHERE URL LIKE '%metrika%';
SELECT SearchPhrase, any(URL), count() AS c FROM {table} WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT SearchPhrase, any(URL), any(Title), count() AS c, uniq(UserID) FROM {table} WHERE Title LIKE '%Яндекс%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT * FROM {table} WHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10;
SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10;
SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10;
SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10;
SELECT CounterID, avg(length(URL)) AS l, count() AS c FROM {table} WHERE URL != '' GROUP BY CounterID HAVING c > 100000 ORDER BY l DESC LIMIT 25;
SELECT domainWithoutWWW(Referer) AS key, avg(length(Referer)) AS l, count() AS c, any(Referer) FROM {table} WHERE Referer != '' GROUP BY key HAVING c > 100000 ORDER BY l DESC LIMIT 25;
SELECT sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM {table};
SELECT SearchEngineID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;
SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
SELECT URL, count() AS c FROM {table} GROUP BY URL ORDER BY c DESC LIMIT 10;
SELECT 1, URL, count() AS c FROM {table} GROUP BY 1, URL ORDER BY c DESC LIMIT 10;
SELECT ClientIP AS x, x - 1, x - 2, x - 3, count() AS c FROM {table} GROUP BY x, x - 1, x - 2, x - 3 ORDER BY c DESC LIMIT 10;
SELECT URL, count() AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND NOT DontCountHits AND NOT Refresh AND notEmpty(URL) GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
SELECT Title, count() AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND NOT DontCountHits AND NOT Refresh AND notEmpty(Title) GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
SELECT URL, count() AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND NOT Refresh AND IsLink AND NOT IsDownload GROUP BY URL ORDER BY PageViews DESC LIMIT 1000;
SELECT TraficSourceID, SearchEngineID, AdvEngineID, ((SearchEngineID = 0 AND AdvEngineID = 0) ? Referer : '') AS Src, URL AS Dst, count() AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND NOT Refresh GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 1000;
SELECT URLHash, EventDate, count() AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND NOT Refresh AND TraficSourceID IN (-1, 6) AND RefererHash = halfMD5('http://example.ru/') GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100;
SELECT WindowClientWidth, WindowClientHeight, count() AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND NOT Refresh AND NOT DontCountHits AND URLHash = halfMD5('http://example.ru/') GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000;
SELECT toStartOfMinute(EventTime) AS Minute, count() AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-02' AND NOT Refresh AND NOT DontCountHits GROUP BY Minute ORDER BY Minute;

View File

@ -1,3 +0,0 @@
#!/usr/bin/env bash
table=hits_10m; time clickhouse-client --max_bytes_before_external_sort=30000000000 --query="SELECT toInt64(WatchID), JavaEnable, Title, GoodEvent, (EventTime < toDateTime('1971-01-01 00:00:00') ? toDateTime('1971-01-01 00:00:01') : EventTime), (EventDate < toDate('1971-01-01') ? toDate('1971-01-01') : EventDate), CounterID, ClientIP, RegionID, toInt64(UserID), CounterClass, OS, UserAgent, URL, Referer, Refresh, RefererCategoryID, RefererRegionID, URLCategoryID, URLRegionID, ResolutionWidth, ResolutionHeight, ResolutionDepth, FlashMajor, FlashMinor, FlashMinor2, NetMajor, NetMinor, UserAgentMajor, UserAgentMinor, CookieEnable, JavascriptEnable, IsMobile, MobilePhone, MobilePhoneModel, Params, IPNetworkID, TraficSourceID, SearchEngineID, SearchPhrase, AdvEngineID, IsArtifical, WindowClientWidth, WindowClientHeight, ClientTimeZone, (ClientEventTime < toDateTime('1971-01-01 00:00:01') ? toDateTime('1971-01-01 00:00:01') : ClientEventTime), SilverlightVersion1, SilverlightVersion2, SilverlightVersion3, SilverlightVersion4, PageCharset, CodeVersion, IsLink, IsDownload, IsNotBounce, toInt64(FUniqID), OriginalURL, HID, IsOldCounter, IsEvent, IsParameter, DontCountHits, WithHash, HitColor, (LocalEventTime < toDateTime('1971-01-01 00:00:01') ? toDateTime('1971-01-01 00:00:01') : LocalEventTime), Age, Sex, Income, Interests, Robotness, RemoteIP, WindowName, OpenerName, HistoryLength, BrowserLanguage, BrowserCountry, SocialNetwork, SocialAction, HTTPError, SendTiming, DNSTiming, ConnectTiming, ResponseStartTiming, ResponseEndTiming, FetchTiming, SocialSourceNetworkID, SocialSourcePage, ParamPrice, ParamOrderID, ParamCurrency, ParamCurrencyID, OpenstatServiceName, OpenstatCampaignID, OpenstatAdID, OpenstatSourceID, UTMSource, UTMMedium, UTMCampaign, UTMContent, UTMTerm, FromTag, HasGCLID, toInt64(RefererHash), toInt64(URLHash), CLID FROM $table ORDER BY rand()" | corrector_utf8 > /opt/dumps/${table}_corrected.tsv

View File

@ -1,43 +0,0 @@
Folder structure
______________
dump_dataset_from_ch.sh - bash script that dumps a dataset from Clickhouse
schema.sql - schema for a Greenplum cluster to load dumped dataset in
load_data_set.sql - the script that loads up a dumped dataset
queries.sql - SQL statements used in the benchmark
benchmark.sh - this piece of bash conducts a benchmark
result_parser.py - script to parse benchmark.sh's output and produce python code to build a graph to compare up to 4 benchmark results.
Requirements
____________
Greenplum uses a separate server as a point of entry, so you need 2 servers at least to run a cluster: master and segment hosts. 2 segments host and 56 segments(28 per host) had been used while conducting the test.
You has has to put segment hostnames in the benchmark.sh.
Greenplum quick installation instructions
_________________________________________
Obtain a stable Greenplum version here(4.3.9.1 was used while conducting the benchmark):
https://network.pivotal.io/products/pivotal-gpdb
and install it using this detailed guide:
http://gpdb.docs.pivotal.io/4340/install_guide/install_guide.html
You should change gp_interconnect_type to 'tcp' if cluster members are connected via 1GB link or lower.
There are some variables that has to be changed prior the first benchmark run: gp_vmem_protect_limit and max_statement_mem to allow each segment to use more virtual memory. Here are commands to change this GUCS that has to be executed as gpadmin at the master host:
gpconfig -c gp_interconnect_type -v tcp
gpconfig -c gp_vmem_protect_limit -v 3000
gpconfig -c max_statement_mem -v '4000MB'
How to prepare data
-------------------
One can prepare datasets to run the benchmark on using dump_dataset_from_ch.sh script from this repo. The script has to be run at at Clickhouse host. It takes a long time to get dumps.
Upload the datasets into Greenplum master.Then run schema.sql to prepare schema and load_data_set.sql to load data up. This operation also takes a long time.
How to conduct the benchmark
__________________________
There is a benchmark.sh that take some arguments. Here is the syntax:
./benchmark.sh sql_statements_file tablename dbname orca_switch
If you don't know about the last one then just use a default value.

View File

@ -1,30 +0,0 @@
#!/usr/bin/env bash
filename=${1-queries.sql}
table=$2
dbname=$3
orca=${4-on}
host1=somehost
host2=somehost
mem='15GB'
cat $filename | sed "s/{table}/$table/g" | while read query ;
do
ssh -n $host1 'echo 3 | tee /proc/sys/vm/drop_caches; sync' > /dev/null
ssh -n $host2 'echo 3 | tee /proc/sys/vm/drop_caches; sync' > /dev/null
sleep 5
echo $query | egrep "SELECT UserID, date_trunc\('minute', EventTime\) AS m|SELECT Referer AS key, avg\(length\(Referer\)\) AS l|SELECT URL, count(1) AS c FROM.*GROUP BY URL|SELECT 1, URL, count\(1\) AS c FROM.*GROUP BY 1" && mem='10GB'
echo $query | egrep 'SELECT DISTINCT|GROUP BY UserID, SearchPhrase LIMIT 10|count\(DISTINCT UserID\) AS u' && mem='5GB'
echo "####################"
echo "$query"
echo "Timestamp_begin:$(date)"
echo "\\timing off \\\\set optimizer=$orca; set effective_cache_size='256MB'; set statement_mem='$mem';\\timing on \\\\ $query;" | psql -p 5432 -h 'localhost' -o /dev/null -U gpadmin ${dbname}
echo "Timestamp_end:$(date)"
echo "Timestamp_begin:$(date)"
echo "\\timing off \\\\set optimizer=$orca; set effective_cache_size='50GB'; set statement_mem='$mem';\\timing on \\\\ $query;" | psql -p 5432 -h 'localhost' -o /dev/null -U gpadmin ${dbname}
echo "Timestamp_end:$(date)"
echo "Timestamp_begin:$(date)"
echo "\\timing off \\\\set optimizer=$orca; set effective_cache_size='50GB'; set statement_mem='$mem';\\timing on \\\\ $query;" | psql -p 5432 -h 'localhost' -o /dev/null -U gpadmin ${dbname}
echo "Timestamp_end:$(date)"
echo "$query"
echo '####################'
done

View File

@ -1,5 +0,0 @@
#!/usr/bin/env bash
for table in hits_10m_single hits_100m_single hits_1000m_single; do
clickhouse-client -q "SELECT (round(WatchID/2), JavaEnable, Title, GoodEvent, EventTime, EventDate, CounterID, ClientIP, RegionID,round(UserID/2), CounterClass, OS, UserAgent, URL, Referer, Refresh, RefererCategoryID, RefererRegionID, URLCategoryID, URLRegionID, ResolutionWidth, ResolutionHeight, ResolutionDepth, FlashMajor, FlashMinor, FlashMinor2, NetMajor, NetMinor, UserAgentMajor, CookieEnable, JavascriptEnable, IsMobile, MobilePhone, MobilePhoneModel, Params, IPNetworkID, TraficSourceID, SearchEngineID, SearchPhrase, AdvEngineID, IsArtifical, WindowClientWidth, WindowClientHeight, ClientTimeZone, ClientEventTime, SilverlightVersion1, SilverlightVersion2, SilverlightVersion3, SilverlightVersion4, PageCharset, CodeVersion, IsLink, IsDownload, IsNotBounce,round(FUniqID/2), OriginalURL, HID, IsOldCounter, IsEvent, IsParameter, DontCountHits, WithHash, HitColor, LocalEventTime, Age, Sex, Income, Interests, Robotness, RemoteIP, WindowName, OpenerName, HistoryLength, SocialNetwork, SocialAction, HTTPError, SendTiming, DNSTiming, ConnectTiming, ResponseStartTiming, ResponseEndTiming, FetchTiming, SocialSourceNetworkID, SocialSourcePage, ParamPrice, ParamOrderID, OpenstatServiceName, OpenstatCampaignID, OpenstatAdID, OpenstatSourceID, UTMSource, UTMMedium, UTMCampaign, UTMContent, UTMTerm, FromTag, HasGCLID,round(RefererHash/2),round(URLHash/2), CLID) FROM $table FORMAT CSV" > $table
done

View File

@ -1,12 +0,0 @@
COPY hits_all_10m FROM '/data/hits_10m_single.dump' CSV SEGMENT REJECT LIMIT 30 PERCENT;
CREATE INDEX pk_counterid_eventdate_userid_10m ON hits_all_10m USING btree (counterid, eventdate, userid);
CREATE INDEX idx_10m_counterid on hits_all_10m using btree (counterid); CREATE INDEX idx_10m_userid on hits_all_10m using btree (userid);
ANALYZE hits_all_10m;
COPY hits_all_100m from '/data/hits_100m_single.dump' CSV SEGMENT REJECT LIMIT 30 PERCENT;
CREATE INDEX pk_counterid_eventdate_userid_100m ON hits_all_100m USING btree (counterid, eventdate, userid);
CREATE INDEX idx_100m_counterid on hits_all_100m using btree (counterid); CREATE INDEX idx_100m_userid on hits_all_100m using btree (userid);
ANALYZE hits_all_100m;
COPY hits_all_1000m from '/data/hits_1000m_single.dump' CSV SEGMENT REJECT LIMIT 30 PERCENT;
CREATE INDEX pk_counterid_eventdate_userid_1000m ON hits_all_1000m USING btree (counterid, eventdate, userid);
CREATE INDEX idx_1000m_counterid on hits_all_1000m using btree (counterid); CREATE INDEX idx_1000m_userid on hits_all_1000m using btree (userid);
ANALYZE hits_all_1000m;

View File

@ -1,43 +0,0 @@
SELECT count(1) FROM {table}
SELECT count(1) FROM {table} WHERE AdvEngineID != 0
SELECT sum(AdvEngineID), count(1), avg(ResolutionWidth) FROM {table}
SELECT sum(UserID) FROM {table}
SELECT count(UserID) FROM ( SELECT DISTINCT UserID FROM {table} ) AS d
SELECT count(SearchPhrase) FROM ( SELECT DISTINCT SearchPhrase FROM {table} ) AS d
SELECT min(EventDate), max(EventDate) FROM {table}
SELECT AdvEngineID, count(1) FROM {table} WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY 2 DESC
SELECT RegionID, count(DISTINCT UserID) AS u FROM {table} GROUP BY RegionID ORDER BY u DESC LIMIT 10
SELECT RegionID, sum(AdvEngineID), count(1) AS c, avg(ResolutionWidth), count(DISTINCT UserID) FROM {table} GROUP BY RegionID ORDER BY c DESC LIMIT 10
SELECT MobilePhoneModel, count(DISTINCT UserID) AS u FROM {table} WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10
SELECT MobilePhone, MobilePhoneModel, count(DISTINCT UserID) AS u FROM {table} WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10
SELECT SearchPhrase, count(1) AS c FROM {table} WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10
SELECT SearchPhrase, count(DISTINCT UserID) AS u FROM {table} WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10
SELECT SearchEngineID, SearchPhrase, count(1) AS c FROM {table} WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10
SELECT UserID, count(1) FROM {table} GROUP BY UserID ORDER BY 2 DESC LIMIT 10
SELECT UserID, SearchPhrase, count(1) FROM {table} GROUP BY UserID, SearchPhrase ORDER BY 3 DESC LIMIT 10
SELECT UserID, SearchPhrase, count(1) FROM {table} GROUP BY UserID, SearchPhrase LIMIT 10
SELECT UserID, date_trunc('minute', EventTime) AS m, SearchPhrase, count(1) FROM {table} GROUP BY UserID, m, SearchPhrase ORDER BY count(1) DESC LIMIT 10
SELECT UserID FROM {table} WHERE UserID = 12345678901234567890
SELECT count(1) FROM {table} WHERE URL LIKE '%metrika%'
SELECT SearchPhrase, max(URL) as URL, count(1) AS c FROM {table} h WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10
SELECT SearchPhrase, max(URL) as URL, min(Title) as Title, count(1) AS c, count(DISTINCT UserID) FROM {table} WHERE Title LIKE '%\xd0\xaf\xd0\xbd\xd0\xb4\xd0\xb5\xd0\xba\xd1\x81%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT count(1) FROM {table}
SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10
SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10
SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10
SELECT CounterID, avg(length(URL)) AS l, count(1) AS c FROM {table} WHERE URL != '' GROUP BY CounterID HAVING count(1) > 100000 ORDER BY l DESC LIMIT 25
SELECT Referer AS key, avg(length(Referer)) AS l, count(1) AS c, Referer FROM {table} WHERE Referer != '' GROUP BY key HAVING count(1) > 100000 ORDER BY l DESC LIMIT 25
SELECT sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM {table}
SELECT SearchEngineID, ClientIP, count(1) AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10
SELECT WatchID, ClientIP, count(1) AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10
SELECT WatchID, ClientIP, count(1) AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10
SELECT URL, count(1) AS c FROM {table} GROUP BY URL ORDER BY c DESC LIMIT 10
SELECT 1, URL, count(1) AS c FROM {table} GROUP BY 1, URL ORDER BY c DESC LIMIT 10
SELECT ClientIP AS x, ClientIP - 1, ClientIP - 2, ClientIP - 3, count(1) AS c FROM {table} GROUP BY x, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10
SELECT URL, count(1) AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate between '2013-07-01'::timestamp AND '2013-07-31'::timestamp AND DontCountHits =0 AND Refresh = 0 AND URL <>'' GROUP BY URL ORDER BY PageViews DESC LIMIT 10
SELECT Title, count(1) AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate BETWEEN '2013-07-01'::timestamp AND '2013-07-31'::timestamp AND DontCountHits=0 AND Refresh=0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10
SELECT URL, count(1) AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate between '2013-07-01'::timestamp AND '2013-07-31'::timestamp AND Refresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 1000;
SELECT TraficSourceID, SearchEngineID, AdvEngineID, case when (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END Src, URL AS Dst, count(1) AS PageViews FROM {table} WHERE CounterID = 62 AND eventDate between '2013-07-01'::timestamp AND '2013-07-31'::timestamp AND Refresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 1000;
SELECT URLHash, EventDate, count(1) AS PageViews FROM {table} WHERE CounterID = 62 AND eventDate between '2013-07-01'::timestamp AND '2013-07-31'::timestamp AND Refresh =0 AND TraficSourceID IN (-1, 6) AND RefererHash = 7135345792483900000 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100
SELECT WindowClientWidth, WindowClientHeight, count(1) AS PageViews FROM {table} WHERE CounterID = 62 AND eventDate between '2013-07-01'::timestamp AND '2013-07-31'::timestamp AND Refresh =0 AND DontCountHits =0 AND URLHash = 7135345792483900000 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000;
SELECT date_trunc('minute', EventTime) AS Minute, count(1) AS PageViews FROM {table} WHERE CounterID = 62 AND eventDate between '2013-07-01'::timestamp AND '2013-07-31'::timestamp AND Refresh =0 AND DontCountHits =0 GROUP BY Minute ORDER BY Minute;

View File

@ -1,122 +0,0 @@
#!/usr/bin/env python3
import sys
import json
def parse_block(block=[], options=[]):
#print('block is here', block)
#show_query = False
#show_query = options.show_query
result = []
query = block[0].strip()
if len(block) > 4:
timing1 = block[1].strip().split()[1]
timing2 = block[3].strip().split()[1]
timing3 = block[5].strip().split()[1]
else:
timing1 = block[1].strip().split()[1]
timing2 = block[2].strip().split()[1]
timing3 = block[3].strip().split()[1]
if options.show_queries:
result.append( query )
if not options.show_first_timings:
result += [ timing1 , timing2, timing3 ]
else:
result.append(timing1)
return result
def read_stats_file(options, fname):
result = []
int_result = []
block = []
time_count = 1
with open(fname) as f:
for line in f.readlines():
if 'SELECT' in line:
if len(block) > 1:
result.append( parse_block(block, options) )
block = [ line ]
elif 'Time:' in line:
block.append( line )
return result
def compare_stats_files(options, arguments):
result = []
file_output = []
pyplot_colors = ['y', 'b', 'g', 'r']
for fname in arguments[1:]:
file_output.append((read_stats_file(options, fname)))
if len(file_output[0]) > 0:
timings_count = len(file_output[0])
for idx, data_set in enumerate(file_output):
int_result = []
for timing in data_set:
int_result.append(float(timing[0])) #y values
result.append([[x for x in range(0, len(int_result)) ], int_result,
pyplot_colors[idx] + '^' ] )
# result.append([x for x in range(1, len(int_result)) ]) #x values
# result.append( pyplot_colors[idx] + '^' )
return result
def parse_args():
from optparse import OptionParser
parser = OptionParser(usage='usage: %prog [options] [result_file_path]..')
parser.add_option("-q", "--show-queries", help="Show statements along with timings", action="store_true", dest="show_queries")
parser.add_option("-f", "--show-first-timings", help="Show only first tries timings", action="store_true", dest="show_first_timings")
parser.add_option("-c", "--compare-mode", help="Prepare output for pyplot comparing result files.", action="store", dest="compare_mode")
(options, arguments) = parser.parse_args(sys.argv)
if len(arguments) < 2:
parser.print_usage()
sys.exit(1)
return ( options, arguments )
def gen_pyplot_code(options, arguments):
result = ''
data_sets = compare_stats_files(options, arguments)
for idx, data_set in enumerate(data_sets, start=0):
x_values, y_values, line_style = data_set
result += '\nplt.plot('
result += '%s, %s, \'%s\'' % ( x_values, y_values, line_style )
result += ', label=\'%s try\')' % idx
print('import matplotlib.pyplot as plt')
print(result)
print( 'plt.xlabel(\'Try number\')' )
print( 'plt.ylabel(\'Timing\')' )
print( 'plt.title(\'Benchmark query timings\')' )
print('plt.legend()')
print('plt.show()')
def gen_html_json(options, arguments):
tuples = read_stats_file(options, arguments[1])
print('{')
print('"system: GreenPlum(x2),')
print(('"version": "%s",' % '4.3.9.1'))
print('"data_size": 10000000,')
print('"time": "",')
print('"comments": "",')
print('"result":')
print('[')
for s in tuples:
print(s)
print(']')
print('}')
def main():
( options, arguments ) = parse_args()
if len(arguments) > 2:
gen_pyplot_code(options, arguments)
else:
gen_html_json(options, arguments)
if __name__ == '__main__':
main()

View File

@ -1,3 +0,0 @@
CREATE TABLE hits_all_10m ( WatchID bigint, JavaEnable int, Title text, GoodEvent int, EventTime timestamp, EventDate timestamp, CounterID bigint, ClientIP bigint, RegionID bigint, UserID bigint, CounterClass int, OS int, UserAgent int, URL text, Referer text, Refresh int, RefererCategoryID int, RefererRegionID bigint, URLCategoryID int, URLRegionID bigint, ResolutionWidth int, ResolutionHeight int, ResolutionDepth int, FlashMajor int, FlashMinor int, FlashMinor2 text, NetMajor int, NetMinor int, UserAgentMajor int, CookieEnable int, JavascriptEnable int, IsMobile int, MobilePhone int, MobilePhoneModel text, Params text, IPNetworkID bigint, TraficSourceID int, SearchEngineID int, SearchPhrase text, AdvEngineID int, IsArtifical int, WindowClientWidth int, WindowClientHeight int, ClientTimeZone int, ClientEventTime timestamp, SilverlightVersion1 int, SilverlightVersion2 int, SilverlightVersion3 bigint, SilverlightVersion4 int, PageCharset text, CodeVersion bigint, IsLink int, IsDownload int, IsNotBounce int, FUniqID bigint, OriginalURL text, HID bigint, IsOldCounter int, IsEvent int, IsParameter int, DontCountHits int, WithHash int, HitColor varchar(3), LocalEventTime timestamp, Age int, Sex int, Income int, Interests int, Robotness int, RemoteIP bigint, WindowName int, OpenerName int, HistoryLength int, SocialNetwork text, SocialAction text, HTTPError int, SendTiming bigint, DNSTiming bigint, ConnectTiming bigint, ResponseStartTiming bigint, ResponseEndTiming bigint, FetchTiming bigint, SocialSourceNetworkID int, SocialSourcePage text, ParamPrice int, ParamOrderID text, OpenstatServiceName text, OpenstatCampaignID text, OpenstatAdID text, OpenstatSourceID text, UTMSource text, UTMMedium text, UTMCampaign text, UTMContent text, UTMTerm text, FromTag text, HasGCLID int, RefererHash bigint, URLHash bigint, CLID bigint) WITH (appendonly=true, orientation=column, compresstype=quicklz) DISTRIBUTED BY (userid) ;
CREATE TABLE hits_all_100m ( WatchID bigint, JavaEnable int, Title text, GoodEvent int, EventTime timestamp, EventDate timestamp, CounterID bigint, ClientIP bigint, RegionID bigint, UserID bigint, CounterClass int, OS int, UserAgent int, URL text, Referer text, Refresh int, RefererCategoryID int, RefererRegionID bigint, URLCategoryID int, URLRegionID bigint, ResolutionWidth int, ResolutionHeight int, ResolutionDepth int, FlashMajor int, FlashMinor int, FlashMinor2 text, NetMajor int, NetMinor int, UserAgentMajor int, CookieEnable int, JavascriptEnable int, IsMobile int, MobilePhone int, MobilePhoneModel text, Params text, IPNetworkID bigint, TraficSourceID int, SearchEngineID int, SearchPhrase text, AdvEngineID int, IsArtifical int, WindowClientWidth int, WindowClientHeight int, ClientTimeZone int, ClientEventTime timestamp, SilverlightVersion1 int, SilverlightVersion2 int, SilverlightVersion3 bigint, SilverlightVersion4 int, PageCharset text, CodeVersion bigint, IsLink int, IsDownload int, IsNotBounce int, FUniqID bigint, OriginalURL text, HID bigint, IsOldCounter int, IsEvent int, IsParameter int, DontCountHits int, WithHash int, HitColor varchar(3), LocalEventTime timestamp, Age int, Sex int, Income int, Interests int, Robotness int, RemoteIP bigint, WindowName int, OpenerName int, HistoryLength int, SocialNetwork text, SocialAction text, HTTPError int, SendTiming bigint, DNSTiming bigint, ConnectTiming bigint, ResponseStartTiming bigint, ResponseEndTiming bigint, FetchTiming bigint, SocialSourceNetworkID int, SocialSourcePage text, ParamPrice int, ParamOrderID text, OpenstatServiceName text, OpenstatCampaignID text, OpenstatAdID text, OpenstatSourceID text, UTMSource text, UTMMedium text, UTMCampaign text, UTMContent text, UTMTerm text, FromTag text, HasGCLID int, RefererHash bigint, URLHash bigint, CLID bigint) WITH (appendonly=true, orientation=column, compresstype=quicklz) DISTRIBUTED BY (userid) ;
CREATE TABLE hits_all_1000m ( WatchID bigint, JavaEnable int, Title text, GoodEvent int, EventTime timestamp, EventDate timestamp, CounterID bigint, ClientIP bigint, RegionID bigint, UserID bigint, CounterClass int, OS int, UserAgent int, URL text, Referer text, Refresh int, RefererCategoryID int, RefererRegionID bigint, URLCategoryID int, URLRegionID bigint, ResolutionWidth int, ResolutionHeight int, ResolutionDepth int, FlashMajor int, FlashMinor int, FlashMinor2 text, NetMajor int, NetMinor int, UserAgentMajor int, CookieEnable int, JavascriptEnable int, IsMobile int, MobilePhone int, MobilePhoneModel text, Params text, IPNetworkID bigint, TraficSourceID int, SearchEngineID int, SearchPhrase text, AdvEngineID int, IsArtifical int, WindowClientWidth int, WindowClientHeight int, ClientTimeZone int, ClientEventTime timestamp, SilverlightVersion1 int, SilverlightVersion2 int, SilverlightVersion3 bigint, SilverlightVersion4 int, PageCharset text, CodeVersion bigint, IsLink int, IsDownload int, IsNotBounce int, FUniqID bigint, OriginalURL text, HID bigint, IsOldCounter int, IsEvent int, IsParameter int, DontCountHits int, WithHash int, HitColor varchar(3), LocalEventTime timestamp, Age int, Sex int, Income int, Interests int, Robotness int, RemoteIP bigint, WindowName int, OpenerName int, HistoryLength int, SocialNetwork text, SocialAction text, HTTPError int, SendTiming bigint, DNSTiming bigint, ConnectTiming bigint, ResponseStartTiming bigint, ResponseEndTiming bigint, FetchTiming bigint, SocialSourceNetworkID int, SocialSourcePage text, ParamPrice int, ParamOrderID text, OpenstatServiceName text, OpenstatCampaignID text, OpenstatAdID text, OpenstatSourceID text, UTMSource text, UTMMedium text, UTMCampaign text, UTMContent text, UTMTerm text, FromTag text, HasGCLID int, RefererHash bigint, URLHash bigint, CLID bigint) WITH (appendonly=true, orientation=column,compresstype=quicklz) DISTRIBUTED BY (userid) ;

View File

@ -1,127 +0,0 @@
#!/bin/bash -e
if [[ -n $1 ]]; then
SCALE=$1
else
SCALE=100
fi
TABLE="hits_${SCALE}m_obfuscated"
DATASET="${TABLE}_v1.tar.xz"
QUERIES_FILE="queries.sql"
TRIES=3
AMD64_BIN_URL="https://clickhouse-builds.s3.yandex.net/0/e29c4c3cc47ab2a6c4516486c1b77d57e7d42643/clickhouse_build_check/gcc-10_relwithdebuginfo_none_bundled_unsplitted_disable_False_binary/clickhouse"
AARCH64_BIN_URL="https://clickhouse-builds.s3.yandex.net/0/e29c4c3cc47ab2a6c4516486c1b77d57e7d42643/clickhouse_special_build_check/clang-10-aarch64_relwithdebuginfo_none_bundled_unsplitted_disable_False_binary/clickhouse"
# Note: on older Ubuntu versions, 'axel' does not support IPv6. If you are using IPv6-only servers on very old Ubuntu, just don't install 'axel'.
FASTER_DOWNLOAD=wget
if command -v axel >/dev/null; then
FASTER_DOWNLOAD=axel
else
echo "It's recommended to install 'axel' for faster downloads."
fi
if command -v pixz >/dev/null; then
TAR_PARAMS='-Ipixz'
else
echo "It's recommended to install 'pixz' for faster decompression of the dataset."
fi
mkdir -p clickhouse-benchmark-$SCALE
pushd clickhouse-benchmark-$SCALE
if [[ ! -f clickhouse ]]; then
CPU=$(uname -m)
if [[ ($CPU == x86_64) || ($CPU == amd64) ]]; then
$FASTER_DOWNLOAD "$AMD64_BIN_URL"
elif [[ $CPU == aarch64 ]]; then
$FASTER_DOWNLOAD "$AARCH64_BIN_URL"
else
echo "Unsupported CPU type: $CPU"
exit 1
fi
fi
chmod a+x clickhouse
if [[ ! -f $QUERIES_FILE ]]; then
wget "https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/benchmark/clickhouse/$QUERIES_FILE"
fi
if [[ ! -d data ]]; then
if [[ ! -f $DATASET ]]; then
$FASTER_DOWNLOAD "https://clickhouse-datasets.s3.yandex.net/hits/partitions/$DATASET"
fi
tar $TAR_PARAMS --strip-components=1 --directory=. -x -v -f $DATASET
fi
uptime
echo "Starting clickhouse-server"
./clickhouse server > server.log 2>&1 &
PID=$!
function finish {
kill $PID
wait
}
trap finish EXIT
echo "Waiting for clickhouse-server to start"
for i in {1..30}; do
sleep 1
./clickhouse client --query "SELECT 'The dataset size is: ', count() FROM $TABLE" 2>/dev/null && break || echo '.'
if [[ $i == 30 ]]; then exit 1; fi
done
echo
echo "Will perform benchmark. Results:"
echo
cat "$QUERIES_FILE" | sed "s/{table}/${TABLE}/g" | while read query; do
sync
echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
echo -n "["
for i in $(seq 1 $TRIES); do
RES=$(./clickhouse client --max_memory_usage 100000000000 --time --format=Null --query="$query" 2>&1 ||:)
[[ "$?" == "0" ]] && echo -n "${RES}" || echo -n "null"
[[ "$i" != $TRIES ]] && echo -n ", "
done
echo "],"
done
echo
echo "Benchmark complete. System info:"
echo
echo '----Version, build id-----------'
./clickhouse local --query "SELECT format('Version: {}, build id: {}', version(), buildId())"
./clickhouse local --query "SELECT format('The number of threads is: {}', value) FROM system.settings WHERE name = 'max_threads'" --output-format TSVRaw
./clickhouse local --query "SELECT format('Current time: {}', toString(now(), 'UTC'))"
echo '----CPU-------------------------'
cat /proc/cpuinfo | grep -i -F 'model name' | uniq
lscpu
echo '----Block Devices---------------'
lsblk
echo '----Disk Free and Total--------'
df -h .
echo '----Memory Free and Total-------'
free -h
echo '----Physical Memory Amount------'
cat /proc/meminfo | grep MemTotal
echo '----RAID Info-------------------'
cat /proc/mdstat
#echo '----PCI-------------------------'
#lspci
#echo '----All Hardware Info-----------'
#lshw
echo '--------------------------------'
echo

View File

@ -1,4 +0,0 @@
CONF_DIR=/home/kartavyy/benchmark/hive
expect_file=$CONF_DIR/expect.tcl
test_file=$CONF_DIR/queries.sql
etc_init_d_service=

View File

@ -1,9 +0,0 @@
create table hits_10m_raw ( WatchID BIGINT, JavaEnable SMALLINT, Title STRING, GoodEvent SMALLINT, EventTime TIMESTAMP, EventDate TIMESTAMP, CounterID BIGINT, ClientIP BIGINT, RegionID BIGINT, UserID BIGINT, CounterClass TINYINT, OS SMALLINT, UserAgent SMALLINT, URL STRING, Referer STRING, Refresh TINYINT, RefererCategoryID INT, RefererRegionID BIGINT, URLCategoryID INT, URLRegionID BIGINT, ResolutionWidth INT, ResolutionHeight INT, ResolutionDepth SMALLINT, FlashMajor SMALLINT, FlashMinor SMALLINT, FlashMinor2 STRING, NetMajor SMALLINT, NetMinor SMALLINT, UserAgentMajor INT, UserAgentMinor STRING, CookieEnable SMALLINT, JavascriptEnable SMALLINT, IsMobile SMALLINT, MobilePhone SMALLINT, MobilePhoneModel STRING, Params STRING, IPNetworkID BIGINT, TraficSourceID SMALLINT, SearchEngineID INT, SearchPhrase STRING, AdvEngineID SMALLINT, IsArtifical SMALLINT, WindowClientWidth INT, WindowClientHeight INT, ClientTimeZone INT, ClientEventTime TIMESTAMP, SilverlightVersion1 SMALLINT, SilverlightVersion2 SMALLINT, SilverlightVersion3 BIGINT, SilverlightVersion4 INT, PageCharset STRING, CodeVersion BIGINT, IsLink SMALLINT, IsDownload SMALLINT, IsNotBounce SMALLINT, FUniqID BIGINT, OriginalURL STRING, HID BIGINT, IsOldCounter SMALLINT, IsEvent SMALLINT, IsParameter SMALLINT, DontCountHits SMALLINT, WithHash SMALLINT, HitColor STRING, LocalEventTime TIMESTAMP, Age SMALLINT, Sex SMALLINT, Income SMALLINT, Interests INT, Robotness SMALLINT, RemoteIP BIGINT, WindowName INT, OpenerName INT, HistoryLength SMALLINT, BrowserLanguage STRING, BrowserCountry STRING, SocialNetwork STRING, SocialAction STRING, HTTPError INT, SendTiming BIGINT, DNSTiming BIGINT, ConnectTiming BIGINT, ResponseStartTiming BIGINT, ResponseEndTiming BIGINT, FetchTiming BIGINT, SocialSourceNetworkID SMALLINT, SocialSourcePage STRING, ParamPrice BIGINT, ParamOrderID STRING, ParamCurrency STRING, ParamCurrencyID INT, OpenstatServiceName STRING, OpenstatCampaignID STRING, OpenstatAdID STRING, OpenstatSourceID STRING, UTMSource STRING, UTMMedium STRING, UTMCampaign STRING, UTMContent STRING, UTMTerm STRING, FromTag STRING, HasGCLID SMALLINT, RefererHash BIGINT, URLHash BIGINT, CLID BIGINT, UserIDHash BIGINT ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS TEXTFILE;
load data local inpath '/opt/dump/dump_0.3/dump_hits_10m_meshed_utf8.tsv' overwrite into table hits_10m_raw;
create table hits_10m ( WatchID BIGINT, JavaEnable SMALLINT, Title STRING, GoodEvent SMALLINT, EventTime TIMESTAMP, EventDate TIMESTAMP, CounterID BIGINT, ClientIP BIGINT, RegionID BIGINT, UserID BIGINT, CounterClass TINYINT, OS SMALLINT, UserAgent SMALLINT, URL STRING, Referer STRING, Refresh TINYINT, RefererCategoryID INT, RefererRegionID BIGINT, URLCategoryID INT, URLRegionID BIGINT, ResolutionWidth INT, ResolutionHeight INT, ResolutionDepth SMALLINT, FlashMajor SMALLINT, FlashMinor SMALLINT, FlashMinor2 STRING, NetMajor SMALLINT, NetMinor SMALLINT, UserAgentMajor INT, UserAgentMinor STRING, CookieEnable SMALLINT, JavascriptEnable SMALLINT, IsMobile SMALLINT, MobilePhone SMALLINT, MobilePhoneModel STRING, Params STRING, IPNetworkID BIGINT, TraficSourceID SMALLINT, SearchEngineID INT, SearchPhrase STRING, AdvEngineID SMALLINT, IsArtifical SMALLINT, WindowClientWidth INT, WindowClientHeight INT, ClientTimeZone INT, ClientEventTime TIMESTAMP, SilverlightVersion1 SMALLINT, SilverlightVersion2 SMALLINT, SilverlightVersion3 BIGINT, SilverlightVersion4 INT, PageCharset STRING, CodeVersion BIGINT, IsLink SMALLINT, IsDownload SMALLINT, IsNotBounce SMALLINT, FUniqID BIGINT, OriginalURL STRING, HID BIGINT, IsOldCounter SMALLINT, IsEvent SMALLINT, IsParameter SMALLINT, DontCountHits SMALLINT, WithHash SMALLINT, HitColor STRING, LocalEventTime TIMESTAMP, Age SMALLINT, Sex SMALLINT, Income SMALLINT, Interests INT, Robotness SMALLINT, RemoteIP BIGINT, WindowName INT, OpenerName INT, HistoryLength SMALLINT, BrowserLanguage STRING, BrowserCountry STRING, SocialNetwork STRING, SocialAction STRING, HTTPError INT, SendTiming BIGINT, DNSTiming BIGINT, ConnectTiming BIGINT, ResponseStartTiming BIGINT, ResponseEndTiming BIGINT, FetchTiming BIGINT, SocialSourceNetworkID SMALLINT, SocialSourcePage STRING, ParamPrice BIGINT, ParamOrderID STRING, ParamCurrency STRING, ParamCurrencyID INT, OpenstatServiceName STRING, OpenstatCampaignID STRING, OpenstatAdID STRING, OpenstatSourceID STRING, UTMSource STRING, UTMMedium STRING, UTMCampaign STRING, UTMContent STRING, UTMTerm STRING, FromTag STRING, HasGCLID SMALLINT, RefererHash BIGINT, URLHash BIGINT, CLID BIGINT, UserIDHash BIGINT ) CLUSTERED BY (EventDate) SORTED BY(CounterID, EventDate, UserIDHash, EventTime) INTO 10 BUCKETS STORED AS ORC tblproperties("orc.compress"="ZLIB");
insert overwrite table hits_10m select * from hits_10m_raw;
--drop table hits_10m_raw;

View File

@ -1,18 +0,0 @@
#!/usr/bin/env bash
#!/bin/expect
# Set timeout
set timeout 600
# Get arguments
set query [lindex $argv 0]
spawn hive
expect "hive>"
send "$query;\r"
expect "hive>"
send "quit;\r"
expect eof

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,624 +0,0 @@
start time: Вт. сент. 10 18:46:00 MSK 2013
status
spawn hive
Logging initialized using configuration in file:/opt/hive/conf/hive-log4j.properties
Hive history file=/tmp/kartavyy/hive_job_log_kartavyy_15579@mturlrep13_201309101846_67163557.txt
hive> ;
hive> quit;
times: 1
query: SELECT count(*) FROM hits_10m;
spawn hive
Logging initialized using configuration in file:/opt/hive/conf/hive-log4j.properties
Hive history file=/tmp/kartavyy/hive_job_log_kartavyy_16038@mturlrep13_201309101846_623079473.txt
hive> SELECT count(*) FROM hits_10m;;
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks determined at compile time: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapred.reduce.tasks=<number>
Kill Command = /usr/libexec/../bin/hadoop job -kill job_201309101627_0036
Hadoop job information for Stage-1: number of mappers: 4; number of reducers: 1
2013-09-10 18:46:20,061 Stage-1 map = 0%, reduce = 0%
2013-09-10 18:46:27,089 Stage-1 map = 7%, reduce = 0%
2013-09-10 18:46:33,113 Stage-1 map = 14%, reduce = 0%
2013-09-10 18:46:36,127 Stage-1 map = 22%, reduce = 0%
2013-09-10 18:46:39,143 Stage-1 map = 29%, reduce = 0%, Cumulative CPU 46.41 sec
2013-09-10 18:46:40,149 Stage-1 map = 29%, reduce = 0%, Cumulative CPU 46.41 sec
2013-09-10 18:46:41,156 Stage-1 map = 29%, reduce = 0%, Cumulative CPU 46.41 sec
2013-09-10 18:46:42,162 Stage-1 map = 29%, reduce = 0%, Cumulative CPU 46.41 sec
2013-09-10 18:46:43,168 Stage-1 map = 29%, reduce = 0%, Cumulative CPU 46.41 sec
2013-09-10 18:46:44,174 Stage-1 map = 29%, reduce = 0%, Cumulative CPU 46.41 sec
2013-09-10 18:46:45,179 Stage-1 map = 36%, reduce = 0%, Cumulative CPU 46.41 sec
2013-09-10 18:46:46,185 Stage-1 map = 36%, reduce = 0%, Cumulative CPU 46.41 sec
2013-09-10 18:46:47,191 Stage-1 map = 36%, reduce = 0%, Cumulative CPU 46.41 sec
2013-09-10 18:46:48,197 Stage-1 map = 43%, reduce = 0%, Cumulative CPU 46.41 sec
2013-09-10 18:46:49,205 Stage-1 map = 47%, reduce = 0%, Cumulative CPU 62.51 sec
2013-09-10 18:46:50,211 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 83.95 sec
2013-09-10 18:46:51,217 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 83.95 sec
2013-09-10 18:46:52,222 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 83.95 sec
2013-09-10 18:46:53,227 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 83.95 sec
2013-09-10 18:46:54,233 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 83.95 sec
2013-09-10 18:46:55,238 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 83.95 sec
2013-09-10 18:46:56,244 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 83.95 sec
2013-09-10 18:46:57,250 Stage-1 map = 54%, reduce = 17%, Cumulative CPU 83.95 sec
2013-09-10 18:46:58,255 Stage-1 map = 57%, reduce = 17%, Cumulative CPU 83.95 sec
2013-09-10 18:46:59,261 Stage-1 map = 57%, reduce = 17%, Cumulative CPU 83.95 sec
2013-09-10 18:47:00,266 Stage-1 map = 57%, reduce = 17%, Cumulative CPU 83.95 sec
2013-09-10 18:47:01,272 Stage-1 map = 61%, reduce = 17%, Cumulative CPU 83.95 sec
2013-09-10 18:47:02,277 Stage-1 map = 61%, reduce = 17%, Cumulative CPU 83.95 sec
2013-09-10 18:47:03,282 Stage-1 map = 65%, reduce = 17%, Cumulative CPU 83.95 sec
2013-09-10 18:47:04,287 Stage-1 map = 65%, reduce = 17%, Cumulative CPU 83.95 sec
2013-09-10 18:47:05,305 Stage-1 map = 65%, reduce = 17%, Cumulative CPU 83.95 sec
2013-09-10 18:47:06,310 Stage-1 map = 69%, reduce = 17%, Cumulative CPU 83.95 sec
2013-09-10 18:47:07,316 Stage-1 map = 73%, reduce = 17%, Cumulative CPU 83.95 sec
2013-09-10 18:47:08,321 Stage-1 map = 73%, reduce = 17%, Cumulative CPU 83.95 sec
2013-09-10 18:47:09,326 Stage-1 map = 76%, reduce = 17%, Cumulative CPU 83.95 sec
2013-09-10 18:47:10,331 Stage-1 map = 80%, reduce = 17%, Cumulative CPU 83.95 sec
2013-09-10 18:47:11,336 Stage-1 map = 80%, reduce = 17%, Cumulative CPU 83.95 sec
2013-09-10 18:47:12,341 Stage-1 map = 84%, reduce = 17%, Cumulative CPU 83.95 sec
2013-09-10 18:47:13,346 Stage-1 map = 88%, reduce = 17%, Cumulative CPU 83.95 sec
2013-09-10 18:47:14,351 Stage-1 map = 88%, reduce = 17%, Cumulative CPU 83.95 sec
2013-09-10 18:47:15,356 Stage-1 map = 93%, reduce = 17%, Cumulative CPU 118.21 sec
2013-09-10 18:47:16,372 Stage-1 map = 93%, reduce = 17%, Cumulative CPU 118.21 sec
2013-09-10 18:47:17,379 Stage-1 map = 93%, reduce = 17%, Cumulative CPU 118.21 sec
2013-09-10 18:47:18,384 Stage-1 map = 97%, reduce = 17%, Cumulative CPU 118.21 sec
2013-09-10 18:47:19,388 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 162.76 sec
2013-09-10 18:47:20,393 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 162.76 sec
2013-09-10 18:47:21,397 Stage-1 map = 100%, reduce = 25%, Cumulative CPU 162.76 sec
2013-09-10 18:47:22,404 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 165.27 sec
2013-09-10 18:47:23,410 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 165.27 sec
2013-09-10 18:47:24,415 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 165.27 sec
MapReduce Total cumulative CPU time: 2 minutes 45 seconds 270 msec
Ended Job = job_201309101627_0036
MapReduce Jobs Launched:
Job 0: Map: 4 Reduce: 1 Cumulative CPU: 165.27 sec HDFS Read: 1082943442 HDFS Write: 9 SUCCESS
Total MapReduce CPU Time Spent: 2 minutes 45 seconds 270 msec
OK
10000000
Time taken: 74.228 seconds, Fetched: 1 row(s)
hive> quit;
status
spawn hive
Logging initialized using configuration in file:/opt/hive/conf/hive-log4j.properties
Hive history file=/tmp/kartavyy/hive_job_log_kartavyy_17475@mturlrep13_201309101847_1783698271.txt
hive> ;
hive> quit;
times: 1
query: SELECT count(*) FROM hits_10m WHERE AdvEngineID != 0;
spawn hive
Logging initialized using configuration in file:/opt/hive/conf/hive-log4j.properties
Hive history file=/tmp/kartavyy/hive_job_log_kartavyy_17882@mturlrep13_201309101847_1295809350.txt
hive> SELECT count(*) FROM hits_10m WHERE AdvEngineID != 0;;
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks determined at compile time: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapred.reduce.tasks=<number>
Kill Command = /usr/libexec/../bin/hadoop job -kill job_201309101627_0037
Hadoop job information for Stage-1: number of mappers: 4; number of reducers: 1
2013-09-10 18:47:44,058 Stage-1 map = 0%, reduce = 0%
2013-09-10 18:47:49,086 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 12.21 sec
2013-09-10 18:47:50,093 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 12.21 sec
2013-09-10 18:47:51,101 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 12.21 sec
2013-09-10 18:47:52,107 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 12.21 sec
2013-09-10 18:47:53,113 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 12.21 sec
2013-09-10 18:47:54,119 Stage-1 map = 75%, reduce = 0%, Cumulative CPU 18.18 sec
2013-09-10 18:47:55,125 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 23.81 sec
2013-09-10 18:47:56,130 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 23.81 sec
2013-09-10 18:47:57,138 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 25.64 sec
2013-09-10 18:47:58,144 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 25.64 sec
2013-09-10 18:47:59,150 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 25.64 sec
MapReduce Total cumulative CPU time: 25 seconds 640 msec
Ended Job = job_201309101627_0037
MapReduce Jobs Launched:
Job 0: Map: 4 Reduce: 1 Cumulative CPU: 25.64 sec HDFS Read: 907716 HDFS Write: 7 SUCCESS
Total MapReduce CPU Time Spent: 25 seconds 640 msec
OK
171127
Time taken: 25.153 seconds, Fetched: 1 row(s)
hive> quit;
status
spawn hive
Logging initialized using configuration in file:/opt/hive/conf/hive-log4j.properties
Hive history file=/tmp/kartavyy/hive_job_log_kartavyy_19147@mturlrep13_201309101848_1891179156.txt
hive> ;
hive> quit;
times: 1
query: SELECT sum(AdvEngineID), count(*), avg(ResolutionWidth) FROM hits_10m;
spawn hive
Logging initialized using configuration in file:/opt/hive/conf/hive-log4j.properties
Hive history file=/tmp/kartavyy/hive_job_log_kartavyy_19567@mturlrep13_201309101848_690102300.txt
hive> SELECT sum(AdvEngineID), count(*), avg(ResolutionWidth) FROM hits_10m;;
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks determined at compile time: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapred.reduce.tasks=<number>
Kill Command = /usr/libexec/../bin/hadoop job -kill job_201309101627_0038
Hadoop job information for Stage-1: number of mappers: 4; number of reducers: 1
2013-09-10 18:48:18,837 Stage-1 map = 0%, reduce = 0%
2013-09-10 18:48:25,865 Stage-1 map = 39%, reduce = 0%
2013-09-10 18:48:26,875 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 18.45 sec
2013-09-10 18:48:27,882 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 18.45 sec
2013-09-10 18:48:28,889 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 18.45 sec
2013-09-10 18:48:29,895 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 18.45 sec
2013-09-10 18:48:30,901 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 18.45 sec
2013-09-10 18:48:31,907 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 18.45 sec
2013-09-10 18:48:32,914 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 18.45 sec
2013-09-10 18:48:33,920 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 34.59 sec
2013-09-10 18:48:34,925 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 34.59 sec
2013-09-10 18:48:35,930 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 34.59 sec
2013-09-10 18:48:36,935 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 34.59 sec
2013-09-10 18:48:37,940 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 34.59 sec
2013-09-10 18:48:38,945 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 35.24 sec
2013-09-10 18:48:39,952 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 36.63 sec
2013-09-10 18:48:40,958 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 36.63 sec
MapReduce Total cumulative CPU time: 36 seconds 630 msec
Ended Job = job_201309101627_0038
MapReduce Jobs Launched:
Job 0: Map: 4 Reduce: 1 Cumulative CPU: 36.63 sec HDFS Read: 8109219 HDFS Write: 30 SUCCESS
Total MapReduce CPU Time Spent: 36 seconds 630 msec
OK
Time taken: 31.961 seconds, Fetched: 1 row(s)
hive> quit;
status
spawn hive
Logging initialized using configuration in file:/opt/hive/conf/hive-log4j.properties
Hive history file=/tmp/kartavyy/hive_job_log_kartavyy_20898@mturlrep13_201309101848_327652001.txt
hive> ;
hive> quit;
times: 1
query: SELECT sum(UserID) FROM hits_10m;
spawn hive
Logging initialized using configuration in file:/opt/hive/conf/hive-log4j.properties
Hive history file=/tmp/kartavyy/hive_job_log_kartavyy_21336@mturlrep13_201309101848_1975614127.txt
hive> SELECT sum(UserID) FROM hits_10m;;
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks determined at compile time: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapred.reduce.tasks=<number>
Kill Command = /usr/libexec/../bin/hadoop job -kill job_201309101627_0039
Hadoop job information for Stage-1: number of mappers: 4; number of reducers: 1
2013-09-10 18:49:00,561 Stage-1 map = 0%, reduce = 0%
2013-09-10 18:49:07,617 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 15.12 sec
2013-09-10 18:49:08,626 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 15.12 sec
2013-09-10 18:49:09,634 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 15.12 sec
2013-09-10 18:49:10,639 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 15.12 sec
2013-09-10 18:49:11,646 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 15.12 sec
2013-09-10 18:49:12,652 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 15.12 sec
2013-09-10 18:49:13,658 Stage-1 map = 75%, reduce = 0%, Cumulative CPU 21.86 sec
2013-09-10 18:49:14,664 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 30.08 sec
2013-09-10 18:49:15,670 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 30.08 sec
2013-09-10 18:49:16,675 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 30.08 sec
2013-09-10 18:49:17,680 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 30.08 sec
2013-09-10 18:49:18,685 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 30.08 sec
2013-09-10 18:49:19,690 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 30.08 sec
2013-09-10 18:49:20,697 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 32.07 sec
2013-09-10 18:49:21,703 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 32.07 sec
MapReduce Total cumulative CPU time: 32 seconds 70 msec
Ended Job = job_201309101627_0039
MapReduce Jobs Launched:
Job 0: Map: 4 Reduce: 1 Cumulative CPU: 32.07 sec HDFS Read: 57312623 HDFS Write: 21 SUCCESS
Total MapReduce CPU Time Spent: 32 seconds 70 msec
OK
-4662894107982093709
Time taken: 30.94 seconds, Fetched: 1 row(s)
hive> quit;
status
spawn hive
Logging initialized using configuration in file:/opt/hive/conf/hive-log4j.properties
Hive history file=/tmp/kartavyy/hive_job_log_kartavyy_22560@mturlrep13_201309101849_2023198520.txt
hive> ;
hive> quit;
times: 1
query: SELECT count(DISTINCT UserID) FROM hits_10m;
spawn hive
Logging initialized using configuration in file:/opt/hive/conf/hive-log4j.properties
Hive history file=/tmp/kartavyy/hive_job_log_kartavyy_22993@mturlrep13_201309101849_961728603.txt
hive> SELECT count(DISTINCT UserID) FROM hits_10m;;
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks determined at compile time: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapred.reduce.tasks=<number>
Kill Command = /usr/libexec/../bin/hadoop job -kill job_201309101627_0040
Hadoop job information for Stage-1: number of mappers: 4; number of reducers: 1
2013-09-10 18:49:41,232 Stage-1 map = 0%, reduce = 0%
2013-09-10 18:49:48,264 Stage-1 map = 43%, reduce = 0%
2013-09-10 18:49:51,283 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 27.01 sec
2013-09-10 18:49:52,291 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 27.01 sec
2013-09-10 18:49:53,298 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 27.01 sec
2013-09-10 18:49:54,304 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 27.01 sec
2013-09-10 18:49:55,310 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 27.01 sec
2013-09-10 18:49:56,317 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 27.01 sec
2013-09-10 18:49:57,332 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 27.01 sec
2013-09-10 18:49:58,337 Stage-1 map = 96%, reduce = 17%, Cumulative CPU 27.01 sec
2013-09-10 18:49:59,342 Stage-1 map = 96%, reduce = 17%, Cumulative CPU 27.01 sec
2013-09-10 18:50:00,348 Stage-1 map = 96%, reduce = 17%, Cumulative CPU 27.01 sec
2013-09-10 18:50:01,353 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 55.01 sec
2013-09-10 18:50:02,360 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 55.01 sec
2013-09-10 18:50:03,365 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 55.01 sec
2013-09-10 18:50:04,369 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 55.01 sec
2013-09-10 18:50:05,375 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 55.01 sec
2013-09-10 18:50:06,379 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 55.01 sec
2013-09-10 18:50:07,385 Stage-1 map = 100%, reduce = 88%, Cumulative CPU 55.01 sec
2013-09-10 18:50:08,391 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 62.95 sec
2013-09-10 18:50:09,397 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 62.95 sec
2013-09-10 18:50:10,402 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 62.95 sec
MapReduce Total cumulative CPU time: 1 minutes 2 seconds 950 msec
Ended Job = job_201309101627_0040
MapReduce Jobs Launched:
Job 0: Map: 4 Reduce: 1 Cumulative CPU: 62.95 sec HDFS Read: 57312623 HDFS Write: 8 SUCCESS
Total MapReduce CPU Time Spent: 1 minutes 2 seconds 950 msec
OK
2037258
Time taken: 38.84 seconds, Fetched: 1 row(s)
hive> quit;
status
spawn hive
Logging initialized using configuration in file:/opt/hive/conf/hive-log4j.properties
Hive history file=/tmp/kartavyy/hive_job_log_kartavyy_24634@mturlrep13_201309101850_840502487.txt
hive> ;
hive> quit;
times: 1
query: SELECT count(DISTINCT SearchPhrase) FROM hits_10m;
spawn hive
Logging initialized using configuration in file:/opt/hive/conf/hive-log4j.properties
Hive history file=/tmp/kartavyy/hive_job_log_kartavyy_25401@mturlrep13_201309101850_84750246.txt
hive> SELECT count(DISTINCT SearchPhrase) FROM hits_10m;;
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks determined at compile time: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapred.reduce.tasks=<number>
Kill Command = /usr/libexec/../bin/hadoop job -kill job_201309101627_0041
Hadoop job information for Stage-1: number of mappers: 4; number of reducers: 1
2013-09-10 18:50:31,472 Stage-1 map = 0%, reduce = 0%
2013-09-10 18:50:38,501 Stage-1 map = 43%, reduce = 0%
2013-09-10 18:50:40,517 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 21.42 sec
2013-09-10 18:50:41,523 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 21.42 sec
2013-09-10 18:50:42,531 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 21.42 sec
2013-09-10 18:50:43,536 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 21.42 sec
2013-09-10 18:50:44,542 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 21.42 sec
2013-09-10 18:50:45,548 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 21.42 sec
2013-09-10 18:50:46,555 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 21.42 sec
2013-09-10 18:50:47,561 Stage-1 map = 96%, reduce = 17%, Cumulative CPU 21.42 sec
2013-09-10 18:50:48,566 Stage-1 map = 97%, reduce = 17%, Cumulative CPU 31.8 sec
2013-09-10 18:50:49,571 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 42.95 sec
2013-09-10 18:50:50,576 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 42.95 sec
2013-09-10 18:50:51,581 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 42.95 sec
2013-09-10 18:50:52,587 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 42.95 sec
2013-09-10 18:50:53,592 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 42.95 sec
2013-09-10 18:50:54,597 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 42.95 sec
2013-09-10 18:50:55,602 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 42.95 sec
2013-09-10 18:50:56,607 Stage-1 map = 100%, reduce = 92%, Cumulative CPU 42.95 sec
2013-09-10 18:50:57,615 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 50.6 sec
2013-09-10 18:50:58,642 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 50.6 sec
MapReduce Total cumulative CPU time: 50 seconds 600 msec
Ended Job = job_201309101627_0041
MapReduce Jobs Launched:
Job 0: Map: 4 Reduce: 1 Cumulative CPU: 50.6 sec HDFS Read: 27820105 HDFS Write: 8 SUCCESS
Total MapReduce CPU Time Spent: 50 seconds 600 msec
OK
1110413
Time taken: 37.04 seconds, Fetched: 1 row(s)
hive> quit;
status
spawn hive
Logging initialized using configuration in file:/opt/hive/conf/hive-log4j.properties
Hive history file=/tmp/kartavyy/hive_job_log_kartavyy_26718@mturlrep13_201309101851_285967686.txt
hive> ;
hive> quit;
times: 1
query: SELECT min(EventDate), max(EventDate) FROM hits_10m;
spawn hive
Logging initialized using configuration in file:/opt/hive/conf/hive-log4j.properties
Hive history file=/tmp/kartavyy/hive_job_log_kartavyy_27149@mturlrep13_201309101851_2135309314.txt
hive> SELECT min(EventDate), max(EventDate) FROM hits_10m;;
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks determined at compile time: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapred.reduce.tasks=<number>
Kill Command = /usr/libexec/../bin/hadoop job -kill job_201309101627_0042
Hadoop job information for Stage-1: number of mappers: 4; number of reducers: 1
2013-09-10 18:51:19,077 Stage-1 map = 0%, reduce = 0%
2013-09-10 18:51:25,106 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 13.92 sec
2013-09-10 18:51:26,114 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 13.92 sec
2013-09-10 18:51:27,123 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 13.92 sec
2013-09-10 18:51:28,129 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 13.92 sec
2013-09-10 18:51:29,135 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 13.92 sec
2013-09-10 18:51:30,141 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 13.92 sec
2013-09-10 18:51:31,147 Stage-1 map = 75%, reduce = 0%, Cumulative CPU 20.4 sec
2013-09-10 18:51:32,152 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 27.44 sec
2013-09-10 18:51:33,158 Stage-1 map = 100%, reduce = 25%, Cumulative CPU 27.44 sec
2013-09-10 18:51:34,163 Stage-1 map = 100%, reduce = 25%, Cumulative CPU 27.44 sec
2013-09-10 18:51:35,168 Stage-1 map = 100%, reduce = 25%, Cumulative CPU 27.44 sec
2013-09-10 18:51:36,173 Stage-1 map = 100%, reduce = 25%, Cumulative CPU 27.44 sec
2013-09-10 18:51:37,179 Stage-1 map = 100%, reduce = 25%, Cumulative CPU 27.44 sec
2013-09-10 18:51:38,184 Stage-1 map = 100%, reduce = 25%, Cumulative CPU 27.44 sec
2013-09-10 18:51:39,192 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 29.39 sec
2013-09-10 18:51:40,198 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 29.39 sec
MapReduce Total cumulative CPU time: 29 seconds 390 msec
Ended Job = job_201309101627_0042
MapReduce Jobs Launched:
Job 0: Map: 4 Reduce: 1 Cumulative CPU: 29.39 sec HDFS Read: 597016 HDFS Write: 6 SUCCESS
Total MapReduce CPU Time Spent: 29 seconds 390 msec
OK
Time taken: 30.908 seconds, Fetched: 1 row(s)
hive> quit;
status
spawn hive
Logging initialized using configuration in file:/opt/hive/conf/hive-log4j.properties
Hive history file=/tmp/kartavyy/hive_job_log_kartavyy_28401@mturlrep13_201309101851_891001725.txt
hive> ;
hive> quit;
times: 1
query: SELECT AdvEngineID, count(*) AS c FROM hits_10m WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY c DESC;
spawn hive
Logging initialized using configuration in file:/opt/hive/conf/hive-log4j.properties
Hive history file=/tmp/kartavyy/hive_job_log_kartavyy_28836@mturlrep13_201309101851_1054092389.txt
hive> SELECT AdvEngineID, count(*) AS c FROM hits_10m WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY c DESC;;
Total MapReduce jobs = 2
Launching Job 1 out of 2
Number of reduce tasks not specified. Estimated from input data size: 2
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapred.reduce.tasks=<number>
Kill Command = /usr/libexec/../bin/hadoop job -kill job_201309101627_0043
Hadoop job information for Stage-1: number of mappers: 4; number of reducers: 2
2013-09-10 18:51:59,809 Stage-1 map = 0%, reduce = 0%
2013-09-10 18:52:04,838 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 12.48 sec
2013-09-10 18:52:05,847 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 12.48 sec
2013-09-10 18:52:06,855 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 12.48 sec
2013-09-10 18:52:07,861 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 12.48 sec
2013-09-10 18:52:08,868 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 12.48 sec
2013-09-10 18:52:09,875 Stage-1 map = 75%, reduce = 0%, Cumulative CPU 18.07 sec
2013-09-10 18:52:10,881 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 23.92 sec
2013-09-10 18:52:11,887 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 23.92 sec
2013-09-10 18:52:12,894 Stage-1 map = 100%, reduce = 67%, Cumulative CPU 25.68 sec
2013-09-10 18:52:13,901 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 27.53 sec
2013-09-10 18:52:14,908 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 27.53 sec
MapReduce Total cumulative CPU time: 27 seconds 530 msec
Ended Job = job_201309101627_0043
Launching Job 2 out of 2
Number of reduce tasks determined at compile time: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapred.reduce.tasks=<number>
Kill Command = /usr/libexec/../bin/hadoop job -kill job_201309101627_0044
Hadoop job information for Stage-2: number of mappers: 1; number of reducers: 1
2013-09-10 18:52:17,388 Stage-2 map = 0%, reduce = 0%
2013-09-10 18:52:19,396 Stage-2 map = 100%, reduce = 0%, Cumulative CPU 0.75 sec
2013-09-10 18:52:20,401 Stage-2 map = 100%, reduce = 0%, Cumulative CPU 0.75 sec
2013-09-10 18:52:21,406 Stage-2 map = 100%, reduce = 0%, Cumulative CPU 0.75 sec
2013-09-10 18:52:22,411 Stage-2 map = 100%, reduce = 0%, Cumulative CPU 0.75 sec
2013-09-10 18:52:23,415 Stage-2 map = 100%, reduce = 0%, Cumulative CPU 0.75 sec
2013-09-10 18:52:24,420 Stage-2 map = 100%, reduce = 0%, Cumulative CPU 0.75 sec
2013-09-10 18:52:25,425 Stage-2 map = 100%, reduce = 0%, Cumulative CPU 0.75 sec
2013-09-10 18:52:26,430 Stage-2 map = 100%, reduce = 33%, Cumulative CPU 0.75 sec
2013-09-10 18:52:27,436 Stage-2 map = 100%, reduce = 100%, Cumulative CPU 2.14 sec
2013-09-10 18:52:28,442 Stage-2 map = 100%, reduce = 100%, Cumulative CPU 2.14 sec
2013-09-10 18:52:29,448 Stage-2 map = 100%, reduce = 100%, Cumulative CPU 2.14 sec
MapReduce Total cumulative CPU time: 2 seconds 140 msec
Ended Job = job_201309101627_0044
MapReduce Jobs Launched:
Job 0: Map: 4 Reduce: 2 Cumulative CPU: 27.53 sec HDFS Read: 907716 HDFS Write: 384 SUCCESS
Job 1: Map: 1 Reduce: 1 Cumulative CPU: 2.14 sec HDFS Read: 1153 HDFS Write: 60 SUCCESS
Total MapReduce CPU Time Spent: 29 seconds 670 msec
OK
Time taken: 39.506 seconds, Fetched: 9 row(s)
hive> quit;
-- мощная фильтрация. После фильтрации почти ничего не остаётся, но делаем ещё агрегацию.;
status
spawn hive
Logging initialized using configuration in file:/opt/hive/conf/hive-log4j.properties
Hive history file=/tmp/kartavyy/hive_job_log_kartavyy_30667@mturlrep13_201309101852_966681525.txt
hive> ;
hive> quit;
times: 1
query: SELECT RegionID, count(DISTINCT UserID) AS u FROM hits_10m GROUP BY RegionID ORDER BY u DESC LIMIT 10;
spawn hive
Logging initialized using configuration in file:/opt/hive/conf/hive-log4j.properties
Hive history file=/tmp/kartavyy/hive_job_log_kartavyy_31123@mturlrep13_201309101852_1252745596.txt
hive> SELECT RegionID, count(DISTINCT UserID) AS u FROM hits_10m GROUP BY RegionID ORDER BY u DESC LIMIT 10;;
Total MapReduce jobs = 2
Launching Job 1 out of 2
Number of reduce tasks not specified. Estimated from input data size: 2
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapred.reduce.tasks=<number>
Kill Command = /usr/libexec/../bin/hadoop job -kill job_201309101627_0045
Hadoop job information for Stage-1: number of mappers: 4; number of reducers: 2
2013-09-10 18:52:49,457 Stage-1 map = 0%, reduce = 0%
2013-09-10 18:52:56,485 Stage-1 map = 43%, reduce = 0%
2013-09-10 18:52:59,503 Stage-1 map = 46%, reduce = 0%, Cumulative CPU 14.56 sec
2013-09-10 18:53:00,511 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 29.73 sec
2013-09-10 18:53:01,519 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 29.73 sec
2013-09-10 18:53:02,526 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 29.73 sec
2013-09-10 18:53:03,533 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 29.73 sec
2013-09-10 18:53:04,539 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 29.73 sec
2013-09-10 18:53:05,545 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 29.73 sec
2013-09-10 18:53:06,550 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 29.73 sec
2013-09-10 18:53:07,557 Stage-1 map = 92%, reduce = 17%, Cumulative CPU 29.73 sec
2013-09-10 18:53:08,563 Stage-1 map = 92%, reduce = 17%, Cumulative CPU 29.73 sec
2013-09-10 18:53:09,569 Stage-1 map = 92%, reduce = 17%, Cumulative CPU 29.73 sec
2013-09-10 18:53:10,575 Stage-1 map = 97%, reduce = 17%, Cumulative CPU 44.01 sec
2013-09-10 18:53:11,598 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 58.47 sec
2013-09-10 18:53:12,604 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 58.47 sec
2013-09-10 18:53:13,609 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 58.47 sec
2013-09-10 18:53:14,615 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 58.47 sec
2013-09-10 18:53:15,620 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 58.47 sec
2013-09-10 18:53:16,627 Stage-1 map = 100%, reduce = 63%, Cumulative CPU 65.64 sec
2013-09-10 18:53:17,634 Stage-1 map = 100%, reduce = 63%, Cumulative CPU 65.64 sec
2013-09-10 18:53:18,640 Stage-1 map = 100%, reduce = 63%, Cumulative CPU 65.64 sec
2013-09-10 18:53:19,646 Stage-1 map = 100%, reduce = 63%, Cumulative CPU 65.64 sec
2013-09-10 18:53:20,653 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 71.27 sec
2013-09-10 18:53:21,659 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 71.27 sec
MapReduce Total cumulative CPU time: 1 minutes 11 seconds 270 msec
Ended Job = job_201309101627_0045
Launching Job 2 out of 2
Number of reduce tasks determined at compile time: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapred.reduce.tasks=<number>
Kill Command = /usr/libexec/../bin/hadoop job -kill job_201309101627_0046
Hadoop job information for Stage-2: number of mappers: 1; number of reducers: 1
2013-09-10 18:53:25,187 Stage-2 map = 0%, reduce = 0%
2013-09-10 18:53:27,196 Stage-2 map = 100%, reduce = 0%, Cumulative CPU 1.42 sec
2013-09-10 18:53:28,202 Stage-2 map = 100%, reduce = 0%, Cumulative CPU 1.42 sec
2013-09-10 18:53:29,207 Stage-2 map = 100%, reduce = 0%, Cumulative CPU 1.42 sec
2013-09-10 18:53:30,211 Stage-2 map = 100%, reduce = 0%, Cumulative CPU 1.42 sec
2013-09-10 18:53:31,216 Stage-2 map = 100%, reduce = 0%, Cumulative CPU 1.42 sec
2013-09-10 18:53:32,220 Stage-2 map = 100%, reduce = 0%, Cumulative CPU 1.42 sec
2013-09-10 18:53:33,226 Stage-2 map = 100%, reduce = 0%, Cumulative CPU 1.42 sec
2013-09-10 18:53:34,231 Stage-2 map = 100%, reduce = 33%, Cumulative CPU 1.42 sec
2013-09-10 18:53:35,237 Stage-2 map = 100%, reduce = 100%, Cumulative CPU 3.16 sec
2013-09-10 18:53:36,243 Stage-2 map = 100%, reduce = 100%, Cumulative CPU 3.16 sec
MapReduce Total cumulative CPU time: 3 seconds 160 msec
Ended Job = job_201309101627_0046
MapReduce Jobs Launched:
Job 0: Map: 4 Reduce: 2 Cumulative CPU: 71.27 sec HDFS Read: 67340015 HDFS Write: 100142 SUCCESS
Job 1: Map: 1 Reduce: 1 Cumulative CPU: 3.16 sec HDFS Read: 100911 HDFS Write: 96 SUCCESS
Total MapReduce CPU Time Spent: 1 minutes 14 seconds 430 msec
OK
Time taken: 56.439 seconds, Fetched: 10 row(s)
hive> quit;
-- агрегация, среднее количество ключей.;
status
spawn hive
Logging initialized using configuration in file:/opt/hive/conf/hive-log4j.properties
Hive history file=/tmp/kartavyy/hive_job_log_kartavyy_609@mturlrep13_201309101853_355533849.txt
hive> ;
hive> quit;
times: 1
query: SELECT RegionID, sum(AdvEngineID), count(*) AS c, avg(ResolutionWidth), count(DISTINCT UserID) FROM hits_10m GROUP BY RegionID ORDER BY c DESC LIMIT 10;
spawn hive
Logging initialized using configuration in file:/opt/hive/conf/hive-log4j.properties
Hive history file=/tmp/kartavyy/hive_job_log_kartavyy_1183@mturlrep13_201309101853_289725544.txt
hive> SELECT RegionID, sum(AdvEngineID), count(*) AS c, avg(ResolutionWidth), count(DISTINCT UserID) FROM hits_10m GROUP BY RegionID ORDER BY c DESC LIMIT 10;;
Total MapReduce jobs = 2
Launching Job 1 out of 2
Number of reduce tasks not specified. Estimated from input data size: 2
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapred.reduce.tasks=<number>
Kill Command = /usr/libexec/../bin/hadoop job -kill job_201309101627_0047
Hadoop job information for Stage-1: number of mappers: 4; number of reducers: 2
2013-09-10 18:53:55,838 Stage-1 map = 0%, reduce = 0%
2013-09-10 18:54:02,865 Stage-1 map = 29%, reduce = 0%
2013-09-10 18:54:05,876 Stage-1 map = 43%, reduce = 0%
2013-09-10 18:54:08,894 Stage-1 map = 46%, reduce = 0%, Cumulative CPU 16.8 sec
2013-09-10 18:54:09,901 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 34.85 sec
2013-09-10 18:54:10,909 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 34.85 sec
2013-09-10 18:54:11,915 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 34.85 sec
2013-09-10 18:54:12,921 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 34.85 sec
2013-09-10 18:54:13,927 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 34.85 sec
2013-09-10 18:54:14,932 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 34.85 sec
2013-09-10 18:54:15,938 Stage-1 map = 50%, reduce = 0%, Cumulative CPU 34.85 sec
2013-09-10 18:54:16,943 Stage-1 map = 80%, reduce = 17%, Cumulative CPU 34.85 sec
2013-09-10 18:54:17,949 Stage-1 map = 80%, reduce = 17%, Cumulative CPU 34.85 sec
2013-09-10 18:54:18,954 Stage-1 map = 80%, reduce = 17%, Cumulative CPU 34.85 sec
2013-09-10 18:54:19,959 Stage-1 map = 96%, reduce = 17%, Cumulative CPU 34.85 sec
2013-09-10 18:54:20,964 Stage-1 map = 96%, reduce = 17%, Cumulative CPU 34.85 sec
2013-09-10 18:54:21,970 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 67.35 sec
2013-09-10 18:54:22,975 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 67.35 sec
2013-09-10 18:54:23,980 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 67.35 sec
2013-09-10 18:54:24,986 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 67.35 sec
2013-09-10 18:54:25,991 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 67.35 sec
2013-09-10 18:54:26,997 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 67.35 sec
2013-09-10 18:54:28,002 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 67.35 sec
2013-09-10 18:54:29,008 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 67.35 sec
2013-09-10 18:54:30,014 Stage-1 map = 100%, reduce = 17%, Cumulative CPU 67.35 sec
2013-09-10 18:54:31,021 Stage-1 map = 100%, reduce = 58%, Cumulative CPU 74.39 sec
2013-09-10 18:54:32,027 Stage-1 map = 100%, reduce = 96%, Cumulative CPU 74.39 sec
2013-09-10 18:54:33,033 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 84.05 sec
2013-09-10 18:54:34,038 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 84.05 sec
2013-09-10 18:54:35,044 Stage-1 map = 100%, reduce = 100%, Cumulative CPU 84.05 sec
MapReduce Total cumulative CPU time: 1 minutes 24 seconds 50 msec
Ended Job = job_201309101627_0047
Launching Job 2 out of 2
Number of reduce tasks determined at compile time: 1
In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapred.reduce.tasks=<number>
Kill Command = /usr/libexec/../bin/hadoop job -kill job_201309101627_0048

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,109 +0,0 @@
SELECT count(*) FROM hits_10m;
SELECT count(*) FROM hits_10m WHERE AdvEngineID != 0;
SELECT sum(AdvEngineID), count(*), avg(ResolutionWidth) FROM hits_10m;
SELECT sum(UserID) FROM hits_10m;
SELECT count(DISTINCT UserID) FROM hits_10m;
SELECT count(DISTINCT SearchPhrase) FROM hits_10m;
SELECT min(EventDate), max(EventDate) FROM hits_10m;
SELECT AdvEngineID, count(*) AS c FROM hits_10m WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY c DESC;
-- мощная фильтрация. После фильтрации почти ничего не остаётся, но делаем ещё агрегацию.;
SELECT RegionID, count(DISTINCT UserID) AS u FROM hits_10m GROUP BY RegionID ORDER BY u DESC LIMIT 10;
-- агрегация, среднее количество ключей.;
SELECT RegionID, sum(AdvEngineID), count(*) AS c, avg(ResolutionWidth), count(DISTINCT UserID) FROM hits_10m GROUP BY RegionID ORDER BY c DESC LIMIT 10;
-- агрегация, среднее количество ключей, несколько агрегатных функций.;
SELECT MobilePhoneModel, count(DISTINCT UserID) AS u FROM hits_10m WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
-- мощная фильтрация по строкам, затем агрегация по строкам.;
SELECT MobilePhone, MobilePhoneModel, count(DISTINCT UserID) AS u FROM hits_10m WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
-- мощная фильтрация по строкам, затем агрегация по паре из числа и строки.;
SELECT SearchPhrase, count(*) AS c FROM hits_10m WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
-- средняя фильтрация по строкам, затем агрегация по строкам, большое количество ключей.;
SELECT SearchPhrase, count(DISTINCT UserID) AS u FROM hits_10m WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
-- агрегация чуть сложнее.;
SELECT SearchEngineID, SearchPhrase, count(*) AS c FROM hits_10m WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;
-- агрегация по числу и строке, большое количество ключей.;
SELECT UserID, count(*) AS c FROM hits_10m GROUP BY UserID ORDER BY c DESC LIMIT 10;
-- агрегация по очень большому количеству ключей, может не хватить оперативки.;
SELECT UserID, SearchPhrase, count(*) AS c FROM hits_10m GROUP BY UserID, SearchPhrase ORDER BY c DESC LIMIT 10;
-- ещё более сложная агрегация.;
SELECT UserID, SearchPhrase, count(*) AS c FROM hits_10m GROUP BY UserID, SearchPhrase LIMIT 10;
-- то же самое, но без сортировки.;
SELECT UserID, minute(EventTime), SearchPhrase, count(*) AS c FROM hits_10m GROUP BY UserID, minute(EventTime), SearchPhrase ORDER BY c DESC LIMIT 10;
-- ещё более сложная агрегация, не стоит выполнять на больших таблицах.;
SELECT UserID FROM hits_10m WHERE UserID = 12345678901234567890;
-- мощная фильтрация по столбцу типа UInt64.;
SELECT count(*) AS c FROM hits_10m WHERE URL LIKE '%metrika%';
-- фильтрация по поиску подстроки в строке.;
SELECT SearchPhrase, MAX(URL), count(*) AS c FROM hits_10m WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
-- вынимаем большие столбцы, фильтрация по строке.;
SELECT SearchPhrase, MAX(URL), MAX(Title), count(*) AS c, count(DISTINCT UserID) FROM hits_10m WHERE Title LIKE '%Яндекс%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
-- чуть больше столбцы.;
SELECT * FROM hits_10m WHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10;
-- плохой запрос - вынимаем все столбцы.;
SELECT SearchPhrase, EventTime FROM hits_10m WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10;
-- большая сортировка.;
SELECT SearchPhrase FROM hits_10m WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10;
-- большая сортировка по строкам.;
SELECT SearchPhrase, EventTime FROM hits_10m WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10;
-- большая сортировка по кортежу.;
SELECT CounterID, avg(length(URL)) AS l, count(*) AS c FROM hits_10m WHERE URL != '' GROUP BY CounterID HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
-- считаем средние длины URL для крупных счётчиков.;
SELECT SUBSTRING(SUBSTRING(Referer, instr(Referer, '//') + 2), 1, if(0 < instr(SUBSTRING(Referer, instr(Referer, '//') + 2), '/') - 1, instr(SUBSTRING(Referer, instr(Referer, '//') + 2), '/' ) - 1, 0)), avg(length(Referer)) AS l, count(*) AS c, MAX(Referer) FROM hits_100m WHERE Referer != '' GROUP BY SUBSTRING(SUBSTRING(Referer, instr(Referer, '//') + 2), 1, if(0 < instr(SUBSTRING(Referer, instr(Referer, '//') + 2), '/') - 1, instr(SUBSTRING(Referer, instr(Referer, '//') + 2), '/' ) - 1, 0)) HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
-- то же самое, но с разбивкой по доменам.;
SELECT sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM hits_10m;
-- много тупых агрегатных функций.;
SELECT SearchEngineID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_10m WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;
-- сложная агрегация, для больших таблиц может не хватить оперативки.;
SELECT WatchID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_10m WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
-- агрегация по двум полям, которая ничего не агрегирует. Для больших таблиц выполнить не получится.;
SELECT WatchID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_10m GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
-- то же самое, но ещё и без фильтрации.;
SELECT URL, count(*) AS c FROM hits_10m GROUP BY URL ORDER BY c DESC LIMIT 10;
-- агрегация по URL.;
SELECT 1, URL, count(*) AS c FROM hits_10m GROUP BY 1, URL ORDER BY c DESC LIMIT 10;
-- агрегация по URL и числу.;
SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, count(*) AS c FROM hits_10m GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;
SELECT URL, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= TIMESTAMP('2013-07-01') AND EventDate <= TIMESTAMP('2013-07-31') AND NOT DontCountHits != 0 AND NOT Refresh != 0 AND URL != '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
SELECT Title, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= TIMESTAMP('2013-07-01') AND EventDate <= TIMESTAMP('2013-07-31') AND NOT DontCountHits != 0 AND NOT Refresh != 0 AND Title != '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
SELECT URL, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= TIMESTAMP('2013-07-01') AND EventDate <= TIMESTAMP('2013-07-31') AND NOT Refresh != 0 AND IsLink != 0 AND NOT IsDownload != 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 1000;
SELECT TraficSourceID, SearchEngineID, AdvEngineID, URL, count(*) as c, if(SearchEngineID = 0 AND AdvEngineID = 0 , Referer, '') as src FROM hits_100m WHERE CounterID = 62 AND EventDate >= TIMESTAMP('2013-07-01') AND EventDate <= TIMESTAMP('2013-07-31') AND NOT Refresh != 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, URL, if(SearchEngineID = 0 AND AdvEngineID = 0 , Referer, '') ORDER BY c DESC LIMIT 1000;
SELECT URLHash, EventDate, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= TIMESTAMP('2013-07-01') AND EventDate <= TIMESTAMP('2013-07-31') AND NOT Refresh != 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 6202628419148573758 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100000;
SELECT WindowClientWidth, WindowClientHeight, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= TIMESTAMP('2013-07-01') AND EventDate <= TIMESTAMP('2013-07-31') AND NOT Refresh != 0 AND NOT DontCountHits != 0 AND URLHash = 6202628419148573758 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000;
SELECT unix_timestamp(EventTime) - SECOND(EventTime) AS m, count(*) FROM hits_10m WHERE CounterID = 62 AND EventDate >= TIMESTAMP('2013-07-01') AND EventDate <= TIMESTAMP('2013-07-02') AND NOT Refresh != 0 AND NOT DontCountHits != 0 GROUP BY unix_timestamp(EventTime) - SECOND(EventTime) ORDER BY m;

View File

@ -1,2 +0,0 @@
cd /home/kartavyy/benchmark
./benchmark.sh -c hive/conf.sh -n $1 > hive/log/log_$1

View File

@ -1,4 +0,0 @@
CONF_DIR=/home/kartavyy/benchmark/mysql
expect_file=$CONF_DIR/expect.tcl
test_file=$CONF_DIR/queries.sql
etc_init_d_service=/etc/init.d/mysql

View File

@ -1,7 +0,0 @@
create table hits_10m( WatchID BIGINT, JavaEnable TINYINT UNSIGNED, Title VARCHAR(1024), GoodEvent SMALLINT, EventTime DATETIME, EventDate DATE, CounterID INTEGER UNSIGNED, ClientIP INTEGER UNSIGNED, RegionID INTEGER UNSIGNED, UserID BIGINT, CounterClass TINYINT, OS SMALLINT, UserAgent SMALLINT, URL VARCHAR(6072), Referer VARCHAR(2048), Refresh TINYINT, RefererCategoryID SMALLINT UNSIGNED, RefererRegionID INTEGER UNSIGNED, URLCategoryID SMALLINT UNSIGNED, URLRegionID INTEGER UNSIGNED, ResolutionWidth SMALLINT UNSIGNED, ResolutionHeight SMALLINT UNSIGNED, ResolutionDepth TINYINT UNSIGNED, FlashMajor TINYINT UNSIGNED, FlashMinor TINYINT UNSIGNED, FlashMinor2 VARCHAR(256), NetMajor TINYINT UNSIGNED, NetMinor TINYINT UNSIGNED, UserAgentMajor SMALLINT UNSIGNED, UserAgentMinor CHAR(2), CookieEnable TINYINT UNSIGNED, JavascriptEnable TINYINT UNSIGNED, IsMobile TINYINT UNSIGNED, MobilePhone TINYINT UNSIGNED, MobilePhoneModel VARCHAR(80), Params VARCHAR(2048), IPNetworkID INT UNSIGNED, TraficSourceID SMALLINT, SearchEngineID SMALLINT UNSIGNED, SearchPhrase VARCHAR(1024), AdvEngineID TINYINT UNSIGNED, IsArtifical TINYINT UNSIGNED, WindowClientWidth SMALLINT UNSIGNED, WindowClientHeight SMALLINT UNSIGNED, ClientTimeZone INTEGER, ClientEventTime DATETIME, SilverlightVersion1 TINYINT UNSIGNED, SilverlightVersion2 TINYINT UNSIGNED, SilverlightVersion3 INT UNSIGNED, SilverlightVersion4 SMALLINT UNSIGNED, PageCharset VARCHAR(80), CodeVersion INT UNSIGNED, IsLink TINYINT UNSIGNED, IsDownload TINYINT UNSIGNED, IsNotBounce TINYINT UNSIGNED, FUniqID BIGINT, OriginalURL VARCHAR(6072), HID INT UNSIGNED, IsOldCounter TINYINT UNSIGNED, IsEvent TINYINT UNSIGNED, IsParameter TINYINT UNSIGNED, DontCountHits TINYINT UNSIGNED, WithHash TINYINT UNSIGNED, HitColor CHAR(1), LocalEventTime DATETIME, Age TINYINT UNSIGNED, Sex TINYINT UNSIGNED, Income TINYINT UNSIGNED, Interests SMALLINT UNSIGNED, Robotness TINYINT UNSIGNED, RemoteIP INT UNSIGNED, WindowName INT, OpenerName INT, HistoryLength SMALLINT, BrowserLanguage CHAR(2), BrowserCountry CHAR(2), SocialNetwork VARCHAR(128), SocialAction VARCHAR(128), HTTPError SMALLINT UNSIGNED, SendTiming INT UNSIGNED, DNSTiming INT UNSIGNED, ConnectTiming INTEGER UNSIGNED, ResponseStartTiming INTEGER UNSIGNED, ResponseEndTiming INTEGER UNSIGNED, FetchTiming INTEGER UNSIGNED, SocialSourceNetworkID TINYINT UNSIGNED, SocialSourcePage VARCHAR(128), ParamPrice BIGINT, ParamOrderID VARCHAR(80), ParamCurrency CHAR(3), ParamCurrencyID SMALLINT UNSIGNED, OpenstatServiceName VARCHAR(80), OpenstatCampaignID VARCHAR(80), OpenstatAdID VARCHAR(80), OpenstatSourceID VARCHAR(80), UTMSource VARCHAR(256), UTMMedium VARCHAR(256), UTMCampaign VARCHAR(256), UTMContent VARCHAR(256), UTMTerm VARCHAR(256), FromTag VARCHAR(256), HasGCLID TINYINT UNSIGNED, RefererHash BIGINT, URLHash BIGINT, CLID INTEGER UNSIGNED, UserIDHash BIGINT UNSIGNED) ENGINE=MYISAM;
CREATE INDEX hits_10m_ind on hits_10m (CounterID, EventDate, UserIDHash, EventTime) using BTREE;
load data infile '/opt/dump/dump_0.3/dump_hits_10m_meshed_utf8.tsv' into table hits_10m FIELDS TERMINATED BY '\t' ESCAPED BY '\\' ;

View File

@ -1,23 +0,0 @@
#!/usr/bin/env bash
#!/bin/expect
# Set timeout
set timeout 600
# Get arguments
set query [lindex $argv 0]
spawn mysql -u root
expect "mysql>"
send "use hits\r"
expect "mysql>"
send "$query\r"
expect "mysql>"
send "quit\r"
expect eof

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,111 +0,0 @@
SELECT SQL_NO_CACHE count(*) FROM hits_10m;
SELECT SQL_NO_CACHE count(*) FROM hits_10m WHERE AdvEngineID != 0;
SELECT SQL_NO_CACHE sum(AdvEngineID), count(*), avg(ResolutionWidth) FROM hits_10m;
SELECT SQL_NO_CACHE sum(UserID) FROM hits_10m;
SELECT SQL_NO_CACHE count(DISTINCT UserID) FROM hits_10m;
SELECT SQL_NO_CACHE count(DISTINCT SearchPhrase) FROM hits_10m;
SELECT SQL_NO_CACHE min(EventDate), max(EventDate) FROM hits_10m;
SELECT SQL_NO_CACHE AdvEngineID, count(*) FROM hits_10m WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY count(*) DESC;
-- мощная фильтрация. После фильтрации почти ничего не остаётся, но делаем ещё агрегацию.;
SELECT SQL_NO_CACHE RegionID, count(DISTINCT UserID) AS u FROM hits_10m GROUP BY RegionID ORDER BY u DESC LIMIT 10;
-- агрегация, среднее количество ключей.;
SELECT SQL_NO_CACHE RegionID, sum(AdvEngineID), count(*) AS c, avg(ResolutionWidth), count(DISTINCT UserID) FROM hits_10m GROUP BY RegionID ORDER BY count(*) DESC LIMIT 10;
-- агрегация, среднее количество ключей, несколько агрегатных функций.;
SELECT SQL_NO_CACHE MobilePhoneModel, count(DISTINCT UserID) AS u FROM hits_10m WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
-- мощная фильтрация по строкам, затем агрегация по строкам.;
SELECT SQL_NO_CACHE MobilePhone, MobilePhoneModel, count(DISTINCT UserID) AS u FROM hits_10m WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
-- мощная фильтрация по строкам, затем агрегация по паре из числа и строки.;
SELECT SQL_NO_CACHE SearchPhrase, count(*) FROM hits_10m WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
-- средняя фильтрация по строкам, затем агрегация по строкам, большое количество ключей.;
SELECT SQL_NO_CACHE SearchPhrase, count(DISTINCT UserID) AS u FROM hits_10m WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
-- агрегация чуть сложнее.;
SELECT SQL_NO_CACHE SearchEngineID, SearchPhrase, count(*) FROM hits_10m WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
-- агрегация по числу и строке, большое количество ключей.;
SELECT SQL_NO_CACHE UserID, count(*) FROM hits_10m GROUP BY UserID ORDER BY count(*) DESC LIMIT 10;
-- агрегация по очень большому количеству ключей, может не хватить оперативки.;
SELECT SQL_NO_CACHE UserID, SearchPhrase, count(*) FROM hits_10m GROUP BY UserID, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
-- ещё более сложная агрегация.;
SELECT SQL_NO_CACHE UserID, SearchPhrase, count(*) FROM hits_10m GROUP BY UserID, SearchPhrase LIMIT 10;
-- то же самое, но без сортировки.;
SELECT SQL_NO_CACHE UserID, Minute(EventTime) AS m, SearchPhrase, count(*) FROM hits_10m GROUP BY UserID, m, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
-- ещё более сложная агрегация, не стоит выполнять на больших таблицах.;
SELECT SQL_NO_CACHE UserID FROM hits_10m WHERE UserID = 12345678901234567890;
-- мощная фильтрация по столбцу типа UInt64.;
SELECT SQL_NO_CACHE count(*) FROM hits_10m WHERE URL LIKE '%metrika%';
-- фильтрация по поиску подстроки в строке.;
SELECT SQL_NO_CACHE SearchPhrase, MAX(URL), count(*) FROM hits_10m WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
-- вынимаем большие столбцы, фильтрация по строке.;
SELECT SQL_NO_CACHE SearchPhrase, MAX(URL), MAX(Title), count(*) AS c, count(DISTINCT UserID) FROM hits_10m WHERE Title LIKE '%Яндекс%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
-- чуть больше столбцы.;
SELECT SQL_NO_CACHE * FROM hits_10m WHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10;
-- плохой запрос - вынимаем все столбцы.;
SELECT SQL_NO_CACHE SearchPhrase FROM hits_10m WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10;
-- большая сортировка.;
SELECT SQL_NO_CACHE SearchPhrase FROM hits_10m WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10;
-- большая сортировка по строкам.;
SELECT SQL_NO_CACHE SearchPhrase FROM hits_10m WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10;
-- большая сортировка по кортежу.;
SELECT SQL_NO_CACHE CounterID, avg(length(URL)) AS l, count(*) FROM hits_10m WHERE URL != '' GROUP BY CounterID HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
-- считаем средние длины URL для крупных счётчиков.;
SELECT SQL_NO_CACHE SUBSTRING(SUBSTRING(Referer, POSITION('//' IN Referer) + 2), 1, GREATEST(0, POSITION('/' IN SUBSTRING(Referer, POSITION('//' IN Referer) + 2)) - 1)) AS k, avg(length(Referer)) AS l, count(*) AS c, MAX(Referer) FROM hits_10m WHERE Referer != '' GROUP BY k HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
-- то же самое, но с разбивкой по доменам.;
SELECT SQL_NO_CACHE sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM hits_10m;
-- много тупых агрегатных функций.;
SELECT SQL_NO_CACHE SearchEngineID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_10m WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY count(*) DESC LIMIT 10;
-- сложная агрегация, для больших таблиц может не хватить оперативки.;
SELECT SQL_NO_CACHE WatchID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_10m WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY count(*) DESC LIMIT 10;
-- агрегация по двум полям, которая ничего не агрегирует. Для больших таблиц выполнить не получится.;
SELECT SQL_NO_CACHE WatchID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_10m GROUP BY WatchID, ClientIP ORDER BY count(*) DESC LIMIT 10;
-- то же самое, но ещё и без фильтрации.;
SELECT SQL_NO_CACHE URL, count(*) FROM hits_10m GROUP BY URL ORDER BY count(*) DESC LIMIT 10;
-- агрегация по URL.;
SELECT SQL_NO_CACHE 1, URL, count(*) FROM hits_10m GROUP BY 1, URL ORDER BY count(*) DESC LIMIT 10;
-- агрегация по URL и числу.;
SELECT SQL_NO_CACHE ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, count(*) FROM hits_10m GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY count(*) DESC LIMIT 10;
SELECT SQL_NO_CACHE URL, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND URL != '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
SELECT SQL_NO_CACHE Title, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND Title != '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
SELECT SQL_NO_CACHE URL, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND IsLink AND NOT IsDownload GROUP BY URL ORDER BY PageViews DESC LIMIT 1000;
SELECT SQL_NO_CACHE TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN SearchEngineID = 0 AND AdvEngineID = 0 THEN Referer ELSE '' END AS Src, URL AS Dst, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 1000;
SELECT SQL_NO_CACHE URLHash, EventDate, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND TraficSourceID IN (-1, 6) AND RefererHash = 6202628419148573758 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100000;
SELECT SQL_NO_CACHE WindowClientWidth, WindowClientHeight, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND NOT DontCountHits AND URLHash = 6202628419148573758 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000;
SELECT SQL_NO_CACHE EventTime - INTERVAL SECOND(EventTime) SECOND AS Minute, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-02') AND NOT Refresh AND NOT DontCountHits GROUP BY Minute ORDER BY Minute;

View File

@ -1,5 +0,0 @@
CONF_DIR=/home/kartavyy/benchmark/infobright
expect_file=$CONF_DIR/expect.tcl
test_file=$CONF_DIR/queries.sql
etc_init_d_service=/etc/init.d/mysqld-ib

View File

@ -1,111 +0,0 @@
create table hits_10m
(
WatchID BIGINT,
JavaEnable SMALLINT,
Title VARCHAR(1400),
GoodEvent SMALLINT,
EventTime TIMESTAMP,
EventDate DATE,
CounterID BIGINT,
ClientIP BIGINT,
RegionID BIGINT,
UserID BIGINT,
CounterClass TINYINT,
OS SMALLINT,
UserAgent SMALLINT,
URL VARCHAR(7800),
Referer VARCHAR(3125),
Refresh TINYINT,
RefererCategoryID INT,
RefererRegionID BIGINT,
URLCategoryID INT,
URLRegionID BIGINT,
ResolutionWidth INT,
ResolutionHeight INT,
ResolutionDepth SMALLINT,
FlashMajor SMALLINT,
FlashMinor SMALLINT,
FlashMinor2 VARCHAR(256),
NetMajor SMALLINT,
NetMinor SMALLINT,
UserAgentMajor INT,
UserAgentMinor CHAR(2),
CookieEnable SMALLINT,
JavascriptEnable SMALLINT,
IsMobile SMALLINT,
MobilePhone SMALLINT,
MobilePhoneModel VARCHAR(80),
Params VARCHAR(2925),
IPNetworkID BIGINT,
TraficSourceID SMALLINT,
SearchEngineID INT,
SearchPhrase VARCHAR(2008),
AdvEngineID SMALLINT,
IsArtifical SMALLINT,
WindowClientWidth INT,
WindowClientHeight INT,
ClientTimeZone INTEGER,
ClientEventTime TIMESTAMP,
SilverlightVersion1 SMALLINT,
SilverlightVersion2 SMALLINT,
SilverlightVersion3 BIGINT,
SilverlightVersion4 INT,
PageCharset VARCHAR(80),
CodeVersion BIGINT,
IsLink SMALLINT,
IsDownload SMALLINT,
IsNotBounce SMALLINT,
FUniqID BIGINT,
OriginalURL VARCHAR(8181),
HID BIGINT,
IsOldCounter SMALLINT,
IsEvent SMALLINT,
IsParameter SMALLINT,
DontCountHits SMALLINT,
WithHash SMALLINT,
HitColor CHAR(1),
LocalEventTime TIMESTAMP,
Age SMALLINT,
Sex SMALLINT,
Income SMALLINT,
Interests INT,
Robotness SMALLINT,
RemoteIP BIGINT,
WindowName INT,
OpenerName INT,
HistoryLength SMALLINT,
BrowserLanguage CHAR(2),
BrowserCountry CHAR(2),
SocialNetwork VARCHAR(128),
SocialAction VARCHAR(128),
HTTPError INT,
SendTiming BIGINT,
DNSTiming BIGINT,
ConnectTiming BIGINT,
ResponseStartTiming BIGINT,
ResponseEndTiming BIGINT,
FetchTiming BIGINT,
SocialSourceNetworkID SMALLINT,
SocialSourcePage VARCHAR(256),
ParamPrice BIGINT,
ParamOrderID VARCHAR(80),
ParamCurrency CHAR(3),
ParamCurrencyID INT,
OpenstatServiceName VARCHAR(80),
OpenstatCampaignID VARCHAR(512),
OpenstatAdID VARCHAR(80),
OpenstatSourceID VARCHAR(256),
UTMSource VARCHAR(256),
UTMMedium VARCHAR(256),
UTMCampaign VARCHAR(407),
UTMContent VARCHAR(256),
UTMTerm VARCHAR(437),
FromTag VARCHAR(428),
HasGCLID SMALLINT,
RefererHash BIGINT,
URLHash BIGINT,
CLID BIGINT,
UserIDHash BIGINT
);
LOAD DATA INFILE '/opt/dump/dump_0.3/dump_hits_10m_meshed_utf8.tsv' INTO TABLE hits_10m FIELDS TERMINATED BY '\t' ESCAPED BY '\\' ENCLOSED BY "NULL";

View File

@ -1,18 +0,0 @@
#!/usr/bin/env bash
#!/bin/expect
# Set timeout
set timeout 600
# Get arguments
set query [lindex $argv 0]
spawn mysql-ib -u root -D hits
expect "mysql>"
send "$query\r"
expect "mysql>"
send "quit\r"
expect eof

File diff suppressed because it is too large Load Diff

View File

@ -1,113 +0,0 @@
-- set GLOBAL max_length_for_sort_data = 8388608;
SELECT count(*) FROM hits_10m;
SELECT count(*) FROM hits_10m WHERE AdvEngineID != 0;
SELECT sum(AdvEngineID), count(*), avg(ResolutionWidth) FROM hits_10m;
SELECT sum(UserID) FROM hits_10m;
SELECT count(DISTINCT UserID) FROM hits_10m;
SELECT count(DISTINCT SearchPhrase) FROM hits_10m;
SELECT min(EventDate), max(EventDate) FROM hits_10m;
SELECT AdvEngineID, count(*) FROM hits_10m WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY count(*) DESC;
-- мощная фильтрация. После фильтрации почти ничего не остаётся, но делаем ещё агрегацию.;
SELECT RegionID, count(DISTINCT UserID) AS u FROM hits_10m GROUP BY RegionID ORDER BY u DESC LIMIT 10;
-- агрегация, среднее количество ключей.;
SELECT RegionID, sum(AdvEngineID), count(*) AS c, avg(ResolutionWidth), count(DISTINCT UserID) FROM hits_10m GROUP BY RegionID ORDER BY count(*) DESC LIMIT 10;
-- агрегация, среднее количество ключей, несколько агрегатных функций.;
SELECT MobilePhoneModel, count(DISTINCT UserID) AS u FROM hits_10m WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
-- мощная фильтрация по строкам, затем агрегация по строкам.;
SELECT MobilePhone, MobilePhoneModel, count(DISTINCT UserID) AS u FROM hits_10m WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
-- мощная фильтрация по строкам, затем агрегация по паре из числа и строки.;
SELECT SearchPhrase, count(*) FROM hits_10m WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
-- средняя фильтрация по строкам, затем агрегация по строкам, большое количество ключей.;
SELECT SearchPhrase, count(DISTINCT UserID) AS u FROM hits_10m WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
-- агрегация чуть сложнее.;
SELECT SearchEngineID, SearchPhrase, count(*) FROM hits_10m WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
-- агрегация по числу и строке, большое количество ключей.;
SELECT UserID, count(*) FROM hits_10m GROUP BY UserID ORDER BY count(*) DESC LIMIT 10;
-- агрегация по очень большому количеству ключей, может не хватить оперативки.;
SELECT UserID, SearchPhrase, count(*) FROM hits_10m GROUP BY UserID, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
-- ещё более сложная агрегация.;
SELECT UserID, SearchPhrase, count(*) FROM hits_10m GROUP BY UserID, SearchPhrase LIMIT 10;
-- то же самое, но без сортировки.;
SELECT UserID, Minute(EventTime) AS m, SearchPhrase, count(*) FROM hits_10m GROUP BY UserID, m, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
-- ещё более сложная агрегация, не стоит выполнять на больших таблицах.;
SELECT UserID FROM hits_10m WHERE UserID = 123456789;
-- мощная фильтрация по столбцу типа UInt64.;
SELECT count(*) FROM hits_10m WHERE URL LIKE '%metrika%';
-- фильтрация по поиску подстроки в строке.;
SELECT SearchPhrase, MAX(URL), count(*) FROM hits_10m WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
-- вынимаем большие столбцы, фильтрация по строке.;
SELECT SearchPhrase, MAX(URL), MAX(Title), count(*) AS c, count(DISTINCT UserID) FROM hits_10m WHERE Title LIKE '%Яндекс%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
-- чуть больше столбцы.;
SELECT * FROM hits_10m WHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10;
-- плохой запрос - вынимаем все столбцы.;
SELECT SearchPhrase FROM hits_10m WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10;
-- большая сортировка.;
SELECT SearchPhrase FROM hits_10m WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10;
-- большая сортировка по строкам.;
SELECT SearchPhrase FROM hits_10m WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10;
-- большая сортировка по кортежу.;
SELECT CounterID, avg(length(URL)) AS l, count(*) FROM hits_10m WHERE URL != '' GROUP BY CounterID HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
-- считаем средние длины URL для крупных счётчиков.;
SELECT SUBSTRING(SUBSTRING(Referer, POSITION('//' IN Referer) + 2), 1, GREATEST(0, POSITION('/' IN SUBSTRING(Referer, POSITION('//' IN Referer) + 2)) - 1)) AS k, avg(length(Referer)) AS l, count(*) AS c, MAX(Referer) FROM hits_10m WHERE Referer != '' GROUP BY k HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
-- то же самое, но с разбивкой по доменам.;
SELECT sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM hits_10m;
-- много тупых агрегатных функций.;
SELECT SearchEngineID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_10m WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY count(*) DESC LIMIT 10;
-- сложная агрегация, для больших таблиц может не хватить оперативки.;
SELECT WatchID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_10m WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY count(*) DESC LIMIT 10;
-- агрегация по двум полям, которая ничего не агрегирует. Для больших таблиц выполнить не получится.;
SELECT WatchID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_10m GROUP BY WatchID, ClientIP ORDER BY count(*) DESC LIMIT 10;
-- то же самое, но ещё и без фильтрации.;
SELECT URL, count(*) FROM hits_10m GROUP BY URL ORDER BY count(*) DESC LIMIT 10;
-- агрегация по URL.;
SELECT 1, URL, count(*) FROM hits_10m GROUP BY 1, URL ORDER BY count(*) DESC LIMIT 10;
-- агрегация по URL и числу.;
SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, count(*) FROM hits_10m GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY count(*) DESC LIMIT 10;
SELECT URL, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND URL != '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
SELECT Title, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND Title != '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
SELECT URL, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND IsLink AND NOT IsDownload GROUP BY URL ORDER BY PageViews DESC LIMIT 1000;
SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN SearchEngineID = 0 AND AdvEngineID = 0 THEN Referer ELSE '' END AS Src, URL AS Dst, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 1000;
SELECT URLHash, EventDate, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND TraficSourceID IN (-1, 6) AND RefererHash = 6202628419148573758 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100000;
SELECT WindowClientWidth, WindowClientHeight, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND NOT DontCountHits AND URLHash = 6202628419148573758 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000;
SELECT EventTime - INTERVAL SECOND(EventTime) SECOND AS Minute, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-02') AND NOT Refresh AND NOT DontCountHits GROUP BY Minute ORDER BY Minute;

View File

@ -1,20 +0,0 @@
#!/usr/bin/env bash
QUERIES_FILE="queries.sql"
TABLE=$1
TRIES=3
cat "$QUERIES_FILE" | sed "s/{table}/${TABLE}/g" | while read query; do
sync
echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
echo -n "["
for i in $(seq 1 $TRIES); do
RES=$(mysql -u root -h 127.0.0.1 -P 3306 --database=test -t -vvv -e "$query" 2>&1 | grep ' set ' | grep -oP '\d+\.\d+')
[[ "$?" == "0" ]] && echo -n "$RES" || echo -n "null"
[[ "$i" != $TRIES ]] && echo -n ", "
done
echo "],"
done

View File

@ -1,141 +0,0 @@
Note: column store in MemSQL was introduced in Feb 2014.
http://www.memsql.com/download/
http://docs.memsql.com/docs/latest/setup/setup_onprem.html
wget http://download.memsql.com/8d9f4c4d99a547baa40ba097b171bd15/memsql-3.2.x86_64.deb
scp memsql-3.2.x86_64.deb example05e:~
ssh example05e
sudo dpkg -i memsql-3.2.x86_64.deb
sudo mkdir /opt/memsql-data/
sudo cp -r /var/lib/memsql/data/* /opt/memsql-data/
sudo rm -rf /var/lib/memsql/data
sudo ln -s /opt/memsql-data /var/lib/memsql/data
sudo chown -R memsql /opt/memsql-data
sudo chown -R memsql /var/lib/memsql/data
sudo service memsql start
mysql -u root -h 127.0.0.1 -P 3306 --prompt="memsql> "
CREATE DATABASE test;
USE test;
CREATE TABLE hits_10m
(
WatchID BIGINT,
JavaEnable SMALLINT,
Title VARCHAR(1400),
GoodEvent SMALLINT,
EventTime TIMESTAMP,
EventDate DATE,
CounterID BIGINT,
ClientIP BIGINT,
RegionID BIGINT,
UserID BIGINT,
CounterClass TINYINT,
OS SMALLINT,
UserAgent SMALLINT,
URL VARCHAR(7800),
Referer VARCHAR(3125),
Refresh TINYINT,
RefererCategoryID INT,
RefererRegionID BIGINT,
URLCategoryID INT,
URLRegionID BIGINT,
ResolutionWidth INT,
ResolutionHeight INT,
ResolutionDepth SMALLINT,
FlashMajor SMALLINT,
FlashMinor SMALLINT,
FlashMinor2 VARCHAR(256),
NetMajor SMALLINT,
NetMinor SMALLINT,
UserAgentMajor INT,
UserAgentMinor CHAR(2),
CookieEnable SMALLINT,
JavascriptEnable SMALLINT,
IsMobile SMALLINT,
MobilePhone SMALLINT,
MobilePhoneModel VARCHAR(80),
Params VARCHAR(2925),
IPNetworkID BIGINT,
TraficSourceID SMALLINT,
SearchEngineID INT,
SearchPhrase VARCHAR(2008),
AdvEngineID SMALLINT,
IsArtifical SMALLINT,
WindowClientWidth INT,
WindowClientHeight INT,
ClientTimeZone INTEGER,
ClientEventTime TIMESTAMP,
SilverlightVersion1 SMALLINT,
SilverlightVersion2 SMALLINT,
SilverlightVersion3 BIGINT,
SilverlightVersion4 INT,
PageCharset VARCHAR(80),
CodeVersion BIGINT,
IsLink SMALLINT,
IsDownload SMALLINT,
IsNotBounce SMALLINT,
FUniqID BIGINT,
OriginalURL VARCHAR(8181),
HID BIGINT,
IsOldCounter SMALLINT,
IsEvent SMALLINT,
IsParameter SMALLINT,
DontCountHits SMALLINT,
WithHash SMALLINT,
HitColor CHAR(1),
LocalEventTime TIMESTAMP,
Age SMALLINT,
Sex SMALLINT,
Income SMALLINT,
Interests INT,
Robotness SMALLINT,
RemoteIP BIGINT,
WindowName INT,
OpenerName INT,
HistoryLength SMALLINT,
BrowserLanguage CHAR(2),
BrowserCountry CHAR(2),
SocialNetwork VARCHAR(128),
SocialAction VARCHAR(128),
HTTPError INT,
SendTiming BIGINT,
DNSTiming BIGINT,
ConnectTiming BIGINT,
ResponseStartTiming BIGINT,
ResponseEndTiming BIGINT,
FetchTiming BIGINT,
SocialSourceNetworkID SMALLINT,
SocialSourcePage VARCHAR(256),
ParamPrice BIGINT,
ParamOrderID VARCHAR(80),
ParamCurrency CHAR(3),
ParamCurrencyID INT,
OpenstatServiceName VARCHAR(80),
OpenstatCampaignID VARCHAR(512),
OpenstatAdID VARCHAR(80),
OpenstatSourceID VARCHAR(256),
UTMSource VARCHAR(256),
UTMMedium VARCHAR(256),
UTMCampaign VARCHAR(407),
UTMContent VARCHAR(256),
UTMTerm VARCHAR(437),
FromTag VARCHAR(428),
HasGCLID SMALLINT,
RefererHash BIGINT,
URLHash BIGINT,
CLID BIGINT,
INDEX ColumnStoreIndex USING CLUSTERED COLUMNSTORE (CounterID, EventDate, UserID, EventTime)
);
Table creation takes about 15 seconds.
LOAD DATA INFILE '/opt/dumps/hits_10m_corrected.tsv' INTO TABLE hits_10m;
12 min 24.51 sec
13422 rows/sec.
data size: 1 613 773 528 bytes.

View File

@ -1,43 +0,0 @@
SELECT count(*) FROM hits_10m;
SELECT count(*) FROM hits_10m WHERE AdvEngineID != 0;
SELECT sum(AdvEngineID), count(*), avg(ResolutionWidth) FROM hits_10m;
SELECT sum(UserID) FROM hits_10m;
SELECT count(DISTINCT UserID) FROM hits_10m;
SELECT count(DISTINCT SearchPhrase) FROM hits_10m;
SELECT min(EventDate), max(EventDate) FROM hits_10m;
SELECT AdvEngineID, count(*) FROM hits_10m WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY count(*) DESC;
SELECT RegionID, count(DISTINCT UserID) AS u FROM hits_10m GROUP BY RegionID ORDER BY u DESC LIMIT 10;
SELECT RegionID, sum(AdvEngineID), count(*) AS c, avg(ResolutionWidth), count(DISTINCT UserID) FROM hits_10m GROUP BY RegionID ORDER BY count(*) DESC LIMIT 10;
SELECT MobilePhoneModel, count(DISTINCT UserID) AS u FROM hits_10m WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
SELECT MobilePhone, MobilePhoneModel, count(DISTINCT UserID) AS u FROM hits_10m WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
SELECT SearchPhrase, count(*) FROM hits_10m WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT SearchPhrase, count(DISTINCT UserID) AS u FROM hits_10m WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
SELECT SearchEngineID, SearchPhrase, count(*) FROM hits_10m WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT UserID, count(*) FROM hits_10m GROUP BY UserID ORDER BY count(*) DESC LIMIT 10;
SELECT UserID, SearchPhrase, count(*) FROM hits_10m GROUP BY UserID, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT UserID, SearchPhrase, count(*) FROM hits_10m GROUP BY UserID, SearchPhrase LIMIT 10;
SELECT UserID, Minute(EventTime) AS m, SearchPhrase, count(*) FROM hits_10m GROUP BY UserID, m, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT UserID FROM hits_10m WHERE UserID = 123456789;
SELECT count(*) FROM hits_10m WHERE URL LIKE '%metrika%';
SELECT SearchPhrase, MAX(URL), count(*) FROM hits_10m WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT SearchPhrase, MAX(URL), MAX(Title), count(*) AS c, count(DISTINCT UserID) FROM hits_10m WHERE Title LIKE '%Яндекс%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT * FROM hits_10m WHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10;
SELECT SearchPhrase FROM hits_10m WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10;
SELECT SearchPhrase FROM hits_10m WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10;
SELECT SearchPhrase FROM hits_10m WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10;
SELECT CounterID, avg(length(URL)) AS l, count(*) FROM hits_10m WHERE URL != '' GROUP BY CounterID HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
SELECT SUBSTRING(SUBSTRING(Referer, POSITION('//' IN Referer) + 2), 1, GREATEST(0, POSITION('/' IN SUBSTRING(Referer, POSITION('//' IN Referer) + 2)) - 1)) AS k, avg(length(Referer)) AS l, count(*) AS c, MAX(Referer) FROM hits_10m WHERE Referer != '' GROUP BY k HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
SELECT sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM hits_10m;
SELECT SearchEngineID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_10m WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY count(*) DESC LIMIT 10;
SELECT WatchID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_10m WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY count(*) DESC LIMIT 10;
SELECT WatchID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_10m GROUP BY WatchID, ClientIP ORDER BY count(*) DESC LIMIT 10;
SELECT URL, count(*) FROM hits_10m GROUP BY URL ORDER BY count(*) DESC LIMIT 10;
SELECT 1, URL, count(*) FROM hits_10m GROUP BY 1, URL ORDER BY count(*) DESC LIMIT 10;
SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, count(*) FROM hits_10m GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY count(*) DESC LIMIT 10;
SELECT URL, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND URL != '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
SELECT Title, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND Title != '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
SELECT URL, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND IsLink AND NOT IsDownload GROUP BY URL ORDER BY PageViews DESC LIMIT 1000;
SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN SearchEngineID = 0 AND AdvEngineID = 0 THEN Referer ELSE '' END AS Src, URL AS Dst, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 1000;
SELECT URLHash, EventDate, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND TraficSourceID IN (-1, 6) AND RefererHash = 6202628419148573758 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100000;
SELECT WindowClientWidth, WindowClientHeight, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND NOT DontCountHits AND URLHash = 6202628419148573758 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000;
SELECT EventTime - INTERVAL SECOND(EventTime) SECOND AS Minute, count(*) AS PageViews FROM hits_10m WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-02') AND NOT Refresh AND NOT DontCountHits GROUP BY Minute ORDER BY Minute;

File diff suppressed because it is too large Load Diff

View File

@ -1,10 +0,0 @@
#!/bin/bash
grep -v -P '^#' queries.sql | sed -e 's/{table}/hits/' | while read query; do
echo 3 | sudo tee /proc/sys/vm/drop_caches
echo "$query";
for i in {1..3}; do
./send-query "$query" 2>&1 | grep -P '\d+ tuple|clk: |unknown|overflow|error';
done;
done;

View File

@ -1,356 +0,0 @@
Go to https://www.monetdb.org/
Dowload now.
Latest binary releases.
Ubuntu & Debian.
https://www.monetdb.org/downloads/deb/
Go to the server where you want to install MonetDB.
```
$ sudo mcedit /etc/apt/sources.list.d/monetdb.list
```
Write:
```
deb https://dev.monetdb.org/downloads/deb/ bionic monetdb
```
```
$ wget --output-document=- https://www.monetdb.org/downloads/MonetDB-GPG-KEY | sudo apt-key add -
$ sudo apt update
$ sudo apt install monetdb5-sql monetdb-client
$ sudo systemctl enable monetdbd
$ sudo systemctl start monetdbd
$ sudo usermod -a -G monetdb $USER
```
Logout and login back to your server.
Tutorial:
https://www.monetdb.org/Documentation/UserGuide/Tutorial
Creating the database:
```
$ sudo mkdir /opt/monetdb
$ sudo chmod 777 /opt/monetdb
$ monetdbd create /opt/monetdb
$ monetdbd start /opt/monetdb
cannot remove socket files
```
Now you have to stop MonetDB, copy the contents of `/var/monetdb5` to `/opt/monetdb` and replace the `/var/monetdb5` with symlink to `/opt/monetdb`. This is necessary, because I don't have free space in `/var` and creation of database in `/opt` did not succeed.
Start MonetDB again.
```
$ sudo systemctl start monetdbd
```
```
$ monetdb create test
created database in maintenance mode: test
$ monetdb release test
taken database out of maintenance mode: test
```
Run client:
```
$ mclient -u monetdb -d test
```
Type password: monetdb
```
CREATE TABLE hits
(
"WatchID" BIGINT,
"JavaEnable" TINYINT,
"Title" TEXT,
"GoodEvent" SMALLINT,
"EventTime" TIMESTAMP,
"EventDate" Date,
"CounterID" INTEGER,
"ClientIP" INTEGER,
"RegionID" INTEGER,
"UserID" BIGINT,
"CounterClass" TINYINT,
"OS" TINYINT,
"UserAgent" TINYINT,
"URL" TEXT,
"Referer" TEXT,
"Refresh" TINYINT,
"RefererCategoryID" SMALLINT,
"RefererRegionID" INTEGER,
"URLCategoryID" SMALLINT,
"URLRegionID" INTEGER,
"ResolutionWidth" SMALLINT,
"ResolutionHeight" SMALLINT,
"ResolutionDepth" TINYINT,
"FlashMajor" TINYINT,
"FlashMinor" TINYINT,
"FlashMinor2" TEXT,
"NetMajor" TINYINT,
"NetMinor" TINYINT,
"UserAgentMajor" SMALLINT,
"UserAgentMinor" TEXT,
"CookieEnable" TINYINT,
"JavascriptEnable" TINYINT,
"IsMobile" TINYINT,
"MobilePhone" TINYINT,
"MobilePhoneModel" TEXT,
"Params" TEXT,
"IPNetworkID" INTEGER,
"TraficSourceID" TINYINT,
"SearchEngineID" SMALLINT,
"SearchPhrase" TEXT,
"AdvEngineID" TINYINT,
"IsArtifical" TINYINT,
"WindowClientWidth" SMALLINT,
"WindowClientHeight" SMALLINT,
"ClientTimeZone" SMALLINT,
"ClientEventTime" TIMESTAMP,
"SilverlightVersion1" TINYINT,
"SilverlightVersion2" TINYINT,
"SilverlightVersion3" INTEGER,
"SilverlightVersion4" SMALLINT,
"PageCharset" TEXT,
"CodeVersion" INTEGER,
"IsLink" TINYINT,
"IsDownload" TINYINT,
"IsNotBounce" TINYINT,
"FUniqID" BIGINT,
"OriginalURL" TEXT,
"HID" INTEGER,
"IsOldCounter" TINYINT,
"IsEvent" TINYINT,
"IsParameter" TINYINT,
"DontCountHits" TINYINT,
"WithHash" TINYINT,
"HitColor" TEXT,
"LocalEventTime" TIMESTAMP,
"Age" TINYINT,
"Sex" TINYINT,
"Income" TINYINT,
"Interests" SMALLINT,
"Robotness" TINYINT,
"RemoteIP" INTEGER,
"WindowName" INTEGER,
"OpenerName" INTEGER,
"HistoryLength" SMALLINT,
"BrowserLanguage" TEXT,
"BrowserCountry" TEXT,
"SocialNetwork" TEXT,
"SocialAction" TEXT,
"HTTPError" SMALLINT,
"SendTiming" INTEGER,
"DNSTiming" INTEGER,
"ConnectTiming" INTEGER,
"ResponseStartTiming" INTEGER,
"ResponseEndTiming" INTEGER,
"FetchTiming" INTEGER,
"SocialSourceNetworkID" TINYINT,
"SocialSourcePage" TEXT,
"ParamPrice" BIGINT,
"ParamOrderID" TEXT,
"ParamCurrency" TEXT,
"ParamCurrencyID" SMALLINT,
"OpenstatServiceName" TEXT,
"OpenstatCampaignID" TEXT,
"OpenstatAdID" TEXT,
"OpenstatSourceID" TEXT,
"UTMSource" TEXT,
"UTMMedium" TEXT,
"UTMCampaign" TEXT,
"UTMContent" TEXT,
"UTMTerm" TEXT,
"FromTag" TEXT,
"HasGCLID" TINYINT,
"RefererHash" BIGINT,
"URLHash" BIGINT,
"CLID" INTEGER
);
```
# How to prepare data
Download the 100 million rows dataset from here and insert into ClickHouse:
https://clickhouse.tech/docs/en/getting-started/example-datasets/metrica/
Create the dataset from ClickHouse:
```
SELECT
toInt64(WatchID) = -9223372036854775808 ? -9223372036854775807 : toInt64(WatchID),
toInt8(JavaEnable) = -128 ? -127 : toInt8(JavaEnable),
toValidUTF8(toString(Title)),
toInt16(GoodEvent) = -32768 ? -32767 : toInt16(GoodEvent),
EventTime,
EventDate,
toInt32(CounterID) = -2147483648 ? -2147483647 : toInt32(CounterID),
toInt32(ClientIP) = -2147483648 ? -2147483647 : toInt32(ClientIP),
toInt32(RegionID) = -2147483648 ? -2147483647 : toInt32(RegionID),
toInt64(UserID) = -9223372036854775808 ? -9223372036854775807 : toInt64(UserID),
toInt8(CounterClass) = -128 ? -127 : toInt8(CounterClass),
toInt8(OS) = -128 ? -127 : toInt8(OS),
toInt8(UserAgent) = -128 ? -127 : toInt8(UserAgent),
toValidUTF8(toString(URL)),
toValidUTF8(toString(Referer)),
toInt8(Refresh) = -128 ? -127 : toInt8(Refresh),
toInt16(RefererCategoryID) = -32768 ? -32767 : toInt16(RefererCategoryID),
toInt32(RefererRegionID) = -2147483648 ? -2147483647 : toInt32(RefererRegionID),
toInt16(URLCategoryID) = -32768 ? -32767 : toInt16(URLCategoryID),
toInt32(URLRegionID) = -2147483648 ? -2147483647 : toInt32(URLRegionID),
toInt16(ResolutionWidth) = -32768 ? -32767 : toInt16(ResolutionWidth),
toInt16(ResolutionHeight) = -32768 ? -32767 : toInt16(ResolutionHeight),
toInt8(ResolutionDepth) = -128 ? -127 : toInt8(ResolutionDepth),
toInt8(FlashMajor) = -128 ? -127 : toInt8(FlashMajor),
toInt8(FlashMinor) = -128 ? -127 : toInt8(FlashMinor),
toValidUTF8(toString(FlashMinor2)),
toInt8(NetMajor) = -128 ? -127 : toInt8(NetMajor),
toInt8(NetMinor) = -128 ? -127 : toInt8(NetMinor),
toInt16(UserAgentMajor) = -32768 ? -32767 : toInt16(UserAgentMajor),
toValidUTF8(toString(UserAgentMinor)),
toInt8(CookieEnable) = -128 ? -127 : toInt8(CookieEnable),
toInt8(JavascriptEnable) = -128 ? -127 : toInt8(JavascriptEnable),
toInt8(IsMobile) = -128 ? -127 : toInt8(IsMobile),
toInt8(MobilePhone) = -128 ? -127 : toInt8(MobilePhone),
toValidUTF8(toString(MobilePhoneModel)),
toValidUTF8(toString(Params)),
toInt32(IPNetworkID) = -2147483648 ? -2147483647 : toInt32(IPNetworkID),
toInt8(TraficSourceID) = -128 ? -127 : toInt8(TraficSourceID),
toInt16(SearchEngineID) = -32768 ? -32767 : toInt16(SearchEngineID),
toValidUTF8(toString(SearchPhrase)),
toInt8(AdvEngineID) = -128 ? -127 : toInt8(AdvEngineID),
toInt8(IsArtifical) = -128 ? -127 : toInt8(IsArtifical),
toInt16(WindowClientWidth) = -32768 ? -32767 : toInt16(WindowClientWidth),
toInt16(WindowClientHeight) = -32768 ? -32767 : toInt16(WindowClientHeight),
toInt16(ClientTimeZone) = -32768 ? -32767 : toInt16(ClientTimeZone),
ClientEventTime,
toInt8(SilverlightVersion1) = -128 ? -127 : toInt8(SilverlightVersion1),
toInt8(SilverlightVersion2) = -128 ? -127 : toInt8(SilverlightVersion2),
toInt32(SilverlightVersion3) = -2147483648 ? -2147483647 : toInt32(SilverlightVersion3),
toInt16(SilverlightVersion4) = -32768 ? -32767 : toInt16(SilverlightVersion4),
toValidUTF8(toString(PageCharset)),
toInt32(CodeVersion) = -2147483648 ? -2147483647 : toInt32(CodeVersion),
toInt8(IsLink) = -128 ? -127 : toInt8(IsLink),
toInt8(IsDownload) = -128 ? -127 : toInt8(IsDownload),
toInt8(IsNotBounce) = -128 ? -127 : toInt8(IsNotBounce),
toInt64(FUniqID) = -9223372036854775808 ? -9223372036854775807 : toInt64(FUniqID),
toValidUTF8(toString(OriginalURL)),
toInt32(HID) = -2147483648 ? -2147483647 : toInt32(HID),
toInt8(IsOldCounter) = -128 ? -127 : toInt8(IsOldCounter),
toInt8(IsEvent) = -128 ? -127 : toInt8(IsEvent),
toInt8(IsParameter) = -128 ? -127 : toInt8(IsParameter),
toInt8(DontCountHits) = -128 ? -127 : toInt8(DontCountHits),
toInt8(WithHash) = -128 ? -127 : toInt8(WithHash),
toValidUTF8(toString(HitColor)),
LocalEventTime,
toInt8(Age) = -128 ? -127 : toInt8(Age),
toInt8(Sex) = -128 ? -127 : toInt8(Sex),
toInt8(Income) = -128 ? -127 : toInt8(Income),
toInt16(Interests) = -32768 ? -32767 : toInt16(Interests),
toInt8(Robotness) = -128 ? -127 : toInt8(Robotness),
toInt32(RemoteIP) = -2147483648 ? -2147483647 : toInt32(RemoteIP),
toInt32(WindowName) = -2147483648 ? -2147483647 : toInt32(WindowName),
toInt32(OpenerName) = -2147483648 ? -2147483647 : toInt32(OpenerName),
toInt16(HistoryLength) = -32768 ? -32767 : toInt16(HistoryLength),
toValidUTF8(toString(BrowserLanguage)),
toValidUTF8(toString(BrowserCountry)),
toValidUTF8(toString(SocialNetwork)),
toValidUTF8(toString(SocialAction)),
toInt16(HTTPError) = -32768 ? -32767 : toInt16(HTTPError),
toInt32(SendTiming) = -2147483648 ? -2147483647 : toInt32(SendTiming),
toInt32(DNSTiming) = -2147483648 ? -2147483647 : toInt32(DNSTiming),
toInt32(ConnectTiming) = -2147483648 ? -2147483647 : toInt32(ConnectTiming),
toInt32(ResponseStartTiming) = -2147483648 ? -2147483647 : toInt32(ResponseStartTiming),
toInt32(ResponseEndTiming) = -2147483648 ? -2147483647 : toInt32(ResponseEndTiming),
toInt32(FetchTiming) = -2147483648 ? -2147483647 : toInt32(FetchTiming),
toInt8(SocialSourceNetworkID) = -128 ? -127 : toInt8(SocialSourceNetworkID),
toValidUTF8(toString(SocialSourcePage)),
toInt64(ParamPrice) = -9223372036854775808 ? -9223372036854775807 : toInt64(ParamPrice),
toValidUTF8(toString(ParamOrderID)),
toValidUTF8(toString(ParamCurrency)),
toInt16(ParamCurrencyID) = -32768 ? -32767 : toInt16(ParamCurrencyID),
toValidUTF8(toString(OpenstatServiceName)),
toValidUTF8(toString(OpenstatCampaignID)),
toValidUTF8(toString(OpenstatAdID)),
toValidUTF8(toString(OpenstatSourceID)),
toValidUTF8(toString(UTMSource)),
toValidUTF8(toString(UTMMedium)),
toValidUTF8(toString(UTMCampaign)),
toValidUTF8(toString(UTMContent)),
toValidUTF8(toString(UTMTerm)),
toValidUTF8(toString(FromTag)),
toInt8(HasGCLID) = -128 ? -127 : toInt8(HasGCLID),
toInt64(RefererHash) = -9223372036854775808 ? -9223372036854775807 : toInt64(RefererHash),
toInt64(URLHash) = -9223372036854775808 ? -9223372036854775807 : toInt64(URLHash),
toInt32(CLID) = -2147483648 ? -2147483647 : toInt32(CLID)
FROM hits_100m_obfuscated
INTO OUTFILE '/home/milovidov/example_datasets/hits_100m_obfuscated_monetdb.tsv'
FORMAT TSV;
```
Note that MonetDB does not support the most negative numbers like -128. And we have to convert them by adding one.
It makes impossible to store the values of 64bit identifiers in BIGINT.
Maybe it's a trick to optimize NULLs?
Upload the data:
```
$ mclient -u monetdb -d test
```
Type password: monetdb
```
COPY INTO hits FROM '/home/milovidov/example_datasets/hits_100m_obfuscated_monetdb.tsv' USING DELIMITERS '\t';
```
It takes 28 minutes 02 seconds on a server (Linux Ubuntu, Xeon E5-2560v2, 32 logical CPU, 128 GiB RAM, 8xHDD RAID-5, 40 TB).
It is roughly 60 000 rows per second.
Validate the data:
```
SELECT count(*) FROM hits;
```
Create an index:
```
CREATE INDEX hits_idx ON hits ("CounterID", "EventDate");
```
(it takes 5 seconds)
Run the benchmark:
```
./benchmark.sh | tee log.txt
```
You can find the log in `log.txt` file.
Postprocess data:
```
grep clk log.txt | tr -d '\r' | awk '{ if ($3 == "ms") { print $2 / 1000; } else if ($3 == "sec") { print $2 } else { print } }'
```
Then replace values with "min" (minutes) timing manually and save to `tmp.txt`.
Then process to JSON format:
```
awk '{
if (i % 3 == 0) { a = $1 }
else if (i % 3 == 1) { b = $1 }
else if (i % 3 == 2) { c = $1; print "[" a ", " b ", " c "]," };
++i; }' < tmp.txt
```
And paste to `/website/benchmark/dbms/results/005_monetdb.json` in the repository.

View File

@ -1,341 +0,0 @@
3
SELECT count(*) FROM hits;
1 tuple
clk: 1.262 ms
1 tuple
clk: 1.420 ms
1 tuple
clk: 1.190 ms
3
SELECT count(*) FROM hits WHERE "AdvEngineID" <> 0;
1 tuple
clk: 1.530 sec
1 tuple
clk: 1.489 sec
1 tuple
clk: 1.490 sec
3
SELECT sum("AdvEngineID"), count(*), avg("ResolutionWidth") FROM hits;
1 tuple
clk: 597.512 ms
1 tuple
clk: 579.383 ms
1 tuple
clk: 598.220 ms
3
SELECT sum("UserID") FROM hits;
overflow in calculation.
clk: 568.003 ms
overflow in calculation.
clk: 554.572 ms
overflow in calculation.
clk: 552.076 ms
3
SELECT COUNT(DISTINCT "UserID") FROM hits;
1 tuple
clk: 6.688 sec
1 tuple
clk: 6.689 sec
1 tuple
clk: 6.652 sec
3
SELECT COUNT(DISTINCT "SearchPhrase") FROM hits;
1 tuple
clk: 15.702 sec
1 tuple
clk: 17.189 sec
1 tuple
clk: 15.514 sec
3
SELECT min("EventDate"), max("EventDate") FROM hits;
1 tuple
clk: 697.770 ms
1 tuple
clk: 711.870 ms
1 tuple
clk: 697.177 ms
3
SELECT "AdvEngineID", count(*) FROM hits WHERE "AdvEngineID" <> 0 GROUP BY "AdvEngineID" ORDER BY count(*) DESC;
18 tuples
clk: 1.536 sec
18 tuples
clk: 1.505 sec
18 tuples
clk: 1.492 sec
3
SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM hits GROUP BY "RegionID" ORDER BY u DESC LIMIT 10;
10 tuples
clk: 9.965 sec
10 tuples
clk: 10.106 sec
10 tuples
clk: 10.136 sec
3
SELECT "RegionID", sum("AdvEngineID"), count(*) AS c, avg("ResolutionWidth"), COUNT(DISTINCT "UserID") FROM hits GROUP BY "RegionID" ORDER BY c DESC LIMIT 10;
10 tuples
clk: 8.329 sec
10 tuples
clk: 8.601 sec
10 tuples
clk: 8.039 sec
3
SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
10 tuples
clk: 3.385 sec
10 tuples
clk: 3.321 sec
10 tuples
clk: 3.326 sec
3
SELECT "MobilePhone", "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhone", "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
10 tuples
clk: 3.510 sec
10 tuples
clk: 3.431 sec
10 tuples
clk: 3.382 sec
3
SELECT "SearchPhrase", count(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
10 tuples
clk: 10.891 sec
10 tuples
clk: 11.483 sec
10 tuples
clk: 10.352 sec
3
SELECT "SearchPhrase", COUNT(DISTINCT "UserID") AS u FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY u DESC LIMIT 10;
10 tuples
clk: 15.711 sec
10 tuples
clk: 15.444 sec
10 tuples
clk: 15.503 sec
3
SELECT "SearchEngineID", "SearchPhrase", count(*) AS c FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "SearchPhrase" ORDER BY c DESC LIMIT 10;
10 tuples
clk: 11.433 sec
10 tuples
clk: 11.399 sec
10 tuples
clk: 11.285 sec
3
SELECT "UserID", count(*) FROM hits GROUP BY "UserID" ORDER BY count(*) DESC LIMIT 10;
10 tuples
clk: 7.184 sec
10 tuples
clk: 7.015 sec
10 tuples
clk: 6.849 sec
3
SELECT "UserID", "SearchPhrase", count(*) FROM hits GROUP BY "UserID", "SearchPhrase" ORDER BY count(*) DESC LIMIT 10;
10 tuples
clk: 29.096 sec
10 tuples
clk: 28.328 sec
10 tuples
clk: 29.247 sec
3
SELECT "UserID", "SearchPhrase", count(*) FROM hits GROUP BY "UserID", "SearchPhrase" LIMIT 10;
10 tuples
clk: 29.457 sec
10 tuples
clk: 29.364 sec
10 tuples
clk: 29.269 sec
3
SELECT "UserID", extract(minute FROM "EventTime") AS m, "SearchPhrase", count(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY count(*) DESC LIMIT 10;
10 tuples
clk: 47.141 sec
10 tuples
clk: 46.495 sec
10 tuples
clk: 46.472 sec
3
SELECT "UserID" FROM hits WHERE "UserID" = -6101065172474983726;
0 tuples
clk: 783.332 ms
0 tuples
clk: 771.157 ms
0 tuples
clk: 783.082 ms
3
SELECT count(*) FROM hits WHERE "URL" LIKE '%metrika%';
1 tuple
clk: 3.963 sec
1 tuple
clk: 3.930 sec
1 tuple
clk: 3.964 sec
3
SELECT "SearchPhrase", min("URL"), count(*) AS c FROM hits WHERE "URL" LIKE '%metrika%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
10 tuples
clk: 3.925 sec
10 tuples
clk: 3.817 sec
10 tuples
clk: 3.802 sec
3
SELECT "SearchPhrase", min("URL"), min("Title"), count(*) AS c, COUNT(DISTINCT "UserID") FROM hits WHERE "Title" LIKE '%Яндекс%' AND "URL" NOT LIKE '%.yandex.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
10 tuples
clk: 6.067 sec
10 tuples
clk: 6.120 sec
10 tuples
clk: 6.012 sec
3
SELECT * FROM hits WHERE "URL" LIKE '%metrika%' ORDER BY "EventTime" LIMIT 10;
10 tuples !87 columns dropped, 29 fields truncated!
clk: 4.251 sec
10 tuples !87 columns dropped, 29 fields truncated!
clk: 4.190 sec
10 tuples !87 columns dropped, 29 fields truncated!
clk: 4.379 sec
3
SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime" LIMIT 10;
10 tuples
clk: 6.699 sec
10 tuples
clk: 6.718 sec
10 tuples
clk: 6.802 sec
3
SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "SearchPhrase" LIMIT 10;
10 tuples
clk: 6.887 sec
10 tuples
clk: 6.838 sec
10 tuples
clk: 6.844 sec
3
SELECT "SearchPhrase" FROM hits WHERE "SearchPhrase" <> '' ORDER BY "EventTime", "SearchPhrase" LIMIT 10;
10 tuples
clk: 6.806 sec
10 tuples
clk: 6.878 sec
10 tuples
clk: 6.807 sec
3
SELECT "CounterID", avg(length("URL")) AS l, count(*) AS c FROM hits WHERE "URL" <> '' GROUP BY "CounterID" HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
25 tuples
clk: 1:01 min
25 tuples
clk: 55.553 sec
25 tuples
clk: 56.188 sec
3
SELECT sys.getdomain("Referer") AS key, avg(length("Referer")) AS l, count(*) AS c, min("Referer") FROM hits WHERE "Referer" <> '' GROUP BY key HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
clk: 1:00 min
clk: 1:00 min
clk: 1:00 min
3
SELECT sum("ResolutionWidth"), sum("ResolutionWidth" + 1), sum("ResolutionWidth" + 2), sum("ResolutionWidth" + 3), sum("ResolutionWidth" + 4), sum("ResolutionWidth" + 5), sum("ResolutionWidth" + 6), sum("ResolutionWidth" + 7), sum("ResolutionWidth" + 8), sum("ResolutionWidth" + 9), sum("ResolutionWidth" + 10), sum("ResolutionWidth" + 11), sum("ResolutionWidth" + 12), sum("ResolutionWidth" + 13), sum("ResolutionWidth" + 14), sum("ResolutionWidth" + 15), sum("ResolutionWidth" + 16), sum("ResolutionWidth" + 17), sum("ResolutionWidth" + 18), sum("ResolutionWidth" + 19), sum("ResolutionWidth" + 20), sum("ResolutionWidth" + 21), sum("ResolutionWidth" + 22), sum("ResolutionWidth" + 23), sum("ResolutionWidth" + 24), sum("ResolutionWidth" + 25), sum("ResolutionWidth" + 26), sum("ResolutionWidth" + 27), sum("ResolutionWidth" + 28), sum("ResolutionWidth" + 29), sum("ResolutionWidth" + 30), sum("ResolutionWidth" + 31), sum("ResolutionWidth" + 32), sum("ResolutionWidth" + 33), sum("ResolutionWidth" + 34), sum("ResolutionWidth" + 35), sum("ResolutionWidth" + 36), sum("ResolutionWidth" + 37), sum("ResolutionWidth" + 38), sum("ResolutionWidth" + 39), sum("ResolutionWidth" + 40), sum("ResolutionWidth" + 41), sum("ResolutionWidth" + 42), sum("ResolutionWidth" + 43), sum("ResolutionWidth" + 44), sum("ResolutionWidth" + 45), sum("ResolutionWidth" + 46), sum("ResolutionWidth" + 47), sum("ResolutionWidth" + 48), sum("ResolutionWidth" + 49), sum("ResolutionWidth" + 50), sum("ResolutionWidth" + 51), sum("ResolutionWidth" + 52), sum("ResolutionWidth" + 53), sum("ResolutionWidth" + 54), sum("ResolutionWidth" + 55), sum("ResolutionWidth" + 56), sum("ResolutionWidth" + 57), sum("ResolutionWidth" + 58), sum("ResolutionWidth" + 59), sum("ResolutionWidth" + 60), sum("ResolutionWidth" + 61), sum("ResolutionWidth" + 62), sum("ResolutionWidth" + 63), sum("ResolutionWidth" + 64), sum("ResolutionWidth" + 65), sum("ResolutionWidth" + 66), sum("ResolutionWidth" + 67), sum("ResolutionWidth" + 68), sum("ResolutionWidth" + 69), sum("ResolutionWidth" + 70), sum("ResolutionWidth" + 71), sum("ResolutionWidth" + 72), sum("ResolutionWidth" + 73), sum("ResolutionWidth" + 74), sum("ResolutionWidth" + 75), sum("ResolutionWidth" + 76), sum("ResolutionWidth" + 77), sum("ResolutionWidth" + 78), sum("ResolutionWidth" + 79), sum("ResolutionWidth" + 80), sum("ResolutionWidth" + 81), sum("ResolutionWidth" + 82), sum("ResolutionWidth" + 83), sum("ResolutionWidth" + 84), sum("ResolutionWidth" + 85), sum("ResolutionWidth" + 86), sum("ResolutionWidth" + 87), sum("ResolutionWidth" + 88), sum("ResolutionWidth" + 89) FROM hits;
1 tuple !77 columns dropped!
clk: 6.221 sec
1 tuple !77 columns dropped!
clk: 6.170 sec
1 tuple !77 columns dropped!
clk: 6.382 sec
3
SELECT "SearchEngineID", "ClientIP", count(*) AS c, sum("Refresh"), avg("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10;
10 tuples
clk: 5.684 sec
10 tuples
clk: 5.585 sec
10 tuples
clk: 5.463 sec
3
SELECT "WatchID", "ClientIP", count(*) AS c, sum("Refresh"), avg("ResolutionWidth") FROM hits WHERE "SearchPhrase" <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
10 tuples
clk: 6.281 sec
10 tuples
clk: 6.574 sec
10 tuples
clk: 6.243 sec
3
SELECT "WatchID", "ClientIP", count(*) AS c, sum("Refresh"), avg("ResolutionWidth") FROM hits GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
10 tuples
clk: 44.641 sec
10 tuples
clk: 41.904 sec
10 tuples
clk: 43.218 sec
3
SELECT "URL", count(*) AS c FROM hits GROUP BY "URL" ORDER BY c DESC LIMIT 10;
10 tuples
clk: 1:24 min
10 tuples
clk: 1:31 min
10 tuples
clk: 1:24 min
3
SELECT 1, "URL", count(*) AS c FROM hits GROUP BY 1, "URL" ORDER BY c DESC LIMIT 10;
10 tuples
clk: 1:24 min
10 tuples
clk: 1:25 min
10 tuples
clk: 1:24 min
3
SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, count(*) AS c FROM hits GROUP BY "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3 ORDER BY c DESC LIMIT 10;
10 tuples
clk: 26.438 sec
10 tuples
clk: 26.033 sec
10 tuples
clk: 26.147 sec
3
SELECT "URL", count(*) AS "PageViews" FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "Refresh" = 0 AND "URL" <> '' GROUP BY "URL" ORDER BY "PageViews" DESC LIMIT 10;
10 tuples
clk: 4.825 sec
10 tuples
clk: 4.618 sec
10 tuples
clk: 4.623 sec
3
SELECT "Title", count(*) AS "PageViews" FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "Refresh" = 0 AND "Title" <> '' GROUP BY "Title" ORDER BY "PageViews" DESC LIMIT 10;
10 tuples
clk: 4.380 sec
10 tuples
clk: 4.418 sec
10 tuples
clk: 4.413 sec
3
SELECT "URL", count(*) AS "PageViews" FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "Refresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL" ORDER BY "PageViews" DESC LIMIT 1000;
1000 tuples
clk: 4.259 sec
1000 tuples
clk: 4.195 sec
1000 tuples
clk: 4.195 sec
3
SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, count(*) AS "PageViews" FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "Refresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END, "URL" ORDER BY "PageViews" DESC LIMIT 1000;
1000 tuples
clk: 3.233 sec
1000 tuples
clk: 3.180 sec
1000 tuples
clk: 3.181 sec
3
SELECT "URLHash", "EventDate", count(*) AS "PageViews" FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "Refresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 686716256552154761 GROUP BY "URLHash", "EventDate" ORDER BY "PageViews" DESC LIMIT 100;
0 tuples
clk: 2.656 sec
0 tuples
clk: 2.557 sec
0 tuples
clk: 2.561 sec
3
SELECT "WindowClientWidth", "WindowClientHeight", count(*) AS "PageViews" FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "Refresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 686716256552154761 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY "PageViews" DESC LIMIT 10000;
0 tuples
clk: 4.161 sec
0 tuples
clk: 4.243 sec
0 tuples
clk: 4.166 sec
3
SELECT DATE_TRUNC('minute', "EventTime") AS "Minute", count(*) AS "PageViews" FROM hits WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-02' AND "Refresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', "EventTime") ORDER BY DATE_TRUNC('minute', "EventTime");
0 tuples
clk: 4.199 sec
0 tuples
clk: 4.211 sec
0 tuples
clk: 4.190 sec

View File

@ -1,43 +0,0 @@
SELECT count(*) FROM {table};
SELECT count(*) FROM {table} WHERE "AdvEngineID" <> 0;
SELECT sum("AdvEngineID"), count(*), avg("ResolutionWidth") FROM {table};
SELECT sum("UserID") FROM {table};
SELECT COUNT(DISTINCT "UserID") FROM {table};
SELECT COUNT(DISTINCT "SearchPhrase") FROM {table};
SELECT min("EventDate"), max("EventDate") FROM {table};
SELECT "AdvEngineID", count(*) FROM {table} WHERE "AdvEngineID" <> 0 GROUP BY "AdvEngineID" ORDER BY count(*) DESC;
SELECT "RegionID", COUNT(DISTINCT "UserID") AS u FROM {table} GROUP BY "RegionID" ORDER BY u DESC LIMIT 10;
SELECT "RegionID", sum("AdvEngineID"), count(*) AS c, avg("ResolutionWidth"), COUNT(DISTINCT "UserID") FROM {table} GROUP BY "RegionID" ORDER BY c DESC LIMIT 10;
SELECT "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM {table} WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
SELECT "MobilePhone", "MobilePhoneModel", COUNT(DISTINCT "UserID") AS u FROM {table} WHERE "MobilePhoneModel" <> '' GROUP BY "MobilePhone", "MobilePhoneModel" ORDER BY u DESC LIMIT 10;
SELECT "SearchPhrase", count(*) AS c FROM {table} WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
SELECT "SearchPhrase", COUNT(DISTINCT "UserID") AS u FROM {table} WHERE "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY u DESC LIMIT 10;
SELECT "SearchEngineID", "SearchPhrase", count(*) AS c FROM {table} WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "SearchPhrase" ORDER BY c DESC LIMIT 10;
SELECT "UserID", count(*) FROM {table} GROUP BY "UserID" ORDER BY count(*) DESC LIMIT 10;
SELECT "UserID", "SearchPhrase", count(*) FROM {table} GROUP BY "UserID", "SearchPhrase" ORDER BY count(*) DESC LIMIT 10;
SELECT "UserID", "SearchPhrase", count(*) FROM {table} GROUP BY "UserID", "SearchPhrase" LIMIT 10;
SELECT "UserID", extract(minute FROM "EventTime") AS m, "SearchPhrase", count(*) FROM {table} GROUP BY "UserID", m, "SearchPhrase" ORDER BY count(*) DESC LIMIT 10;
SELECT "UserID" FROM {table} WHERE "UserID" = -6101065172474983726;
SELECT count(*) FROM {table} WHERE "URL" LIKE '%metrika%';
SELECT "SearchPhrase", min("URL"), count(*) AS c FROM {table} WHERE "URL" LIKE '%metrika%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
SELECT "SearchPhrase", min("URL"), min("Title"), count(*) AS c, COUNT(DISTINCT "UserID") FROM {table} WHERE "Title" LIKE '%Яндекс%' AND "URL" NOT LIKE '%.yandex.%' AND "SearchPhrase" <> '' GROUP BY "SearchPhrase" ORDER BY c DESC LIMIT 10;
SELECT * FROM {table} WHERE "URL" LIKE '%metrika%' ORDER BY "EventTime" LIMIT 10;
SELECT "SearchPhrase" FROM {table} WHERE "SearchPhrase" <> '' ORDER BY "EventTime" LIMIT 10;
SELECT "SearchPhrase" FROM {table} WHERE "SearchPhrase" <> '' ORDER BY "SearchPhrase" LIMIT 10;
SELECT "SearchPhrase" FROM {table} WHERE "SearchPhrase" <> '' ORDER BY "EventTime", "SearchPhrase" LIMIT 10;
SELECT "CounterID", avg(length("URL")) AS l, count(*) AS c FROM {table} WHERE "URL" <> '' GROUP BY "CounterID" HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
SELECT sys.getdomain("Referer") AS key, avg(length("Referer")) AS l, count(*) AS c, min("Referer") FROM {table} WHERE "Referer" <> '' GROUP BY key HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
SELECT sum("ResolutionWidth"), sum("ResolutionWidth" + 1), sum("ResolutionWidth" + 2), sum("ResolutionWidth" + 3), sum("ResolutionWidth" + 4), sum("ResolutionWidth" + 5), sum("ResolutionWidth" + 6), sum("ResolutionWidth" + 7), sum("ResolutionWidth" + 8), sum("ResolutionWidth" + 9), sum("ResolutionWidth" + 10), sum("ResolutionWidth" + 11), sum("ResolutionWidth" + 12), sum("ResolutionWidth" + 13), sum("ResolutionWidth" + 14), sum("ResolutionWidth" + 15), sum("ResolutionWidth" + 16), sum("ResolutionWidth" + 17), sum("ResolutionWidth" + 18), sum("ResolutionWidth" + 19), sum("ResolutionWidth" + 20), sum("ResolutionWidth" + 21), sum("ResolutionWidth" + 22), sum("ResolutionWidth" + 23), sum("ResolutionWidth" + 24), sum("ResolutionWidth" + 25), sum("ResolutionWidth" + 26), sum("ResolutionWidth" + 27), sum("ResolutionWidth" + 28), sum("ResolutionWidth" + 29), sum("ResolutionWidth" + 30), sum("ResolutionWidth" + 31), sum("ResolutionWidth" + 32), sum("ResolutionWidth" + 33), sum("ResolutionWidth" + 34), sum("ResolutionWidth" + 35), sum("ResolutionWidth" + 36), sum("ResolutionWidth" + 37), sum("ResolutionWidth" + 38), sum("ResolutionWidth" + 39), sum("ResolutionWidth" + 40), sum("ResolutionWidth" + 41), sum("ResolutionWidth" + 42), sum("ResolutionWidth" + 43), sum("ResolutionWidth" + 44), sum("ResolutionWidth" + 45), sum("ResolutionWidth" + 46), sum("ResolutionWidth" + 47), sum("ResolutionWidth" + 48), sum("ResolutionWidth" + 49), sum("ResolutionWidth" + 50), sum("ResolutionWidth" + 51), sum("ResolutionWidth" + 52), sum("ResolutionWidth" + 53), sum("ResolutionWidth" + 54), sum("ResolutionWidth" + 55), sum("ResolutionWidth" + 56), sum("ResolutionWidth" + 57), sum("ResolutionWidth" + 58), sum("ResolutionWidth" + 59), sum("ResolutionWidth" + 60), sum("ResolutionWidth" + 61), sum("ResolutionWidth" + 62), sum("ResolutionWidth" + 63), sum("ResolutionWidth" + 64), sum("ResolutionWidth" + 65), sum("ResolutionWidth" + 66), sum("ResolutionWidth" + 67), sum("ResolutionWidth" + 68), sum("ResolutionWidth" + 69), sum("ResolutionWidth" + 70), sum("ResolutionWidth" + 71), sum("ResolutionWidth" + 72), sum("ResolutionWidth" + 73), sum("ResolutionWidth" + 74), sum("ResolutionWidth" + 75), sum("ResolutionWidth" + 76), sum("ResolutionWidth" + 77), sum("ResolutionWidth" + 78), sum("ResolutionWidth" + 79), sum("ResolutionWidth" + 80), sum("ResolutionWidth" + 81), sum("ResolutionWidth" + 82), sum("ResolutionWidth" + 83), sum("ResolutionWidth" + 84), sum("ResolutionWidth" + 85), sum("ResolutionWidth" + 86), sum("ResolutionWidth" + 87), sum("ResolutionWidth" + 88), sum("ResolutionWidth" + 89) FROM {table};
SELECT "SearchEngineID", "ClientIP", count(*) AS c, sum("Refresh"), avg("ResolutionWidth") FROM {table} WHERE "SearchPhrase" <> '' GROUP BY "SearchEngineID", "ClientIP" ORDER BY c DESC LIMIT 10;
SELECT "WatchID", "ClientIP", count(*) AS c, sum("Refresh"), avg("ResolutionWidth") FROM {table} WHERE "SearchPhrase" <> '' GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
SELECT "WatchID", "ClientIP", count(*) AS c, sum("Refresh"), avg("ResolutionWidth") FROM {table} GROUP BY "WatchID", "ClientIP" ORDER BY c DESC LIMIT 10;
SELECT "URL", count(*) AS c FROM {table} GROUP BY "URL" ORDER BY c DESC LIMIT 10;
SELECT 1, "URL", count(*) AS c FROM {table} GROUP BY 1, "URL" ORDER BY c DESC LIMIT 10;
SELECT "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3, count(*) AS c FROM {table} GROUP BY "ClientIP", "ClientIP" - 1, "ClientIP" - 2, "ClientIP" - 3 ORDER BY c DESC LIMIT 10;
SELECT "URL", count(*) AS "PageViews" FROM {table} WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "Refresh" = 0 AND "URL" <> '' GROUP BY "URL" ORDER BY "PageViews" DESC LIMIT 10;
SELECT "Title", count(*) AS "PageViews" FROM {table} WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "DontCountHits" = 0 AND "Refresh" = 0 AND "Title" <> '' GROUP BY "Title" ORDER BY "PageViews" DESC LIMIT 10;
SELECT "URL", count(*) AS "PageViews" FROM {table} WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "Refresh" = 0 AND "IsLink" <> 0 AND "IsDownload" = 0 GROUP BY "URL" ORDER BY "PageViews" DESC LIMIT 1000;
SELECT "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END AS Src, "URL" AS Dst, count(*) AS "PageViews" FROM {table} WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "Refresh" = 0 GROUP BY "TraficSourceID", "SearchEngineID", "AdvEngineID", CASE WHEN ("SearchEngineID" = 0 AND "AdvEngineID" = 0) THEN "Referer" ELSE '' END, "URL" ORDER BY "PageViews" DESC LIMIT 1000;
SELECT "URLHash", "EventDate", count(*) AS "PageViews" FROM {table} WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "Refresh" = 0 AND "TraficSourceID" IN (-1, 6) AND "RefererHash" = 686716256552154761 GROUP BY "URLHash", "EventDate" ORDER BY "PageViews" DESC LIMIT 100;
SELECT "WindowClientWidth", "WindowClientHeight", count(*) AS "PageViews" FROM {table} WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-31' AND "Refresh" = 0 AND "DontCountHits" = 0 AND "URLHash" = 686716256552154761 GROUP BY "WindowClientWidth", "WindowClientHeight" ORDER BY "PageViews" DESC LIMIT 10000;
SELECT DATE_TRUNC('minute', "EventTime") AS "Minute", count(*) AS "PageViews" FROM {table} WHERE "CounterID" = 62 AND "EventDate" >= '2013-07-01' AND "EventDate" <= '2013-07-02' AND "Refresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', "EventTime") ORDER BY DATE_TRUNC('minute', "EventTime");

View File

@ -1,19 +0,0 @@
#!/usr/bin/expect
# Set timeout
set timeout 600
# Get arguments
set query [lindex $argv 0]
spawn mclient -u monetdb -d test --timer=clock
expect "password:"
send "monetdb\r"
expect "sql>"
send "$query;\r"
expect "sql>"
send "\\q\r"
expect eof

File diff suppressed because it is too large Load Diff

View File

@ -1,17 +0,0 @@
#!/bin/bash
grep -v -P '^#' queries.sql | sed -e 's/{table}/hits/' | while read query; do
echo 3 | sudo tee /proc/sys/vm/drop_caches
sudo systemctl restart omnisci_server
for i in {1..1000}; do
/opt/omnisci/bin/omnisql -t -p HyperInteractive <<< "SELECT 1;" 2>&1 | grep -q '1 rows returned' && break;
sleep 0.1;
done
sleep 10;
echo "$query";
for i in {1..3}; do
/opt/omnisci/bin/omnisql -t -p HyperInteractive <<< "$query" 2>&1 | grep -P 'Exception:|Execution time:';
done;
done;

View File

@ -1,332 +0,0 @@
# Instruction to run benchmark for OmniSci on web-analytics dataset
OmniSci (former name "MapD") is open-source (open-core) in-memory analytical DBMS with support for GPU processing.
It can run on CPU without GPU as well. It can show competitive performance on simple queries (like - simple aggregation on a single column).
# How to install
https://docs.omnisci.com/installation-and-configuration/installation/installing-on-ubuntu
# Caveats
- Dataset (at least needed columns) must fit in memory.
- It does not support data compression (only dictionary encoding for strings).
- First query execution is very slow because uncompressed data is read from disk.
- It does not support index for quick range queries.
- It does not support NOT NULL for data types.
- It does not support BLOB.
- No support for UNSIGNED data type (it's Ok according to SQL standard).
- Lack of string processing functions.
- Strings are limited to 32767 bytes.
- GROUP BY on text data type is supported only if it has dictionary encoding.
`Exception: Cannot group by string columns which are not dictionary encoded`
- Some aggregate functions are not supported for strings at all.
`Aggregate on TEXT is not supported yet.`
- Sometimes I hit a bug when query is run in infinite loop and does not finish (after retry it's finished successfully).
- One query executed in hours even with retries.
- Sorting is slow and disabled with default settings for large resultsets.
`Exception: Sorting the result would be too slow`
`Cast from dictionary-encoded string to none-encoded would be slow`
- There is approximate count distinct function but the precision is not documented.
To enable sorting of large resultsets, see:
https://stackoverflow.com/questions/62977734/omnissci-sorting-the-result-would-be-too-slow
The list of known issues is here:
https://github.com/omnisci/omniscidb/issues?q=is%3Aissue+author%3Aalexey-milovidov
# How to prepare data
Download the 100 million rows dataset from here and insert into ClickHouse:
https://clickhouse.tech/docs/en/getting-started/example-datasets/metrica/
Convert the CREATE TABLE query:
```
clickhouse-client --query "SHOW CREATE TABLE hits_100m" --format TSVRaw |
tr '`' '"' |
sed -r -e '
s/U?Int64/BIGINT/;
s/U?Int32/INTEGER/;
s/U?Int16/SMALLINT/;
s/U?Int8/TINYINT/;
s/DateTime/TIMESTAMP ENCODING FIXED(32)/;
s/ Date/ DATE ENCODING DAYS(16)/;
s/FixedString\(2\)/TEXT ENCODING DICT(16)/;
s/FixedString\(3\)/TEXT ENCODING DICT/;
s/FixedString\(\d+\)/TEXT ENCODING DICT/;
s/String/TEXT ENCODING DICT/;'
```
And cut `ENGINE` part.
The resulting CREATE TABLE query:
```
CREATE TABLE hits
(
"WatchID" BIGINT,
"JavaEnable" TINYINT,
"Title" TEXT ENCODING DICT,
"GoodEvent" SMALLINT,
"EventTime" TIMESTAMP ENCODING FIXED(32),
"EventDate" ENCODING DAYS(16) Date,
"CounterID" INTEGER,
"ClientIP" INTEGER,
"RegionID" INTEGER,
"UserID" BIGINT,
"CounterClass" TINYINT,
"OS" TINYINT,
"UserAgent" TINYINT,
"URL" TEXT ENCODING DICT,
"Referer" TEXT ENCODING DICT,
"Refresh" TINYINT,
"RefererCategoryID" SMALLINT,
"RefererRegionID" INTEGER,
"URLCategoryID" SMALLINT,
"URLRegionID" INTEGER,
"ResolutionWidth" SMALLINT,
"ResolutionHeight" SMALLINT,
"ResolutionDepth" TINYINT,
"FlashMajor" TINYINT,
"FlashMinor" TINYINT,
"FlashMinor2" TEXT ENCODING DICT,
"NetMajor" TINYINT,
"NetMinor" TINYINT,
"UserAgentMajor" SMALLINT,
"UserAgentMinor" TEXT ENCODING DICT(16),
"CookieEnable" TINYINT,
"JavascriptEnable" TINYINT,
"IsMobile" TINYINT,
"MobilePhone" TINYINT,
"MobilePhoneModel" TEXT ENCODING DICT,
"Params" TEXT ENCODING DICT,
"IPNetworkID" INTEGER,
"TraficSourceID" TINYINT,
"SearchEngineID" SMALLINT,
"SearchPhrase" TEXT ENCODING DICT,
"AdvEngineID" TINYINT,
"IsArtifical" TINYINT,
"WindowClientWidth" SMALLINT,
"WindowClientHeight" SMALLINT,
"ClientTimeZone" SMALLINT,
"ClientEventTime" TIMESTAMP ENCODING FIXED(32),
"SilverlightVersion1" TINYINT,
"SilverlightVersion2" TINYINT,
"SilverlightVersion3" INTEGER,
"SilverlightVersion4" SMALLINT,
"PageCharset" TEXT ENCODING DICT,
"CodeVersion" INTEGER,
"IsLink" TINYINT,
"IsDownload" TINYINT,
"IsNotBounce" TINYINT,
"FUniqID" BIGINT,
"OriginalURL" TEXT ENCODING DICT,
"HID" INTEGER,
"IsOldCounter" TINYINT,
"IsEvent" TINYINT,
"IsParameter" TINYINT,
"DontCountHits" TINYINT,
"WithHash" TINYINT,
"HitColor" TEXT ENCODING DICT(8),
"LocalEventTime" TIMESTAMP ENCODING FIXED(32),
"Age" TINYINT,
"Sex" TINYINT,
"Income" TINYINT,
"Interests" SMALLINT,
"Robotness" TINYINT,
"RemoteIP" INTEGER,
"WindowName" INTEGER,
"OpenerName" INTEGER,
"HistoryLength" SMALLINT,
"BrowserLanguage" TEXT ENCODING DICT(16),
"BrowserCountry" TEXT ENCODING DICT(16),
"SocialNetwork" TEXT ENCODING DICT,
"SocialAction" TEXT ENCODING DICT,
"HTTPError" SMALLINT,
"SendTiming" INTEGER,
"DNSTiming" INTEGER,
"ConnectTiming" INTEGER,
"ResponseStartTiming" INTEGER,
"ResponseEndTiming" INTEGER,
"FetchTiming" INTEGER,
"SocialSourceNetworkID" TINYINT,
"SocialSourcePage" TEXT ENCODING DICT,
"ParamPrice" BIGINT,
"ParamOrderID" TEXT ENCODING DICT,
"ParamCurrency" TEXT ENCODING DICT,
"ParamCurrencyID" SMALLINT,
"OpenstatServiceName" TEXT ENCODING DICT,
"OpenstatCampaignID" TEXT ENCODING DICT,
"OpenstatAdID" TEXT ENCODING DICT,
"OpenstatSourceID" TEXT ENCODING DICT,
"UTMSource" TEXT ENCODING DICT,
"UTMMedium" TEXT ENCODING DICT,
"UTMCampaign" TEXT ENCODING DICT,
"UTMContent" TEXT ENCODING DICT,
"UTMTerm" TEXT ENCODING DICT,
"FromTag" TEXT ENCODING DICT,
"HasGCLID" TINYINT,
"RefererHash" BIGINT,
"URLHash" BIGINT,
"CLID" INTEGER
);
```
Convert the dataset, prepare the list of fields for SELECT:
```
clickhouse-client --query "SHOW CREATE TABLE hits_100m" --format TSVRaw |
tr '`' '"' |
sed -r -e '
s/"(\w+)" U?Int([0-9]+)/toInt\2(\1)/;
s/"(\w+)" (Fixed)?String(\([0-9]+\))?/toValidUTF8(toString(\1))/;
s/"(\w+)" \w+/\1/'
```
The resulting SELECT query for data preparation:
```
SELECT
toInt64(WatchID),
toInt8(JavaEnable),
toValidUTF8(toString(Title)),
toInt16(GoodEvent),
EventTime,
EventDate,
toInt32(CounterID),
toInt32(ClientIP),
toInt32(RegionID),
toInt64(UserID),
toInt8(CounterClass),
toInt8(OS),
toInt8(UserAgent),
toValidUTF8(toString(URL)),
toValidUTF8(toString(Referer)),
toInt8(Refresh),
toInt16(RefererCategoryID),
toInt32(RefererRegionID),
toInt16(URLCategoryID),
toInt32(URLRegionID),
toInt16(ResolutionWidth),
toInt16(ResolutionHeight),
toInt8(ResolutionDepth),
toInt8(FlashMajor),
toInt8(FlashMinor),
toValidUTF8(toString(FlashMinor2)),
toInt8(NetMajor),
toInt8(NetMinor),
toInt16(UserAgentMajor),
toValidUTF8(toString(UserAgentMinor)),
toInt8(CookieEnable),
toInt8(JavascriptEnable),
toInt8(IsMobile),
toInt8(MobilePhone),
toValidUTF8(toString(MobilePhoneModel)),
toValidUTF8(toString(Params)),
toInt32(IPNetworkID),
toInt8(TraficSourceID),
toInt16(SearchEngineID),
toValidUTF8(toString(SearchPhrase)),
toInt8(AdvEngineID),
toInt8(IsArtifical),
toInt16(WindowClientWidth),
toInt16(WindowClientHeight),
toInt16(ClientTimeZone),
ClientEventTime,
toInt8(SilverlightVersion1),
toInt8(SilverlightVersion2),
toInt32(SilverlightVersion3),
toInt16(SilverlightVersion4),
toValidUTF8(toString(PageCharset)),
toInt32(CodeVersion),
toInt8(IsLink),
toInt8(IsDownload),
toInt8(IsNotBounce),
toInt64(FUniqID),
toValidUTF8(toString(OriginalURL)),
toInt32(HID),
toInt8(IsOldCounter),
toInt8(IsEvent),
toInt8(IsParameter),
toInt8(DontCountHits),
toInt8(WithHash),
toValidUTF8(toString(HitColor)),
LocalEventTime,
toInt8(Age),
toInt8(Sex),
toInt8(Income),
toInt16(Interests),
toInt8(Robotness),
toInt32(RemoteIP),
toInt32(WindowName),
toInt32(OpenerName),
toInt16(HistoryLength),
toValidUTF8(toString(BrowserLanguage)),
toValidUTF8(toString(BrowserCountry)),
toValidUTF8(toString(SocialNetwork)),
toValidUTF8(toString(SocialAction)),
toInt16(HTTPError),
toInt32(SendTiming),
toInt32(DNSTiming),
toInt32(ConnectTiming),
toInt32(ResponseStartTiming),
toInt32(ResponseEndTiming),
toInt32(FetchTiming),
toInt8(SocialSourceNetworkID),
toValidUTF8(toString(SocialSourcePage)),
toInt64(ParamPrice),
toValidUTF8(toString(ParamOrderID)),
toValidUTF8(toString(ParamCurrency)),
toInt16(ParamCurrencyID),
toValidUTF8(toString(OpenstatServiceName)),
toValidUTF8(toString(OpenstatCampaignID)),
toValidUTF8(toString(OpenstatAdID)),
toValidUTF8(toString(OpenstatSourceID)),
toValidUTF8(toString(UTMSource)),
toValidUTF8(toString(UTMMedium)),
toValidUTF8(toString(UTMCampaign)),
toValidUTF8(toString(UTMContent)),
toValidUTF8(toString(UTMTerm)),
toValidUTF8(toString(FromTag)),
toInt8(HasGCLID),
toInt64(RefererHash),
toInt64(URLHash),
toInt32(CLID)
FROM hits_100m_obfuscated
INTO OUTFILE '/home/milovidov/example_datasets/hits_100m_obfuscated.csv'
FORMAT CSV;
```
Upload data to OmniSci:
```
/opt/omnisci/bin/omnisql -t -p HyperInteractive
```
Run CREATE TABLE statement, then run:
```
COPY hits FROM '/home/milovidov/example_datasets/hits_100m_obfuscated.csv' WITH (HEADER = 'false');
```
Data loading took
```
336639 ms
```
on a server (Linux Ubuntu, Xeon E5-2560v2, 32 logical CPU, 128 GiB RAM, 8xHDD RAID-5, 40 TB).
Run benchmark:
```
./benchmark.sh
```
Prepare the result to paste into JSON:
```
grep -oP 'Total time: \d+' log.txt |
grep -oP '\d+' |
awk '{
if (i % 3 == 0) { a = $1 }
else if (i % 3 == 1) { b = $1 }
else if (i % 3 == 2) { c = $1; print "[" a / 1000 ", " b / 1000 ", " c / 1000 "]," };
++i; }'
```
And fill out `[null, null, null]` for missing runs.

View File

@ -1,210 +0,0 @@
3
SELECT count(*) FROM hits;
Execution time: 23471 ms, Total time: 23471 ms
Execution time: 42 ms, Total time: 43 ms
Execution time: 35 ms, Total time: 35 ms
3
SELECT count(*) FROM hits WHERE AdvEngineID != 0;
Execution time: 17328 ms, Total time: 17329 ms
Execution time: 58 ms, Total time: 59 ms
Execution time: 57 ms, Total time: 59 ms
3
SELECT sum(AdvEngineID), count(*), avg(ResolutionWidth) FROM hits;
Execution time: 17309 ms, Total time: 17310 ms
Execution time: 115 ms, Total time: 115 ms
Execution time: 129 ms, Total time: 130 ms
3
SELECT sum(UserID) FROM hits;
Execution time: 26091 ms, Total time: 26091 ms
Execution time: 88 ms, Total time: 89 ms
Execution time: 71 ms, Total time: 72 ms
3
SELECT APPROX_COUNT_DISTINCT(UserID) FROM hits;
Execution time: 21720 ms, Total time: 21720 ms
Execution time: 364 ms, Total time: 364 ms
Execution time: 344 ms, Total time: 345 ms
3
SELECT APPROX_COUNT_DISTINCT(SearchPhrase) FROM hits;
Execution time: 19314 ms, Total time: 19315 ms
Execution time: 385 ms, Total time: 386 ms
Execution time: 382 ms, Total time: 382 ms
3
SELECT min(EventDate), max(EventDate) FROM hits;
Execution time: 19431 ms, Total time: 19432 ms
Execution time: 130 ms, Total time: 131 ms
Execution time: 147 ms, Total time: 148 ms
3
SELECT AdvEngineID, count(*) FROM hits WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY count(*) DESC;
Execution time: 20660 ms, Total time: 20661 ms
Execution time: 63 ms, Total time: 64 ms
Execution time: 88 ms, Total time: 89 ms
3
SELECT RegionID, APPROX_COUNT_DISTINCT(UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;
Execution time: 21364 ms, Total time: 21472 ms
Execution time: 1387 ms, Total time: 1504 ms
Execution time: 1443 ms, Total time: 1505 ms
3
SELECT RegionID, sum(AdvEngineID), count(*) AS c, avg(ResolutionWidth), APPROX_COUNT_DISTINCT(UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;
Execution time: 22205 ms, Total time: 22285 ms
Execution time: 1590 ms, Total time: 1655 ms
Execution time: 1591 ms, Total time: 1658 ms
3
SELECT MobilePhoneModel, APPROX_COUNT_DISTINCT(UserID) AS u FROM hits WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
Execution time: 22343 ms, Total time: 22344 ms
Execution time: 122 ms, Total time: 123 ms
Execution time: 117 ms, Total time: 118 ms
3
SELECT MobilePhone, MobilePhoneModel, APPROX_COUNT_DISTINCT(UserID) AS u FROM hits WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
Execution time: 21681 ms, Total time: 21695 ms
Execution time: 299 ms, Total time: 310 ms
Execution time: 275 ms, Total time: 292 ms
3
SELECT SearchPhrase, count(*) AS c FROM hits WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
Execution time: 23346 ms, Total time: 23360 ms
Execution time: 613 ms, Total time: 631 ms
Execution time: 606 ms, Total time: 624 ms
3
SELECT SearchPhrase, APPROX_COUNT_DISTINCT(UserID) AS u FROM hits WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
Execution time: 66014 ms, Total time: 68618 ms
Execution time: 44309 ms, Total time: 47296 ms
Execution time: 44019 ms, Total time: 46866 ms
3
SELECT SearchEngineID, SearchPhrase, count(*) AS c FROM hits WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;
Execution time: 25853 ms, Total time: 25984 ms
Execution time: 2590 ms, Total time: 2728 ms
Execution time: 2652 ms, Total time: 2789 ms
3
SELECT UserID, count(*) FROM hits GROUP BY UserID ORDER BY count(*) DESC LIMIT 10;
Execution time: 26581 ms, Total time: 26953 ms
Execution time: 5843 ms, Total time: 6158 ms
Execution time: 5970 ms, Total time: 6286 ms
3
SELECT UserID, SearchPhrase, count(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
Execution time: 33007 ms, Total time: 33581 ms
Execution time: 9943 ms, Total time: 10509 ms
Execution time: 9470 ms, Total time: 10047 ms
3
SELECT UserID, SearchPhrase, count(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;
Execution time: 39009 ms, Total time: 39575 ms
Execution time: 8151 ms, Total time: 8785 ms
Execution time: 8037 ms, Total time: 8665 ms
3
SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, count(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
Execution time: 56207 ms, Total time: 57764 ms
Execution time: 26653 ms, Total time: 28199 ms
Execution time: 25614 ms, Total time: 27336 ms
3
SELECT UserID FROM hits WHERE UserID = -6101065172474983726;
Execution time: 18975 ms, Total time: 18976 ms
Execution time: 136 ms, Total time: 136 ms
Execution time: 136 ms, Total time: 136 ms
3
SELECT count(*) FROM hits WHERE URL LIKE '%metrika%';
Execution time: 32444 ms, Total time: 32445 ms
Execution time: 125 ms, Total time: 126 ms
Execution time: 134 ms, Total time: 136 ms
3
SELECT SearchPhrase, min(URL), count(*) AS c FROM hits WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
Exception: Aggregate on TEXT is not supported yet.
Exception: Aggregate on TEXT is not supported yet.
Exception: Aggregate on TEXT is not supported yet.
3
SELECT SearchPhrase, min(URL), min(Title), count(*) AS c, APPROX_COUNT_DISTINCT(UserID) FROM hits WHERE Title LIKE '%Яндекс%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
Exception: Aggregate on TEXT is not supported yet.
Exception: Aggregate on TEXT is not supported yet.
Exception: Aggregate on TEXT is not supported yet.
3
SELECT * FROM hits WHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10;
Execution time: 96163 ms, Total time: 96166 ms
Execution time: 312 ms, Total time: 314 ms
Execution time: 303 ms, Total time: 305 ms
3
SELECT SearchPhrase FROM hits WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10;
Execution time: 27493 ms, Total time: 27494 ms
Execution time: 216 ms, Total time: 216 ms
Execution time: 221 ms, Total time: 222 ms
3
SELECT SearchPhrase FROM hits WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10;
Execution time: 38230 ms, Total time: 38308 ms
Execution time: 17175 ms, Total time: 17256 ms
Execution time: 17225 ms, Total time: 17310 ms
3
SELECT SearchPhrase FROM hits WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10;
Execution time: 115614 ms, Total time: 115714 ms
Execution time: 95944 ms, Total time: 96041 ms
Execution time: 94274 ms, Total time: 94383 ms
3
SELECT CounterID, avg(length(URL)) AS l, count(*) AS c FROM hits WHERE URL != '' GROUP BY CounterID HAVING c > 100000 ORDER BY l DESC LIMIT 25;
Execution time: 31775 ms, Total time: 31779 ms
Execution time: 2643 ms, Total time: 2647 ms
Execution time: 2933 ms, Total time: 2937 ms
3
SELECT domainWithoutWWW(Referer) AS key, avg(length(Referer)) AS l, count(*) AS c, min(Referer) FROM hits WHERE Referer != '' GROUP BY key HAVING c > 100000 ORDER BY l DESC LIMIT 25;
Exception: Exception occurred: org.apache.calcite.runtime.CalciteContextException: From line 1, column 8 to line 1, column 36: No match found for function signature domainWithoutWWW(<CHARACTER>)
Exception: Exception occurred: org.apache.calcite.runtime.CalciteContextException: From line 1, column 8 to line 1, column 36: No match found for function signature domainWithoutWWW(<CHARACTER>)
Exception: Exception occurred: org.apache.calcite.runtime.CalciteContextException: From line 1, column 8 to line 1, column 36: No match found for function signature domainWithoutWWW(<CHARACTER>)
3
SELECT sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM hits;
Execution time: 28853 ms, Total time: 28854 ms
Execution time: 5654 ms, Total time: 5655 ms
Execution time: 5579 ms, Total time: 5581 ms
3
SELECT SearchEngineID, ClientIP, count(*) AS c, sum("Refresh"), avg(ResolutionWidth) FROM hits WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;
Execution time: 31694 ms, Total time: 31925 ms
Execution time: 3872 ms, Total time: 4142 ms
Execution time: 3928 ms, Total time: 4162 ms
3
SELECT WatchID, ClientIP, count(*) AS c, sum("Refresh"), avg(ResolutionWidth) FROM hits WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
Execution time: 43690 ms, Total time: 44297 ms
Execution time: 8221 ms, Total time: 8825 ms
Execution time: 8115 ms, Total time: 8711 ms
3
SELECT URL, count(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;
Execution time: 29669 ms, Total time: 29715 ms
Execution time: 1623 ms, Total time: 1669 ms
Execution time: 1534 ms, Total time: 1586 ms
3
SELECT 1, URL, count(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;
Execution time: 34860 ms, Total time: 35201 ms
Execution time: 7075 ms, Total time: 7414 ms
Execution time: 7164 ms, Total time: 7567 ms
3
SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, count(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;
Execution time: 26467 ms, Total time: 26724 ms
Execution time: 5740 ms, Total time: 6026 ms
Execution time: 5667 ms, Total time: 5920 ms
3
SELECT URL, count(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND "Refresh" = 0 AND URL != '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
Execution time: 31899 ms, Total time: 31908 ms
Execution time: 1141 ms, Total time: 1154 ms
Execution time: 1155 ms, Total time: 1168 ms
3
SELECT Title, count(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND "Refresh" = 0 AND Title != '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
Execution time: 27991 ms, Total time: 27997 ms
Execution time: 719 ms, Total time: 724 ms
Execution time: 737 ms, Total time: 744 ms
3
SELECT URL, count(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND "Refresh" = 0 AND IsLink != 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 1000;
Execution time: 34651 ms, Total time: 34661 ms
Execution time: 1182 ms, Total time: 1200 ms
Execution time: 1142 ms, Total time: 1159 ms
3
SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, count(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND "Refresh" = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 1000;
Execution time: 30130 ms, Total time: 30136 ms
Execution time: 461 ms, Total time: 467 ms
Execution time: 445 ms, Total time: 451 ms
3
SELECT URLHash, EventDate, count(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND "Refresh" = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 686716256552154761 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100;
Execution time: 19989 ms, Total time: 19991 ms
Execution time: 326 ms, Total time: 327 ms
Execution time: 325 ms, Total time: 326 ms
3
SELECT WindowClientWidth, WindowClientHeight, count(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND "Refresh" = 0 AND DontCountHits = 0 AND URLHash = 686716256552154761 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000;
Execution time: 18658 ms, Total time: 18660 ms
Execution time: 265 ms, Total time: 266 ms
Execution time: 254 ms, Total time: 255 ms
3
SELECT DATE_TRUNC(minute, EventTime) AS "Minute", count(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-02' AND "Refresh" = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC(minute, EventTime) ORDER BY DATE_TRUNC(minute, EventTime);
Execution time: 25225 ms, Total time: 25227 ms
Execution time: 210 ms, Total time: 212 ms
Execution time: 199 ms, Total time: 200 ms

View File

@ -1,43 +0,0 @@
SELECT count(*) FROM {table};
SELECT count(*) FROM {table} WHERE AdvEngineID != 0;
SELECT sum(AdvEngineID), count(*), avg(ResolutionWidth) FROM {table};
SELECT sum(UserID) FROM {table};
SELECT APPROX_COUNT_DISTINCT(UserID) FROM {table};
SELECT APPROX_COUNT_DISTINCT(SearchPhrase) FROM {table};
SELECT min(EventDate), max(EventDate) FROM {table};
SELECT AdvEngineID, count(*) FROM {table} WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY count(*) DESC;
SELECT RegionID, APPROX_COUNT_DISTINCT(UserID) AS u FROM {table} GROUP BY RegionID ORDER BY u DESC LIMIT 10;
SELECT RegionID, sum(AdvEngineID), count(*) AS c, avg(ResolutionWidth), APPROX_COUNT_DISTINCT(UserID) FROM {table} GROUP BY RegionID ORDER BY c DESC LIMIT 10;
SELECT MobilePhoneModel, APPROX_COUNT_DISTINCT(UserID) AS u FROM {table} WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
SELECT MobilePhone, MobilePhoneModel, APPROX_COUNT_DISTINCT(UserID) AS u FROM {table} WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
SELECT SearchPhrase, count(*) AS c FROM {table} WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT SearchPhrase, APPROX_COUNT_DISTINCT(UserID) AS u FROM {table} WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
SELECT SearchEngineID, SearchPhrase, count(*) AS c FROM {table} WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT UserID, count(*) FROM {table} GROUP BY UserID ORDER BY count(*) DESC LIMIT 10;
SELECT UserID, SearchPhrase, count(*) FROM {table} GROUP BY UserID, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT UserID, SearchPhrase, count(*) FROM {table} GROUP BY UserID, SearchPhrase LIMIT 10;
SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, count(*) FROM {table} GROUP BY UserID, m, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT UserID FROM {table} WHERE UserID = -6101065172474983726;
SELECT count(*) FROM {table} WHERE URL LIKE '%metrika%';
SELECT SearchPhrase, min(URL), count(*) AS c FROM {table} WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT SearchPhrase, min(URL), min(Title), count(*) AS c, APPROX_COUNT_DISTINCT(UserID) FROM {table} WHERE Title LIKE '%Яндекс%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
SELECT * FROM {table} WHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10;
SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10;
SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10;
SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10;
SELECT CounterID, avg(length(URL)) AS l, count(*) AS c FROM {table} WHERE URL != '' GROUP BY CounterID HAVING c > 100000 ORDER BY l DESC LIMIT 25;
SELECT domainWithoutWWW(Referer) AS key, avg(length(Referer)) AS l, count(*) AS c, min(Referer) FROM {table} WHERE Referer != '' GROUP BY key HAVING c > 100000 ORDER BY l DESC LIMIT 25;
SELECT sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM {table};
SELECT SearchEngineID, ClientIP, count(*) AS c, sum("Refresh"), avg(ResolutionWidth) FROM {table} WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;
SELECT WatchID, ClientIP, count(*) AS c, sum("Refresh"), avg(ResolutionWidth) FROM {table} WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
#SELECT WatchID, ClientIP, count(*) AS c, sum("Refresh"), avg(ResolutionWidth) FROM {table} GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
SELECT URL, count(*) AS c FROM {table} GROUP BY URL ORDER BY c DESC LIMIT 10;
SELECT 1, URL, count(*) AS c FROM {table} GROUP BY 1, URL ORDER BY c DESC LIMIT 10;
SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, count(*) AS c FROM {table} GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;
SELECT URL, count(*) AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND "Refresh" = 0 AND URL != '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
SELECT Title, count(*) AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND "Refresh" = 0 AND Title != '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
SELECT URL, count(*) AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND "Refresh" = 0 AND IsLink != 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 1000;
SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, count(*) AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND "Refresh" = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 1000;
SELECT URLHash, EventDate, count(*) AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND "Refresh" = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 686716256552154761 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100;
SELECT WindowClientWidth, WindowClientHeight, count(*) AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND "Refresh" = 0 AND DontCountHits = 0 AND URLHash = 686716256552154761 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000;
SELECT DATE_TRUNC(minute, EventTime) AS "Minute", count(*) AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-02' AND "Refresh" = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC(minute, EventTime) ORDER BY DATE_TRUNC(minute, EventTime);

View File

@ -1,40 +0,0 @@
Quick installation instructions
-------------------------------
Register on my.vertica.com
https://my.vertica.com/download-community-edition/
Download HP Vertica 7.1.1 Analytic Database Server, Debian or Ubuntu 14.04 version.
sudo apt-get install sysstat pstack mcelog
sudo dpkg -i vertica_7.1.1-0_amd64.deb
sudo sh -c "echo 'export TZ=Europe/Moscow' >> /home/dbadmin/.bash_profile"
# Don't specify localhost due to poor support of IPv6.
sudo /opt/vertica/sbin/install_vertica --hosts=127.0.0.1 --failure-threshold=NONE
sudo mkdir /opt/vertica-data/
sudo chown dbadmin /opt/vertica-data/
sudo su dbadmin
/opt/vertica/bin/adminTools
configuration menu
create database
name: default
empty password
both directories: /opt/vertica-data/
main menu
exit
How to prepare data
-------------------
Prepare dumps with script create_dump.sh for tables hits_10m, hits_100m, hits_1000m. It takes about 5 hours (1m41.882s, 25m11.103s, 276m36.388s).
Start vsql command line client.
/opt/vertica/bin/vsql -U dbadmin
Create tables with queries from hits_define_schema.sql.
Time to insert data:
hits_10m: 91 sec.
hits_100m: 774 sec.
hits_1000m: 13769 sec.
You need to validate number of rows with SELECT count(*).

View File

@ -1,24 +0,0 @@
#!/usr/bin/env bash
QUERIES_FILE="queries.sql"
TABLE=$1
TRIES=3
cat "$QUERIES_FILE" | sed "s/{table}/${TABLE}/g" | while read query; do
sync
echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
echo -n "["
for i in $(seq 1 $TRIES); do
RES=$((echo '\timing'; echo "$query") |
/opt/vertica/bin/vsql -U dbadmin |
grep -oP 'All rows formatted: [^ ]+ ms' |
ssed -R -e 's/^All rows formatted: ([\d,]+) ms$/\1/' |
tr ',' '.')
[[ "$?" == "0" ]] && echo -n "$(perl -e "print ${RES} / 1000")" || echo -n "null"
[[ "$i" != $TRIES ]] && echo -n ", "
done
echo "],"
done

View File

@ -1,339 +0,0 @@
\timing
create table hits_10m
(
WatchID INTEGER,
JavaEnable INTEGER,
Title VARCHAR(1024),
GoodEvent INTEGER,
EventTime DATETIME,
EventDate DATE,
CounterID INTEGER,
ClientIP INTEGER,
RegionID INTEGER,
UserID INTEGER,
CounterClass INTEGER,
OS INTEGER,
UserAgent INTEGER,
URL VARCHAR(6072),
Referer VARCHAR(2048),
Refresh INTEGER,
RefererCategoryID INTEGER,
RefererRegionID INTEGER,
URLCategoryID INTEGER,
URLRegionID INTEGER,
ResolutionWidth INTEGER,
ResolutionHeight INTEGER,
ResolutionDepth INTEGER,
FlashMajor INTEGER,
FlashMinor INTEGER,
FlashMinor2 VARCHAR(256),
NetMajor INTEGER,
NetMinor INTEGER,
UserAgentMajor INTEGER,
UserAgentMinor CHAR(2),
CookieEnable INTEGER,
JavascriptEnable INTEGER,
IsMobile INTEGER,
MobilePhone INTEGER,
MobilePhoneModel VARCHAR(80),
Params VARCHAR(2048),
IPNetworkID INTEGER,
TraficSourceID INTEGER,
SearchEngineID INTEGER,
SearchPhrase VARCHAR(1024),
AdvEngineID INTEGER,
IsArtifical INTEGER,
WindowClientWidth INTEGER,
WindowClientHeight INTEGER,
ClientTimeZone INTEGER,
ClientEventTime DATETIME,
SilverlightVersion1 INTEGER,
SilverlightVersion2 INTEGER,
SilverlightVersion3 INTEGER,
SilverlightVersion4 INTEGER,
PageCharset VARCHAR(80),
CodeVersion INTEGER,
IsLink INTEGER,
IsDownload INTEGER,
IsNotBounce INTEGER,
FUniqID INTEGER,
OriginalURL VARCHAR(6072),
HID INTEGER,
IsOldCounter INTEGER,
IsEvent INTEGER,
IsParameter INTEGER,
DontCountHits INTEGER,
WithHash INTEGER,
HitColor CHAR(1),
LocalEventTime DATETIME,
Age INTEGER,
Sex INTEGER,
Income INTEGER,
Interests INTEGER,
Robotness INTEGER,
RemoteIP INTEGER,
WindowName INTEGER,
OpenerName INTEGER,
HistoryLength INTEGER,
BrowserLanguage CHAR(2),
BrowserCountry CHAR(2),
SocialNetwork VARCHAR(128),
SocialAction VARCHAR(128),
HTTPError INTEGER,
SendTiming INTEGER,
DNSTiming INTEGER,
ConnectTiming INTEGER,
ResponseStartTiming INTEGER,
ResponseEndTiming INTEGER,
FetchTiming INTEGER,
SocialSourceNetworkID INTEGER,
SocialSourcePage VARCHAR(128),
ParamPrice INTEGER,
ParamOrderID VARCHAR(80),
ParamCurrency CHAR(3),
ParamCurrencyID INTEGER,
OpenstatServiceName VARCHAR(80),
OpenstatCampaignID VARCHAR(80),
OpenstatAdID VARCHAR(80),
OpenstatSourceID VARCHAR(80),
UTMSource VARCHAR(256),
UTMMedium VARCHAR(256),
UTMCampaign VARCHAR(256),
UTMContent VARCHAR(256),
UTMTerm VARCHAR(256),
FromTag VARCHAR(256),
HasGCLID INTEGER,
RefererHash INTEGER,
URLHash INTEGER,
CLID INTEGER
) ORDER BY CounterID, EventDate, UserID, EventTime;
\set input_file '''/opt/dumps/hits_10m_corrected.tsv'''
COPY hits_10m FROM :input_file DELIMITER E'\t' DIRECT;
create table hits_100m
(
WatchID INTEGER,
JavaEnable INTEGER,
Title VARCHAR(1024),
GoodEvent INTEGER,
EventTime DATETIME,
EventDate DATE,
CounterID INTEGER,
ClientIP INTEGER,
RegionID INTEGER,
UserID INTEGER,
CounterClass INTEGER,
OS INTEGER,
UserAgent INTEGER,
URL VARCHAR(6072),
Referer VARCHAR(2048),
Refresh INTEGER,
RefererCategoryID INTEGER,
RefererRegionID INTEGER,
URLCategoryID INTEGER,
URLRegionID INTEGER,
ResolutionWidth INTEGER,
ResolutionHeight INTEGER,
ResolutionDepth INTEGER,
FlashMajor INTEGER,
FlashMinor INTEGER,
FlashMinor2 VARCHAR(256),
NetMajor INTEGER,
NetMinor INTEGER,
UserAgentMajor INTEGER,
UserAgentMinor CHAR(2),
CookieEnable INTEGER,
JavascriptEnable INTEGER,
IsMobile INTEGER,
MobilePhone INTEGER,
MobilePhoneModel VARCHAR(80),
Params VARCHAR(2048),
IPNetworkID INTEGER,
TraficSourceID INTEGER,
SearchEngineID INTEGER,
SearchPhrase VARCHAR(1024),
AdvEngineID INTEGER,
IsArtifical INTEGER,
WindowClientWidth INTEGER,
WindowClientHeight INTEGER,
ClientTimeZone INTEGER,
ClientEventTime DATETIME,
SilverlightVersion1 INTEGER,
SilverlightVersion2 INTEGER,
SilverlightVersion3 INTEGER,
SilverlightVersion4 INTEGER,
PageCharset VARCHAR(80),
CodeVersion INTEGER,
IsLink INTEGER,
IsDownload INTEGER,
IsNotBounce INTEGER,
FUniqID INTEGER,
OriginalURL VARCHAR(6072),
HID INTEGER,
IsOldCounter INTEGER,
IsEvent INTEGER,
IsParameter INTEGER,
DontCountHits INTEGER,
WithHash INTEGER,
HitColor CHAR(1),
LocalEventTime DATETIME,
Age INTEGER,
Sex INTEGER,
Income INTEGER,
Interests INTEGER,
Robotness INTEGER,
RemoteIP INTEGER,
WindowName INTEGER,
OpenerName INTEGER,
HistoryLength INTEGER,
BrowserLanguage CHAR(2),
BrowserCountry CHAR(2),
SocialNetwork VARCHAR(128),
SocialAction VARCHAR(128),
HTTPError INTEGER,
SendTiming INTEGER,
DNSTiming INTEGER,
ConnectTiming INTEGER,
ResponseStartTiming INTEGER,
ResponseEndTiming INTEGER,
FetchTiming INTEGER,
SocialSourceNetworkID INTEGER,
SocialSourcePage VARCHAR(128),
ParamPrice INTEGER,
ParamOrderID VARCHAR(80),
ParamCurrency CHAR(3),
ParamCurrencyID INTEGER,
OpenstatServiceName VARCHAR(80),
OpenstatCampaignID VARCHAR(80),
OpenstatAdID VARCHAR(80),
OpenstatSourceID VARCHAR(80),
UTMSource VARCHAR(256),
UTMMedium VARCHAR(256),
UTMCampaign VARCHAR(256),
UTMContent VARCHAR(256),
UTMTerm VARCHAR(256),
FromTag VARCHAR(256),
HasGCLID INTEGER,
RefererHash INTEGER,
URLHash INTEGER,
CLID INTEGER
) ORDER BY CounterID, EventDate, UserID, EventTime;
\set input_file '''/opt/dumps/hits_100m_corrected.tsv'''
COPY hits_100m FROM :input_file DELIMITER E'\t' DIRECT;
create table hits_1000m
(
WatchID INTEGER,
JavaEnable INTEGER,
Title VARCHAR(1024),
GoodEvent INTEGER,
EventTime DATETIME,
EventDate DATE,
CounterID INTEGER,
ClientIP INTEGER,
RegionID INTEGER,
UserID INTEGER,
CounterClass INTEGER,
OS INTEGER,
UserAgent INTEGER,
URL VARCHAR(6072),
Referer VARCHAR(2048),
Refresh INTEGER,
RefererCategoryID INTEGER,
RefererRegionID INTEGER,
URLCategoryID INTEGER,
URLRegionID INTEGER,
ResolutionWidth INTEGER,
ResolutionHeight INTEGER,
ResolutionDepth INTEGER,
FlashMajor INTEGER,
FlashMinor INTEGER,
FlashMinor2 VARCHAR(256),
NetMajor INTEGER,
NetMinor INTEGER,
UserAgentMajor INTEGER,
UserAgentMinor CHAR(2),
CookieEnable INTEGER,
JavascriptEnable INTEGER,
IsMobile INTEGER,
MobilePhone INTEGER,
MobilePhoneModel VARCHAR(80),
Params VARCHAR(2048),
IPNetworkID INTEGER,
TraficSourceID INTEGER,
SearchEngineID INTEGER,
SearchPhrase VARCHAR(1024),
AdvEngineID INTEGER,
IsArtifical INTEGER,
WindowClientWidth INTEGER,
WindowClientHeight INTEGER,
ClientTimeZone INTEGER,
ClientEventTime DATETIME,
SilverlightVersion1 INTEGER,
SilverlightVersion2 INTEGER,
SilverlightVersion3 INTEGER,
SilverlightVersion4 INTEGER,
PageCharset VARCHAR(80),
CodeVersion INTEGER,
IsLink INTEGER,
IsDownload INTEGER,
IsNotBounce INTEGER,
FUniqID INTEGER,
OriginalURL VARCHAR(6072),
HID INTEGER,
IsOldCounter INTEGER,
IsEvent INTEGER,
IsParameter INTEGER,
DontCountHits INTEGER,
WithHash INTEGER,
HitColor CHAR(1),
LocalEventTime DATETIME,
Age INTEGER,
Sex INTEGER,
Income INTEGER,
Interests INTEGER,
Robotness INTEGER,
RemoteIP INTEGER,
WindowName INTEGER,
OpenerName INTEGER,
HistoryLength INTEGER,
BrowserLanguage CHAR(2),
BrowserCountry CHAR(2),
SocialNetwork VARCHAR(128),
SocialAction VARCHAR(128),
HTTPError INTEGER,
SendTiming INTEGER,
DNSTiming INTEGER,
ConnectTiming INTEGER,
ResponseStartTiming INTEGER,
ResponseEndTiming INTEGER,
FetchTiming INTEGER,
SocialSourceNetworkID INTEGER,
SocialSourcePage VARCHAR(128),
ParamPrice INTEGER,
ParamOrderID VARCHAR(80),
ParamCurrency CHAR(3),
ParamCurrencyID INTEGER,
OpenstatServiceName VARCHAR(80),
OpenstatCampaignID VARCHAR(80),
OpenstatAdID VARCHAR(80),
OpenstatSourceID VARCHAR(80),
UTMSource VARCHAR(256),
UTMMedium VARCHAR(256),
UTMCampaign VARCHAR(256),
UTMContent VARCHAR(256),
UTMTerm VARCHAR(256),
FromTag VARCHAR(256),
HasGCLID INTEGER,
RefererHash INTEGER,
URLHash INTEGER,
CLID INTEGER
) ORDER BY CounterID, EventDate, UserID, EventTime;
\set input_file '''/opt/dumps/hits_1000m_corrected.tsv'''
COPY hits_1000m FROM :input_file DELIMITER E'\t' DIRECT;

View File

@ -1,43 +0,0 @@
SELECT count(*) FROM {table};
SELECT count(*) FROM {table} WHERE AdvEngineID != 0;
SELECT sum(AdvEngineID), count(*), avg(ResolutionWidth) FROM {table};
SELECT sum_float(UserID) FROM {table};
SELECT COUNT(DISTINCT UserID) FROM {table};
SELECT COUNT(DISTINCT SearchPhrase) FROM {table};
SELECT min(EventDate), max(EventDate) FROM {table};
SELECT AdvEngineID, count(*) FROM {table} WHERE AdvEngineID != 0 GROUP BY AdvEngineID ORDER BY count(*) DESC;
SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM {table} GROUP BY RegionID ORDER BY u DESC LIMIT 10;
SELECT RegionID, sum(AdvEngineID), count(*) AS c, avg(ResolutionWidth), COUNT(DISTINCT UserID) FROM {table} GROUP BY RegionID ORDER BY count(*) DESC LIMIT 10;
SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM {table} WHERE MobilePhoneModel != '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM {table} WHERE MobilePhoneModel != '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
SELECT SearchPhrase, count(*) FROM {table} WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM {table} WHERE SearchPhrase != '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
SELECT SearchEngineID, SearchPhrase, count(*) FROM {table} WHERE SearchPhrase != '' GROUP BY SearchEngineID, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT UserID, count(*) FROM {table} GROUP BY UserID ORDER BY count(*) DESC LIMIT 10;
SELECT UserID, SearchPhrase, count(*) FROM {table} GROUP BY UserID, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT UserID, SearchPhrase, count(*) FROM {table} GROUP BY UserID, SearchPhrase LIMIT 10;
SELECT UserID, Minute(EventTime) AS m, SearchPhrase, count(*) FROM {table} GROUP BY UserID, m, SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT UserID FROM {table} WHERE UserID = 12345678901234567890;
SELECT count(*) FROM {table} WHERE URL LIKE '%metrika%';
SELECT SearchPhrase, MAX(URL), count(*) FROM {table} WHERE URL LIKE '%metrika%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT SearchPhrase, MAX(URL), MAX(Title), count(*) AS c, COUNT(DISTINCT UserID) FROM {table} WHERE Title LIKE '%Яндекс%' AND URL NOT LIKE '%.yandex.%' AND SearchPhrase != '' GROUP BY SearchPhrase ORDER BY count(*) DESC LIMIT 10;
SELECT * FROM {table} WHERE URL LIKE '%metrika%' ORDER BY EventTime LIMIT 10;
SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY EventTime LIMIT 10;
SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY SearchPhrase LIMIT 10;
SELECT SearchPhrase FROM {table} WHERE SearchPhrase != '' ORDER BY EventTime, SearchPhrase LIMIT 10;
SELECT CounterID, avg(OCTET_LENGTH(URL)) AS l, count(*) FROM {table} WHERE URL != '' GROUP BY CounterID HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
SELECT SUBSTRB(SUBSTRB(Referer, POSITIONB(Referer, '//') + 2), 1, GREATEST(0, POSITIONB(SUBSTRB(Referer, POSITIONB(Referer, '//') + 2), '/') - 1)) AS key, avg(OCTET_LENGTH(Referer)) AS l, count(*) AS c, MAX(Referer) FROM {table} WHERE Referer != '' GROUP BY key HAVING count(*) > 100000 ORDER BY l DESC LIMIT 25;
SELECT sum(ResolutionWidth), sum(ResolutionWidth + 1), sum(ResolutionWidth + 2), sum(ResolutionWidth + 3), sum(ResolutionWidth + 4), sum(ResolutionWidth + 5), sum(ResolutionWidth + 6), sum(ResolutionWidth + 7), sum(ResolutionWidth + 8), sum(ResolutionWidth + 9), sum(ResolutionWidth + 10), sum(ResolutionWidth + 11), sum(ResolutionWidth + 12), sum(ResolutionWidth + 13), sum(ResolutionWidth + 14), sum(ResolutionWidth + 15), sum(ResolutionWidth + 16), sum(ResolutionWidth + 17), sum(ResolutionWidth + 18), sum(ResolutionWidth + 19), sum(ResolutionWidth + 20), sum(ResolutionWidth + 21), sum(ResolutionWidth + 22), sum(ResolutionWidth + 23), sum(ResolutionWidth + 24), sum(ResolutionWidth + 25), sum(ResolutionWidth + 26), sum(ResolutionWidth + 27), sum(ResolutionWidth + 28), sum(ResolutionWidth + 29), sum(ResolutionWidth + 30), sum(ResolutionWidth + 31), sum(ResolutionWidth + 32), sum(ResolutionWidth + 33), sum(ResolutionWidth + 34), sum(ResolutionWidth + 35), sum(ResolutionWidth + 36), sum(ResolutionWidth + 37), sum(ResolutionWidth + 38), sum(ResolutionWidth + 39), sum(ResolutionWidth + 40), sum(ResolutionWidth + 41), sum(ResolutionWidth + 42), sum(ResolutionWidth + 43), sum(ResolutionWidth + 44), sum(ResolutionWidth + 45), sum(ResolutionWidth + 46), sum(ResolutionWidth + 47), sum(ResolutionWidth + 48), sum(ResolutionWidth + 49), sum(ResolutionWidth + 50), sum(ResolutionWidth + 51), sum(ResolutionWidth + 52), sum(ResolutionWidth + 53), sum(ResolutionWidth + 54), sum(ResolutionWidth + 55), sum(ResolutionWidth + 56), sum(ResolutionWidth + 57), sum(ResolutionWidth + 58), sum(ResolutionWidth + 59), sum(ResolutionWidth + 60), sum(ResolutionWidth + 61), sum(ResolutionWidth + 62), sum(ResolutionWidth + 63), sum(ResolutionWidth + 64), sum(ResolutionWidth + 65), sum(ResolutionWidth + 66), sum(ResolutionWidth + 67), sum(ResolutionWidth + 68), sum(ResolutionWidth + 69), sum(ResolutionWidth + 70), sum(ResolutionWidth + 71), sum(ResolutionWidth + 72), sum(ResolutionWidth + 73), sum(ResolutionWidth + 74), sum(ResolutionWidth + 75), sum(ResolutionWidth + 76), sum(ResolutionWidth + 77), sum(ResolutionWidth + 78), sum(ResolutionWidth + 79), sum(ResolutionWidth + 80), sum(ResolutionWidth + 81), sum(ResolutionWidth + 82), sum(ResolutionWidth + 83), sum(ResolutionWidth + 84), sum(ResolutionWidth + 85), sum(ResolutionWidth + 86), sum(ResolutionWidth + 87), sum(ResolutionWidth + 88), sum(ResolutionWidth + 89) FROM {table};
SELECT SearchEngineID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} WHERE SearchPhrase != '' GROUP BY SearchEngineID, ClientIP ORDER BY count(*) DESC LIMIT 10;
SELECT WatchID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY count(*) DESC LIMIT 10;
SELECT WatchID, ClientIP, count(*) AS c, sum(Refresh), avg(ResolutionWidth) FROM {table} GROUP BY WatchID, ClientIP ORDER BY count(*) DESC LIMIT 10;
SELECT URL, count(*) FROM {table} GROUP BY URL ORDER BY count(*) DESC LIMIT 10;
SELECT 1, URL, count(*) FROM {table} GROUP BY 1, URL ORDER BY count(*) DESC LIMIT 10;
SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, count(*) FROM {table} GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY count(*) DESC LIMIT 10;
SELECT URL, count(*) AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND URL != '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
SELECT Title, count(*) AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT DontCountHits AND NOT Refresh AND Title != '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
SELECT URL, count(*) AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND IsLink AND NOT IsDownload GROUP BY URL ORDER BY PageViews DESC LIMIT 1000;
SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN SearchEngineID = 0 AND AdvEngineID = 0 THEN Referer ELSE '' END AS Src, URL AS Dst, count(*) AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 1000;
SELECT URLHash, EventDate, count(*) AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND TraficSourceID IN (-1, 6) AND RefererHash = 6202628419148573758 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 100000;
SELECT WindowClientWidth, WindowClientHeight, count(*) AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-31') AND NOT Refresh AND NOT DontCountHits AND URLHash = 6202628419148573758 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10000;
SELECT TIME_SLICE(EventTime, 1, 'MINUTE') AS Minute, count(*) AS PageViews FROM {table} WHERE CounterID = 62 AND EventDate >= DATE('2013-07-01') AND EventDate <= DATE('2013-07-02') AND NOT Refresh AND NOT DontCountHits GROUP BY Minute ORDER BY Minute;

46
build_bin.sh Executable file
View File

@ -0,0 +1,46 @@
#!/bin/bash
set -e
set -x
PROJECT="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
export PATH=`echo $PATH | sed -e 's/:\/opt\/tiger\/typhoon-blade//'`
rm -rf output/
mkdir -p output
export CMAKE_BUILD_TYPE=${CUSTOM_CMAKE_BUILD_TYPE:-RelWithDebInfo}
export CMAKE_FLAGS="-DCMAKE_INSTALL_PREFIX=../output -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DUSE_BYTEDANCE_RDKAFKA=${CUSTOM_USE_BYTEDANCE_RDKAFKA:-1} ${CMAKE_FLAGS}"
CMAKE_FLAGS="-DCMAKE_INSTALL_PREFIX=../output ${CMAKE_FLAGS}"
CMAKE_FLAGS="-DCMAKE_BUILD_TYPE=${CUSTOM_CMAKE_BUILD_TYPE:-RelWithDebInfo} $CMAKE_FLAGS"
CMAKE_FLAGS="-DENABLE_BREAKPAD=ON $CMAKE_FLAGS" # enable minidump
[[ -n "$CUSTOM_SANITIZE" ]] && CMAKE_FLAGS="-DSANITIZE=$CUSTOM_SANITIZE $CMAKE_FLAGS"
[[ -n "$CUSTOM_MAX_LINKING_JOBS" ]] && CMAKE_FLAGS="-DPARALLEL_LINK_JOBS=${CUSTOM_MAX_LINKING_JOBS} ${CMAKE_FLAGS}"
[[ -n "$CUSTOM_MAX_COMPILE_JOBS" ]] && CMAKE_FLAGS="-DPARALLEL_COMPILE_JOBS=${CUSTOM_MAX_COMPILE_JOBS} ${CMAKE_FLAGS}"
export CMAKE_FLAGS
rm -rf build && mkdir build && cd build
source /etc/os-release
if [ "$NAME" == "CentOS Linux" ] && [ "$VERSION_ID" == "7" ] && hash scl 2>/dev/null; then
echo "Found Centos 7 and scl"
scl enable devtoolset-9 "CC=clang CXX=clang++ cmake3 ${CMAKE_FLAGS} -DCMAKE_MAKE_PROGRAM:FILEPATH=/usr/bin/ninja ../"
scl enable devtoolset-9 "ninja"
scl enable devtoolset-9 "ninja install"
else
export CC=/usr/bin/clang
export CXX=/usr/bin/clang++
cmake ../ ${CMAKE_FLAGS} && ninja
fi
# copy shared libaries
cp ${PROJECT}/contrib/foundationdb/lib/libfdb_c.so ../output/lib
# create the `usr/bin` directory to keep it same with old version
mkdir -p ../output/usr
mv ../output/bin ../output/usr/
# create symlink to make CI tests happy
cd ../output
ln -s usr/bin bin

Some files were not shown because too many files have changed in this diff Show More