mirror of https://github.com/ByConity/ByConity
Compare commits
26 Commits
742a9c438e
...
016de1746f
Author | SHA1 | Date |
---|---|---|
Doga Uzuncukoglu | 016de1746f | |
Wei Wang | 47df57f28d | |
fredwang | 9f730be7ba | |
fredwang | da5a5ae346 | |
李开希 | b3db1670af | |
石宇泽 | 2f5f064312 | |
勾王敏浩 | a15cb38d81 | |
魏祥威 | 70da7d1bb7 | |
许海涛 | 131bf36a8c | |
连文珑 | 77cf321a73 | |
贾硕 | d76d265f0f | |
高大月 | a539619a72 | |
冯开宇 | 5dcfbd8b8e | |
黄子淳 | 7fcad97ca2 | |
连薛超 | 9dd300f6f9 | |
吴健 | 8300dfb78e | |
杜峰 | f7f177b015 | |
Wuwen Wang | 35b01aafe6 | |
左闯 | b4b3e95874 | |
魏祥威 | 280d97a386 | |
李金志 | 5e0ac2bee1 | |
刘毫强 | a02cb343de | |
高远宁 | 900329c591 | |
景鹏 | 8f383e3969 | |
Yuan Zhu | 151a5cd02b | |
dogauzuncukoglu | e153d8a93d |
|
@ -1 +0,0 @@
|
|||
docker-compose exec server-0 /opt/byconity/bin/clickhouse client --port 52145 --host 127.0.0.1
|
|
@ -1,174 +0,0 @@
|
|||
version: "3"
|
||||
|
||||
services:
|
||||
# After upgrade to docker-compose v2, we could use `include` instead of `extend`.
|
||||
hdfs-namenode:
|
||||
extends:
|
||||
file: ./common/hdfs.yml
|
||||
service: hdfs-namenode
|
||||
hdfs-datanode:
|
||||
extends:
|
||||
file: ./common/hdfs.yml
|
||||
service: hdfs-datanode
|
||||
fdb:
|
||||
extends:
|
||||
file: ./common/fdb.yml
|
||||
service: fdb
|
||||
my_mysql:
|
||||
extends:
|
||||
file: ./common/mysql.yml
|
||||
service: my_mysql
|
||||
tso:
|
||||
image: hub.byted.org/bytehouse/debian.bullseye.fdb.udf:0.1
|
||||
command: bash -c "fdbcli -C /config/fdb.cluster --exec \"configure new single ssd\"; tso-server --config-file /config/tso.yml"
|
||||
depends_on:
|
||||
- fdb
|
||||
- hdfs-namenode
|
||||
volumes:
|
||||
- ${CNCH_BINARY_PATH}/:/opt/byconity/bin/:ro
|
||||
- ${CNCH_LIBRARY_PATH}/:/opt/byconity/lib/:ro
|
||||
- ./nexusfs/:/config/:ro
|
||||
- ./test_output/tso/:/var/log/byconity/:rw
|
||||
environment: &env
|
||||
LD_LIBRARY_PATH: /opt/byconity/lib
|
||||
PATH: /opt/byconity/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
|
||||
ASAN_OPTIONS:
|
||||
TSAN_OPTIONS:
|
||||
IS_CI_ENV: 1
|
||||
CI_PIPELINE_NAME: CI
|
||||
cap_add:
|
||||
- SYS_PTRACE
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "localhost:18845"]
|
||||
interval: 5s
|
||||
|
||||
server-0:
|
||||
image: hub.byted.org/bytehouse/debian.bullseye.fdb.udf:0.1
|
||||
command: bash -c "(udf-manager --config-file /config/server.yml & clickhouse-server --config-file /config/server.yml)"
|
||||
depends_on:
|
||||
tso:
|
||||
condition: service_healthy
|
||||
ports:
|
||||
- "9000:52145"
|
||||
- "127.0.0.1:8123:21557"
|
||||
- "127.0.0.1:9004:9004"
|
||||
environment:
|
||||
<<: *env
|
||||
SERVER_ID: server-0
|
||||
volumes:
|
||||
- ${CNCH_BINARY_PATH}/:/opt/byconity/bin/:ro
|
||||
- ${CNCH_LIBRARY_PATH}/:/opt/byconity/lib/:ro
|
||||
- ./nexusfs/:/config/:ro
|
||||
- ./test_output/server-0/:/var/log/byconity/:rw
|
||||
- ./queries/:/opt/byconity/queries/:ro
|
||||
cap_add:
|
||||
- SYS_PTRACE
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "localhost:21557"]
|
||||
interval: 5s
|
||||
|
||||
server-1:
|
||||
image: hub.byted.org/bytehouse/debian.bullseye.fdb.udf:0.1
|
||||
command: bash -c "(udf-manager --config-file /config/server.yml & clickhouse-server --config-file /config/server.yml)"
|
||||
depends_on:
|
||||
tso:
|
||||
condition: service_healthy
|
||||
ports:
|
||||
- "9001:52145"
|
||||
- "127.0.0.1:8124:21557"
|
||||
environment:
|
||||
<<: *env
|
||||
SERVER_ID: server-1
|
||||
volumes:
|
||||
- ${CNCH_BINARY_PATH}/:/opt/byconity/bin/:ro
|
||||
- ${CNCH_LIBRARY_PATH}/:/opt/byconity/lib/:ro
|
||||
- ./nexusfs/:/config/:ro
|
||||
- ./test_output/server-1/:/var/log/byconity/:rw
|
||||
- ./queries/:/opt/byconity/queries/:ro
|
||||
cap_add:
|
||||
- SYS_PTRACE
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "localhost:52145"]
|
||||
interval: 5s
|
||||
|
||||
worker-write:
|
||||
image: hub.byted.org/bytehouse/debian.bullseye.fdb.udf:0.1
|
||||
command: bash -c "clickhouse-server --config-file /config/worker.yml"
|
||||
depends_on:
|
||||
- server-0
|
||||
- server-1
|
||||
ports:
|
||||
- "52149:52145"
|
||||
environment:
|
||||
<<: *env
|
||||
WORKER_GROUP_ID: wg_write
|
||||
VIRTUAL_WAREHOUSE_ID: vw_write
|
||||
WORKER_ID: w0
|
||||
volumes:
|
||||
- ${CNCH_BINARY_PATH}/:/opt/byconity/bin/:ro
|
||||
- ${CNCH_LIBRARY_PATH}/:/opt/byconity/lib/:ro
|
||||
- ./nexusfs/:/config/:ro
|
||||
- ./test_output/worker-write/:/var/log/byconity/:rw
|
||||
- ./queries/:/opt/byconity/queries/:ro
|
||||
cap_add:
|
||||
- SYS_PTRACE
|
||||
worker-default:
|
||||
image: hub.byted.org/bytehouse/debian.bullseye.fdb.udf:0.1
|
||||
command: bash -c "(udf-manager --config-file /config/worker.yml & clickhouse-server --config-file /config/worker.yml)"
|
||||
depends_on:
|
||||
- server-0
|
||||
- server-1
|
||||
environment:
|
||||
<<: *env
|
||||
WORKER_GROUP_ID: wg_default
|
||||
VIRTUAL_WAREHOUSE_ID: vw_default
|
||||
WORKER_ID: r0
|
||||
volumes:
|
||||
- ${CNCH_BINARY_PATH}/:/opt/byconity/bin/:ro
|
||||
- ${CNCH_LIBRARY_PATH}/:/opt/byconity/lib/:ro
|
||||
- ./nexusfs/:/config/:ro
|
||||
- ./test_output/worker-default/:/var/log/byconity/:rw
|
||||
- ./queries/:/opt/byconity/queries/:ro
|
||||
cap_add:
|
||||
- SYS_PTRACE
|
||||
daemon-manager:
|
||||
image: hub.byted.org/bytehouse/debian.bullseye.fdb.udf:0.1
|
||||
command: bash -c "daemon-manager --config-file ./config/daemon-manager.yml"
|
||||
depends_on:
|
||||
server-0:
|
||||
condition: service_healthy
|
||||
server-1:
|
||||
condition: service_healthy
|
||||
environment:
|
||||
<<: *env
|
||||
volumes:
|
||||
- ${CNCH_BINARY_PATH}/:/opt/byconity/bin/:ro
|
||||
- ${CNCH_LIBRARY_PATH}/:/opt/byconity/lib/:ro
|
||||
- ./nexusfs/:/config/:ro
|
||||
- ./test_output/daemon-manager/:/var/log/byconity/:rw
|
||||
cap_add:
|
||||
- SYS_PTRACE
|
||||
restart: always
|
||||
|
||||
resource-manager:
|
||||
image: hub.byted.org/bytehouse/debian.bullseye.fdb.udf:0.1
|
||||
command: bash -c "resource-manager --config-file /config/resource-manager.yml"
|
||||
depends_on:
|
||||
- tso
|
||||
volumes:
|
||||
- ${CNCH_BINARY_PATH}/:/opt/byconity/bin/:ro
|
||||
- ${CNCH_LIBRARY_PATH}/:/opt/byconity/lib/:ro
|
||||
- ./nexusfs/:/config/:ro
|
||||
- ./test_output/rm/:/var/log/byconity/:rw
|
||||
environment:
|
||||
<<: *env
|
||||
cap_add:
|
||||
- SYS_PTRACE
|
||||
|
||||
volumes:
|
||||
fdb-data:
|
||||
external: false
|
||||
hdfs-namenode:
|
||||
external: false
|
||||
hdfs-datanode:
|
||||
external: false
|
|
@ -1,228 +0,0 @@
|
|||
logger:
|
||||
level: trace
|
||||
log: /var/log/byconity/out.log
|
||||
errorlog: /var/log/byconity/err.log
|
||||
testlog: /var/log/byconity/test.log
|
||||
size: 1000M
|
||||
count: 10
|
||||
console: true
|
||||
additional_services:
|
||||
GIS: 1
|
||||
VectorSearch: 1
|
||||
FullTextSearch: 1
|
||||
http_port: 21557
|
||||
rpc_port: 30605
|
||||
tcp_port: 52145
|
||||
ha_tcp_port: 26247
|
||||
exchange_port: 47447
|
||||
exchange_status_port: 60611
|
||||
interserver_http_port: 30491
|
||||
mysql_port: 9004
|
||||
listen_host: "0.0.0.0"
|
||||
prometheus:
|
||||
endpoint: "/metrics"
|
||||
port: 0
|
||||
metrics: true
|
||||
events: true
|
||||
asynchronous_metrics: true
|
||||
part_metrics: false
|
||||
cnch_type: server
|
||||
max_connections: 4096
|
||||
keep_alive_timeout: 3
|
||||
max_concurrent_queries: 200
|
||||
uncompressed_cache_size: 8589934592
|
||||
mark_cache_size: 5368709120
|
||||
path: /var/byconity/
|
||||
tmp_path: /var/byconity/tmp_data/
|
||||
users_config: /config/users.yml
|
||||
default_profile: default
|
||||
default_database: default
|
||||
timezone: Europe/Moscow
|
||||
mlock_executable: false
|
||||
enable_tenant_systemdb: false
|
||||
macros:
|
||||
"-incl": macros
|
||||
"-optional": true
|
||||
builtin_dictionaries_reload_interval: 3600
|
||||
max_session_timeout: 3600
|
||||
default_session_timeout: 60
|
||||
dictionaries_config: "*_dictionary.xml"
|
||||
format_schema_path: /var/byconity/format_schemas/
|
||||
perQuery: 1
|
||||
storage_configuration:
|
||||
disks:
|
||||
hdfs_disk:
|
||||
path: /user/clickhouse/
|
||||
type: bytehdfs
|
||||
local_disk:
|
||||
path: /var/byconity/data/
|
||||
type: local
|
||||
policies:
|
||||
default:
|
||||
volumes:
|
||||
hdfs:
|
||||
default: hdfs_disk
|
||||
disk: hdfs_disk
|
||||
local:
|
||||
default: local_disk
|
||||
disk: local_disk
|
||||
cnch_kafka_log:
|
||||
database: cnch_system
|
||||
table: cnch_kafka_log
|
||||
flush_max_row_count: 10000
|
||||
flush_interval_milliseconds: 7500
|
||||
cnch_unique_table_log:
|
||||
database: cnch_system
|
||||
table: cnch_unique_table_log
|
||||
flush_max_row_count: 10000
|
||||
flush_interval_milliseconds: 7500
|
||||
cnch_query_log:
|
||||
database: cnch_system
|
||||
table: cnch_query_log
|
||||
flush_max_row_count: 10000
|
||||
flush_interval_milliseconds: 7500
|
||||
query_log:
|
||||
database: system
|
||||
table: query_log
|
||||
flush_interval_milliseconds: 15000
|
||||
partition_by: event_date
|
||||
part_allocation_algorithm: 1
|
||||
consistent_hash_ring:
|
||||
num_replicas: 16
|
||||
num_probes: 21
|
||||
load_factor: 1.3
|
||||
service_discovery:
|
||||
mode: local
|
||||
cluster: default
|
||||
disable_cache: false
|
||||
cache_timeout: 5
|
||||
server:
|
||||
psm: data.cnch.server
|
||||
node:
|
||||
- host: server-0
|
||||
hostname: server-0
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
- host: server-1
|
||||
hostname: server-1
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
tso:
|
||||
psm: data.cnch.tso
|
||||
node:
|
||||
host: tso-0
|
||||
hostname: tso
|
||||
ports:
|
||||
port:
|
||||
- name: PORT0
|
||||
value: 18845
|
||||
- name: PORT2
|
||||
value: 9181
|
||||
resource_manager:
|
||||
psm: data.cnch.resource_manager
|
||||
node:
|
||||
host: resource-manager-0
|
||||
hostname: resource-manager-0
|
||||
ports:
|
||||
port:
|
||||
name: PORT0
|
||||
value: 28989
|
||||
daemon_manager:
|
||||
psm: data.cnch.daemon_manager
|
||||
node:
|
||||
host: daemon-manager-0
|
||||
hostname: daemon-manager
|
||||
ports:
|
||||
port:
|
||||
name: PORT0
|
||||
value: 17553
|
||||
vw_psm: data.cnch.vw
|
||||
vw:
|
||||
psm: data.cnch.vw
|
||||
node:
|
||||
- host: worker-write-0
|
||||
hostname: worker-write
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
vw_name: vw_write
|
||||
- host: worker-default-0
|
||||
hostname: worker-default
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
vw_name: vw_default
|
||||
catalog:
|
||||
name_space: default
|
||||
catalog_service:
|
||||
type: fdb
|
||||
fdb:
|
||||
cluster_file: /config/fdb.cluster
|
||||
hdfs_addr: hdfs://hdfs-namenode:9000
|
||||
udf_path: /var/byconity/data/user_defined
|
||||
udf_manager_server:
|
||||
timeout_ms: 20000
|
||||
max_retry: 1
|
||||
udf_processor:
|
||||
count: 3
|
||||
uds_path: /dev/shm/udf_processor_server
|
||||
timeout_ms: 10000
|
||||
max_retry: 1
|
||||
custom_settings_prefixes: SQL_
|
||||
restrict_tenanted_users_to_whitelist_settings: false
|
||||
restrict_tenanted_users_to_privileged_operations: false
|
||||
sensitive_permission_tenants: 1234
|
|
@ -1,202 +0,0 @@
|
|||
logger:
|
||||
level: trace
|
||||
log: /var/log/byconity/out.log
|
||||
errorlog: /var/log/byconity/err.log
|
||||
testlog: /var/log/byconity/test.log
|
||||
size: 1000M
|
||||
count: 10
|
||||
http_port: 21557
|
||||
rpc_port: 30605
|
||||
tcp_port: 52145
|
||||
ha_tcp_port: 26247
|
||||
exchange_port: 47447
|
||||
exchange_status_port: 60611
|
||||
interserver_http_port: 30491
|
||||
listen_host: "0.0.0.0"
|
||||
cnch_type: worker
|
||||
vw_name: vw_default
|
||||
max_connections: 4096
|
||||
keep_alive_timeout: 3
|
||||
max_concurrent_queries: 200
|
||||
uncompressed_cache_size: 8589934592
|
||||
mark_cache_size: 5368709120
|
||||
path: /var/byconity/
|
||||
tmp_path: /var/byconity/tmp_data/
|
||||
users_config: /config/users.yml
|
||||
default_profile: default
|
||||
default_database: default
|
||||
timezone: Europe/Moscow
|
||||
mlock_executable: false
|
||||
enable_tenant_systemdb: false
|
||||
macros:
|
||||
"-incl": macros
|
||||
"-optional": true
|
||||
builtin_dictionaries_reload_interval: 3600
|
||||
max_session_timeout: 3600
|
||||
default_session_timeout: 60
|
||||
dictionaries_config: "*_dictionary.xml"
|
||||
format_schema_path: /var/byconity/format_schemas/
|
||||
perQuery: 1
|
||||
storage_configuration:
|
||||
disks:
|
||||
hdfs_disk:
|
||||
path: /user/clickhouse/
|
||||
type: bytehdfs
|
||||
local_disk:
|
||||
path: /var/byconity/data/
|
||||
type: local
|
||||
policies:
|
||||
default:
|
||||
volumes:
|
||||
hdfs:
|
||||
default: hdfs_disk
|
||||
disk: hdfs_disk
|
||||
local:
|
||||
default: local_disk
|
||||
disk: local_disk
|
||||
hdfs_addr: "hdfs://hdfs-namenode:9000"
|
||||
cnch_unique_table_log:
|
||||
database: cnch_system
|
||||
table: cnch_unique_table_log
|
||||
flush_max_row_count: 10000
|
||||
flush_interval_milliseconds: 7500
|
||||
query_log:
|
||||
database: system
|
||||
table: query_log
|
||||
flush_interval_milliseconds: 15000
|
||||
partition_by: event_date
|
||||
service_discovery:
|
||||
mode: local
|
||||
cluster: default
|
||||
disable_cache: false
|
||||
cache_timeout: 5
|
||||
server:
|
||||
psm: data.cnch.server
|
||||
node:
|
||||
- host: server-0
|
||||
hostname: server-0
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
- host: server-1
|
||||
hostname: server-1
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
tso:
|
||||
psm: data.cnch.tso
|
||||
node:
|
||||
host: tso-0
|
||||
hostname: tso
|
||||
ports:
|
||||
port:
|
||||
- name: PORT0
|
||||
value: 18845
|
||||
- name: PORT2
|
||||
value: 9181
|
||||
resource_manager:
|
||||
psm: data.cnch.resource_manager
|
||||
node:
|
||||
host: resource-manager-0
|
||||
hostname: resource-manager-0
|
||||
ports:
|
||||
port:
|
||||
name: PORT0
|
||||
value: 28989
|
||||
daemon_manager:
|
||||
psm: data.cnch.daemon_manager
|
||||
node:
|
||||
host: daemon-manager-0
|
||||
hostname: daemon-manager
|
||||
ports:
|
||||
port:
|
||||
name: PORT0
|
||||
value: 17553
|
||||
vw_psm: data.cnch.vw
|
||||
vw:
|
||||
psm: data.cnch.vw
|
||||
node:
|
||||
- host: worker-write-0
|
||||
hostname: worker-write
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
vw_name: vw_write
|
||||
- host: worker-default-0
|
||||
hostname: worker-default
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
vw_name: vw_default
|
||||
catalog:
|
||||
name_space: default
|
||||
catalog_service:
|
||||
type: fdb
|
||||
fdb:
|
||||
cluster_file: /config/fdb.cluster
|
||||
udf_path: /var/byconity/data/user_defined
|
||||
udf_manager_server:
|
||||
timeout_ms: 20000
|
||||
max_retry: 1
|
||||
udf_processor:
|
||||
count: 3
|
||||
uds_path: /dev/shm/udf_processor_worker
|
||||
timeout_ms: 10000
|
||||
max_retry: 1
|
||||
restrict_tenanted_users_to_whitelist_settings: false
|
||||
restrict_tenanted_users_to_privileged_operations: false
|
||||
additional_services:
|
||||
FullTextSearch: true
|
||||
sensitive_permission_tenants: 1234
|
|
@ -1,252 +0,0 @@
|
|||
logger:
|
||||
level: trace
|
||||
log: /var/log/byconity/out.log
|
||||
errorlog: /var/log/byconity/err.log
|
||||
testlog: /var/log/byconity/test.log
|
||||
size: 1000M
|
||||
count: 10
|
||||
additional_services:
|
||||
GIS: 1
|
||||
VectorSearch: 1
|
||||
FullTextSearch: 1
|
||||
http_port: 21557
|
||||
rpc_port: 30605
|
||||
tcp_port: 52145
|
||||
ha_tcp_port: 26247
|
||||
exchange_port: 47447
|
||||
exchange_status_port: 60611
|
||||
interserver_http_port: 30491
|
||||
mysql_port: 9004
|
||||
listen_host: "0.0.0.0"
|
||||
prometheus:
|
||||
endpoint: "/metrics"
|
||||
port: 0
|
||||
metrics: true
|
||||
events: true
|
||||
asynchronous_metrics: true
|
||||
part_metrics: false
|
||||
cnch_type: server
|
||||
max_connections: 4096
|
||||
keep_alive_timeout: 3
|
||||
max_concurrent_queries: 200
|
||||
uncompressed_cache_size: 8589934592
|
||||
mark_cache_size: 5368709120
|
||||
path: /var/byconity/
|
||||
tmp_path: /var/byconity/tmp_data/
|
||||
users_config: /config/users.yml
|
||||
default_profile: default
|
||||
default_database: default
|
||||
timezone: Europe/Moscow
|
||||
mlock_executable: false
|
||||
enable_tenant_systemdb: false
|
||||
macros:
|
||||
"-incl": macros
|
||||
"-optional": true
|
||||
builtin_dictionaries_reload_interval: 3600
|
||||
max_session_timeout: 3600
|
||||
default_session_timeout: 60
|
||||
dictionaries_config: "*_dictionary.xml"
|
||||
format_schema_path: /var/byconity/format_schemas/
|
||||
perQuery: 1
|
||||
storage_configuration:
|
||||
disks:
|
||||
hdfs_disk:
|
||||
path: /user/clickhouse/
|
||||
type: bytehdfs
|
||||
local_disk:
|
||||
path: /var/byconity/data/
|
||||
type: local
|
||||
policies:
|
||||
default:
|
||||
volumes:
|
||||
hdfs:
|
||||
default: hdfs_disk
|
||||
disk: hdfs_disk
|
||||
local:
|
||||
default: local_disk
|
||||
disk: local_disk
|
||||
cnch_kafka_log:
|
||||
database: cnch_system
|
||||
table: cnch_kafka_log
|
||||
flush_max_row_count: 10000
|
||||
flush_interval_milliseconds: 7500
|
||||
cnch_unique_table_log:
|
||||
database: cnch_system
|
||||
table: cnch_unique_table_log
|
||||
flush_max_row_count: 10000
|
||||
flush_interval_milliseconds: 7500
|
||||
cnch_query_log:
|
||||
database: cnch_system
|
||||
table: cnch_query_log
|
||||
flush_max_row_count: 10000
|
||||
flush_interval_milliseconds: 7500
|
||||
query_log:
|
||||
database: system
|
||||
table: query_log
|
||||
flush_interval_milliseconds: 15000
|
||||
partition_by: event_date
|
||||
part_allocation_algorithm: 1
|
||||
consistent_hash_ring:
|
||||
num_replicas: 16
|
||||
num_probes: 21
|
||||
load_factor: 1.3
|
||||
service_discovery:
|
||||
mode: local
|
||||
cluster: default
|
||||
disable_cache: false
|
||||
cache_timeout: 5
|
||||
server:
|
||||
psm: data.cnch.server
|
||||
node:
|
||||
- host: server-0
|
||||
hostname: server-0
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
- host: server-1
|
||||
hostname: server-1
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
resource_manager:
|
||||
psm: data.cnch.resource_manager
|
||||
node:
|
||||
host: resource-manager-0
|
||||
hostname: resource-manager-0
|
||||
ports:
|
||||
port:
|
||||
name: PORT0
|
||||
value: 28989
|
||||
daemon_manager:
|
||||
psm: data.cnch.daemon_manager
|
||||
node:
|
||||
host: daemon-manager-0
|
||||
hostname: daemon-manager-0
|
||||
ports:
|
||||
port:
|
||||
name: PORT0
|
||||
value: 17553
|
||||
tso:
|
||||
psm: data.cnch.tso
|
||||
node:
|
||||
host: tso-0
|
||||
hostname: tso-0
|
||||
ports:
|
||||
port:
|
||||
- name: PORT0
|
||||
value: 18845
|
||||
- name: PORT2
|
||||
value: 9181
|
||||
vw_psm: data.cnch.vw
|
||||
vw:
|
||||
psm: data.cnch.vw
|
||||
node:
|
||||
vw_name: vw_write
|
||||
host: worker-write
|
||||
hostname: worker-write
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
node:
|
||||
vw_name: vw_default
|
||||
host: worker-default-0
|
||||
hostname: worker-default-0
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
node:
|
||||
vw_name: vw_default
|
||||
host: worker-default-1
|
||||
hostname: worker-default-1
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
catalog:
|
||||
name_space: default
|
||||
catalog_service:
|
||||
type: fdb
|
||||
fdb:
|
||||
cluster_file: /config/fdb.cluster
|
||||
external_catalog_mgr:
|
||||
type: fdb
|
||||
fdb:
|
||||
cluster_file: /config/fdb/cluster
|
||||
hdfs_addr: "hdfs://hdfs-namenode:9000"
|
||||
udf_path: /var/byconity/data/user_defined
|
||||
udf_manager_server:
|
||||
timeout_ms: 20000
|
||||
max_retry: 1
|
||||
udf_processor:
|
||||
count: 3
|
||||
uds_path: /dev/shm/udf_processor_server
|
||||
timeout_ms: 10000
|
||||
max_retry: 1
|
||||
custom_settings_prefixes: SQL_
|
||||
restrict_tenanted_users_to_whitelist_settings: false
|
||||
restrict_tenanted_users_to_privileged_operations: false
|
||||
sensitive_permission_tenants: 1234
|
|
@ -1,6 +0,0 @@
|
|||
catalog:
|
||||
name_space: default
|
||||
catalog_service:
|
||||
type: fdb
|
||||
fdb:
|
||||
cluster_file: /config/fdb.cluster
|
|
@ -1,115 +0,0 @@
|
|||
service_discovery:
|
||||
mode: local
|
||||
cluster: default
|
||||
disable_cache: false
|
||||
cache_timeout: 5
|
||||
server:
|
||||
psm: data.cnch.server
|
||||
node:
|
||||
- host: server-0
|
||||
hostname: server-0
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
- host: server-1
|
||||
hostname: server-1
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
tso:
|
||||
psm: data.cnch.tso
|
||||
node:
|
||||
host: tso
|
||||
hostname: tso
|
||||
ports:
|
||||
port:
|
||||
- name: PORT0
|
||||
value: 18845
|
||||
- name: PORT2
|
||||
value: 9181
|
||||
resource_manager:
|
||||
psm: data.cnch.resource_manager
|
||||
node:
|
||||
host: resource-manager
|
||||
hostname: resource-manager
|
||||
ports:
|
||||
port:
|
||||
name: PORT0
|
||||
value: 28989
|
||||
daemon_manager:
|
||||
psm: data.cnch.daemon_manager
|
||||
node:
|
||||
host: daemon-manager
|
||||
hostname: daemon-manager
|
||||
ports:
|
||||
port:
|
||||
name: PORT0
|
||||
value: 17553
|
||||
vw_psm: data.cnch.vw
|
||||
vw:
|
||||
psm: data.cnch.vw
|
||||
node:
|
||||
- host: worker-write
|
||||
hostname: worker-write
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
vw_name: vw_write
|
||||
- host: worker-default
|
||||
hostname: worker-default
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
vw_name: vw_default
|
|
@ -1,18 +0,0 @@
|
|||
hdfs_addr: hdfs://hdfs-namenode:9000
|
||||
storage_configuration:
|
||||
disks:
|
||||
hdfs_disk:
|
||||
path: /user/clickhouse/
|
||||
type: bytehdfs
|
||||
local_disk:
|
||||
path: /var/byconity/data/
|
||||
type: local
|
||||
policies:
|
||||
default:
|
||||
volumes:
|
||||
hdfs:
|
||||
default: hdfs_disk
|
||||
disk: hdfs_disk
|
||||
local:
|
||||
default: local_disk
|
||||
disk: local_disk
|
|
@ -1,63 +0,0 @@
|
|||
logger:
|
||||
level: trace
|
||||
log: /var/log/byconity/out.log
|
||||
errorlog: /var/log/byconity/err.log
|
||||
testlog: /var/log/byconity/test.log
|
||||
size: 1000M
|
||||
count: 10
|
||||
http_port: 21557
|
||||
rpc_port: 30605
|
||||
tcp_port: 52145
|
||||
ha_tcp_port: 26247
|
||||
exchange_port: 47447
|
||||
exchange_status_port: 60611
|
||||
interserver_http_port: 30491
|
||||
listen_host: "0.0.0.0"
|
||||
cnch_type: server
|
||||
max_connections: 4096
|
||||
keep_alive_timeout: 3
|
||||
max_concurrent_queries: 200
|
||||
uncompressed_cache_size: 8589934592
|
||||
mark_cache_size: 5368709120
|
||||
path: /var/byconity/
|
||||
tmp_path: /var/byconity/tmp_data/
|
||||
users_config: /config/users.yml
|
||||
default_profile: default
|
||||
default_database: default
|
||||
timezone: Europe/Moscow
|
||||
mlock_executable: false
|
||||
macros:
|
||||
"-incl": macros
|
||||
"-optional": true
|
||||
builtin_dictionaries_reload_interval: 3600
|
||||
max_session_timeout: 3600
|
||||
default_session_timeout: 60
|
||||
dictionaries_config: "*_dictionary.xml"
|
||||
format_schema_path: /var/byconity/format_schemas/
|
||||
perQuery: 1
|
||||
daemon_manager:
|
||||
port: 17553
|
||||
daemon_jobs:
|
||||
job:
|
||||
- name: PART_GC
|
||||
interval: 10000
|
||||
disable: 0
|
||||
- name: PART_MERGE
|
||||
interval: 10000
|
||||
disable: 0
|
||||
- name: CONSUMER
|
||||
interval: 10000
|
||||
disable: 0
|
||||
- name: GLOBAL_GC
|
||||
interval: 5000
|
||||
disable: 1
|
||||
- name: PART_CLUSTERING
|
||||
interval: 30000
|
||||
disable: 0
|
||||
- name: DEDUP_WORKER
|
||||
interval: 3000
|
||||
disable: 0
|
||||
# Increasing the frequency of recycling in a test environment
|
||||
- name: TXN_GC
|
||||
interval: 3000
|
||||
disable: 0
|
|
@ -1 +0,0 @@
|
|||
docker:docker@fdb:4550
|
|
@ -1,29 +0,0 @@
|
|||
logger:
|
||||
level: trace
|
||||
log: /var/log/byconity/out.log
|
||||
errorlog: /var/log/byconity/err.log
|
||||
testlog: /var/log/byconity/test.log
|
||||
size: 1000M
|
||||
count: 10
|
||||
listen_host: "0.0.0.0"
|
||||
path: /var/byconity/
|
||||
timezone: Europe/Moscow
|
||||
perQuery: 1
|
||||
resource_manager:
|
||||
port: 28989
|
||||
vws:
|
||||
vw:
|
||||
- name: vw_default
|
||||
type: default
|
||||
num_workers: 1
|
||||
worker_groups:
|
||||
worker_group:
|
||||
name: wg_default
|
||||
type: Physical
|
||||
- name: vw_write
|
||||
type: write
|
||||
num_workers: 1
|
||||
worker_groups:
|
||||
worker_group:
|
||||
name: wg_write
|
||||
type: Physical
|
|
@ -1,105 +0,0 @@
|
|||
logger:
|
||||
level: trace
|
||||
log: /var/log/byconity/out.log
|
||||
errorlog: /var/log/byconity/err.log
|
||||
testlog: /var/log/byconity/test.log
|
||||
size: 1000M
|
||||
count: 10
|
||||
console: true
|
||||
additional_services:
|
||||
GIS: 1
|
||||
VectorSearch: 1
|
||||
FullTextSearch: 1
|
||||
http_port: 21557
|
||||
rpc_port: 30605
|
||||
tcp_port: 52145
|
||||
ha_tcp_port: 26247
|
||||
exchange_port: 47447
|
||||
exchange_status_port: 60611
|
||||
interserver_http_port: 30491
|
||||
mysql_port: 9004
|
||||
listen_host: "0.0.0.0"
|
||||
prometheus:
|
||||
endpoint: "/metrics"
|
||||
port: 0
|
||||
metrics: true
|
||||
events: true
|
||||
asynchronous_metrics: true
|
||||
part_metrics: false
|
||||
cnch_type: server
|
||||
max_connections: 4096
|
||||
keep_alive_timeout: 3
|
||||
max_concurrent_queries: 200
|
||||
uncompressed_cache_size: 8589934592
|
||||
mark_cache_size: 5368709120
|
||||
path: /var/byconity/
|
||||
tmp_path: /var/byconity/tmp_data/
|
||||
users_config: /config/users.yml
|
||||
default_profile: default
|
||||
default_database: default
|
||||
timezone: Europe/Moscow
|
||||
mlock_executable: false
|
||||
enable_tenant_systemdb: false
|
||||
macros:
|
||||
"-incl": macros
|
||||
"-optional": true
|
||||
builtin_dictionaries_reload_interval: 3600
|
||||
max_session_timeout: 3600
|
||||
default_session_timeout: 60
|
||||
dictionaries_config: "*_dictionary.xml"
|
||||
format_schema_path: /var/byconity/format_schemas/
|
||||
perQuery: 1
|
||||
nexus_fs:
|
||||
enable: 1
|
||||
use_memory_device: 0
|
||||
enable_async_io: 0
|
||||
cache_size: 5368709120
|
||||
region_size: 4194304
|
||||
segment_size: 524288
|
||||
enable_memory_buffer: 1
|
||||
memory_buffer_size: 1073741824
|
||||
clean_regions_pool: 16
|
||||
clean_region_threads: 4
|
||||
num_in_mem_buffers: 32
|
||||
reader_threads: 32
|
||||
merge_tree:
|
||||
reorganize_marks_data_layout: 1
|
||||
enable_nexus_fs: 1
|
||||
cnch_kafka_log:
|
||||
database: cnch_system
|
||||
table: cnch_kafka_log
|
||||
flush_max_row_count: 10000
|
||||
flush_interval_milliseconds: 7500
|
||||
cnch_unique_table_log:
|
||||
database: cnch_system
|
||||
table: cnch_unique_table_log
|
||||
flush_max_row_count: 10000
|
||||
flush_interval_milliseconds: 7500
|
||||
cnch_query_log:
|
||||
database: cnch_system
|
||||
table: cnch_query_log
|
||||
flush_max_row_count: 10000
|
||||
flush_interval_milliseconds: 7500
|
||||
query_log:
|
||||
database: system
|
||||
table: query_log
|
||||
flush_interval_milliseconds: 15000
|
||||
partition_by: event_date
|
||||
part_allocation_algorithm: 1
|
||||
consistent_hash_ring:
|
||||
num_replicas: 16
|
||||
num_probes: 21
|
||||
load_factor: 1.3
|
||||
udf_path: /var/byconity/data/user_defined
|
||||
udf_manager_server:
|
||||
timeout_ms: 20000
|
||||
max_retry: 1
|
||||
udf_processor:
|
||||
count: 3
|
||||
uds_path: /dev/shm/udf_processor_server
|
||||
timeout_ms: 10000
|
||||
max_retry: 1
|
||||
custom_settings_prefixes: SQL_
|
||||
restrict_tenanted_users_to_whitelist_settings: false
|
||||
restrict_tenanted_users_to_privileged_operations: false
|
||||
sensitive_permission_tenants: 1234
|
|
@ -1,22 +0,0 @@
|
|||
logger:
|
||||
level: trace
|
||||
log: /var/log/byconity/tso.log
|
||||
errorlog: /var/log/byconity/tso.err.log
|
||||
testlog: /var/log/byconity/tso.test.log
|
||||
size: 1000M
|
||||
count: 10
|
||||
console: false
|
||||
listen_host: "0.0.0.0"
|
||||
path: /var/byconity/tso
|
||||
tmp_path: /var/byconity/tmp
|
||||
tso_service:
|
||||
type: fdb
|
||||
fdb:
|
||||
cluster_file: /config/fdb.cluster
|
||||
port: 18845
|
||||
http:
|
||||
port: 9181
|
||||
receive_timeout: 1800
|
||||
send_timeout: 1800
|
||||
tso_window_ms: 3000
|
||||
tso_get_leader_info_interval_ms: 0
|
|
@ -1,38 +0,0 @@
|
|||
profiles:
|
||||
default:
|
||||
load_balancing: random
|
||||
log_queries: 1
|
||||
max_execution_time: 180
|
||||
exchange_timeout_ms: 300000
|
||||
enable_nexus_fs: 1
|
||||
|
||||
users:
|
||||
default:
|
||||
networks:
|
||||
ip: ::/0
|
||||
password: ""
|
||||
profile: default
|
||||
quota: default
|
||||
access_management: 1
|
||||
server:
|
||||
networks:
|
||||
ip: ::/0
|
||||
password: ""
|
||||
profile: default
|
||||
quota: default
|
||||
probe:
|
||||
networks:
|
||||
ip: ::/0
|
||||
password: ""
|
||||
profile: default
|
||||
quota: default
|
||||
|
||||
quotas:
|
||||
default:
|
||||
interval:
|
||||
duration: 3600
|
||||
queries: 0
|
||||
errors: 0
|
||||
result_rows: 0
|
||||
read_rows: 0
|
||||
execution_time: 0
|
|
@ -1,82 +0,0 @@
|
|||
logger:
|
||||
level: trace
|
||||
log: /var/log/byconity/out.log
|
||||
errorlog: /var/log/byconity/err.log
|
||||
testlog: /var/log/byconity/test.log
|
||||
size: 1000M
|
||||
count: 10
|
||||
http_port: 21557
|
||||
rpc_port: 30605
|
||||
tcp_port: 52145
|
||||
ha_tcp_port: 26247
|
||||
exchange_port: 47447
|
||||
exchange_status_port: 60611
|
||||
interserver_http_port: 30491
|
||||
listen_host: "0.0.0.0"
|
||||
cnch_type: worker
|
||||
vw_name: vw_default
|
||||
max_connections: 4096
|
||||
keep_alive_timeout: 3
|
||||
max_concurrent_queries: 200
|
||||
uncompressed_cache_size: 8589934592
|
||||
mark_cache_size: 5368709120
|
||||
path: /var/byconity/
|
||||
tmp_path: /var/byconity/tmp_data/
|
||||
users_config: /config/users.yml
|
||||
default_profile: default
|
||||
default_database: default
|
||||
timezone: Europe/Moscow
|
||||
mlock_executable: false
|
||||
enable_tenant_systemdb: false
|
||||
macros:
|
||||
"-incl": macros
|
||||
"-optional": true
|
||||
builtin_dictionaries_reload_interval: 3600
|
||||
max_session_timeout: 3600
|
||||
default_session_timeout: 60
|
||||
dictionaries_config: "*_dictionary.xml"
|
||||
format_schema_path: /var/byconity/format_schemas/
|
||||
perQuery: 1
|
||||
nexus_fs:
|
||||
enable: 1
|
||||
use_memory_device: 0
|
||||
enable_async_io: 0
|
||||
cache_size: 5368709120
|
||||
region_size: 4194304
|
||||
segment_size: 524288
|
||||
enable_memory_buffer: 1
|
||||
memory_buffer_size: 1073741824
|
||||
clean_regions_pool: 16
|
||||
clean_region_threads: 4
|
||||
num_in_mem_buffers: 32
|
||||
reader_threads: 32
|
||||
merge_tree:
|
||||
reorganize_marks_data_layout: 1
|
||||
enable_nexus_fs: 1
|
||||
cnch_unique_table_log:
|
||||
database: cnch_system
|
||||
table: cnch_unique_table_log
|
||||
flush_max_row_count: 10000
|
||||
flush_interval_milliseconds: 7500
|
||||
query_log:
|
||||
database: system
|
||||
table: query_log
|
||||
flush_interval_milliseconds: 15000
|
||||
partition_by: event_date
|
||||
udf_path: /var/byconity/data/user_defined
|
||||
udf_manager_server:
|
||||
timeout_ms: 20000
|
||||
max_retry: 1
|
||||
udf_processor:
|
||||
count: 3
|
||||
uds_path: /dev/shm/udf_processor_worker
|
||||
timeout_ms: 10000
|
||||
max_retry: 1
|
||||
restrict_tenanted_users_to_system_tables: false
|
||||
restrict_tenanted_users_to_whitelist_settings: false
|
||||
restrict_tenanted_users_to_privileged_operations: false
|
||||
additional_services:
|
||||
FullTextSearch: true
|
||||
VectorSearch: true
|
||||
GIS: true
|
||||
sensitive_permission_tenants: 1234
|
|
@ -1,236 +0,0 @@
|
|||
# Auto-generated! Please do not modify this file directly. Refer to 'convert-hdfs-configs-to-s3.sh'.
|
||||
logger:
|
||||
level: trace
|
||||
log: /var/log/byconity/out.log
|
||||
errorlog: /var/log/byconity/err.log
|
||||
testlog: /var/log/byconity/test.log
|
||||
size: 1000M
|
||||
count: 10
|
||||
console: true
|
||||
additional_services:
|
||||
GIS: 1
|
||||
VectorSearch: 1
|
||||
FullTextSearch: 1
|
||||
http_port: 21557
|
||||
rpc_port: 30605
|
||||
tcp_port: 52145
|
||||
ha_tcp_port: 26247
|
||||
exchange_port: 47447
|
||||
exchange_status_port: 60611
|
||||
interserver_http_port: 30491
|
||||
mysql_port: 9004
|
||||
listen_host: "0.0.0.0"
|
||||
prometheus:
|
||||
endpoint: "/metrics"
|
||||
port: 0
|
||||
metrics: true
|
||||
events: true
|
||||
asynchronous_metrics: true
|
||||
part_metrics: false
|
||||
cnch_type: server
|
||||
max_connections: 4096
|
||||
keep_alive_timeout: 3
|
||||
max_concurrent_queries: 200
|
||||
uncompressed_cache_size: 8589934592
|
||||
mark_cache_size: 5368709120
|
||||
path: /var/byconity/
|
||||
tmp_path: /var/byconity/tmp_data/
|
||||
users_config: /config/users.yml
|
||||
default_profile: default
|
||||
default_database: default
|
||||
timezone: Europe/Moscow
|
||||
mlock_executable: false
|
||||
enable_tenant_systemdb: false
|
||||
macros:
|
||||
"-incl": macros
|
||||
"-optional": true
|
||||
builtin_dictionaries_reload_interval: 3600
|
||||
max_session_timeout: 3600
|
||||
default_session_timeout: 60
|
||||
dictionaries_config: "*_dictionary.xml"
|
||||
format_schema_path: /var/byconity/format_schemas/
|
||||
perQuery: 1
|
||||
storage_configuration:
|
||||
disks:
|
||||
local_disk:
|
||||
path: /var/byconity/data/
|
||||
type: local
|
||||
s3_disk:
|
||||
path: data123/
|
||||
type: s3
|
||||
endpoint: http://minio:9000
|
||||
bucket: cnch
|
||||
ak_id: minio
|
||||
ak_secret: minio123
|
||||
policies:
|
||||
default:
|
||||
volumes:
|
||||
local:
|
||||
default: local_disk
|
||||
disk: local_disk
|
||||
cnch_default_hdfs:
|
||||
volumes:
|
||||
s3:
|
||||
default: s3_disk
|
||||
disk: s3_disk
|
||||
# To avoid break hard-coded test cases.
|
||||
cnch_default_policy: cnch_default_hdfs
|
||||
cnch_kafka_log:
|
||||
database: cnch_system
|
||||
table: cnch_kafka_log
|
||||
flush_max_row_count: 10000
|
||||
flush_interval_milliseconds: 7500
|
||||
cnch_unique_table_log:
|
||||
database: cnch_system
|
||||
table: cnch_unique_table_log
|
||||
flush_max_row_count: 10000
|
||||
flush_interval_milliseconds: 7500
|
||||
cnch_query_log:
|
||||
database: cnch_system
|
||||
table: cnch_query_log
|
||||
flush_max_row_count: 10000
|
||||
flush_interval_milliseconds: 7500
|
||||
query_log:
|
||||
database: system
|
||||
table: query_log
|
||||
flush_interval_milliseconds: 15000
|
||||
partition_by: event_date
|
||||
part_allocation_algorithm: 1
|
||||
consistent_hash_ring:
|
||||
num_replicas: 16
|
||||
num_probes: 21
|
||||
load_factor: 1.3
|
||||
service_discovery:
|
||||
mode: local
|
||||
cluster: default
|
||||
disable_cache: false
|
||||
cache_timeout: 5
|
||||
server:
|
||||
psm: data.cnch.server
|
||||
node:
|
||||
- host: server-0
|
||||
hostname: server-0
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
- host: server-1
|
||||
hostname: server-1
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
tso:
|
||||
psm: data.cnch.tso
|
||||
node:
|
||||
host: tso-0
|
||||
hostname: tso
|
||||
ports:
|
||||
port:
|
||||
- name: PORT0
|
||||
value: 18845
|
||||
- name: PORT2
|
||||
value: 9181
|
||||
resource_manager:
|
||||
psm: data.cnch.resource_manager
|
||||
node:
|
||||
host: resource-manager-0
|
||||
hostname: resource-manager-0
|
||||
ports:
|
||||
port:
|
||||
name: PORT0
|
||||
value: 28989
|
||||
daemon_manager:
|
||||
psm: data.cnch.daemon_manager
|
||||
node:
|
||||
host: daemon-manager-0
|
||||
hostname: daemon-manager
|
||||
ports:
|
||||
port:
|
||||
name: PORT0
|
||||
value: 17553
|
||||
vw_psm: data.cnch.vw
|
||||
vw:
|
||||
psm: data.cnch.vw
|
||||
node:
|
||||
- host: worker-write-0
|
||||
hostname: worker-write
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
vw_name: vw_write
|
||||
- host: worker-default-0
|
||||
hostname: worker-default
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
vw_name: vw_default
|
||||
catalog:
|
||||
name_space: default
|
||||
catalog_service:
|
||||
type: fdb
|
||||
fdb:
|
||||
cluster_file: /config/fdb.cluster
|
||||
udf_path: /var/byconity/data/user_defined
|
||||
udf_manager_server:
|
||||
timeout_ms: 20000
|
||||
max_retry: 1
|
||||
udf_processor:
|
||||
count: 3
|
||||
uds_path: /dev/shm/udf_processor_server
|
||||
timeout_ms: 10000
|
||||
max_retry: 1
|
||||
custom_settings_prefixes: SQL_
|
||||
restrict_tenanted_users_to_whitelist_settings: false
|
||||
restrict_tenanted_users_to_privileged_operations: false
|
||||
sensitive_permission_tenants: 1234
|
|
@ -1,210 +0,0 @@
|
|||
# Auto-generated! Please do not modify this file directly. Refer to 'convert-hdfs-configs-to-s3.sh'.
|
||||
logger:
|
||||
level: trace
|
||||
log: /var/log/byconity/out.log
|
||||
errorlog: /var/log/byconity/err.log
|
||||
testlog: /var/log/byconity/test.log
|
||||
size: 1000M
|
||||
count: 10
|
||||
http_port: 21557
|
||||
rpc_port: 30605
|
||||
tcp_port: 52145
|
||||
ha_tcp_port: 26247
|
||||
exchange_port: 47447
|
||||
exchange_status_port: 60611
|
||||
interserver_http_port: 30491
|
||||
listen_host: "0.0.0.0"
|
||||
cnch_type: worker
|
||||
vw_name: vw_default
|
||||
max_connections: 4096
|
||||
keep_alive_timeout: 3
|
||||
max_concurrent_queries: 200
|
||||
uncompressed_cache_size: 8589934592
|
||||
mark_cache_size: 5368709120
|
||||
path: /var/byconity/
|
||||
tmp_path: /var/byconity/tmp_data/
|
||||
users_config: /config/users.yml
|
||||
default_profile: default
|
||||
default_database: default
|
||||
timezone: Europe/Moscow
|
||||
mlock_executable: false
|
||||
enable_tenant_systemdb: false
|
||||
macros:
|
||||
"-incl": macros
|
||||
"-optional": true
|
||||
builtin_dictionaries_reload_interval: 3600
|
||||
max_session_timeout: 3600
|
||||
default_session_timeout: 60
|
||||
dictionaries_config: "*_dictionary.xml"
|
||||
format_schema_path: /var/byconity/format_schemas/
|
||||
perQuery: 1
|
||||
storage_configuration:
|
||||
disks:
|
||||
local_disk:
|
||||
path: /var/byconity/data/
|
||||
type: local
|
||||
s3_disk:
|
||||
path: data123/
|
||||
type: s3
|
||||
endpoint: http://minio:9000
|
||||
bucket: cnch
|
||||
ak_id: minio
|
||||
ak_secret: minio123
|
||||
policies:
|
||||
default:
|
||||
volumes:
|
||||
local:
|
||||
default: local_disk
|
||||
disk: local_disk
|
||||
cnch_default_hdfs:
|
||||
volumes:
|
||||
s3:
|
||||
default: s3_disk
|
||||
disk: s3_disk
|
||||
# To avoid break hard-coded test cases.
|
||||
cnch_default_policy: cnch_default_hdfs
|
||||
cnch_unique_table_log:
|
||||
database: cnch_system
|
||||
table: cnch_unique_table_log
|
||||
flush_max_row_count: 10000
|
||||
flush_interval_milliseconds: 7500
|
||||
query_log:
|
||||
database: system
|
||||
table: query_log
|
||||
flush_interval_milliseconds: 15000
|
||||
partition_by: event_date
|
||||
service_discovery:
|
||||
mode: local
|
||||
cluster: default
|
||||
disable_cache: false
|
||||
cache_timeout: 5
|
||||
server:
|
||||
psm: data.cnch.server
|
||||
node:
|
||||
- host: server-0
|
||||
hostname: server-0
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
- host: server-1
|
||||
hostname: server-1
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
tso:
|
||||
psm: data.cnch.tso
|
||||
node:
|
||||
host: tso-0
|
||||
hostname: tso
|
||||
ports:
|
||||
port:
|
||||
- name: PORT0
|
||||
value: 18845
|
||||
- name: PORT2
|
||||
value: 9181
|
||||
resource_manager:
|
||||
psm: data.cnch.resource_manager
|
||||
node:
|
||||
host: resource-manager-0
|
||||
hostname: resource-manager-0
|
||||
ports:
|
||||
port:
|
||||
name: PORT0
|
||||
value: 28989
|
||||
daemon_manager:
|
||||
psm: data.cnch.daemon_manager
|
||||
node:
|
||||
host: daemon-manager-0
|
||||
hostname: daemon-manager
|
||||
ports:
|
||||
port:
|
||||
name: PORT0
|
||||
value: 17553
|
||||
vw_psm: data.cnch.vw
|
||||
vw:
|
||||
psm: data.cnch.vw
|
||||
node:
|
||||
- host: worker-write-0
|
||||
hostname: worker-write
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
vw_name: vw_write
|
||||
- host: worker-default-0
|
||||
hostname: worker-default
|
||||
ports:
|
||||
port:
|
||||
- name: PORT2
|
||||
value: 21557
|
||||
- name: PORT1
|
||||
value: 30605
|
||||
- name: PORT0
|
||||
value: 52145
|
||||
- name: PORT4
|
||||
value: 27651
|
||||
- name: PORT3
|
||||
value: 45443
|
||||
- name: PORT5
|
||||
value: 47447
|
||||
- name: PORT6
|
||||
value: 60611
|
||||
vw_name: vw_default
|
||||
catalog:
|
||||
name_space: default
|
||||
catalog_service:
|
||||
type: fdb
|
||||
fdb:
|
||||
cluster_file: /config/fdb.cluster
|
||||
udf_path: /var/byconity/data/user_defined
|
||||
udf_manager_server:
|
||||
timeout_ms: 20000
|
||||
max_retry: 1
|
||||
udf_processor:
|
||||
count: 3
|
||||
uds_path: /dev/shm/udf_processor_worker
|
||||
timeout_ms: 10000
|
||||
max_retry: 1
|
||||
restrict_tenanted_users_to_whitelist_settings: false
|
||||
restrict_tenanted_users_to_privileged_operations: false
|
||||
additional_services:
|
||||
FullTextSearch: true
|
||||
sensitive_permission_tenants: 1234
|
|
@ -4,7 +4,14 @@ profiles:
|
|||
log_queries: 1
|
||||
max_execution_time: 180
|
||||
exchange_timeout_ms: 300000
|
||||
cnch_max_cached_storage : 50000
|
||||
point_lookup:
|
||||
max_threads: 1
|
||||
exchange_source_pipeline_threads: 1
|
||||
enable_plan_cache: true
|
||||
query_worker_fault_tolerance: false
|
||||
send_cacheable_table_definitions: true
|
||||
optimize_skip_unused_shards: true
|
||||
enable_prune_source_plan_segment: true
|
||||
readonly:
|
||||
readonly: 1
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@ RUN wget -qO foundationdb-clients.deb https://github.com/apple/foundationdb/rele
|
|||
RUN wget -qO tini https://github.com/krallin/tini/releases/download/v0.19.0/tini
|
||||
|
||||
# Debian builder
|
||||
FROM debian:11.6 as debian-builder
|
||||
FROM debian:11.9 as debian-builder
|
||||
LABEL description="Debian image for compiling"
|
||||
LABEL org.opencontainers.image.source="https://github.com/ByConity/ByConity"
|
||||
|
||||
|
@ -39,7 +39,7 @@ RUN ldconfig
|
|||
ENV CC=clang-11 CXX=clang++-11
|
||||
|
||||
# Base runner image
|
||||
FROM debian:11.6-slim as debian-runner
|
||||
FROM debian:11.9-slim as debian-runner
|
||||
LABEL description="Base Debian image for runtime"
|
||||
LABEL org.opencontainers.image.source="https://github.com/ByConity/ByConity"
|
||||
|
||||
|
|
|
@ -4,7 +4,15 @@ profiles:
|
|||
log_queries: 1
|
||||
max_execution_time: 180
|
||||
exchange_timeout_ms: 300000
|
||||
|
||||
enable_auto_query_forwarding: true
|
||||
point_lookup:
|
||||
max_threads: 1
|
||||
exchange_source_pipeline_threads: 1
|
||||
enable_plan_cache: true
|
||||
query_worker_fault_tolerance: false
|
||||
send_cacheable_table_definitions: true
|
||||
optimize_skip_unused_shards: true
|
||||
enable_prune_source_plan_segment: true
|
||||
users:
|
||||
default:
|
||||
networks:
|
||||
|
@ -35,4 +43,4 @@ quotas:
|
|||
result_rows: 0
|
||||
read_rows: 0
|
||||
execution_time: 0
|
||||
cnch_config: "/config/cnch-config.yml"
|
||||
cnch_config: "/config/cnch-config.yml"
|
||||
|
|
|
@ -4,6 +4,14 @@ profiles:
|
|||
log_queries: 1
|
||||
max_execution_time: 180
|
||||
exchange_timeout_ms: 300000
|
||||
point_lookup:
|
||||
max_threads: 1
|
||||
exchange_source_pipeline_threads: 1
|
||||
enable_plan_cache: true
|
||||
query_worker_fault_tolerance: false
|
||||
send_cacheable_table_definitions: true
|
||||
optimize_skip_unused_shards: true
|
||||
enable_prune_source_plan_segment: true
|
||||
|
||||
users:
|
||||
default:
|
||||
|
@ -35,4 +43,4 @@ quotas:
|
|||
result_rows: 0
|
||||
read_rows: 0
|
||||
execution_time: 0
|
||||
cnch_config: "/config/cnch-config.yml"
|
||||
cnch_config: "/config/cnch-config.yml"
|
||||
|
|
|
@ -4,7 +4,14 @@ profiles:
|
|||
log_queries: 1
|
||||
max_execution_time: 180
|
||||
exchange_timeout_ms: 300000
|
||||
|
||||
point_lookup:
|
||||
max_threads: 1
|
||||
exchange_source_pipeline_threads: 1
|
||||
enable_plan_cache: true
|
||||
query_worker_fault_tolerance: false
|
||||
send_cacheable_table_definitions: true
|
||||
optimize_skip_unused_shards: true
|
||||
enable_prune_source_plan_segment: true
|
||||
users:
|
||||
default:
|
||||
networks:
|
||||
|
|
|
@ -1337,144 +1337,3 @@ Result:
|
|||
│ 2,"good" │
|
||||
└───────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## snowflakeToDateTime {#snowflakeToDateTime}
|
||||
|
||||
extract time from snowflake id as DateTime format.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
snowflakeToDateTime(value [, time_zone])
|
||||
```
|
||||
|
||||
**Parameters**
|
||||
|
||||
- `value` — `snowflake id`, Int64 value.
|
||||
- `time_zone` — [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../../sql-reference/data-types/string.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- value converted to the `DateTime` data type.
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT snowflakeToDateTime(CAST('1426860702823350272', 'Int64'), 'UTC');
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─snowflakeToDateTime(CAST('1426860702823350272', 'Int64'), 'UTC')─┐
|
||||
│ 2021-08-15 10:57:56 │
|
||||
└──────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## snowflakeToDateTime64 {#snowflakeToDateTime64}
|
||||
|
||||
extract time from snowflake id as DateTime64 format.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
snowflakeToDateTime64(value [, time_zone])
|
||||
```
|
||||
|
||||
**Parameters**
|
||||
|
||||
- `value` — `snowflake id`, Int64 value.
|
||||
- `time_zone` — [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../../sql-reference/data-types/string.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- value converted to the `DateTime64` data type.
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT snowflakeToDateTime64(CAST('1426860802823350272', 'Int64'), 'UTC');
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─snowflakeToDateTime64(CAST('1426860802823350272', 'Int64'), 'UTC')─┐
|
||||
│ 2021-08-15 10:58:19.841 │
|
||||
└────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## dateTimeToSnowflake {#dateTimeToSnowflake}
|
||||
|
||||
convert DateTime to the first snowflake id at the giving time.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
dateTimeToSnowflake(value)
|
||||
```
|
||||
|
||||
**Parameters**
|
||||
|
||||
- `value` — Date and time. [DateTime](../../sql-reference/data-types/datetime.md).
|
||||
|
||||
|
||||
**Returned value**
|
||||
|
||||
- `value` converted to the `Int64` data type as the first snowflake id at that time.
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT dateTimeToSnowflake(CAST('2021-08-15 18:57:56', 'DateTime'));
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─dateTimeToSnowflake(CAST('2021-08-15 18:57:56', 'DateTime'))─┐
|
||||
│ 1426860702823350272 │
|
||||
└──────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
|
||||
## dateTime64ToSnowflake {#dateTime64ToSnowflake}
|
||||
|
||||
convert DateTime64 to the first snowflake id at the giving time.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
dateTime64ToSnowflake(value)
|
||||
```
|
||||
|
||||
**Parameters**
|
||||
|
||||
- `value` — Date and time. [DateTime64](../../sql-reference/data-types/datetime64.md).
|
||||
|
||||
|
||||
**Returned value**
|
||||
|
||||
- `value` converted to the `Int64` data type as the first snowflake id at that time.
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT dateTime64ToSnowflake(CAST('2021-08-15 18:57:56.073', 'DateTime64'));
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
``` text
|
||||
┌─dateTime64ToSnowflake(CAST('2021-08-15 18:57:56.073', 'DateTime64'))─┐
|
||||
│ 1426860703129534464 │
|
||||
└──────────────────────────────────────────────────────────────────────┘
|
||||
```
|
|
@ -0,0 +1,925 @@
|
|||
---
|
||||
toc_priority: 53
|
||||
toc_title: UUID
|
||||
---
|
||||
|
||||
# Functions for Working with UUIDs
|
||||
|
||||
## generateUUIDv4
|
||||
|
||||
Generates a [version 4](https://tools.ietf.org/html/rfc4122#section-4.4) [UUID](../data-types/uuid.md).
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
generateUUIDv4([expr])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `expr` — An arbitrary [expression](../syntax.md#syntax-expressions) used to bypass [common subexpression elimination](../functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query. The value of the expression has no effect on the returned UUID. Optional.
|
||||
|
||||
**Returned value**
|
||||
|
||||
A value of type UUIDv4.
|
||||
|
||||
**Example**
|
||||
|
||||
First, create a table with a column of type UUID, then insert a generated UUIDv4 into the table.
|
||||
|
||||
``` sql
|
||||
CREATE TABLE tab (uuid UUID) ENGINE = Memory;
|
||||
|
||||
INSERT INTO tab SELECT generateUUIDv4();
|
||||
|
||||
SELECT * FROM tab;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```response
|
||||
┌─────────────────────────────────uuid─┐
|
||||
│ f4bf890f-f9dc-4332-ad5c-0c18e73f28e9 │
|
||||
└──────────────────────────────────────┘
|
||||
```
|
||||
|
||||
**Example with multiple UUIDs generated per row**
|
||||
|
||||
```sql
|
||||
SELECT generateUUIDv4(1), generateUUIDv4(2);
|
||||
|
||||
┌─generateUUIDv4(1)────────────────────┬─generateUUIDv4(2)────────────────────┐
|
||||
│ 2d49dc6e-ddce-4cd0-afb8-790956df54c1 │ 8abf8c13-7dea-4fdf-af3e-0e18767770e6 │
|
||||
└──────────────────────────────────────┴──────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## generateUUIDv7 {#generateUUIDv7}
|
||||
|
||||
Generates a [version 7](https://datatracker.ietf.org/doc/html/draft-peabody-dispatch-new-uuid-format-04) [UUID](../data-types/uuid.md).
|
||||
|
||||
The generated UUID contains the current Unix timestamp in milliseconds (48 bits), followed by version "7" (4 bits), a counter (42 bit) to distinguish UUIDs within a millisecond (including a variant field "2", 2 bit), and a random field (32 bits).
|
||||
For any given timestamp (unix_ts_ms), the counter starts at a random value and is incremented by 1 for each new UUID until the timestamp changes.
|
||||
In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to a random new start value.
|
||||
|
||||
Function `generateUUIDv7` guarantees that the counter field within a timestamp increments monotonically across all function invocations in concurrently running threads and queries.
|
||||
|
||||
```
|
||||
0 1 2 3
|
||||
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
|
||||
├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
|
||||
| unix_ts_ms |
|
||||
├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
|
||||
| unix_ts_ms | ver | counter_high_bits |
|
||||
├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
|
||||
|var| counter_low_bits |
|
||||
├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
|
||||
| rand_b |
|
||||
└─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
|
||||
```
|
||||
|
||||
:::note
|
||||
As of April 2024, version 7 UUIDs are in draft status and their layout may change in future.
|
||||
:::
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
generateUUIDv7([expr])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `expr` — An arbitrary [expression](../syntax.md#syntax-expressions) used to bypass [common subexpression elimination](../functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query. The value of the expression has no effect on the returned UUID. Optional.
|
||||
|
||||
**Returned value**
|
||||
|
||||
A value of type UUIDv7.
|
||||
|
||||
**Example**
|
||||
|
||||
First, create a table with a column of type UUID, then insert a generated UUIDv7 into the table.
|
||||
|
||||
``` sql
|
||||
CREATE TABLE tab (uuid UUID) ENGINE = Memory;
|
||||
|
||||
INSERT INTO tab SELECT generateUUIDv7();
|
||||
|
||||
SELECT * FROM tab;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```response
|
||||
┌─────────────────────────────────uuid─┐
|
||||
│ 018f05af-f4a8-778f-beee-1bedbc95c93b │
|
||||
└──────────────────────────────────────┘
|
||||
```
|
||||
|
||||
**Example with multiple UUIDs generated per row**
|
||||
|
||||
```sql
|
||||
SELECT generateUUIDv7(1), generateUUIDv7(2);
|
||||
|
||||
┌─generateUUIDv7(1)────────────────────┬─generateUUIDv7(2)────────────────────┐
|
||||
│ 018f05c9-4ab8-7b86-b64e-c9f03fbd45d1 │ 018f05c9-4ab8-7b86-b64e-c9f12efb7e16 │
|
||||
└──────────────────────────────────────┴──────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## empty
|
||||
|
||||
Checks whether the input UUID is empty.
|
||||
|
||||
**Syntax**
|
||||
|
||||
```sql
|
||||
empty(UUID)
|
||||
```
|
||||
|
||||
The UUID is considered empty if it contains all zeros (zero UUID).
|
||||
|
||||
The function also works for [Arrays](array-functions.md#function-empty) and [Strings](string-functions.md#empty).
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `x` — A UUID. [UUID](../data-types/uuid.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Returns `1` for an empty UUID or `0` for a non-empty UUID. [UInt8](../data-types/int-uint.md).
|
||||
|
||||
**Example**
|
||||
|
||||
To generate the UUID value, ClickHouse provides the [generateUUIDv4](#generateuuidv4) function.
|
||||
|
||||
Query:
|
||||
|
||||
```sql
|
||||
SELECT empty(generateUUIDv4());
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```response
|
||||
┌─empty(generateUUIDv4())─┐
|
||||
│ 0 │
|
||||
└─────────────────────────┘
|
||||
```
|
||||
|
||||
## notEmpty
|
||||
|
||||
Checks whether the input UUID is non-empty.
|
||||
|
||||
**Syntax**
|
||||
|
||||
```sql
|
||||
notEmpty(UUID)
|
||||
```
|
||||
|
||||
The UUID is considered empty if it contains all zeros (zero UUID).
|
||||
|
||||
The function also works for [Arrays](array-functions.md#function-notempty) or [Strings](string-functions.md#notempty).
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `x` — A UUID. [UUID](../data-types/uuid.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Returns `1` for a non-empty UUID or `0` for an empty UUID. [UInt8](../data-types/int-uint.md).
|
||||
|
||||
**Example**
|
||||
|
||||
To generate the UUID value, ClickHouse provides the [generateUUIDv4](#generateuuidv4) function.
|
||||
|
||||
Query:
|
||||
|
||||
```sql
|
||||
SELECT notEmpty(generateUUIDv4());
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```response
|
||||
┌─notEmpty(generateUUIDv4())─┐
|
||||
│ 1 │
|
||||
└────────────────────────────┘
|
||||
```
|
||||
|
||||
## toUUID
|
||||
|
||||
Converts a value of type String to a UUID.
|
||||
|
||||
``` sql
|
||||
toUUID(string)
|
||||
```
|
||||
|
||||
**Returned value**
|
||||
|
||||
The UUID type value.
|
||||
|
||||
**Usage example**
|
||||
|
||||
``` sql
|
||||
SELECT toUUID('61f0c404-5cb3-11e7-907b-a6006ad3dba0') AS uuid
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```response
|
||||
┌─────────────────────────────────uuid─┐
|
||||
│ 61f0c404-5cb3-11e7-907b-a6006ad3dba0 │
|
||||
└──────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## toUUIDOrDefault
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — String of 36 characters or FixedString(36). [String](../syntax.md#string).
|
||||
- `default` — UUID to be used as the default if the first argument cannot be converted to a UUID type. [UUID](../data-types/uuid.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
UUID
|
||||
|
||||
``` sql
|
||||
toUUIDOrDefault(string, default)
|
||||
```
|
||||
|
||||
**Returned value**
|
||||
|
||||
The UUID type value.
|
||||
|
||||
**Usage examples**
|
||||
|
||||
This first example returns the first argument converted to a UUID type as it can be converted:
|
||||
|
||||
``` sql
|
||||
SELECT toUUIDOrDefault('61f0c404-5cb3-11e7-907b-a6006ad3dba0', cast('59f0c404-5cb3-11e7-907b-a6006ad3dba0' as UUID));
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```response
|
||||
┌─toUUIDOrDefault('61f0c404-5cb3-11e7-907b-a6006ad3dba0', CAST('59f0c404-5cb3-11e7-907b-a6006ad3dba0', 'UUID'))─┐
|
||||
│ 61f0c404-5cb3-11e7-907b-a6006ad3dba0 │
|
||||
└───────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
This second example returns the second argument (the provided default UUID) as the first argument cannot be converted to a UUID type:
|
||||
|
||||
```sql
|
||||
SELECT toUUIDOrDefault('-----61f0c404-5cb3-11e7-907b-a6006ad3dba0', cast('59f0c404-5cb3-11e7-907b-a6006ad3dba0' as UUID));
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```response
|
||||
┌─toUUIDOrDefault('-----61f0c404-5cb3-11e7-907b-a6006ad3dba0', CAST('59f0c404-5cb3-11e7-907b-a6006ad3dba0', 'UUID'))─┐
|
||||
│ 59f0c404-5cb3-11e7-907b-a6006ad3dba0 │
|
||||
└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## toUUIDOrNull
|
||||
|
||||
Takes an argument of type String and tries to parse it into UUID. If failed, returns NULL.
|
||||
|
||||
``` sql
|
||||
toUUIDOrNull(string)
|
||||
```
|
||||
|
||||
**Returned value**
|
||||
|
||||
The Nullable(UUID) type value.
|
||||
|
||||
**Usage example**
|
||||
|
||||
``` sql
|
||||
SELECT toUUIDOrNull('61f0c404-5cb3-11e7-907b-a6006ad3dba0T') AS uuid
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```response
|
||||
┌─uuid─┐
|
||||
│ ᴺᵁᴸᴸ │
|
||||
└──────┘
|
||||
```
|
||||
|
||||
## toUUIDOrZero
|
||||
|
||||
It takes an argument of type String and tries to parse it into UUID. If failed, returns zero UUID.
|
||||
|
||||
``` sql
|
||||
toUUIDOrZero(string)
|
||||
```
|
||||
|
||||
**Returned value**
|
||||
|
||||
The UUID type value.
|
||||
|
||||
**Usage example**
|
||||
|
||||
``` sql
|
||||
SELECT toUUIDOrZero('61f0c404-5cb3-11e7-907b-a6006ad3dba0T') AS uuid
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```response
|
||||
┌─────────────────────────────────uuid─┐
|
||||
│ 00000000-0000-0000-0000-000000000000 │
|
||||
└──────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## UUIDStringToNum
|
||||
|
||||
Accepts `string` containing 36 characters in the format `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`, and returns a [FixedString(16)](../data-types/fixedstring.md) as its binary representation, with its format optionally specified by `variant` (`Big-endian` by default).
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
UUIDStringToNum(string[, variant = 1])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `string` — A [String](../syntax.md#syntax-string-literal) of 36 characters or [FixedString](../syntax.md#syntax-string-literal)
|
||||
- `variant` — Integer, representing a variant as specified by [RFC4122](https://datatracker.ietf.org/doc/html/rfc4122#section-4.1.1). 1 = `Big-endian` (default), 2 = `Microsoft`.
|
||||
|
||||
**Returned value**
|
||||
|
||||
FixedString(16)
|
||||
|
||||
**Usage examples**
|
||||
|
||||
``` sql
|
||||
SELECT
|
||||
'612f3c40-5d3b-217e-707b-6a546a3d7b29' AS uuid,
|
||||
UUIDStringToNum(uuid) AS bytes
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```response
|
||||
┌─uuid─────────────────────────────────┬─bytes────────────┐
|
||||
│ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ a/<@];!~p{jTj={) │
|
||||
└──────────────────────────────────────┴──────────────────┘
|
||||
```
|
||||
|
||||
``` sql
|
||||
SELECT
|
||||
'612f3c40-5d3b-217e-707b-6a546a3d7b29' AS uuid,
|
||||
UUIDStringToNum(uuid, 2) AS bytes
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```response
|
||||
┌─uuid─────────────────────────────────┬─bytes────────────┐
|
||||
│ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ @</a;]~!p{jTj={) │
|
||||
└──────────────────────────────────────┴──────────────────┘
|
||||
```
|
||||
|
||||
## UUIDNumToString
|
||||
|
||||
Accepts `binary` containing a binary representation of a UUID, with its format optionally specified by `variant` (`Big-endian` by default), and returns a string containing 36 characters in text format.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
UUIDNumToString(binary[, variant = 1])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `binary` — [FixedString(16)](../data-types/fixedstring.md) as a binary representation of a UUID.
|
||||
- `variant` — Integer, representing a variant as specified by [RFC4122](https://datatracker.ietf.org/doc/html/rfc4122#section-4.1.1). 1 = `Big-endian` (default), 2 = `Microsoft`.
|
||||
|
||||
**Returned value**
|
||||
|
||||
String.
|
||||
|
||||
**Usage example**
|
||||
|
||||
``` sql
|
||||
SELECT
|
||||
'a/<@];!~p{jTj={)' AS bytes,
|
||||
UUIDNumToString(toFixedString(bytes, 16)) AS uuid
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```response
|
||||
┌─bytes────────────┬─uuid─────────────────────────────────┐
|
||||
│ a/<@];!~p{jTj={) │ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │
|
||||
└──────────────────┴──────────────────────────────────────┘
|
||||
```
|
||||
|
||||
``` sql
|
||||
SELECT
|
||||
'@</a;]~!p{jTj={)' AS bytes,
|
||||
UUIDNumToString(toFixedString(bytes, 16), 2) AS uuid
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```response
|
||||
┌─bytes────────────┬─uuid─────────────────────────────────┐
|
||||
│ @</a;]~!p{jTj={) │ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │
|
||||
└──────────────────┴──────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## UUIDToNum
|
||||
|
||||
Accepts a [UUID](../data-types/uuid.md) and returns its binary representation as a [FixedString(16)](../data-types/fixedstring.md), with its format optionally specified by `variant` (`Big-endian` by default). This function replaces calls to two separate functions `UUIDStringToNum(toString(uuid))` so no intermediate conversion from UUID to string is required to extract bytes from a UUID.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
UUIDToNum(uuid[, variant = 1])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `uuid` — [UUID](../data-types/uuid.md).
|
||||
- `variant` — Integer, representing a variant as specified by [RFC4122](https://datatracker.ietf.org/doc/html/rfc4122#section-4.1.1). 1 = `Big-endian` (default), 2 = `Microsoft`.
|
||||
|
||||
**Returned value**
|
||||
|
||||
The binary representation of the UUID.
|
||||
|
||||
**Usage examples**
|
||||
|
||||
``` sql
|
||||
SELECT
|
||||
toUUID('612f3c40-5d3b-217e-707b-6a546a3d7b29') AS uuid,
|
||||
UUIDToNum(uuid) AS bytes
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```response
|
||||
┌─uuid─────────────────────────────────┬─bytes────────────┐
|
||||
│ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ a/<@];!~p{jTj={) │
|
||||
└──────────────────────────────────────┴──────────────────┘
|
||||
```
|
||||
|
||||
``` sql
|
||||
SELECT
|
||||
toUUID('612f3c40-5d3b-217e-707b-6a546a3d7b29') AS uuid,
|
||||
UUIDToNum(uuid, 2) AS bytes
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```response
|
||||
┌─uuid─────────────────────────────────┬─bytes────────────┐
|
||||
│ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ @</a;]~!p{jTj={) │
|
||||
└──────────────────────────────────────┴──────────────────┘
|
||||
```
|
||||
|
||||
## UUIDv7ToDateTime
|
||||
|
||||
Returns the timestamp component of a UUID version 7.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
UUIDv7ToDateTime(uuid[, timezone])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `uuid` — [UUID](../data-types/uuid.md) of version 7.
|
||||
- `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). [String](../data-types/string.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Timestamp with milliseconds precision. If the UUID is not a valid version 7 UUID, it returns 1970-01-01 00:00:00.000. [DateTime64(3)](../data-types/datetime64.md).
|
||||
|
||||
**Usage examples**
|
||||
|
||||
``` sql
|
||||
SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'))
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```response
|
||||
┌─UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'))─┐
|
||||
│ 2024-04-22 15:30:29.048 │
|
||||
└──────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
``` sql
|
||||
SELECT UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York')
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```response
|
||||
┌─UUIDv7ToDateTime(toUUID('018f05c9-4ab8-7b86-b64e-c9f03fbd45d1'), 'America/New_York')─┐
|
||||
│ 2024-04-22 08:30:29.048 │
|
||||
└──────────────────────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## serverUUID
|
||||
|
||||
Returns the random UUID generated during the first start of the ClickHouse server. The UUID is stored in file `uuid` in the ClickHouse server directory (e.g. `/var/lib/clickhouse/`) and retained between server restarts.
|
||||
|
||||
**Syntax**
|
||||
|
||||
```sql
|
||||
serverUUID()
|
||||
```
|
||||
|
||||
**Returned value**
|
||||
|
||||
- The UUID of the server. [UUID](../data-types/uuid.md).
|
||||
|
||||
## generateSnowflakeID
|
||||
|
||||
Generates a [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID).
|
||||
|
||||
The generated Snowflake ID contains the current Unix timestamp in milliseconds (41 + 1 top zero bits), followed by a machine id (10 bits), and a counter (12 bits) to distinguish IDs within a millisecond.
|
||||
For any given timestamp (unix_ts_ms), the counter starts at 0 and is incremented by 1 for each new Snowflake ID until the timestamp changes.
|
||||
In case the counter overflows, the timestamp field is incremented by 1 and the counter is reset to 0.
|
||||
|
||||
Function `generateSnowflakeID` guarantees that the counter field within a timestamp increments monotonically across all function invocations in concurrently running threads and queries.
|
||||
|
||||
:::note
|
||||
The generated Snowflake IDs are based on the UNIX epoch 1970-01-01.
|
||||
While no standard or recommendation exists for the epoch of Snowflake IDs, implementations in other systems may use a different epoch, e.g. Twitter/X (2010-11-04) or Mastodon (2015-01-01).
|
||||
:::
|
||||
|
||||
```
|
||||
0 1 2 3
|
||||
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
|
||||
├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
|
||||
|0| timestamp |
|
||||
├─┼ ┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
|
||||
| | machine_id | machine_seq_num |
|
||||
└─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
|
||||
```
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
generateSnowflakeID([expr, [machine_id]])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `expr` — An arbitrary [expression](../../sql-reference/syntax.md#syntax-expressions) used to bypass [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in a query. The value of the expression has no effect on the returned Snowflake ID. Optional.
|
||||
- `machine_id` — A machine ID, the lowest 10 bits are used. [Int64](../data-types/int-uint.md). Optional.
|
||||
|
||||
**Returned value**
|
||||
|
||||
A value of type UInt64.
|
||||
|
||||
**Example**
|
||||
|
||||
First, create a table with a column of type UInt64, then insert a generated Snowflake ID into the table.
|
||||
|
||||
``` sql
|
||||
CREATE TABLE tab (id UInt64) ENGINE = Memory;
|
||||
|
||||
INSERT INTO tab SELECT generateSnowflakeID();
|
||||
|
||||
SELECT * FROM tab;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```response
|
||||
┌──────────────────id─┐
|
||||
│ 7199081390080409600 │
|
||||
└─────────────────────┘
|
||||
```
|
||||
|
||||
**Example with multiple Snowflake IDs generated per row**
|
||||
|
||||
```sql
|
||||
SELECT generateSnowflakeID(1), generateSnowflakeID(2);
|
||||
|
||||
┌─generateSnowflakeID(1)─┬─generateSnowflakeID(2)─┐
|
||||
│ 7199081609652224000 │ 7199081609652224001 │
|
||||
└────────────────────────┴────────────────────────┘
|
||||
```
|
||||
|
||||
**Example with expression and a machine ID**
|
||||
|
||||
```
|
||||
SELECT generateSnowflakeID('expr', 1);
|
||||
|
||||
┌─generateSnowflakeID('expr', 1)─┐
|
||||
│ 7201148511606784002 │
|
||||
└────────────────────────────────┘
|
||||
```
|
||||
|
||||
## snowflakeToDateTime
|
||||
|
||||
:::warning
|
||||
This function is deprecated and can only be used if setting [allow_deprecated_snowflake_conversion_functions](../../operations/settings/settings.md#allow_deprecated_snowflake_conversion_functions) is enabled.
|
||||
The function will be removed at some point in future.
|
||||
:::
|
||||
|
||||
Extracts the timestamp component of a [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) in [DateTime](../data-types/datetime.md) format.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
snowflakeToDateTime(value[, time_zone])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `value` — Snowflake ID. [Int64](../data-types/int-uint.md).
|
||||
- `time_zone` — [Timezone](/docs/en/operations/server-configuration-parameters/settings.md/#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../data-types/string.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- The timestamp component of `value` as a [DateTime](../data-types/datetime.md) value.
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT snowflakeToDateTime(CAST('1426860702823350272', 'Int64'), 'UTC');
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```response
|
||||
|
||||
┌─snowflakeToDateTime(CAST('1426860702823350272', 'Int64'), 'UTC')─┐
|
||||
│ 2021-08-15 10:57:56 │
|
||||
└──────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## snowflakeToDateTime64
|
||||
|
||||
:::warning
|
||||
This function is deprecated and can only be used if setting [allow_deprecated_snowflake_conversion_functions](../../operations/settings/settings.md#allow_deprecated_snowflake_conversion_functions) is enabled.
|
||||
The function will be removed at some point in future.
|
||||
:::
|
||||
|
||||
Extracts the timestamp component of a [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) in [DateTime64](../data-types/datetime64.md) format.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
snowflakeToDateTime64(value[, time_zone])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `value` — Snowflake ID. [Int64](../data-types/int-uint.md).
|
||||
- `time_zone` — [Timezone](/docs/en/operations/server-configuration-parameters/settings.md/#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../data-types/string.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- The timestamp component of `value` as a [DateTime64](../data-types/datetime64.md) with scale = 3, i.e. millisecond precision.
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
SELECT snowflakeToDateTime64(CAST('1426860802823350272', 'Int64'), 'UTC');
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```response
|
||||
|
||||
┌─snowflakeToDateTime64(CAST('1426860802823350272', 'Int64'), 'UTC')─┐
|
||||
│ 2021-08-15 10:58:19.841 │
|
||||
└────────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## dateTimeToSnowflake
|
||||
|
||||
:::warning
|
||||
This function is deprecated and can only be used if setting [allow_deprecated_snowflake_conversion_functions](../../operations/settings/settings.md#allow_deprecated_snowflake_conversion_functions) is enabled.
|
||||
The function will be removed at some point in future.
|
||||
:::
|
||||
|
||||
Converts a [DateTime](../data-types/datetime.md) value to the first [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) at the giving time.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
dateTimeToSnowflake(value)
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `value` — Date with time. [DateTime](../data-types/datetime.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Input value converted to the [Int64](../data-types/int-uint.md) data type as the first Snowflake ID at that time.
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
WITH toDateTime('2021-08-15 18:57:56', 'Asia/Shanghai') AS dt SELECT dateTimeToSnowflake(dt);
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```response
|
||||
┌─dateTimeToSnowflake(dt)─┐
|
||||
│ 1426860702823350272 │
|
||||
└─────────────────────────┘
|
||||
```
|
||||
|
||||
## dateTime64ToSnowflake
|
||||
|
||||
:::warning
|
||||
This function is deprecated and can only be used if setting [allow_deprecated_snowflake_conversion_functions](../../operations/settings/settings.md#allow_deprecated_snowflake_conversion_functions) is enabled.
|
||||
The function will be removed at some point in future.
|
||||
:::
|
||||
|
||||
Convert a [DateTime64](../data-types/datetime64.md) to the first [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) at the giving time.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
dateTime64ToSnowflake(value)
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `value` — Date with time. [DateTime64](../data-types/datetime64.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Input value converted to the [Int64](../data-types/int-uint.md) data type as the first Snowflake ID at that time.
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
``` sql
|
||||
WITH toDateTime64('2021-08-15 18:57:56.492', 3, 'Asia/Shanghai') AS dt64 SELECT dateTime64ToSnowflake(dt64);
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```response
|
||||
┌─dateTime64ToSnowflake(dt64)─┐
|
||||
│ 1426860704886947840 │
|
||||
└─────────────────────────────┘
|
||||
```
|
||||
|
||||
## snowflakeIDToDateTime
|
||||
|
||||
Returns the timestamp component of a [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) as a value of type [DateTime](../data-types/datetime.md).
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
snowflakeIDToDateTime(value[, epoch[, time_zone]])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `value` — Snowflake ID. [UInt64](../data-types/int-uint.md).
|
||||
- `epoch` - Epoch of the Snowflake ID in milliseconds since 1970-01-01. Defaults to 0 (1970-01-01). For the Twitter/X epoch (2015-01-01), provide 1288834974657. Optional. [UInt*](../data-types/int-uint.md).
|
||||
- `time_zone` — [Timezone](/docs/en/operations/server-configuration-parameters/settings.md/#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../data-types/string.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- The timestamp component of `value` as a [DateTime](../data-types/datetime.md) value.
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
```sql
|
||||
SELECT snowflakeIDToDateTime(7204436857747984384) AS res
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```
|
||||
┌─────────────────res─┐
|
||||
│ 2024-06-06 10:59:58 │
|
||||
└─────────────────────┘
|
||||
```
|
||||
|
||||
## snowflakeIDToDateTime64
|
||||
|
||||
Returns the timestamp component of a [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) as a value of type [DateTime64](../data-types/datetime64.md).
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
snowflakeIDToDateTime64(value[, epoch[, time_zone]])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `value` — Snowflake ID. [UInt64](../data-types/int-uint.md).
|
||||
- `epoch` - Epoch of the Snowflake ID in milliseconds since 1970-01-01. Defaults to 0 (1970-01-01). For the Twitter/X epoch (2015-01-01), provide 1288834974657. Optional. [UInt*](../data-types/int-uint.md).
|
||||
- `time_zone` — [Timezone](/docs/en/operations/server-configuration-parameters/settings.md/#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../data-types/string.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- The timestamp component of `value` as a [DateTime64](../data-types/datetime64.md) with scale = 3, i.e. millisecond precision.
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
```sql
|
||||
SELECT snowflakeIDToDateTime64(7204436857747984384) AS res
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```
|
||||
┌─────────────────res─┐
|
||||
│ 2024-06-06 10:59:58 │
|
||||
└─────────────────────┘
|
||||
```
|
||||
|
||||
## dateTimeToSnowflakeID
|
||||
|
||||
Converts a [DateTime](../data-types/datetime.md) value to the first [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) at the giving time.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
dateTimeToSnowflakeID(value[, epoch])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `value` — Date with time. [DateTime](../data-types/datetime.md).
|
||||
- `epoch` - Epoch of the Snowflake ID in milliseconds since 1970-01-01. Defaults to 0 (1970-01-01). For the Twitter/X epoch (2015-01-01), provide 1288834974657. Optional. [UInt*](../data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Input value converted to [UInt64](../data-types/int-uint.md) as the first Snowflake ID at that time.
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
```sql
|
||||
SELECT toDateTime('2021-08-15 18:57:56', 'Asia/Shanghai') AS dt, dateTimeToSnowflakeID(dt) AS res;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```
|
||||
┌──────────────────dt─┬─────────────────res─┐
|
||||
│ 2021-08-15 18:57:56 │ 6832626392367104000 │
|
||||
└─────────────────────┴─────────────────────┘
|
||||
```
|
||||
|
||||
## dateTime64ToSnowflakeID
|
||||
|
||||
Convert a [DateTime64](../data-types/datetime64.md) to the first [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) at the giving time.
|
||||
|
||||
**Syntax**
|
||||
|
||||
``` sql
|
||||
dateTime64ToSnowflakeID(value[, epoch])
|
||||
```
|
||||
|
||||
**Arguments**
|
||||
|
||||
- `value` — Date with time. [DateTime64](../data-types/datetime64.md).
|
||||
- `epoch` - Epoch of the Snowflake ID in milliseconds since 1970-01-01. Defaults to 0 (1970-01-01). For the Twitter/X epoch (2015-01-01), provide 1288834974657. Optional. [UInt*](../data-types/int-uint.md).
|
||||
|
||||
**Returned value**
|
||||
|
||||
- Input value converted to [UInt64](../data-types/int-uint.md) as the first Snowflake ID at that time.
|
||||
|
||||
**Example**
|
||||
|
||||
Query:
|
||||
|
||||
```sql
|
||||
SELECT toDateTime('2021-08-15 18:57:56.493', 3, 'Asia/Shanghai') AS dt, dateTime64ToSnowflakeID(dt) AS res;
|
||||
```
|
||||
|
||||
Result:
|
||||
|
||||
```
|
||||
┌──────────────────────dt─┬─────────────────res─┐
|
||||
│ 2021-08-15 18:57:56.493 │ 6832626394434895872 │
|
||||
└─────────────────────────┴─────────────────────┘
|
||||
```
|
||||
|
||||
## See also
|
||||
|
||||
- [dictGetUUID](../functions/ext-dict-functions.md#ext_dict_functions-other)
|
|
@ -61,6 +61,7 @@ option (ENABLE_CLICKHOUSE_DUMPER "Enable clickhouse-dumper" ${ENABLE_CLICKHOUSE_
|
|||
option (ENABLE_CLICKHOUSE_RESOURCE_MANAGER "Service that manage worker resources" ${ENABLE_CLICKHOUSE_ALL})
|
||||
option (ENABLE_CLICKHOUSE_META_INSPECTOR "Enable meta-inspector in CNCH" ${ENABLE_CLICKHOUSE_ALL})
|
||||
option (ENABLE_CLICKHOUSE_STORAGE_TOOLS "Enable storage-tools in CNCH" ${ENABLE_CLICKHOUSE_ALL})
|
||||
option (ENABLE_CLICKHOUSE_SCHEMA_ADVISOR "Data schema advisor" ${ENABLE_CLICKHOUSE_ALL})
|
||||
|
||||
if (NOT USE_NURAFT)
|
||||
# RECONFIGURE_MESSAGE_LEVEL should not be used here,
|
||||
|
@ -187,6 +188,12 @@ else()
|
|||
message(STATUS "ClickHouse tso-server mode: OFF")
|
||||
endif()
|
||||
|
||||
if (ENABLE_CLICKHOUSE_SCHEMA_ADVISOR)
|
||||
message(STATUS "ClickHouse schema-advisor mode: ON")
|
||||
else()
|
||||
message(STATUS "ClickHouse schema-advisor mode: OFF")
|
||||
endif()
|
||||
|
||||
if(NOT (MAKE_STATIC_LIBRARIES OR SPLIT_SHARED_LIBRARIES))
|
||||
set(CLICKHOUSE_ONE_SHARED ON)
|
||||
endif()
|
||||
|
@ -281,6 +288,10 @@ if (ENABLE_CLICKHOUSE_LIBRARY_BRIDGE)
|
|||
add_subdirectory (library-bridge)
|
||||
endif ()
|
||||
|
||||
if (ENABLE_CLICKHOUSE_SCHEMA_ADVISOR)
|
||||
add_subdirectory (schema-advisor)
|
||||
endif()
|
||||
|
||||
set (JAVA_EXTENSIONS_JVM_SEARCH_PATH "/usr/lib/jvm/default-java/lib:/usr/lib/jvm/default-java/lib/server:/usr/lib/jvm/default-java/jre/lib/amd64:/usr/lib/jvm/default-java/jre/lib/amd64/server:/usr/lib/jvm/java-8-byteopenjdk-amd64/jre/lib/amd64:/usr/lib/jvm/java-8-byteopenjdk-amd64/jre/lib/amd64/server" CACHE STRING "Runtime search path for libjvm.so")
|
||||
|
||||
if (CLICKHOUSE_ONE_SHARED)
|
||||
|
@ -298,7 +309,8 @@ if (CLICKHOUSE_ONE_SHARED)
|
|||
${CLICKHOUSE_ODBC_BRIDGE_SOURCES}
|
||||
${CLICKHOUSE_KEEPER_SOURCES}
|
||||
${CLICKHOUSE_PART_TOOLKIT_SOURCES}
|
||||
${CLICKHOUSE_META_INSPECTOR_SOURCES})
|
||||
${CLICKHOUSE_META_INSPECTOR_SOURCES}
|
||||
${CLICKHOUSE_SCHEMA_ADVISOR_SOURCES})
|
||||
|
||||
target_link_libraries(clickhouse-lib
|
||||
${CLICKHOUSE_SERVER_LINK}
|
||||
|
@ -315,7 +327,8 @@ if (CLICKHOUSE_ONE_SHARED)
|
|||
${CLICKHOUSE_KEEPER_LINK}
|
||||
${CLICKHOUSE_KEEPER_CONVERTER_LINK}
|
||||
${CLICKHOUSE_PART_TOOLKIT_LINK}
|
||||
${CLICKHOUSE_META_INSPECTOR_LINK})
|
||||
${CLICKHOUSE_META_INSPECTOR_LINK}
|
||||
${CLICKHOUSE_SCHEMA_ADVISOR_LINK})
|
||||
|
||||
target_include_directories(clickhouse-lib
|
||||
${CLICKHOUSE_SERVER_INCLUDE}
|
||||
|
@ -352,6 +365,7 @@ if (CLICKHOUSE_SPLIT_BINARY)
|
|||
clickhouse-dumper
|
||||
clickhouse-meta-inspector
|
||||
clickhouse-storage-tools
|
||||
clickhouse-schema-advisor
|
||||
)
|
||||
|
||||
if (ENABLE_CLICKHOUSE_ODBC_BRIDGE)
|
||||
|
@ -370,6 +384,10 @@ if (CLICKHOUSE_SPLIT_BINARY)
|
|||
list (APPEND CLICKHOUSE_ALL_TARGETS clickhouse-keeper-converter)
|
||||
endif ()
|
||||
|
||||
if (ENABLE_CLICKHOUSE_SCHEMA_ADVISOR)
|
||||
list (APPEND CLICKHOUSE_ALL_TARGETS clickhouse-schema-advisor)
|
||||
endif ()
|
||||
|
||||
set_target_properties(${CLICKHOUSE_ALL_TARGETS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ..)
|
||||
set(RPATH "$ORIGIN/../../lib")
|
||||
if (USE_JAVA_EXTENSIONS)
|
||||
|
@ -450,6 +468,9 @@ else ()
|
|||
if (ENABLE_CLICKHOUSE_STORAGE_TOOLS)
|
||||
clickhouse_target_link_split_lib(clickhouse storage-tools)
|
||||
endif()
|
||||
if (ENABLE_CLICKHOUSE_SCHEMA_ADVISOR)
|
||||
clickhouse_target_link_split_lib(clickhouse schema-advisor)
|
||||
endif ()
|
||||
|
||||
set (CLICKHOUSE_BUNDLE)
|
||||
if (ENABLE_CLICKHOUSE_SERVER)
|
||||
|
@ -566,6 +587,11 @@ else ()
|
|||
install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-storage_tools" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse)
|
||||
list(APPEND CLICKHOUSE_BUNDLE clickhouse-storage_tools)
|
||||
endif ()
|
||||
if (ENABLE_CLICKHOUSE_SCHEMA_ADVISOR)
|
||||
add_custom_target (clickhouse-schema-advisor ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-schema-advisor DEPENDS clickhouse)
|
||||
install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-schema-advisor" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse)
|
||||
list(APPEND CLICKHOUSE_BUNDLE clickhouse-schema-advisor)
|
||||
endif ()
|
||||
|
||||
install (TARGETS clickhouse RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse)
|
||||
|
||||
|
|
|
@ -25,3 +25,4 @@
|
|||
#cmakedefine01 ENABLE_CLICKHOUSE_META_INSPECTOR
|
||||
#cmakedefine01 ENABLE_CLICKHOUSE_DUMPER
|
||||
#cmakedefine01 ENABLE_CLICKHOUSE_STORAGE_TOOLS
|
||||
#cmakedefine01 ENABLE_CLICKHOUSE_SCHEMA_ADVISOR
|
|
@ -124,6 +124,9 @@ int mainEntryClickHouseTSOServer(int argc, char ** argv);
|
|||
#if ENABLE_CLICKHOUSE_STORAGE_TOOLS
|
||||
int mainEntryStorageTools(int argc, char ** argv);
|
||||
#endif
|
||||
#if ENABLE_CLICKHOUSE_SCHEMA_ADVISOR
|
||||
int mainEntryClickHouseSchemaAdvisor(int argc, char ** argv);
|
||||
#endif
|
||||
|
||||
int mainEntryClickHouseHashBinary(int, char **)
|
||||
{
|
||||
|
@ -218,6 +221,9 @@ std::pair<const char *, MainFunc> clickhouse_applications[] =
|
|||
{"storage-tools", mainEntryStorageTools},
|
||||
{"storage_tools", mainEntryStorageTools},
|
||||
#endif
|
||||
#if ENABLE_CLICKHOUSE_SCHEMA_ADVISOR
|
||||
{"schema-advisor", mainEntryClickHouseSchemaAdvisor},
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
set(CLICKHOUSE_SCHEMA_ADVISOR_SOURCES
|
||||
SchemaAdvisor.cpp
|
||||
CodecAdvisor.cpp
|
||||
TypeAdvisor.cpp
|
||||
IndexAdvisor.cpp
|
||||
PrewhereAdvisor.cpp
|
||||
SampleColumnReader.cpp
|
||||
Statistics.cpp
|
||||
CompressedStatisticsCollectBuffer.cpp
|
||||
ColumnUsageExtractor.cpp
|
||||
MockGlobalContext.cpp
|
||||
MockEnvironment.cpp
|
||||
)
|
||||
|
||||
set(CLICKHOUSE_SCHEMA_ADVISOR_LINK
|
||||
PRIVATE
|
||||
boost::program_options
|
||||
clickhouse_functions
|
||||
clickhouse_aggregate_functions
|
||||
clickhouse_parsers
|
||||
dbms
|
||||
clickhouse_storages_system
|
||||
)
|
||||
|
||||
if (CLICKHOUSE_SPLIT_BINARY)
|
||||
list(APPEND CLICKHOUSE_SCHEMA_ADVISOR_LINK $<TARGET_OBJECTS:protobuf::libprotobuf>)
|
||||
endif()
|
||||
|
||||
clickhouse_program_add(schema-advisor)
|
|
@ -0,0 +1,212 @@
|
|||
#include "CodecAdvisor.h"
|
||||
#include "CompressedStatisticsCollectBuffer.h"
|
||||
|
||||
#include <boost/algorithm/string/join.hpp>
|
||||
#include <boost/program_options.hpp>
|
||||
|
||||
#include <Common/ThreadPool.h>
|
||||
#include <Compression/CompressedReadBufferFromFile.h>
|
||||
#include <Compression/CompressionFactory.h>
|
||||
#include <DataTypes/MapHelpers.h>
|
||||
#include <IO/copyData.h>
|
||||
#include <IO/ReadBufferFromFile.h>
|
||||
#include <IO/ReadBufferFromFileDescriptor.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <IO/WriteBufferFromFile.h>
|
||||
#include <IO/WriteBufferFromFileDescriptor.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <Parsers/ASTCreateQuery.h>
|
||||
#include <Parsers/ParserCreateQuery.h>
|
||||
#include <Parsers/parseQuery.h>
|
||||
#include <Parsers/queryToString.h>
|
||||
#include <Poco/DirectoryIterator.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
CodecAdvisor::CodecAdvisor(
|
||||
const po::variables_map & options,
|
||||
const ColumnsDescription & column_descs,
|
||||
std::string absolute_part_path_,
|
||||
size_t sample_row_number_,
|
||||
size_t max_threads_)
|
||||
: absolute_part_path(std::move(absolute_part_path_))
|
||||
, sample_row_number(sample_row_number_)
|
||||
, max_threads(max_threads_)
|
||||
{
|
||||
parseCodecCandidates(options);
|
||||
setSamplingColumnFiles(absolute_part_path, column_descs);
|
||||
}
|
||||
|
||||
void CodecAdvisor::parseCodecCandidates(const po::variables_map & options)
|
||||
{
|
||||
block_size = options["block-size"].as<unsigned>();
|
||||
|
||||
bool use_lz4hc = options.count("hc");
|
||||
bool use_zstd = options.count("zstd");
|
||||
std::vector<std::string> combi_codec;
|
||||
if (options.count("codec"))
|
||||
combi_codec = options["codec"].as<std::vector<std::string>>();
|
||||
|
||||
if (!use_lz4hc && !use_zstd && combi_codec.empty())
|
||||
throw Exception(
|
||||
"Missing options, either --hc or --zstd or --codec options is required", ErrorCodes::BAD_ARGUMENTS);
|
||||
if ((use_lz4hc || use_zstd) && !combi_codec.empty())
|
||||
throw Exception(
|
||||
"Wrong options, codec flags like --zstd and --codec options are mutually exclusive", ErrorCodes::BAD_ARGUMENTS);
|
||||
if (!combi_codec.empty() && options.count("level"))
|
||||
throw Exception("Wrong options, --level is not compatible with --codec list", ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
std::string method_family;
|
||||
if (use_lz4hc)
|
||||
method_family = "LZ4HC";
|
||||
else if (use_zstd)
|
||||
method_family = "ZSTD";
|
||||
|
||||
std::optional<int> level = std::nullopt;
|
||||
if (options.count("level"))
|
||||
level = options["level"].as<int>();
|
||||
|
||||
CompressionCodecPtr codec;
|
||||
if (!combi_codec.empty())
|
||||
{
|
||||
ParserCodec codec_parser;
|
||||
std::string combi_codec_line = boost::algorithm::join(combi_codec, ",");
|
||||
auto ast = parseQuery(codec_parser, "(" + combi_codec_line + ")", 0, DBMS_DEFAULT_MAX_PARSER_DEPTH);
|
||||
codec = CompressionCodecFactory::instance().get(ast, nullptr);
|
||||
}
|
||||
else
|
||||
codec = CompressionCodecFactory::instance().get(method_family, level);
|
||||
|
||||
codecs_to_compare.push_back(codec);
|
||||
}
|
||||
|
||||
/// Select column files to sample and estimate profit
|
||||
void CodecAdvisor::setSamplingColumnFiles(const std::string & part_path, const ColumnsDescription & column_descs)
|
||||
{
|
||||
Poco::DirectoryIterator end;
|
||||
for (Poco::DirectoryIterator it(part_path); it != end; ++it)
|
||||
{
|
||||
if (it->isFile() && endsWith(it->path(), ".bin"))
|
||||
{
|
||||
std::string file_path = it->path();
|
||||
std::string file_name = it.name();
|
||||
std::string column_name;
|
||||
if (isMapImplicitKey(file_name) && !isMapBaseFile(file_name))
|
||||
column_name = parseMapNameFromImplicitFileName(file_name);
|
||||
else if (endsWith(it->path(), ".null.bin"))
|
||||
column_name = unescapeForFileName(file_name.substr(0, file_name.size() - 9));
|
||||
else
|
||||
column_name = unescapeForFileName(file_name.substr(0, file_name.size() - 4));
|
||||
|
||||
if (column_descs.has(column_name))
|
||||
column_files_to_sample.push_back(std::make_shared<SamplingColumnFile>(file_path, column_name));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CodecAdvisor::execute()
|
||||
{
|
||||
size_t part_row_count;
|
||||
std::string part_count_path = absolute_part_path + "/count.txt";
|
||||
{
|
||||
ReadBufferFromFile in(part_count_path, METADATA_FILE_BUFFER_SIZE);
|
||||
readIntText(part_row_count, in);
|
||||
assertEOF(in);
|
||||
}
|
||||
|
||||
auto run_estimate_task = [&](const SamplingColumnFilePtr & column_file_to_sample) {
|
||||
std::string file_path = column_file_to_sample->file_path;
|
||||
column_file_to_sample->origin_file_size = std::filesystem::file_size(file_path) * sample_row_number / part_row_count;
|
||||
|
||||
CompressedReadBufferFromFile from(std::make_unique<ReadBufferFromFile>(file_path), true, 0, column_file_to_sample->origin_file_size, true);
|
||||
CompressedStatisticsCollectBuffer to(codecs_to_compare[0], block_size); /// TODO(weiping.qw): support comparing multiple codecs after FSST is imported.
|
||||
copyData(from, to);
|
||||
|
||||
column_file_to_sample->optimized_file_size = to.getCompressedBytes();
|
||||
};
|
||||
|
||||
ExceptionHandler exception_handler;
|
||||
///make queue size large enough to hold all tasks.
|
||||
ThreadPool pool(max_threads, max_threads, 100000);
|
||||
|
||||
for (const auto & file : column_files_to_sample)
|
||||
{
|
||||
pool.trySchedule(
|
||||
createExceptionHandledJob(
|
||||
[&, column_file_to_sample = file]() { run_estimate_task(column_file_to_sample); }
|
||||
, exception_handler
|
||||
)
|
||||
);
|
||||
}
|
||||
pool.wait();
|
||||
/// throw if exception during collecting compression info.
|
||||
exception_handler.throwIfException();
|
||||
}
|
||||
|
||||
void CodecAdvisor::serializeJson(WriteBuffer & buf, bool verbose)
|
||||
{
|
||||
size_t total_origin_file_size = 0;
|
||||
size_t total_optimized_file_size = 0;
|
||||
|
||||
std::unordered_map<std::string, size_t> column_origin_file_sizes;
|
||||
std::unordered_map<std::string, size_t> column_optimized_file_sizes;
|
||||
for (const auto & file : column_files_to_sample)
|
||||
{
|
||||
/// skip column without potential compression profit
|
||||
if (file->origin_file_size <= file->optimized_file_size)
|
||||
continue;
|
||||
|
||||
total_origin_file_size += file->origin_file_size;
|
||||
total_optimized_file_size += file->optimized_file_size;
|
||||
if (verbose)
|
||||
{
|
||||
if (column_origin_file_sizes.find(file->column_name) == column_origin_file_sizes.end())
|
||||
{
|
||||
column_origin_file_sizes.emplace(file->column_name, file->origin_file_size);
|
||||
column_optimized_file_sizes.emplace(file->column_name, file->optimized_file_size);
|
||||
}
|
||||
else
|
||||
{
|
||||
column_origin_file_sizes[file->column_name] += file->origin_file_size;
|
||||
column_optimized_file_sizes[file->column_name] += file->optimized_file_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (verbose)
|
||||
{
|
||||
bool first = true;
|
||||
writeString("\"columns\":[", buf);
|
||||
for (const auto & entry : column_origin_file_sizes)
|
||||
{
|
||||
if (first)
|
||||
first = false;
|
||||
else
|
||||
writeString(",", buf);
|
||||
std::string column_name = entry.first;
|
||||
writeString("{\"name\":\"", buf);
|
||||
writeString(column_name, buf);
|
||||
writeString("\",", buf);
|
||||
size_t column_origin_file_size = entry.second;
|
||||
size_t column_optimized_file_size = column_optimized_file_sizes[column_name];
|
||||
double column_estimated_profit =
|
||||
(column_origin_file_size == 0 || column_origin_file_size <= column_optimized_file_size)
|
||||
? 0 : (column_origin_file_size - column_optimized_file_size) * 100.0 / column_origin_file_size;
|
||||
writeString("\"codec\":{\"", buf);
|
||||
writeString(queryToString(codecs_to_compare[0]->getCodecDesc()), buf);
|
||||
writeString("\":{\"compression ratio\":", buf);
|
||||
writeFloatText(column_estimated_profit, buf);
|
||||
writeString("}}}", buf);
|
||||
}
|
||||
writeString("],", buf);
|
||||
}
|
||||
|
||||
double estimated_profit = (total_origin_file_size - total_optimized_file_size) * 100.0 / total_origin_file_size;
|
||||
|
||||
writeString("\"codec\":{\"compression ratio\":", buf);
|
||||
writeFloatText(estimated_profit, buf);
|
||||
writeString("}", buf);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,43 @@
|
|||
#pragma once
|
||||
|
||||
#include <boost/program_options.hpp>
|
||||
|
||||
#include "SchemaAdvisorHelpers.h"
|
||||
|
||||
#include <Compression/ICompressionCodec.h>
|
||||
#include <IO/WriteBuffer.h>
|
||||
#include <Storages/ColumnsDescription.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace po = boost::program_options;
|
||||
|
||||
class CodecAdvisor
|
||||
{
|
||||
private:
|
||||
SamplingColumnFiles column_files_to_sample;
|
||||
Codecs codecs_to_compare;
|
||||
const std::string absolute_part_path;
|
||||
const size_t sample_row_number;
|
||||
const size_t max_threads;
|
||||
unsigned block_size;
|
||||
|
||||
void parseCodecCandidates(const po::variables_map & options);
|
||||
void setSamplingColumnFiles(const std::string & part_path, const ColumnsDescription & column_descs);
|
||||
|
||||
public:
|
||||
CodecAdvisor(
|
||||
const po::variables_map & options,
|
||||
const ColumnsDescription & column_descs,
|
||||
std::string absolute_part_path,
|
||||
size_t sample_row_number,
|
||||
size_t max_threads);
|
||||
|
||||
virtual ~CodecAdvisor() = default;
|
||||
|
||||
void execute();
|
||||
void serializeJson(WriteBuffer & buf, bool verbose = false);
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,223 @@
|
|||
#include "ColumnUsageExtractor.h"
|
||||
#include "SchemaAdvisorHelpers.h"
|
||||
#include "MockEnvironment.h"
|
||||
|
||||
#include <Advisor/ColumnUsage.h>
|
||||
#include <Advisor/WorkloadQuery.h>
|
||||
#include <Common/Exception.h>
|
||||
#include <Common/ThreadPool.h>
|
||||
#include <Interpreters/Context_fwd.h>
|
||||
#include <Interpreters/DatabaseCatalog.h>
|
||||
#include <Parsers/formatAST.h>
|
||||
#include <Storages/KeyDescription.h>
|
||||
#include <boost/algorithm/string/replace.hpp>
|
||||
#include <bthread/mutex.h>
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <optional>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace
|
||||
{
|
||||
WorkloadQueries buildWorkloadQueriesCollectException(const std::vector<std::string> & queries,
|
||||
ContextPtr from_context,
|
||||
ThreadPool & query_thread_pool,
|
||||
MessageCollector & collector)
|
||||
{
|
||||
WorkloadQueries res(queries.size());
|
||||
for (size_t i = 0; i < queries.size(); ++i)
|
||||
{
|
||||
query_thread_pool.scheduleOrThrowOnError([&, i] {
|
||||
setThreadName("BuildQuery");
|
||||
try
|
||||
{
|
||||
res[i] = WorkloadQuery::build("q" + std::to_string(i), queries[i], from_context);
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
std::string msg = "failed to build query " + std::to_string(i) + "\nreason: " + getCurrentExceptionMessage(true)
|
||||
+ "\nsql: " + queries[i] + "\n";
|
||||
collector.collect(std::move(msg));
|
||||
}
|
||||
});
|
||||
}
|
||||
query_thread_pool.wait();
|
||||
res.erase(std::remove(res.begin(), res.end(), nullptr), res.end());
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
ColumnUsages ColumnUsageExtractor::extractColumnUsages(const std::vector<std::string> & queries) const
|
||||
{
|
||||
ThreadPool query_thread_pool{std::min<size_t>(max_threads, queries.size())};
|
||||
MessageCollector collector;
|
||||
WorkloadQueries workload_queries = buildWorkloadQueriesCollectException(queries, context, query_thread_pool, collector);
|
||||
if (queries.empty())
|
||||
throw Exception("No valid query has been extracted", ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
LOG_DEBUG(getLogger("ColumnUsageExtractor"), "Successfully planed {} / {} queries", workload_queries.size(), queries.size());
|
||||
collector.logCollectedError();
|
||||
|
||||
return buildColumnUsages(workload_queries);
|
||||
}
|
||||
|
||||
ColumnUsageExtractor::ColumnToScannedUsages ColumnUsageExtractor::extractUsageForLowCardinality(const ColumnUsages & column_usages) const
|
||||
{
|
||||
ColumnToScannedUsages res;
|
||||
for (const auto & [column, info] : column_usages)
|
||||
{
|
||||
if (MockEnvironment::isPrimaryKey(column, context))
|
||||
{
|
||||
LOG_DEBUG(getLogger("ColumnUsageExtractor"), "Column {} skipped because it is a primary key", column.getFullName());
|
||||
continue;
|
||||
}
|
||||
|
||||
auto scanned = info.getUsages(ColumnUsageType::SCANNED, /*only_source_table=*/false);
|
||||
if (scanned.empty())
|
||||
continue;
|
||||
|
||||
res.emplace(column, scanned.size());
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
ColumnUsageExtractor::ColumnToEqualityAndInUsages ColumnUsageExtractor::extractUsageForSkipIndex(const ColumnUsages & column_usages) const
|
||||
{
|
||||
/// if only interested in a specific table, do it here
|
||||
// std::erase_if(column_usages, [&](const auto & pair) { return pair.first.database != database || pair.first.table != table;});
|
||||
|
||||
ColumnToEqualityAndInUsages res;
|
||||
for (const auto & [column, info] : column_usages)
|
||||
{
|
||||
if (MockEnvironment::isPrimaryKey(column, context))
|
||||
{
|
||||
LOG_DEBUG(getLogger("ColumnUsageExtractor"), "Column {} skipped because it is a primary key", column.getFullName());
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t arraysetfunc_count = info.getFrequency(ColumnUsageType::ARRAY_SET_FUNCTION, /*only_source_table=*/false);
|
||||
size_t others_count = info.getFrequency(ColumnUsageType::OTHER_PREDICATE, /*only_source_table=*/false);
|
||||
|
||||
if (arraysetfunc_count)
|
||||
{
|
||||
auto arraysetfuncs = info.getUsages(ColumnUsageType::ARRAY_SET_FUNCTION, /*only_source_table=*/false);
|
||||
size_t total_count = arraysetfuncs.size() + others_count;
|
||||
/// TODO: Optimize the ColumnToEqualityAndInUsages struct?
|
||||
res.emplace(column, std::make_tuple(std::move(arraysetfuncs), std::vector<ColumnUsage>{}, total_count));
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
auto equalities = info.getUsages(ColumnUsageType::EQUALITY_PREDICATE, /*only_source_table=*/false);
|
||||
auto ins = info.getUsages(ColumnUsageType::IN_PREDICATE, /*only_source_table=*/false);
|
||||
size_t ranges_count = info.getFrequency(ColumnUsageType::RANGE_PREDICATE, /*only_source_table=*/false);
|
||||
|
||||
size_t total_count = equalities.size() + ins.size() + ranges_count + others_count;
|
||||
if (total_count == 0)
|
||||
{
|
||||
LOG_DEBUG(
|
||||
getLogger("ColumnUsageExtractor"),
|
||||
"Column {} skipped, total count: {}",
|
||||
column.getFullName(),
|
||||
total_count);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Keep the set size threshold limit on
|
||||
// Remove in lists whose in set size is larger than IN_LIST_SIZE_UPPER_BOUND
|
||||
std::erase_if(ins, [](const ColumnUsage & usage) {
|
||||
if (auto func = dynamic_pointer_cast<const ASTFunction>(usage.expression); func && func->name == "in")
|
||||
if (auto expr_list = dynamic_pointer_cast<const ASTExpressionList>(func->arguments); expr_list && expr_list->children.size() == 2)
|
||||
if (auto tuple = dynamic_pointer_cast<const ASTFunction>(expr_list->children[1]); tuple && tuple->name == "tuple")
|
||||
if (auto tuple_expr = dynamic_pointer_cast<const ASTExpressionList>(tuple->arguments))
|
||||
return tuple_expr->children.size() > IN_LIST_SIZE_UPPER_BOUND;
|
||||
return true;
|
||||
});
|
||||
|
||||
size_t eq_in_count = equalities.size() + ins.size();
|
||||
if (eq_in_count == 0)
|
||||
{
|
||||
LOG_DEBUG(
|
||||
getLogger("ColumnUsageExtractor"),
|
||||
"Column {} skipped, eq & in count: {}, total count: {}",
|
||||
column.getFullName(),
|
||||
eq_in_count,
|
||||
total_count);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Temply loosen the restrictions of in+equals proportion
|
||||
if (eq_in_count * 1.0 < total_count * EQUALITY_AND_IN_PREDICATE_THRESHOLD)
|
||||
{
|
||||
LOG_DEBUG(
|
||||
getLogger("ColumnUsageExtractor"),
|
||||
"Column {} maybe skipped, eq & in count: {}, total count: {}",
|
||||
column.getFullName(),
|
||||
eq_in_count,
|
||||
total_count);
|
||||
continue;
|
||||
}
|
||||
|
||||
LOG_DEBUG(
|
||||
getLogger("ColumnUsageExtractor"),
|
||||
"Column {} added, eq & in count: {}, total count: {}",
|
||||
column.getFullName(),
|
||||
eq_in_count,
|
||||
total_count);
|
||||
|
||||
res.emplace(column, std::make_tuple(std::move(equalities), std::move(ins), total_count));
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
ColumnUsageExtractor::ColumnToPrewherePredicateUsages ColumnUsageExtractor::extractUsageForPrewhere(const ColumnUsages & column_usages) const
|
||||
{
|
||||
/// if only interested in a specific table, do it here
|
||||
// std::erase_if(column_usages, [&](const auto & pair) { return pair.first.database != database || pair.first.table != table;});
|
||||
|
||||
ColumnToPrewherePredicateUsages res;
|
||||
for (const auto & [column, info] : column_usages)
|
||||
{
|
||||
if (MockEnvironment::isPrimaryKey(column, context))
|
||||
{
|
||||
LOG_DEBUG(getLogger("ColumnUsageExtractor"), "Column {} skipped because it is a primary key", column.getFullName());
|
||||
continue;
|
||||
}
|
||||
|
||||
auto equalities = info.getUsages(ColumnUsageType::EQUALITY_PREDICATE, /*only_source_table=*/false);
|
||||
auto ins = info.getUsages(ColumnUsageType::IN_PREDICATE, /*only_source_table=*/false);
|
||||
auto ranges = info.getUsages(ColumnUsageType::RANGE_PREDICATE, /*only_source_table=*/false);
|
||||
auto others = info.getUsages(ColumnUsageType::OTHER_PREDICATE, /*only_source_table=*/false);
|
||||
|
||||
size_t total_count = equalities.size() + ins.size() + ranges.size() + others.size();
|
||||
|
||||
if (total_count == 0)
|
||||
continue;
|
||||
|
||||
// Keep the set size threshold limit on
|
||||
// Remove in lists whose in set size is larger than IN_LIST_SIZE_UPPER_BOUND
|
||||
std::erase_if(ins, [](const ColumnUsage & usage) {
|
||||
if (auto func = dynamic_pointer_cast<const ASTFunction>(usage.expression); func && func->name == "in")
|
||||
if (auto expr_list = dynamic_pointer_cast<const ASTExpressionList>(func->arguments); expr_list && expr_list->children.size() == 2)
|
||||
if (auto tuple = dynamic_pointer_cast<const ASTFunction>(expr_list->children[1]); tuple && tuple->name == "tuple")
|
||||
if (auto tuple_expr = dynamic_pointer_cast<const ASTExpressionList>(tuple->arguments))
|
||||
return tuple_expr->children.size() > IN_LIST_SIZE_UPPER_BOUND;
|
||||
return true;
|
||||
});
|
||||
|
||||
res.emplace(column, std::make_tuple(std::move(equalities), std::move(ins), std::move(ranges), std::move(others), total_count));
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
} // DB
|
|
@ -0,0 +1,45 @@
|
|||
#pragma once
|
||||
|
||||
#include <Analyzers/QualifiedColumnName.h>
|
||||
#include <Advisor/ColumnUsage.h>
|
||||
#include <Interpreters/Context_fwd.h>
|
||||
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class ColumnUsageExtractor
|
||||
{
|
||||
public:
|
||||
// EqualityAndInUsages:
|
||||
// for skip index: equality_usages, in_usages, total_predicates
|
||||
// for bitmap index: arraysetfunc_usages, {}, total_predicates
|
||||
using EqualityAndInUsages = std::tuple<std::vector<ColumnUsage>, std::vector<ColumnUsage>, size_t>;
|
||||
using ColumnToEqualityAndInUsages = std::unordered_map<QualifiedColumnName, EqualityAndInUsages, QualifiedColumnNameHash>;
|
||||
using ColumnToScannedUsages = std::unordered_map<QualifiedColumnName, size_t, QualifiedColumnNameHash>;
|
||||
|
||||
// for prewhere: equality_usages, in_usages, range_usages, other_usages, total_predicates
|
||||
using PrewherePredicateUsages = std::tuple<std::vector<ColumnUsage>, std::vector<ColumnUsage>, std::vector<ColumnUsage>, std::vector<ColumnUsage>, size_t>;
|
||||
using ColumnToPrewherePredicateUsages = std::unordered_map<QualifiedColumnName, PrewherePredicateUsages, QualifiedColumnNameHash>;
|
||||
|
||||
|
||||
explicit ColumnUsageExtractor(ContextMutablePtr _context, size_t _max_threads): context(_context), max_threads(_max_threads) {}
|
||||
|
||||
ColumnUsages extractColumnUsages(const std::vector<std::string> & queries) const;
|
||||
ColumnToEqualityAndInUsages extractUsageForSkipIndex(const ColumnUsages & column_usages) const;
|
||||
ColumnToPrewherePredicateUsages extractUsageForPrewhere(const ColumnUsages & column_usages) const;
|
||||
|
||||
ColumnToScannedUsages extractUsageForLowCardinality(const ColumnUsages & column_usages) const;
|
||||
|
||||
private:
|
||||
ContextMutablePtr context;
|
||||
size_t max_threads;
|
||||
// which "in" filters are considered interesting
|
||||
static constexpr size_t IN_LIST_SIZE_UPPER_BOUND = 10;
|
||||
static constexpr float EQUALITY_AND_IN_PREDICATE_THRESHOLD = 0.5;
|
||||
};
|
||||
|
||||
} // DB
|
|
@ -0,0 +1,54 @@
|
|||
#include <city.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <common/unaligned.h>
|
||||
#include <common/types.h>
|
||||
|
||||
#include "CompressedStatisticsCollectBuffer.h"
|
||||
#include <Compression/CompressionFactory.h>
|
||||
|
||||
#include <Common/MemorySanitizer.h>
|
||||
#include <Common/MemoryTracker.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
}
|
||||
|
||||
static constexpr auto CHECKSUM_SIZE{sizeof(CityHash_v1_0_2::uint128)};
|
||||
|
||||
void CompressedStatisticsCollectBuffer::nextImpl()
|
||||
{
|
||||
if (!offset())
|
||||
return;
|
||||
|
||||
size_t decompressed_size = offset();
|
||||
UInt32 compressed_reserve_size = codec->getCompressedReserveSize(decompressed_size);
|
||||
compressed_buffer.resize(compressed_reserve_size);
|
||||
UInt32 compressed_size = codec->compress(working_buffer.begin(), decompressed_size, compressed_buffer.data());
|
||||
|
||||
// FIXME remove this after fixing msan report in lz4.
|
||||
// Almost always reproduces on stateless tests, the exact test unknown.
|
||||
__msan_unpoison(compressed_buffer.data(), compressed_size);
|
||||
|
||||
total_compressed_size += CHECKSUM_SIZE + compressed_size;
|
||||
}
|
||||
|
||||
|
||||
CompressedStatisticsCollectBuffer::CompressedStatisticsCollectBuffer(
|
||||
CompressionCodecPtr codec_,
|
||||
size_t buf_size)
|
||||
: BufferWithOwnMemory<WriteBuffer>(buf_size), codec(std::move(codec_))
|
||||
{
|
||||
}
|
||||
|
||||
CompressedStatisticsCollectBuffer::~CompressedStatisticsCollectBuffer()
|
||||
{
|
||||
/// FIXME move final flush into the caller
|
||||
next();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include <Common/PODArray.h>
|
||||
|
||||
#include <IO/WriteBuffer.h>
|
||||
#include <IO/BufferWithOwnMemory.h>
|
||||
#include <Compression/ICompressionCodec.h>
|
||||
#include <Compression/CompressionFactory.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class CompressedStatisticsCollectBuffer : public BufferWithOwnMemory<WriteBuffer>
|
||||
{
|
||||
private:
|
||||
CompressionCodecPtr codec;
|
||||
PODArray<char> compressed_buffer;
|
||||
size_t total_compressed_size = 0;
|
||||
|
||||
void nextImpl() override;
|
||||
|
||||
public:
|
||||
CompressedStatisticsCollectBuffer(
|
||||
CompressionCodecPtr codec_ = CompressionCodecFactory::instance().getDefaultCodec(),
|
||||
size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE);
|
||||
|
||||
/// The amount of compressed data
|
||||
size_t getCompressedBytes()
|
||||
{
|
||||
next();
|
||||
return total_compressed_size;
|
||||
}
|
||||
|
||||
/// How many uncompressed bytes were written to the buffer
|
||||
size_t getUncompressedBytes()
|
||||
{
|
||||
return count();
|
||||
}
|
||||
|
||||
/// How many bytes are in the buffer (not yet compressed)
|
||||
size_t getRemainingBytes()
|
||||
{
|
||||
nextIfAtEnd();
|
||||
return offset();
|
||||
}
|
||||
|
||||
~CompressedStatisticsCollectBuffer() override;
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,274 @@
|
|||
#include "IndexAdvisor.h"
|
||||
#include "ColumnUsageExtractor.h"
|
||||
#include "IO/WriteIntText.h"
|
||||
#include "SampleColumnReader.h"
|
||||
#include "Statistics.h"
|
||||
#include "SchemaAdvisorHelpers.h"
|
||||
#include "PotentialColumn.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <boost/algorithm/string/classification.hpp>
|
||||
#include <boost/algorithm/string/split.hpp>
|
||||
|
||||
#include "Common/Exception.h"
|
||||
#include <Common/ThreadPool.h>
|
||||
#include <Core/NamesAndTypes.h>
|
||||
#include <DataTypes/DataTypeNullable.h>
|
||||
#include <IO/copyData.h>
|
||||
#include <IO/ReadBufferFromFile.h>
|
||||
#include <IO/ReadBufferFromFileDescriptor.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <IO/WriteBufferFromFile.h>
|
||||
#include <IO/WriteBufferFromFileDescriptor.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
static constexpr double ADVISOR_HIGH_CARDINALITY_NDV_THRESHOLD = 0.33;
|
||||
|
||||
IndexAdvisor::IndexAdvisor(
|
||||
MockEnvironment & env_,
|
||||
const po::variables_map & options_,
|
||||
size_t sample_row_number_,
|
||||
size_t max_threads_)
|
||||
: env(env_)
|
||||
, options(options_)
|
||||
, sample_row_number(sample_row_number_)
|
||||
, max_threads(max_threads_)
|
||||
{
|
||||
}
|
||||
|
||||
// High_Cardinality_Threshold:
|
||||
// 20w / 65536 = sample_row_number / ndv => ndv ~ 1/3 sample_row_number
|
||||
// For skip index, ndv > High_Cardinality_Threshold
|
||||
// For bitmap index, 10 < ndv <= High_Cardinality_Threshold
|
||||
bool checkColumnCardinality(String column_name, size_t ndv, size_t sample_row_number, PotentialIndexType & type)
|
||||
{
|
||||
bool basic_cardinality = ndv > 10;
|
||||
bool high_cardinality = ndv > ADVISOR_HIGH_CARDINALITY_NDV_THRESHOLD * sample_row_number;
|
||||
|
||||
auto get_ndv_check_msg = [&]() -> String
|
||||
{
|
||||
if (type == PotentialIndexType::BITMAP_INDEX)
|
||||
{
|
||||
if (!basic_cardinality)
|
||||
return fmt::format("Column {} skipped because the ndv ({}) is less than 10", column_name, ndv);
|
||||
if (high_cardinality)
|
||||
type = PotentialIndexType::SEGMENT_BITMAP_INDEX;
|
||||
}
|
||||
if (type == PotentialIndexType::BLOOM_FILTER)
|
||||
{
|
||||
if (!high_cardinality)
|
||||
return fmt::format("Column {} skipped because of the array ndv({}) / sample_rows({}) < threshold({})", column_name, ndv, sample_row_number, ADVISOR_HIGH_CARDINALITY_NDV_THRESHOLD);
|
||||
}
|
||||
return "";
|
||||
};
|
||||
|
||||
auto check_ndv_msg = get_ndv_check_msg();
|
||||
if (!check_ndv_msg.empty())
|
||||
{
|
||||
LOG_DEBUG(getLogger("ColumnUsageExtractor"), check_ndv_msg);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
|
||||
void IndexAdvisor::execute()
|
||||
{
|
||||
auto context = createContext(options, env);
|
||||
auto queries = loadQueries(options);
|
||||
|
||||
LOG_DEBUG(getLogger("ColumnUsageExtractor"), "++++++++++ begin to executor index advisor ++++++++++");
|
||||
|
||||
ColumnUsageExtractor extractor(context, max_threads);
|
||||
auto column_usages = extractor.extractColumnUsages(queries);
|
||||
auto skip_index_usages = extractor.extractUsageForSkipIndex(column_usages);
|
||||
|
||||
auto make_predicate_info = [&](PredicateInfos & predicate_infos, ColumnUsageType predicate_type, const std::vector<ColumnUsage> & column_usages_) {
|
||||
PredicateExpressions predicate_expressions;
|
||||
size_t total_predicate_expression = 0;
|
||||
for (const auto & equality_usage : column_usages_)
|
||||
{
|
||||
auto & count = predicate_expressions[equality_usage.expression];
|
||||
++count;
|
||||
total_predicate_expression++;
|
||||
}
|
||||
predicate_infos.insert({predicate_type, {predicate_expressions, total_predicate_expression}});
|
||||
};
|
||||
|
||||
UniExtract uniq_extract;
|
||||
for (const auto & index_usage : skip_index_usages)
|
||||
{
|
||||
auto column_info = index_usage.first;
|
||||
|
||||
auto storage = MockEnvironment::tryGetLocalTable(column_info.database, column_info.table, context);
|
||||
if (!storage)
|
||||
throw Exception(column_info.database + "(" + column_info.table + "): can not find local table.", ErrorCodes::NOT_FOUND_EXPECTED_DATA_PART);
|
||||
|
||||
auto metadata = storage->getInMemoryMetadataCopy();
|
||||
auto column_and_type = metadata.getColumns().tryGetColumn(GetColumnsOptions::Kind::AllPhysical, column_info.column);
|
||||
if (!column_and_type)
|
||||
continue;
|
||||
|
||||
auto column_type = column_and_type->type;
|
||||
|
||||
bool check_bitmap_index = false;
|
||||
bool already_bitmap_index = false;
|
||||
if (isArray(column_type))
|
||||
{
|
||||
if (column_type->isBitmapIndex() || column_type->isSegmentBitmapIndex())
|
||||
{
|
||||
LOG_DEBUG(getLogger("ColumnUsageExtractor"), "Column " + column_info.column + " skipped because has already been a bitmap index column");
|
||||
// continue;
|
||||
already_bitmap_index = true;
|
||||
}
|
||||
check_bitmap_index = true;
|
||||
}
|
||||
|
||||
std::vector<std::string> data_path_list;
|
||||
// if (options.count("path"))
|
||||
// {
|
||||
// std::string path = options["path"].as<std::string>();
|
||||
// if (!endsWith(path, "/"))
|
||||
// path.append("/");
|
||||
// data_path_list.emplace_back(path);
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
boost::split(data_path_list, options["data-path-list"].as<std::string>(), boost::is_any_of(" ,"));
|
||||
for (auto & i : data_path_list)
|
||||
{
|
||||
if (!endsWith(i, "/"))
|
||||
i = i.append("/");
|
||||
}
|
||||
// }
|
||||
|
||||
std::string absolute_part_path;
|
||||
try
|
||||
{
|
||||
absolute_part_path = selectPartPath(options, data_path_list, storage->getStorageID().getDatabaseName(), storage->getStorageID().getTableName(), sample_row_number);
|
||||
}
|
||||
catch (Exception & e)
|
||||
{
|
||||
if (e.code() == ErrorCodes::NOT_FOUND_EXPECTED_DATA_PART)
|
||||
{
|
||||
LOG_DEBUG(
|
||||
getLogger("ColumnUsageExtractor"),
|
||||
"Can't find suitable part for table " + column_info.database + "." + column_info.table
|
||||
+ ", maybe because of the total part rows < " + std::to_string(sample_row_number));
|
||||
continue;
|
||||
}
|
||||
else
|
||||
throw e;
|
||||
}
|
||||
|
||||
SampleColumnReader reader(absolute_part_path + "/", 0, sample_row_number);
|
||||
ColumnPtr column;
|
||||
try
|
||||
{
|
||||
column = reader.readColumn({index_usage.first.column, column_type});
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
// Just skip the column if it can't be read
|
||||
LOG_DEBUG(
|
||||
getLogger("ColumnUsageExtractor"),
|
||||
"Can't read column file " + index_usage.first.column + " from table " + column_info.database + "." + column_info.table
|
||||
+ ", error message: "
|
||||
+ getCurrentExceptionMessage(true));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (check_bitmap_index)
|
||||
{
|
||||
size_t ndv = uniq_extract.executeOnColumnArray(column, column_type).get<UInt64>();
|
||||
auto bitmap_index_type = already_bitmap_index ? PotentialIndexType::ALREADY_BITMAP_INDEX : PotentialIndexType::BITMAP_INDEX;
|
||||
if (!checkColumnCardinality(column_info.getFullName(), ndv, sample_row_number, bitmap_index_type))
|
||||
continue;
|
||||
|
||||
StatisticInfo statistic_info{ndv, sample_row_number, std::get<2>(index_usage.second)};
|
||||
|
||||
PredicateInfos predicate_infos;
|
||||
make_predicate_info(predicate_infos, ColumnUsageType::ARRAY_SET_FUNCTION, std::get<0>(index_usage.second));
|
||||
|
||||
PotentialColumnInfo potential_column{bitmap_index_type, statistic_info, predicate_infos};
|
||||
potential_columns.insert({std::move(column_info), std::move(potential_column)});
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// All following: check skip index
|
||||
size_t ndv = uniq_extract.executeOnColumn(column, column_type).get<UInt64>();
|
||||
auto skip_index_type = PotentialIndexType::BLOOM_FILTER;
|
||||
if (!checkColumnCardinality(column_info.getFullName(), ndv, sample_row_number, skip_index_type))
|
||||
continue;
|
||||
|
||||
StatisticInfo statistic_info{ndv, sample_row_number, std::get<2>(index_usage.second)};
|
||||
|
||||
PredicateInfos predicate_infos;
|
||||
make_predicate_info(predicate_infos, ColumnUsageType::EQUALITY_PREDICATE, std::get<0>(index_usage.second));
|
||||
make_predicate_info(predicate_infos, ColumnUsageType::IN_PREDICATE, std::get<1>(index_usage.second));
|
||||
|
||||
PotentialColumnInfo potential_column{skip_index_type, statistic_info, predicate_infos};
|
||||
potential_columns.insert({std::move(column_info), std::move(potential_column)});
|
||||
}
|
||||
|
||||
LOG_DEBUG(getLogger("ColumnUsageExtractor"), "Extracted {} column usages", potential_columns.size());
|
||||
for ([[maybe_unused]] auto & [column_info, potential_column] : potential_columns)
|
||||
{
|
||||
std::stringstream ss;
|
||||
ss << column_info.getFullName() << "\tindex_type:" << toString(potential_column.index_type)
|
||||
<< "\tsample_ndv:" << potential_column.statistic_info.sample_ndv
|
||||
<< "\tsample_row_num:" << potential_column.statistic_info.sample_row_num << "\ttarget_expression_cnt:"
|
||||
<< potential_column.predicate_infos[ColumnUsageType::EQUALITY_PREDICATE].total
|
||||
+ potential_column.predicate_infos[ColumnUsageType::IN_PREDICATE].total
|
||||
+ potential_column.predicate_infos[ColumnUsageType::ARRAY_SET_FUNCTION].total
|
||||
<< "\ttotal_expr count:" << potential_column.statistic_info.total_predicates;
|
||||
|
||||
LOG_DEBUG(getLogger("ColumnUsageExtractor"), ss.str());
|
||||
}
|
||||
}
|
||||
|
||||
/// TODO: seperate two indices
|
||||
void IndexAdvisor::serializeJson(WriteBuffer & buf, bool /* verbose */)
|
||||
{
|
||||
bool first = true;
|
||||
writeString("\"index\":[", buf);
|
||||
for (auto & [column_info, potential_column] : potential_columns)
|
||||
{
|
||||
if (first)
|
||||
first = false;
|
||||
else
|
||||
writeString(",", buf);
|
||||
|
||||
writeString(R"({"db":")", buf);
|
||||
writeString(column_info.database, buf);
|
||||
writeString(R"(","table":")", buf);
|
||||
writeString(column_info.table, buf);
|
||||
writeString(R"(","column_name":")", buf);
|
||||
writeString(column_info.column, buf);
|
||||
|
||||
writeString(R"(","index_type":")", buf);
|
||||
writeString(toString(potential_column.index_type), buf);
|
||||
|
||||
writeString(R"(","sample_ndv":")", buf);
|
||||
writeIntText(potential_column.statistic_info.sample_ndv, buf);
|
||||
writeString(R"(","sample_row_num":")", buf);
|
||||
writeIntText(potential_column.statistic_info.sample_row_num, buf);
|
||||
|
||||
// The usage type (EQUALITY_PREDICATE + IN_PREDICATE) and (ARRAY_SET_FUNCTION)
|
||||
// will not appear at the same time, so we can simply add the cnt
|
||||
size_t target_expression_cnt = potential_column.predicate_infos[ColumnUsageType::EQUALITY_PREDICATE].total
|
||||
+ potential_column.predicate_infos[ColumnUsageType::IN_PREDICATE].total
|
||||
+ potential_column.predicate_infos[ColumnUsageType::ARRAY_SET_FUNCTION].total;
|
||||
writeString(R"(","target_expression_cnt":")", buf);
|
||||
writeIntText(target_expression_cnt, buf);
|
||||
writeString(R"(","total_expression_cnt":")", buf);
|
||||
writeIntText(potential_column.statistic_info.total_predicates, buf);
|
||||
writeString("\"}", buf);
|
||||
}
|
||||
writeString("]", buf);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
#pragma once
|
||||
|
||||
#include "MockEnvironment.h"
|
||||
#include "PotentialColumn.h"
|
||||
|
||||
#include <boost/program_options/variables_map.hpp>
|
||||
|
||||
#include <Compression/ICompressionCodec.h>
|
||||
#include <Storages/ColumnsDescription.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace po = boost::program_options;
|
||||
|
||||
class IndexAdvisor
|
||||
{
|
||||
private:
|
||||
MockEnvironment & env;
|
||||
po::variables_map options;
|
||||
const size_t sample_row_number;
|
||||
const size_t max_threads;
|
||||
|
||||
PotentialColumns potential_columns;
|
||||
|
||||
public:
|
||||
IndexAdvisor(
|
||||
MockEnvironment & env_,
|
||||
const po::variables_map & options_,
|
||||
size_t sample_row_number,
|
||||
size_t max_threads);
|
||||
|
||||
virtual ~IndexAdvisor() = default;
|
||||
|
||||
void execute();
|
||||
void serializeJson(WriteBuffer & buf, bool verbose = false);
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,330 @@
|
|||
#include "MockEnvironment.h"
|
||||
#include "MockGlobalContext.h"
|
||||
#include "SchemaAdvisorHelpers.h"
|
||||
|
||||
#include <AggregateFunctions/registerAggregateFunctions.h>
|
||||
#include <Analyzers/QueryAnalyzer.h>
|
||||
#include <Analyzers/QueryRewriter.h>
|
||||
#include <common/logger_useful.h>
|
||||
#include <Common/Exception.h>
|
||||
#include <Core/UUID.h>
|
||||
#include <Databases/DatabaseMemory.h>
|
||||
#include <Disks/registerDisks.h>
|
||||
#include <Formats/registerFormats.h>
|
||||
#include <Functions/registerFunctions.h>
|
||||
#include <Interpreters/Context.h>
|
||||
#include <Interpreters/DatabaseCatalog.h>
|
||||
#include <Interpreters/executeQuery.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <Parsers/ASTCreateQuery.h>
|
||||
#include <Parsers/ASTLiteral.h>
|
||||
#include <Parsers/ASTSetQuery.h>
|
||||
#include <Parsers/IAST_fwd.h>
|
||||
#include <Parsers/IParserBase.h>
|
||||
#include <Parsers/parseQuery.h>
|
||||
#include <Parsers/ParserQuery.h>
|
||||
#include <QueryPlan/Hints/registerHints.h>
|
||||
#include <QueryPlan/QueryPlan.h>
|
||||
#include <QueryPlan/QueryPlanner.h>
|
||||
#include <Statistics/CacheManager.h>
|
||||
#include <Storages/IStorage_fwd.h>
|
||||
#include <Storages/ColumnsDescription.h>
|
||||
#include <Storages/StorageDistributed.h>
|
||||
#include <Storages/StorageMergeTree.h>
|
||||
#include <Storages/System/attachSystemTables.h>
|
||||
#include <Storages/registerStorages.h>
|
||||
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <optional>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <memory>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
namespace
|
||||
{
|
||||
std::string readString(const std::string & file_path)
|
||||
{
|
||||
std::ifstream fin(file_path);
|
||||
std::stringstream buffer;
|
||||
buffer << fin.rdbuf();
|
||||
return buffer.str();
|
||||
}
|
||||
}
|
||||
|
||||
MockEnvironment::MockEnvironment(const std::string & path, size_t max_threads)
|
||||
: session_context(MockGlobalContext::instance().createSessionContext())
|
||||
, actual_folder(path)
|
||||
, mock_folder(std::filesystem::path{"/tmp"} / ("advisor_tool_" + toString(UUIDHelpers::generateV4())))
|
||||
{
|
||||
session_context->setPath(mock_folder.string() + '/');
|
||||
session_context->setMetastorePath((mock_folder / METASTORE).string() + '/');
|
||||
|
||||
SettingsChanges setting_changes;
|
||||
setting_changes.emplace_back("max_threads", max_threads);
|
||||
setting_changes.emplace_back("enable_memory_catalog", true);
|
||||
session_context->applySettingsChanges(setting_changes);
|
||||
std::filesystem::remove_all(mock_folder);
|
||||
std::filesystem::create_directories(mock_folder);
|
||||
std::filesystem::create_directories(mock_folder / METASTORE);
|
||||
std::filesystem::create_directories(mock_folder / METADATA);
|
||||
std::filesystem::create_directories(mock_folder / DATA);
|
||||
|
||||
registerFunctions();
|
||||
registerFormats();
|
||||
registerStorages();
|
||||
registerAggregateFunctions();
|
||||
registerHints();
|
||||
registerDisks();
|
||||
Statistics::CacheManager::initialize(session_context);
|
||||
|
||||
// make system database
|
||||
DatabasePtr system_database = DatabaseCatalog::instance().tryGetDatabase(DatabaseCatalog::SYSTEM_DATABASE, session_context);
|
||||
if (!system_database)
|
||||
{
|
||||
system_database = std::make_shared<DatabaseMemory>(DatabaseCatalog::SYSTEM_DATABASE, session_context);
|
||||
DatabaseCatalog::instance().attachDatabase(DatabaseCatalog::SYSTEM_DATABASE, system_database);
|
||||
attachSystemTablesLocal(*system_database);
|
||||
}
|
||||
}
|
||||
|
||||
MockEnvironment::~MockEnvironment()
|
||||
{
|
||||
for (const auto & [name, database] : DatabaseCatalog::instance().getDatabases(session_context))
|
||||
{
|
||||
for (auto it = database->getTablesIterator(session_context); it->isValid(); it->next())
|
||||
{
|
||||
database->dropTable(session_context, it->name(), /*no_delay=*/true);
|
||||
}
|
||||
database->drop(session_context);
|
||||
}
|
||||
|
||||
std::filesystem::remove_all(mock_folder);
|
||||
}
|
||||
|
||||
std::vector<std::string> MockEnvironment::listDatabases()
|
||||
{
|
||||
std::vector<std::string> res;
|
||||
auto meta_path = actual_folder / METADATA;
|
||||
if (!std::filesystem::exists(meta_path))
|
||||
throw Exception("cannot find metadata", ErrorCodes::CANNOT_OPEN_FILE);
|
||||
for (const auto & file : std::filesystem::directory_iterator{meta_path})
|
||||
{
|
||||
const std::filesystem::path & fullname = file.path();
|
||||
if (fullname.extension() == ".sql")
|
||||
res.emplace_back(fullname.stem().string());
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
std::vector<std::string> MockEnvironment::listTables(const std::string & database)
|
||||
{
|
||||
std::vector<std::string> res;
|
||||
auto meta_path = actual_folder / METADATA / database;
|
||||
if (!std::filesystem::exists(meta_path))
|
||||
throw Exception("cannot find metadata", ErrorCodes::CANNOT_OPEN_FILE);
|
||||
for (const auto & file : std::filesystem::directory_iterator{meta_path})
|
||||
{
|
||||
const std::filesystem::path & fullname = file.path();
|
||||
if (fullname.extension() == ".sql")
|
||||
res.emplace_back(fullname.stem().string());
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
bool MockEnvironment::containsDatabase(const std::string & database)
|
||||
{
|
||||
return std::filesystem::exists(actual_folder / METADATA / (database + ".sql"));
|
||||
}
|
||||
|
||||
bool MockEnvironment::containsTable(const std::string & database, const std::string & table)
|
||||
{
|
||||
return std::filesystem::exists(actual_folder / METADATA / database / (table + ".sql"));
|
||||
}
|
||||
|
||||
std::string MockEnvironment::getCreateDatabaseSql(const std::string & database)
|
||||
{
|
||||
if (!containsDatabase(database))
|
||||
throw Exception("cannot find database " + database, ErrorCodes::CANNOT_OPEN_FILE);
|
||||
return readString(actual_folder / METADATA / (database + ".sql"));
|
||||
}
|
||||
|
||||
std::string MockEnvironment::getCreateTableSql(const std::string & database, const std::string & table)
|
||||
{
|
||||
if (!containsTable(database, table))
|
||||
throw Exception("cannot find table " + database + "." + table, ErrorCodes::CANNOT_OPEN_FILE);
|
||||
return readString(actual_folder / METADATA / database / (table + ".sql"));
|
||||
}
|
||||
|
||||
ColumnsDescription MockEnvironment::getColumnsDescription(const std::string & database, const std::string & table)
|
||||
{
|
||||
std::string create_table = getCreateTableSql(database, table);
|
||||
ContextMutablePtr context = createQueryContext();
|
||||
auto ast = parse(create_table, context)->as<ASTCreateQuery &>();
|
||||
return InterpreterCreateQuery::getColumnsDescription(*ast.columns_list->columns, context, ast.attach, false);
|
||||
}
|
||||
|
||||
ContextMutablePtr MockEnvironment::createQueryContext()
|
||||
{
|
||||
ContextMutablePtr query_context = Context::createCopy(session_context);
|
||||
query_context->createPlanNodeIdAllocator();
|
||||
query_context->createSymbolAllocator();
|
||||
query_context->makeQueryContext();
|
||||
return query_context;
|
||||
}
|
||||
|
||||
ASTPtr MockEnvironment::parse(std::string_view sql, ContextPtr query_context)
|
||||
{
|
||||
const char * begin = sql.data();
|
||||
const char * end = begin + sql.size();
|
||||
ParserQuery parser(end, ParserSettings::valueOf(query_context->getSettingsRef()));
|
||||
return parseQuery(
|
||||
parser, begin, end, "",
|
||||
query_context->getSettingsRef().max_query_size,
|
||||
query_context->getSettingsRef().max_parser_depth);
|
||||
}
|
||||
|
||||
QueryPlanPtr MockEnvironment::plan(std::string_view sql, ContextMutablePtr query_context)
|
||||
{
|
||||
ASTPtr ast = parse(sql, query_context);
|
||||
ast = QueryRewriter().rewrite(ast, query_context);
|
||||
AnalysisPtr analysis = QueryAnalyzer::analyze(ast, query_context);
|
||||
QueryPlanPtr query_plan = QueryPlanner().plan(ast, *analysis, query_context);
|
||||
return query_plan;
|
||||
}
|
||||
|
||||
void MockEnvironment::execute(const std::string & sql, ContextMutablePtr query_context)
|
||||
{
|
||||
executeQuery(sql, query_context, /*internal=*/true);
|
||||
}
|
||||
|
||||
void MockEnvironment::createMockDatabase(const std::string & database)
|
||||
{
|
||||
if (DatabaseCatalog::instance().isDatabaseExist(database, session_context))
|
||||
return;
|
||||
ContextMutablePtr query_context = createQueryContext();
|
||||
std::string sql = getCreateDatabaseSql(database);
|
||||
// the sql is "attach _ ..." in metadata, we revert it
|
||||
auto ast = dynamic_pointer_cast<ASTCreateQuery>(parse(sql, query_context));
|
||||
if (!ast)
|
||||
throw Exception("failed to create database " + database + ", invalid sql: " + sql, ErrorCodes::BAD_ARGUMENTS);
|
||||
ast->attach = false;
|
||||
ast->database = database;
|
||||
ast->uuid = UUIDHelpers::Nil;
|
||||
// there are some problems with destructing an Atomic database, so we force to memory
|
||||
if (ast->storage && ast->storage->engine)
|
||||
ast->storage->engine->name = "Memory";
|
||||
ast->cluster = "";
|
||||
execute(serializeAST(*ast), query_context);
|
||||
}
|
||||
|
||||
void MockEnvironment::createMockTable(const std::string & database, const std::string & table)
|
||||
{
|
||||
createMockDatabase(database);
|
||||
if (DatabaseCatalog::instance().getDatabase(database, session_context)->isTableExist(table, session_context))
|
||||
return;
|
||||
ContextMutablePtr query_context = createQueryContext();
|
||||
SettingsChanges setting_changes;
|
||||
setting_changes.emplace_back("enable_constraint_check", false);
|
||||
setting_changes.emplace_back("allow_nullable_key", true);
|
||||
query_context->applySettingsChanges(setting_changes);
|
||||
|
||||
std::string sql = getCreateTableSql(database, table);
|
||||
// the sql is "attach _ ..." in metadata, we revert it
|
||||
auto ast = dynamic_pointer_cast<ASTCreateQuery>(parse(sql, query_context));
|
||||
if (!ast)
|
||||
throw Exception("failed to create table " + database + "." + table + ", invalid sql: " + sql, ErrorCodes::BAD_ARGUMENTS);
|
||||
ast->attach = false;
|
||||
ast->database = database;
|
||||
ast->table = table;
|
||||
ast->uuid = UUIDHelpers::Nil;
|
||||
ast->cluster = "";
|
||||
|
||||
if (ast->storage && ast->storage->engine)
|
||||
{
|
||||
auto engine_name = ast->storage->engine->name;
|
||||
if (engine_name == "Distributed")
|
||||
ast->storage->engine->arguments->children[0] = std::make_shared<ASTLiteral>(MockGlobalContext::ADVISOR_SHARD);
|
||||
else if (engine_name.starts_with("Ha"))
|
||||
{
|
||||
// HaUniqueMergeTree, HaMergeTree require zookeeper
|
||||
engine_name = engine_name.substr(2, engine_name.length());
|
||||
ASTPtr mock_engine = makeASTFunction(engine_name);
|
||||
ast->storage->set(ast->storage->engine, mock_engine);
|
||||
}
|
||||
|
||||
if (engine_name == "MergeTree")
|
||||
{
|
||||
ASTSetQuery * settings = ast->storage->settings;
|
||||
if (!settings)
|
||||
ast->storage->set(settings, std::make_shared<ASTSetQuery>());
|
||||
settings->is_standalone = false;
|
||||
settings->changes.emplace_back("enable_metastore", false);
|
||||
}
|
||||
|
||||
if (engine_name == "UniqueMergeTree")
|
||||
{
|
||||
ASTSetQuery * settings = ast->storage->settings;
|
||||
if (!settings)
|
||||
ast->storage->set(settings, std::make_shared<ASTSetQuery>());
|
||||
settings->is_standalone = false;
|
||||
settings->changes.emplace_back("part_writer_flag", true);
|
||||
settings->changes.emplace_back("enable_metastore", false);
|
||||
}
|
||||
}
|
||||
|
||||
std::string create_sql = serializeAST(*ast);
|
||||
try
|
||||
{
|
||||
execute(std::move(create_sql), query_context);
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
LOG_ERROR(getLogger("MockEnvironment"), "Create table {} failed: {}", table, getCurrentExceptionMessage(true));
|
||||
}
|
||||
}
|
||||
|
||||
bool MockEnvironment::isPrimaryKey(const QualifiedColumnName & column, ContextPtr context)
|
||||
{
|
||||
StoragePtr table = tryGetLocalTable(column.database, column.table, context);
|
||||
|
||||
if (!table)
|
||||
return false;
|
||||
|
||||
auto metadata = table->getInMemoryMetadataCopy();
|
||||
std::optional<KeyDescription> primary_key = std::nullopt;
|
||||
if (metadata.isPrimaryKeyDefined())
|
||||
primary_key = metadata.getPrimaryKey();
|
||||
// From CH: By default the primary key is the same as the sorting key (which is specified by the ORDER BY clause).
|
||||
// Thus in most cases it is unnecessary to specify a separate PRIMARY KEY clause.
|
||||
else if (auto merge_tree = dynamic_pointer_cast<StorageMergeTree>(table); merge_tree && metadata.isSortingKeyDefined())
|
||||
primary_key = metadata.getSortingKey();
|
||||
|
||||
if (!primary_key)
|
||||
return false;
|
||||
|
||||
const auto & primary_key_columns = primary_key.value().expression->getRequiredColumns();
|
||||
return std::find(primary_key_columns.begin(), primary_key_columns.end(), column.column) != primary_key_columns.end();
|
||||
}
|
||||
|
||||
StoragePtr MockEnvironment::tryGetLocalTable(const std::string & database_name, const std::string & table_name, ContextPtr context)
|
||||
{
|
||||
StoragePtr table;
|
||||
|
||||
if (DatabasePtr database = DatabaseCatalog::instance().tryGetDatabase(database_name, context))
|
||||
table = database->tryGetTable(table_name, context);
|
||||
|
||||
if (auto distributed = dynamic_pointer_cast<StorageDistributed>(table))
|
||||
if (auto remote_database = DatabaseCatalog::instance().tryGetDatabase(distributed->getRemoteDatabaseName(), context))
|
||||
if (auto remote_table = remote_database->tryGetTable(distributed->getRemoteTableName(), context))
|
||||
table = remote_table;
|
||||
|
||||
return table;
|
||||
}
|
||||
|
||||
|
||||
|
||||
} // DB
|
|
@ -0,0 +1,63 @@
|
|||
#pragma once
|
||||
|
||||
#include "Interpreters/StorageID.h"
|
||||
#include "MockGlobalContext.h"
|
||||
|
||||
#include <Analyzers/QualifiedColumnName.h>
|
||||
#include <DataTypes/IDataType.h>
|
||||
#include <Interpreters/Context_fwd.h>
|
||||
#include <Parsers/IAST_fwd.h>
|
||||
#include <QueryPlan/QueryPlan.h>
|
||||
#include <Storages/ColumnsDescription.h>
|
||||
#include <Storages/IStorage_fwd.h>
|
||||
|
||||
#include <filesystem>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class MockEnvironment
|
||||
{
|
||||
public:
|
||||
explicit MockEnvironment(const std::string & path, size_t max_threads);
|
||||
MockEnvironment(const MockEnvironment &other) = delete;
|
||||
~MockEnvironment();
|
||||
|
||||
// list the available databases or tables under the give path
|
||||
std::vector<std::string> listDatabases();
|
||||
std::vector<std::string> listTables(const std::string & database);
|
||||
bool containsDatabase(const std::string & database);
|
||||
bool containsTable(const std::string & database, const std::string & table);
|
||||
|
||||
// get the create-table/database sql
|
||||
std::string getCreateDatabaseSql(const std::string & database);
|
||||
std::string getCreateTableSql(const std::string & database, const std::string & table);
|
||||
ColumnsDescription getColumnsDescription(const std::string & database, const std::string & table);
|
||||
|
||||
// mock the query execution environment
|
||||
ContextMutablePtr createQueryContext();
|
||||
ASTPtr parse(std::string_view sql, ContextPtr query_context);
|
||||
QueryPlanPtr plan(std::string_view sql, ContextMutablePtr query_context); // no optimize
|
||||
void execute(const std::string & sql, ContextMutablePtr query_context); // supposed to execute ddl only
|
||||
|
||||
void createMockDatabase(const std::string & database);
|
||||
void createMockTable(const std::string & database, const std::string & table);
|
||||
|
||||
static bool isPrimaryKey(const QualifiedColumnName & column, ContextPtr context);
|
||||
static StoragePtr tryGetLocalTable(const std::string & database_name, const std::string & table_name, ContextPtr context);
|
||||
|
||||
private:
|
||||
ContextMutablePtr session_context;
|
||||
const std::filesystem::path actual_folder;
|
||||
const std::filesystem::path mock_folder;
|
||||
static constexpr const char * METADATA = "metadata";
|
||||
static constexpr const char * METASTORE = "metastore";
|
||||
static constexpr const char * DATA = "data";
|
||||
};
|
||||
|
||||
|
||||
} // DB
|
|
@ -0,0 +1,68 @@
|
|||
#include "MockGlobalContext.h"
|
||||
|
||||
#include <Common/Config/ConfigProcessor.h>
|
||||
#include <Interpreters/Context.h>
|
||||
#include <Interpreters/Context_fwd.h>
|
||||
#include <Poco/AutoPtr.h>
|
||||
#include <Poco/DOM/Document.h>
|
||||
#include <Poco/DOM/Element.h>
|
||||
#include <Poco/DOM/Text.h>
|
||||
#include <Poco/Util/XMLConfiguration.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
ContextMutablePtr MockGlobalContext::createSessionContext()
|
||||
{
|
||||
ContextMutablePtr session_context = Context::createCopy(context);
|
||||
session_context->makeSessionContext();
|
||||
return session_context;
|
||||
}
|
||||
|
||||
MockGlobalContext::MockGlobalContext()
|
||||
{
|
||||
shared_context = Context::createShared();
|
||||
context = Context::createGlobal(shared_context.get());
|
||||
context->makeGlobalContext();
|
||||
ConfigurationPtr configuration(new Poco::Util::XMLConfiguration(MockGlobalContext::mockConfig()));
|
||||
context->setConfig(configuration);
|
||||
}
|
||||
|
||||
XMLDocumentPtr MockGlobalContext::mockConfig()
|
||||
{
|
||||
XMLDocumentPtr document = new Poco::XML::Document();
|
||||
Poco::AutoPtr<Poco::XML::Element> yandex = document->createElement("yandex");
|
||||
Poco::AutoPtr<Poco::XML::Element> remote_servers = document->createElement("remote_servers");
|
||||
Poco::AutoPtr<Poco::XML::Element> advisor_shard = document->createElement(ADVISOR_SHARD);
|
||||
Poco::AutoPtr<Poco::XML::Element> shard = document->createElement("shard");
|
||||
Poco::AutoPtr<Poco::XML::Element> replica = document->createElement("replica");
|
||||
|
||||
Poco::AutoPtr<Poco::XML::Element> host = document->createElement("host");
|
||||
Poco::AutoPtr<Poco::XML::Text> host_text = document->createTextNode("localhost");
|
||||
host->appendChild(host_text);
|
||||
replica->appendChild(host);
|
||||
|
||||
Poco::AutoPtr<Poco::XML::Element> port = document->createElement("port");
|
||||
Poco::AutoPtr<Poco::XML::Text> port_text = document->createTextNode("9000");
|
||||
port->appendChild(port_text);
|
||||
replica->appendChild(port);
|
||||
|
||||
Poco::AutoPtr<Poco::XML::Element> exchange_port = document->createElement("exchange_port");
|
||||
Poco::AutoPtr<Poco::XML::Text> exchange_port_text = document->createTextNode("9300");
|
||||
exchange_port->appendChild(exchange_port_text);
|
||||
replica->appendChild(exchange_port);
|
||||
|
||||
Poco::AutoPtr<Poco::XML::Element> exchange_status_port = document->createElement("exchange_status_port");
|
||||
Poco::AutoPtr<Poco::XML::Text> exchange_status_port_text = document->createTextNode("9400");
|
||||
exchange_status_port->appendChild(exchange_status_port_text);
|
||||
replica->appendChild(exchange_status_port);
|
||||
|
||||
shard->appendChild(replica);
|
||||
advisor_shard->appendChild(shard);
|
||||
remote_servers->appendChild(advisor_shard);
|
||||
yandex->appendChild(remote_servers);
|
||||
document->appendChild(yandex);
|
||||
|
||||
return document;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
#pragma once
|
||||
|
||||
#include <Common/Config/ConfigProcessor.h>
|
||||
#include <Interpreters/Context.h>
|
||||
#include <Interpreters/Context_fwd.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
class MockGlobalContext
|
||||
{
|
||||
public:
|
||||
static constexpr const char * ADVISOR_SHARD = "advisor_shard";
|
||||
|
||||
static MockGlobalContext & instance()
|
||||
{
|
||||
static MockGlobalContext mock_context;
|
||||
return mock_context;
|
||||
}
|
||||
|
||||
ContextMutablePtr createSessionContext();
|
||||
|
||||
private:
|
||||
explicit MockGlobalContext();
|
||||
static XMLDocumentPtr mockConfig();
|
||||
|
||||
SharedContextHolder shared_context;
|
||||
ContextMutablePtr context;
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,101 @@
|
|||
#pragma once
|
||||
|
||||
#include <Analyzers/QualifiedColumnName.h>
|
||||
#include <Advisor/ColumnUsage.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/**
|
||||
* StatisticInfo:
|
||||
* - sample_ndv: the ndv for a column in the particular part
|
||||
* - total_rows: the total number of rows for the particular part
|
||||
* - total_predicates: The total number of predicates involving the specified column, including in and equals and others
|
||||
*/
|
||||
struct StatisticInfo
|
||||
{
|
||||
size_t sample_ndv{};
|
||||
size_t sample_row_num{};
|
||||
size_t total_predicates{};
|
||||
};
|
||||
|
||||
/**
|
||||
* PredicateInfo:
|
||||
* - PredicateExpressions: predicate_ast_expression - count
|
||||
* - total: The total number of occurrences of a ColumnUsageType
|
||||
*/
|
||||
using PredicateExpressions = std::unordered_map<ConstASTPtr, size_t>;
|
||||
struct PredicateInfo
|
||||
{
|
||||
PredicateExpressions expressions;
|
||||
size_t total{};
|
||||
|
||||
PredicateInfo() = default;
|
||||
PredicateInfo(PredicateExpressions & expressions_, size_t total_): expressions(std::move(expressions_)), total(total_) {}
|
||||
};
|
||||
using PredicateInfos = std::unordered_map<ColumnUsageType, PredicateInfo>;
|
||||
|
||||
enum class PotentialIndexType
|
||||
{
|
||||
BLOOM_FILTER, // just support bloom_filter for skip index
|
||||
BITMAP_INDEX,
|
||||
SEGMENT_BITMAP_INDEX, // For high cordinality
|
||||
ALREADY_BITMAP_INDEX, // used in test, to find the column already has bitmap index
|
||||
};
|
||||
|
||||
inline std::string toString(PotentialIndexType indexType)
|
||||
{
|
||||
switch (indexType)
|
||||
{
|
||||
case PotentialIndexType::BLOOM_FILTER:
|
||||
return "BLOOM_FILTER";
|
||||
case PotentialIndexType::BITMAP_INDEX:
|
||||
return "BITMAP_INDEX";
|
||||
case PotentialIndexType::SEGMENT_BITMAP_INDEX:
|
||||
return "SEGMENT_BITMAP_INDEX";
|
||||
case PotentialIndexType::ALREADY_BITMAP_INDEX:
|
||||
return "ALREADY_BITMAP_INDEX";
|
||||
default:
|
||||
return "Unknown";
|
||||
}
|
||||
}
|
||||
|
||||
struct PotentialColumnInfo
|
||||
{
|
||||
PotentialIndexType index_type;
|
||||
StatisticInfo statistic_info;
|
||||
PredicateInfos predicate_infos;
|
||||
};
|
||||
using PotentialColumns = std::unordered_map<QualifiedColumnName, PotentialColumnInfo, QualifiedColumnNameHash>;
|
||||
|
||||
using PotentialPrewhereColumns = std::unordered_map<QualifiedColumnName, std::pair<Float64, Float64>, QualifiedColumnNameHash>;
|
||||
|
||||
struct IndexOverhead
|
||||
{
|
||||
size_t hashs;
|
||||
size_t bits_per_rows;
|
||||
size_t uncompressed_index_size;
|
||||
};
|
||||
|
||||
using IndexSelectors = std::vector<Float64>;
|
||||
struct IndexEffect
|
||||
{
|
||||
IndexSelectors index_selectors;
|
||||
size_t total_expressions;
|
||||
};
|
||||
|
||||
struct PotentialIndex
|
||||
{
|
||||
QualifiedColumnName column;
|
||||
PotentialIndexType index_type;
|
||||
Float32 false_positive_rate;
|
||||
|
||||
IndexOverhead index_overhead;
|
||||
IndexEffect index_effect;
|
||||
|
||||
StatisticInfo statistic_info;
|
||||
PredicateInfos predicate_infos;
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,238 @@
|
|||
#include "PrewhereAdvisor.h"
|
||||
#include "ColumnUsageExtractor.h"
|
||||
#include "Columns/IColumn.h"
|
||||
#include <Columns/ColumnsNumber.h>
|
||||
#include <Columns/ColumnArray.h>
|
||||
#include "Core/Field.h"
|
||||
#include "IO/WriteIntText.h"
|
||||
#include "QueryPlan/PlanSerDerHelper.h"
|
||||
#include "SampleColumnReader.h"
|
||||
#include "Statistics.h"
|
||||
#include "SchemaAdvisorHelpers.h"
|
||||
#include "PotentialColumn.h"
|
||||
|
||||
#include <cstddef>
|
||||
#include <iostream>
|
||||
#include <utility>
|
||||
#include <boost/algorithm/string/classification.hpp>
|
||||
#include <boost/algorithm/string/split.hpp>
|
||||
|
||||
#include "Common/Exception.h"
|
||||
#include "common/types.h"
|
||||
#include <Common/ThreadPool.h>
|
||||
#include <Core/NamesAndTypes.h>
|
||||
#include <DataTypes/DataTypeNullable.h>
|
||||
#include <IO/copyData.h>
|
||||
#include <IO/ReadBufferFromFile.h>
|
||||
#include <IO/ReadBufferFromFileDescriptor.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <IO/WriteBufferFromFile.h>
|
||||
#include <IO/WriteBufferFromFileDescriptor.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
PrewhereAdvisor::PrewhereAdvisor(
|
||||
MockEnvironment & env_,
|
||||
const po::variables_map & options_,
|
||||
size_t sample_row_number_,
|
||||
size_t max_threads_,
|
||||
Float64 mark_filter_threshold_,
|
||||
Float64 top_3_mark_filter_threshold_)
|
||||
: env(env_)
|
||||
, options(options_)
|
||||
, sample_row_number(sample_row_number_)
|
||||
, max_threads(max_threads_)
|
||||
, mark_filter_threshold(mark_filter_threshold_)
|
||||
, top_3_mark_filter_threshold(top_3_mark_filter_threshold_)
|
||||
, sample_mark_number(static_cast<Float64>(sample_row_number_) / 8192)
|
||||
, column_size_threshold(128 * sample_row_number_)
|
||||
{
|
||||
}
|
||||
|
||||
Float64 PrewhereAdvisor::calcMarkFilterRatio(const Field & field) const
|
||||
{
|
||||
const auto & array_field = DB::safeGet<Array>(field);
|
||||
|
||||
size_t total_marks = 0;
|
||||
for (const auto & tuple_field : array_field)
|
||||
{
|
||||
total_marks += DB::safeGet<UInt64>(DB::safeGet<Tuple>(tuple_field)[1]);
|
||||
}
|
||||
|
||||
Float64 avg_mark_occurrence_rate = static_cast<Float64>(total_marks) / array_field.size();
|
||||
|
||||
return avg_mark_occurrence_rate / sample_mark_number;
|
||||
}
|
||||
|
||||
std::pair<Float64, Float64> PrewhereAdvisor::calcMarkFilterRatio(const ColumnPtr & column) const
|
||||
{
|
||||
const auto * array_column = typeid_cast<const ColumnArray *>(column.get());
|
||||
const auto * tuple_column = typeid_cast<const ColumnTuple *>((array_column->getDataPtr()).get());
|
||||
const auto * mark_count_column = typeid_cast<const ColumnUInt64 *>(tuple_column->getColumns()[1].get());
|
||||
|
||||
size_t total_marks = 0;
|
||||
std::priority_queue<UInt64, std::vector<UInt64>, std::greater<UInt64>> top_3_mark_pq;
|
||||
for (size_t i = 0; i < mark_count_column->size(); i++)
|
||||
{
|
||||
auto current_mark = mark_count_column->get64(i);
|
||||
total_marks += current_mark;
|
||||
|
||||
if (top_3_mark_pq.size() < 3)
|
||||
{
|
||||
top_3_mark_pq.push(current_mark);
|
||||
}
|
||||
else if (current_mark > top_3_mark_pq.top())
|
||||
{
|
||||
top_3_mark_pq.pop();
|
||||
top_3_mark_pq.push(current_mark);
|
||||
}
|
||||
}
|
||||
size_t queue_size = top_3_mark_pq.size();
|
||||
|
||||
size_t top_3_mark_sum = 0;
|
||||
while(!top_3_mark_pq.empty())
|
||||
{
|
||||
top_3_mark_sum += top_3_mark_pq.top();
|
||||
top_3_mark_pq.pop();
|
||||
}
|
||||
|
||||
Float64 avg_mark_occurrence_rate = static_cast<Float64>(total_marks) / mark_count_column->size();
|
||||
Float64 top_3_mark_occurrence_rate = static_cast<Float64>(top_3_mark_sum) / queue_size;
|
||||
|
||||
return std::make_pair(avg_mark_occurrence_rate / sample_mark_number, top_3_mark_occurrence_rate / sample_mark_number);
|
||||
}
|
||||
|
||||
void PrewhereAdvisor::execute()
|
||||
{
|
||||
auto context = createContext(options, env);
|
||||
auto queries = loadQueries(options);
|
||||
|
||||
LOG_DEBUG(getLogger("PrewhereAdvisor"), "++++++++++ begin to executor prewhere advisor ++++++++++");
|
||||
|
||||
ColumnUsageExtractor extractor(context, max_threads);
|
||||
auto column_usages = extractor.extractColumnUsages(queries);
|
||||
auto prewhere_usages = extractor.extractUsageForPrewhere(column_usages);
|
||||
|
||||
LOG_DEBUG(getLogger("PrewhereAdvisor"), "Extracted {} prewhere_usages usages for prewhere", prewhere_usages.size());
|
||||
|
||||
CountByGranularity count_by_granularity;
|
||||
|
||||
for (const auto & prewhere_usage : prewhere_usages)
|
||||
{
|
||||
auto column_info = prewhere_usage.first;
|
||||
|
||||
auto storage = MockEnvironment::tryGetLocalTable(column_info.database, column_info.table, context);
|
||||
if (!storage)
|
||||
throw Exception(column_info.database + "(" + column_info.table + "): can not find local table.", ErrorCodes::NOT_FOUND_EXPECTED_DATA_PART);
|
||||
|
||||
auto metadata = storage->getInMemoryMetadataCopy();
|
||||
auto column_and_type = metadata.getColumns().tryGetColumn(GetColumnsOptions::Kind::AllPhysical, column_info.column);
|
||||
if (!column_and_type)
|
||||
continue;
|
||||
|
||||
auto column_type = column_and_type->type;
|
||||
|
||||
if (isArray(column_type))
|
||||
continue;
|
||||
|
||||
std::vector<std::string> data_path_list;
|
||||
|
||||
boost::split(data_path_list, options["data-path-list"].as<std::string>(), boost::is_any_of(" ,"));
|
||||
for (auto & path : data_path_list)
|
||||
{
|
||||
if (!endsWith(path, "/"))
|
||||
path = path.append("/");
|
||||
}
|
||||
|
||||
std::string absolute_part_path;
|
||||
try
|
||||
{
|
||||
absolute_part_path = selectPartPath(options, data_path_list, storage->getStorageID().getDatabaseName(), storage->getStorageID().getTableName(), sample_row_number);
|
||||
}
|
||||
catch (Exception & e)
|
||||
{
|
||||
if (e.code() == ErrorCodes::NOT_FOUND_EXPECTED_DATA_PART)
|
||||
{
|
||||
LOG_DEBUG(
|
||||
getLogger("PrewhereAdvisor"),
|
||||
"Can't find suitable part for table " + column_info.database + "." + column_info.table
|
||||
+ ", maybe because of the total part rows < " + std::to_string(sample_row_number));
|
||||
continue;
|
||||
}
|
||||
else
|
||||
throw e;
|
||||
}
|
||||
|
||||
SampleColumnReader reader(absolute_part_path + "/", 0, sample_row_number);
|
||||
ColumnPtr column;
|
||||
try
|
||||
{
|
||||
column = reader.readColumn({prewhere_usage.first.column, column_type});
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
// Just skip the column if it can't be read
|
||||
LOG_DEBUG(
|
||||
getLogger("PrewhereAdvisor"),
|
||||
"Can't read column file " + prewhere_usage.first.column + " from table " + column_info.database + "." + column_info.table
|
||||
+ ", error message: "
|
||||
+ getCurrentExceptionMessage(true));
|
||||
continue;
|
||||
}
|
||||
|
||||
std::pair<Float64, Float64> mark_filter_pair;
|
||||
try
|
||||
{
|
||||
mark_filter_pair = calcMarkFilterRatio(count_by_granularity.executeOnColumn(column, column_type));
|
||||
}
|
||||
catch (Exception & e)
|
||||
{
|
||||
if (e.code() == ErrorCodes::BAD_ARGUMENTS)
|
||||
{
|
||||
LOG_DEBUG(
|
||||
getLogger("PrewhereAdvisor"), "Error while calculate mark filter ratio, error message: " + e.message());
|
||||
continue;
|
||||
}
|
||||
else
|
||||
throw e;
|
||||
}
|
||||
|
||||
LOG_DEBUG(getLogger("PrewhereAdvisor"), "Column {} mark filter ratio is {}, top-3 mark filter ratio is {}, column size is {} MB", column_info.column, mark_filter_pair.first, mark_filter_pair.second, column->byteSize()/1000000);
|
||||
|
||||
if (((mark_filter_pair.first <= mark_filter_threshold && mark_filter_pair.second <= top_3_mark_filter_threshold) || (mark_filter_pair.first < 0.1 && mark_filter_pair.second < 0.76)) && column->byteSize() < column_size_threshold)
|
||||
potential_columns.insert({std::move(column_info), mark_filter_pair});
|
||||
}
|
||||
}
|
||||
|
||||
/// TODO: seperate two indices
|
||||
void PrewhereAdvisor::serializeJson(WriteBuffer & buf, bool /* verbose */)
|
||||
{
|
||||
bool first = true;
|
||||
writeString("\"prewhere\":[", buf);
|
||||
for (auto & [column_info, mark_filter_ratio] : potential_columns)
|
||||
{
|
||||
if (first)
|
||||
first = false;
|
||||
else
|
||||
writeString(",", buf);
|
||||
|
||||
writeString(R"({"db":")", buf);
|
||||
writeString(column_info.database, buf);
|
||||
writeString(R"(","table":")", buf);
|
||||
writeString(column_info.table, buf);
|
||||
writeString(R"(","column_name":")", buf);
|
||||
writeString(column_info.column, buf);
|
||||
|
||||
writeString(R"(","mark_filter_ratio":")", buf);
|
||||
writeString(toString(mark_filter_ratio.first), buf);
|
||||
writeString(R"(","top_3_mark_filter_ratio":")", buf);
|
||||
writeString(toString(mark_filter_ratio.second), buf);
|
||||
|
||||
writeString("\"}", buf);
|
||||
}
|
||||
writeString("]", buf);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
#pragma once
|
||||
|
||||
#include "MockEnvironment.h"
|
||||
#include "PotentialColumn.h"
|
||||
|
||||
#include <boost/program_options/variables_map.hpp>
|
||||
|
||||
#include <Compression/ICompressionCodec.h>
|
||||
#include <Storages/ColumnsDescription.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace po = boost::program_options;
|
||||
|
||||
class PrewhereAdvisor
|
||||
{
|
||||
private:
|
||||
MockEnvironment & env;
|
||||
po::variables_map options;
|
||||
const size_t sample_row_number;
|
||||
const size_t max_threads;
|
||||
const Float64 mark_filter_threshold;
|
||||
const Float64 top_3_mark_filter_threshold;
|
||||
const Float64 sample_mark_number;
|
||||
// We do not pushdown the column if the field size > 128 bytes
|
||||
const size_t column_size_threshold;
|
||||
|
||||
PotentialPrewhereColumns potential_columns;
|
||||
|
||||
Float64 calcMarkFilterRatio(const Field & field) const;
|
||||
std::pair<Float64, Float64> calcMarkFilterRatio(const ColumnPtr & column) const;
|
||||
public:
|
||||
PrewhereAdvisor(
|
||||
MockEnvironment & env_,
|
||||
const po::variables_map & options_,
|
||||
size_t sample_row_number_,
|
||||
size_t max_threads_,
|
||||
Float64 mark_filter_threshold_,
|
||||
Float64 top_3_mark_filter_threshold_);
|
||||
|
||||
virtual ~PrewhereAdvisor() = default;
|
||||
|
||||
void execute();
|
||||
void serializeJson(WriteBuffer & buf, bool verbose = false);
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,341 @@
|
|||
#include "SampleColumnReader.h"
|
||||
|
||||
#include <filesystem>
|
||||
#include <Common/MemoryTrackerBlockerInThread.h>
|
||||
#include <IO/ReadBufferFromFile.h>
|
||||
#include <Poco/DirectoryIterator.h>
|
||||
#include <Storages/MergeTree/MergeTreeIndexGranularityInfo.h>
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ARGUMENT_OUT_OF_BOUND;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
extern const int CANNOT_SEEK_THROUGH_FILE;
|
||||
extern const int CORRUPTED_DATA;
|
||||
extern const int NO_FILE_IN_DATA_PART;
|
||||
extern const int UNKNOWN_PART_TYPE;
|
||||
}
|
||||
|
||||
SampleColumnIndexGranularityInfo::SampleColumnIndexGranularityInfo(const String & path_to_part)
|
||||
{
|
||||
auto mrk_ext = getMarksExtensionFromFilesystem(path_to_part);
|
||||
if (*mrk_ext == getNonAdaptiveMrkExtension())
|
||||
{
|
||||
is_adaptive = false;
|
||||
part_type = MergeTreeDataPartType::WIDE;
|
||||
marks_file_extension = *mrk_ext;
|
||||
}
|
||||
else if (*mrk_ext == getAdaptiveMrkExtension(MergeTreeDataPartType::WIDE))
|
||||
{
|
||||
is_adaptive = true;
|
||||
part_type = MergeTreeDataPartType::WIDE;
|
||||
marks_file_extension = *mrk_ext;
|
||||
}
|
||||
else
|
||||
{
|
||||
throw Exception("Can't determine part type, because of unsupported mark extension " + *mrk_ext, ErrorCodes::UNKNOWN_PART_TYPE);
|
||||
}
|
||||
}
|
||||
|
||||
std::optional<std::string> SampleColumnIndexGranularityInfo::getMarksExtensionFromFilesystem(const String & path_to_part)
|
||||
{
|
||||
if (std::filesystem::exists(path_to_part))
|
||||
{
|
||||
Poco::DirectoryIterator end;
|
||||
for (Poco::DirectoryIterator it(path_to_part); it != end; ++it)
|
||||
{
|
||||
const auto & ext = std::filesystem::path(it->path()).extension();
|
||||
if (ext == getNonAdaptiveMrkExtension()
|
||||
|| ext == getAdaptiveMrkExtension(MergeTreeDataPartType::WIDE)
|
||||
|| ext == getAdaptiveMrkExtension(MergeTreeDataPartType::COMPACT))
|
||||
return ext;
|
||||
}
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
std::string SampleColumnIndexGranularityInfo::getAdaptiveMrkExtension(MergeTreeDataPartType part_type_)
|
||||
{
|
||||
if (part_type_ == MergeTreeDataPartType::WIDE)
|
||||
return ".mrk2";
|
||||
else if (part_type_ == MergeTreeDataPartType::COMPACT)
|
||||
return ".mrk3";
|
||||
else if (part_type_ == MergeTreeDataPartType::IN_MEMORY)
|
||||
return "";
|
||||
else
|
||||
throw Exception("Unknown part type", ErrorCodes::UNKNOWN_PART_TYPE);
|
||||
}
|
||||
|
||||
size_t SampleColumnIndexGranularityInfo::getMarkSizeInBytes() const
|
||||
{
|
||||
if (part_type == MergeTreeDataPartType::WIDE)
|
||||
return is_adaptive ? getAdaptiveMrkSizeWide() : getNonAdaptiveMrkSizeWide();
|
||||
else
|
||||
throw Exception("Unsupported type: " + part_type.toString(), ErrorCodes::UNKNOWN_PART_TYPE);
|
||||
}
|
||||
|
||||
size_t SampleColumnIndexGranularityInfo::getMarksCount(const String & path_prefix) const
|
||||
{
|
||||
std::string marks_file_path = getMarksFilePath(path_prefix);
|
||||
if (!std::filesystem::exists(marks_file_path))
|
||||
throw Exception("Marks file '" + marks_file_path + "' doesn't exist", ErrorCodes::NO_FILE_IN_DATA_PART);
|
||||
|
||||
size_t marks_file_size = std::filesystem::file_size(marks_file_path);
|
||||
return marks_file_size / getMarkSizeInBytes();
|
||||
}
|
||||
|
||||
size_t SampleColumnIndexGranularityInfo::getMarksTotalSizeInBytes(const String & path_prefix) const
|
||||
{
|
||||
std::string marks_file_path = getMarksFilePath(path_prefix);
|
||||
if (!std::filesystem::exists(marks_file_path))
|
||||
throw Exception("Marks file '" + marks_file_path + "' doesn't exist", ErrorCodes::NO_FILE_IN_DATA_PART);
|
||||
|
||||
return std::filesystem::file_size(marks_file_path);
|
||||
}
|
||||
|
||||
SampleColumnMarksLoader::SampleColumnMarksLoader(
|
||||
const String & path_prefix_,
|
||||
const String & stream_name_,
|
||||
size_t marks_count_,
|
||||
const SampleColumnIndexGranularityInfo & index_granularity_info_,
|
||||
off_t mark_file_offset_,
|
||||
size_t mark_file_size_)
|
||||
: mrk_path(index_granularity_info_.getMarksFilePath(path_prefix_))
|
||||
, stream_name(stream_name_)
|
||||
, marks_count(marks_count_)
|
||||
, mark_file_offset(mark_file_offset_)
|
||||
, mark_file_size(mark_file_size_)
|
||||
, index_granularity_info(index_granularity_info_) {}
|
||||
|
||||
const MarkInCompressedFile & SampleColumnMarksLoader::getMark(size_t row_index)
|
||||
{
|
||||
if (!marks)
|
||||
loadMarks();
|
||||
|
||||
return (*marks)[row_index];
|
||||
}
|
||||
|
||||
SampleColumnMarksLoader::MarksPtr SampleColumnMarksLoader::loadMarksImpl()
|
||||
{
|
||||
/// Memory for marks must not be accounted as memory usage for query, because they are stored in shared cache.
|
||||
MemoryTrackerBlockerInThread temporarily_disable_memory_tracker;
|
||||
|
||||
size_t mark_size = index_granularity_info.getMarkSizeInBytes();
|
||||
size_t expected_file_size = mark_size * marks_count;
|
||||
|
||||
if (expected_file_size != mark_file_size)
|
||||
throw Exception(
|
||||
"Bad size of marks file '" + mrk_path + "' for stream '" + stream_name + "': " + std::to_string(mark_file_size) + ", must be: " + std::to_string(expected_file_size),
|
||||
ErrorCodes::CORRUPTED_DATA);
|
||||
|
||||
auto res = std::make_shared<MarksInCompressedFile>(marks_count);
|
||||
|
||||
if (!index_granularity_info.is_adaptive)
|
||||
{
|
||||
/// Read directly to marks.
|
||||
auto buffer = std::make_unique<ReadBufferFromFile>(mrk_path);
|
||||
if (buffer->seek(mark_file_offset, SEEK_SET) != mark_file_offset)
|
||||
throw Exception("Cannot seek to mark file " + mrk_path + " for stream " + stream_name, ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
|
||||
|
||||
if (buffer->eof() || buffer->buffer().size() != mark_file_size)
|
||||
throw Exception("Cannot read all marks from file " + mrk_path + ", eof: " + std::to_string(buffer->eof())
|
||||
+ ", buffer size: " + std::to_string(buffer->buffer().size()) + ", file size: " + std::to_string(mark_file_size), ErrorCodes::CANNOT_READ_ALL_DATA);
|
||||
|
||||
buffer->readStrict(reinterpret_cast<char *>(res->data()), mark_file_size);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto buffer = std::make_unique<ReadBufferFromFile>(mrk_path);
|
||||
if (buffer->seek(mark_file_offset, SEEK_SET) != mark_file_offset)
|
||||
throw Exception("Cannot seek to mark file " + mrk_path + " for stream " + stream_name, ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
|
||||
|
||||
size_t i = 0;
|
||||
off_t limit_offset_in_file = mark_file_offset + mark_file_size;
|
||||
while (buffer->getPosition() < limit_offset_in_file)
|
||||
{
|
||||
res->read(*buffer, i, 1);
|
||||
buffer->seek(sizeof(size_t), SEEK_CUR);
|
||||
++i;
|
||||
}
|
||||
|
||||
if (i * mark_size != mark_file_size)
|
||||
throw Exception("Cannot read all marks from file " + mrk_path, ErrorCodes::CANNOT_READ_ALL_DATA);
|
||||
}
|
||||
res->protect();
|
||||
return res;
|
||||
}
|
||||
|
||||
void SampleColumnMarksLoader::loadMarks()
|
||||
{
|
||||
String mrk_name = index_granularity_info.getMarksFilePath(stream_name);
|
||||
marks = loadMarksImpl();
|
||||
|
||||
if (!marks)
|
||||
throw Exception("Failed to load marks: " + mrk_name + " from path:" + mrk_path, ErrorCodes::LOGICAL_ERROR);
|
||||
}
|
||||
|
||||
SampleColumnReaderStream::SampleColumnReaderStream(
|
||||
const String & path_prefix_, const String & stream_name_, const String & data_file_extension_,
|
||||
const SampleColumnIndexGranularityInfo * index_granularity_info_,
|
||||
size_t max_rows_to_read_)
|
||||
: path_prefix(path_prefix_)
|
||||
, max_rows_to_read(max_rows_to_read_)
|
||||
, marks_loader(
|
||||
path_prefix_
|
||||
, stream_name_
|
||||
, index_granularity_info_->getMarksCount(path_prefix_)
|
||||
, *index_granularity_info_
|
||||
, mark_file_offset
|
||||
, index_granularity_info_->getMarksTotalSizeInBytes(path_prefix_))
|
||||
{
|
||||
std::string data_file_path = path_prefix_ + data_file_extension_;
|
||||
/// Initialize the objects that shall be used to perform read operations.
|
||||
auto buffer = std::make_unique<CompressedReadBufferFromFile>(
|
||||
std::make_unique<ReadBufferFromFile>(data_file_path),
|
||||
/* allow_different_codecs = */true,
|
||||
data_file_offset,
|
||||
std::filesystem::file_size(data_file_path),
|
||||
/* is_limit = */true);
|
||||
|
||||
/* if (!settings.checksum_on_read) */
|
||||
buffer->disableChecksumming();
|
||||
|
||||
non_cached_buffer = std::move(buffer);
|
||||
data_buffer = non_cached_buffer.get();
|
||||
}
|
||||
|
||||
void SampleColumnReaderStream::seekToMark(size_t index)
|
||||
{
|
||||
MarkInCompressedFile mark = marks_loader.getMark(index);
|
||||
|
||||
try
|
||||
{
|
||||
non_cached_buffer->seek(mark.offset_in_compressed_file + data_file_offset, mark.offset_in_decompressed_block);
|
||||
}
|
||||
catch (Exception & e)
|
||||
{
|
||||
/// Better diagnostics.
|
||||
if (e.code() == ErrorCodes::ARGUMENT_OUT_OF_BOUND)
|
||||
e.addMessage("(while seeking to mark " + toString(index)
|
||||
+ " of column " + path_prefix + "; offsets are: "
|
||||
+ toString(mark.offset_in_compressed_file + data_file_offset) + " "
|
||||
+ toString(mark.offset_in_decompressed_block) + ")");
|
||||
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
void SampleColumnReaderStream::seekToStart()
|
||||
{
|
||||
try
|
||||
{
|
||||
non_cached_buffer->seek(data_file_offset, 0);
|
||||
#ifdef ENABLE_QPL_COMPRESSION
|
||||
if (non_cached_async_buffer)
|
||||
non_cached_async_buffer->seek(data_file_offset, 0);
|
||||
#endif
|
||||
}
|
||||
catch (Exception & e)
|
||||
{
|
||||
/// Better diagnostics.
|
||||
if (e.code() == ErrorCodes::ARGUMENT_OUT_OF_BOUND)
|
||||
e.addMessage("(while seeking to start of column " + path_prefix + ")");
|
||||
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
SampleColumnReader::SampleColumnReader(
|
||||
std::string path_to_part_, size_t from_mark_, size_t max_rows_to_read_)
|
||||
: path_to_part(std::move(path_to_part_))
|
||||
, from_mark(from_mark_)
|
||||
, max_rows_to_read(max_rows_to_read_) {}
|
||||
|
||||
ReadBuffer * SampleColumnReader::getStream(
|
||||
[[maybe_unused]] bool stream_for_prefix,
|
||||
const ISerialization::SubstreamPath & substream_path,
|
||||
const NameAndTypePair & name_and_type,
|
||||
size_t from_mark_)
|
||||
{
|
||||
String stream_name = ISerialization::getFileNameForStream(name_and_type, substream_path);
|
||||
|
||||
auto it = streams.find(stream_name);
|
||||
if (it == streams.end())
|
||||
return nullptr;
|
||||
|
||||
SampleColumnReaderStream & stream = *it->second;
|
||||
|
||||
if (stream_for_prefix)
|
||||
stream.seekToStart();
|
||||
else
|
||||
stream.seekToMark(from_mark_);
|
||||
|
||||
return stream.data_buffer;
|
||||
}
|
||||
|
||||
|
||||
ColumnPtr SampleColumnReader::readColumn(const NameAndTypePair & name_and_type)
|
||||
{
|
||||
SampleColumnIndexGranularityInfo index_granularity_info(path_to_part);
|
||||
|
||||
ISerialization::StreamCallback callback = [&](const ISerialization::SubstreamPath & substream_path) {
|
||||
String stream_name = ISerialization::getFileNameForStream(name_and_type, substream_path);
|
||||
|
||||
if (streams.count(stream_name))
|
||||
return;
|
||||
/*
|
||||
auto check_validity
|
||||
= [&](String & stream_name_) -> bool { return data_part->getChecksums()->files.count(stream_name_ + DATA_FILE_EXTENSION); };
|
||||
|
||||
// If data file is missing then we will not try to open it.
|
||||
// It is necessary since it allows to add new column to structure of the table without creating new files for old parts.
|
||||
//
|
||||
if ((!name_and_type.type->isKVMap() && !check_validity(stream_name))
|
||||
|| (name_and_type.type->isKVMap() && !tryConvertToValidKVStreamName(stream_name, check_validity)))
|
||||
return;
|
||||
*/
|
||||
std::string path_prefix = path_to_part + stream_name;
|
||||
streams.emplace(
|
||||
stream_name,
|
||||
std::make_unique<SampleColumnReaderStream>(
|
||||
path_prefix,
|
||||
stream_name,
|
||||
DATA_FILE_EXTENSION,
|
||||
&index_granularity_info,
|
||||
max_rows_to_read
|
||||
));
|
||||
};
|
||||
|
||||
auto serialization = name_and_type.type->getDefaultSerialization();
|
||||
serialization->enumerateStreams(callback);
|
||||
|
||||
ColumnPtr column = name_and_type.type->createColumn();
|
||||
//double & avg_value_size_hint = avg_value_size_hints[name_and_type.name];
|
||||
ISerialization::DeserializeBinaryBulkSettings deserialize_settings;
|
||||
// deserialize_settings.avg_value_size_hint = avg_value_size_hint;
|
||||
|
||||
const auto & name = name_and_type.name;
|
||||
|
||||
if (deserialize_binary_bulk_state_map.count(name) == 0)
|
||||
{
|
||||
deserialize_settings.getter = [&](const ISerialization::SubstreamPath & substream_path)
|
||||
{
|
||||
return getStream(true, substream_path, name_and_type, from_mark);
|
||||
};
|
||||
serialization->deserializeBinaryBulkStatePrefix(deserialize_settings, deserialize_binary_bulk_state_map[name]);
|
||||
}
|
||||
|
||||
deserialize_settings.getter = [&](const ISerialization::SubstreamPath & substream_path)
|
||||
{
|
||||
return getStream(false, substream_path, name_and_type, from_mark);
|
||||
};
|
||||
deserialize_settings.continuous_reading = 0;
|
||||
auto & deserialize_state = deserialize_binary_bulk_state_map[name];
|
||||
|
||||
serialization->deserializeBinaryBulkWithMultipleStreams(column, max_rows_to_read, deserialize_settings, deserialize_state, nullptr);
|
||||
return column;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,133 @@
|
|||
#pragma once
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include <Compression/CompressedReadBufferFromFile.h>
|
||||
#include <Core/NamesAndTypes.h>
|
||||
#include <DataStreams/MarkInCompressedFile.h>
|
||||
#include <Storages/ColumnsDescription.h>
|
||||
#include <Storages/MergeTree/MergeTreeDataPartType.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
struct SampleColumnIndexGranularityInfo
|
||||
{
|
||||
public:
|
||||
/// Marks file extension '.mrk' or '.mrk2'
|
||||
String marks_file_extension;
|
||||
|
||||
/// Is stride in rows between marks non fixed?
|
||||
bool is_adaptive = false;
|
||||
|
||||
SampleColumnIndexGranularityInfo(const String & path_to_part);
|
||||
|
||||
String getMarksFilePath(const String & path_prefix) const
|
||||
{
|
||||
return path_prefix + marks_file_extension;
|
||||
}
|
||||
|
||||
size_t getMarkSizeInBytes() const;
|
||||
size_t getMarksCount(const String & path_prefix) const;
|
||||
size_t getMarksTotalSizeInBytes(const String & path_prefix) const;
|
||||
|
||||
private:
|
||||
MergeTreeDataPartType part_type;
|
||||
std::optional<std::string> getMarksExtensionFromFilesystem(const String & path_to_part);
|
||||
std::string getAdaptiveMrkExtension(MergeTreeDataPartType part_type);
|
||||
};
|
||||
|
||||
class SampleColumnMarksLoader
|
||||
{
|
||||
public:
|
||||
using MarksPtr = std::shared_ptr<MarksInCompressedFile>;
|
||||
|
||||
SampleColumnMarksLoader(
|
||||
const String & path_prefix_,
|
||||
const String & stream_name_,
|
||||
size_t marks_count_,
|
||||
const SampleColumnIndexGranularityInfo & index_granularity_info_,
|
||||
off_t mark_file_offset_,
|
||||
size_t mark_file_size_);
|
||||
|
||||
const MarkInCompressedFile & getMark(size_t row_index);
|
||||
|
||||
bool initialized() const { return marks != nullptr; }
|
||||
|
||||
private:
|
||||
String mrk_path;
|
||||
String stream_name; // for compacted map
|
||||
size_t marks_count;
|
||||
|
||||
off_t mark_file_offset;
|
||||
size_t mark_file_size;
|
||||
|
||||
SampleColumnIndexGranularityInfo index_granularity_info;
|
||||
|
||||
MarksPtr marks;
|
||||
|
||||
void loadMarks();
|
||||
MarksPtr loadMarksImpl();
|
||||
};
|
||||
|
||||
class SampleColumnReaderStream
|
||||
{
|
||||
public:
|
||||
SampleColumnReaderStream(
|
||||
const String & path_prefix_, const String & stream_name_, const String & data_file_extension_,
|
||||
const SampleColumnIndexGranularityInfo * index_granularity_info_,
|
||||
size_t max_rows_to_read_);
|
||||
|
||||
virtual ~SampleColumnReaderStream() = default;
|
||||
|
||||
void seekToMark(size_t index);
|
||||
|
||||
void seekToStart();
|
||||
|
||||
ReadBuffer * data_buffer;
|
||||
|
||||
private:
|
||||
std::string path_prefix;
|
||||
off_t data_file_offset = 0;
|
||||
off_t mark_file_offset = 0;
|
||||
[[maybe_unused]] size_t max_rows_to_read;
|
||||
|
||||
std::unique_ptr<CompressedReadBufferFromFile> non_cached_buffer;
|
||||
|
||||
SampleColumnMarksLoader marks_loader;
|
||||
};
|
||||
|
||||
using SampleFileStreams = std::map<std::string, std::unique_ptr<SampleColumnReaderStream>>;
|
||||
using DeserializeBinaryBulkStateMap = std::map<std::string, ISerialization::DeserializeBinaryBulkStatePtr>;
|
||||
|
||||
class SampleColumnReader
|
||||
{
|
||||
private:
|
||||
const std::string path_to_part;
|
||||
size_t from_mark;
|
||||
size_t max_rows_to_read;
|
||||
|
||||
SampleFileStreams streams;
|
||||
|
||||
/// Stores states for IDataType::deserializeBinaryBulk
|
||||
DeserializeBinaryBulkStateMap deserialize_binary_bulk_state_map;
|
||||
|
||||
public:
|
||||
SampleColumnReader(
|
||||
std::string path_to_part_,
|
||||
size_t from_mark_,
|
||||
size_t max_rows_to_read_);
|
||||
|
||||
virtual ~SampleColumnReader() = default;
|
||||
|
||||
ReadBuffer * getStream(
|
||||
bool stream_for_prefix,
|
||||
const ISerialization::SubstreamPath & substream_path,
|
||||
const NameAndTypePair & name_and_type,
|
||||
size_t from_mark);
|
||||
|
||||
ColumnPtr readColumn(const NameAndTypePair & name_and_type);
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,243 @@
|
|||
#include "Advisor/Advisor.h"
|
||||
#include "CodecAdvisor.h"
|
||||
#include "ColumnUsageExtractor.h"
|
||||
#include "IndexAdvisor.h"
|
||||
#include "MockEnvironment.h"
|
||||
#include "SchemaAdvisorHelpers.h"
|
||||
#include "TypeAdvisor.h"
|
||||
#include "PrewhereAdvisor.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <boost/algorithm/string/classification.hpp>
|
||||
#include <boost/algorithm/string/split.hpp>
|
||||
|
||||
#include <Advisor/AdvisorContext.h>
|
||||
#include <Core/Defines.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <Interpreters/Context.h>
|
||||
#include <Interpreters/InterpreterCreateQuery.h>
|
||||
#include <Poco/Util/XMLConfiguration.h>
|
||||
#include <common/types.h>
|
||||
#include <Common/Exception.h>
|
||||
#include <Common/TerminalSize.h>
|
||||
#include <Common/escapeForFileName.h>
|
||||
#include <Common/formatReadable.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
static constexpr size_t DEFAULT_SAMPLE_ROW_NUMBER = 1000000;
|
||||
static constexpr size_t DEFAULT_MAX_THREADS = 8;
|
||||
static constexpr Float64 MARK_FILTER_THRESHOLD = 0.35;
|
||||
static constexpr Float64 TOP_3_MARK_FILTER_THRESHOLD = 0.65;
|
||||
|
||||
}
|
||||
|
||||
int mainEntryClickHouseSchemaAdvisor(int argc, char ** argv)
|
||||
{
|
||||
using namespace DB;
|
||||
namespace po = boost::program_options;
|
||||
|
||||
po::options_description desc = createOptionsDescription("Allowed options", getTerminalWidth());
|
||||
desc.add_options()("help,h", "produce help message")
|
||||
/// mandatory
|
||||
("db", po::value<std::string>()->value_name("DATABASE"), "db name")(
|
||||
"table", po::value<std::string>()->value_name("TABLE"), "table name")(
|
||||
"mode", po::value<std::string>()->value_name("MODE"),
|
||||
"mode: type, codec, type-codec, skip-index, projection, materialized-view, projection, order-by-key, sharding-key")(
|
||||
"path", po::value<std::string>()->value_name("PATH"), "main path")(
|
||||
"meta-path", po::value<std::string>()->value_name("META PATH"), "meta path")(
|
||||
"data-path-list", po::value<std::string>()->value_name("DATA PATH LIST"), "data path list, format: path1,path2")(
|
||||
"settings", po::value<std::string>()->default_value(""), "set settings, format: key1=value,key2=value2")(
|
||||
"log-level", po::value<std::string>()->default_value(""),
|
||||
"log level: trace, debug, information, notice, warning, error. Disable if emtpy.")
|
||||
/// optional
|
||||
("part", po::value<std::string>()->value_name("PART"), "sample part name")(
|
||||
"max-threads", po::value<size_t>()->default_value(DEFAULT_MAX_THREADS), "max threads for schema advisor")(
|
||||
"sample-number", po::value<size_t>()->default_value(DEFAULT_SAMPLE_ROW_NUMBER), "sample row number")(
|
||||
"verbose", "print column compression gain ratio")
|
||||
/// codec
|
||||
("fsst", "codec mode: use FSST instead of LZ4")("zstd", "codec mode: use ZSTD instead of LZ4")(
|
||||
"level", po::value<int>(), "codec mode: compression level for codecs specified via flags")(
|
||||
"codec", po::value<std::vector<std::string>>()->multitoken(), "codec mode: use codecs combination instead of LZ4")(
|
||||
"hc", "codec mode: use LZ4HC instead of LZ4")("none", "codec mode: use no compression instead of LZ4")(
|
||||
"block-size,b",
|
||||
po::value<unsigned>()->default_value(DBMS_DEFAULT_BUFFER_SIZE),
|
||||
"codec mode: compress in blocks of specified size")
|
||||
/// skip-index
|
||||
("query-file", po::value<std::string>()->value_name("QUERIES"), "absolute path to the query file seperated by newline")(
|
||||
"query-file-delimiter", po::value<std::string>()->value_name("DELIMITER"), "query file delimiter, default is new line.")
|
||||
/// tos
|
||||
("tos-ak", po::value<std::string>()->value_name("TOS AK"), "tos access key")(
|
||||
"vetos-endpoint", po::value<std::string>()->value_name("VETOS ENDPOINT"), "ve tos endpoint")(
|
||||
"vetos-region", po::value<std::string>()->value_name("VETOS REGION"), "ve tos region")(
|
||||
"vetos-ak", po::value<std::string>()->value_name("VETOS AK"), "ve tos access key")(
|
||||
"vetos-sk", po::value<std::string>()->value_name("VETOS SK"), "ve tos secret key")
|
||||
/// prewhere
|
||||
("mark_filter_threshold", po::value<Float64>()->default_value(MARK_FILTER_THRESHOLD), "threshold for mark filter ratio") ("top_3_mark_filter_threshold", po::value<Float64>()->default_value(TOP_3_MARK_FILTER_THRESHOLD), "threshold for mark filter ratio") ("low-cardinality", "recommend low-cardinality only in type advisor") (
|
||||
"scanned_count_threshold_for_lc", po::value<Float64>()->default_value(0.035), "recommend low-cardinality only scanned count > scan_count_threshold_for_lc") (
|
||||
"cardinality_ratio_threshold_for_lc", po::value<Float64>()->default_value(0.05), "recommend low-cardinality only cardinality < sample_row_number * cardinality_ratio_threshold_for_lc");
|
||||
|
||||
WriteBufferFromOwnString str_buf;
|
||||
std::unique_ptr<WriteBufferFromFileBase> stdout_buf = std::make_unique<WriteBufferFromFileDescriptor>(STDOUT_FILENO);
|
||||
bool verbose = false;
|
||||
|
||||
try
|
||||
{
|
||||
po::variables_map options;
|
||||
po::store(po::command_line_parser(argc, argv).options(desc).run(), options);
|
||||
|
||||
if (options.count("help"))
|
||||
{
|
||||
std::cout << "Usage: " << argv[0] << " [options] < INPUT > OUTPUT" << std::endl;
|
||||
std::cout << "Usage: " << argv[0] << " [options] INPUT OUTPUT" << std::endl;
|
||||
std::cout << desc << std::endl;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!options.count("db") || !options.count("mode"))
|
||||
throw Exception("Missing option, 'db' or 'mode' is missing", ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
// if (options.count("path") == options.count("meta-path") || options.count("meta-path") != options.count("data-path-list"))
|
||||
// throw Exception("Missing option, either single 'path' argument or both meta path and data path list arguments are allowed", ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
std::string db_name = options["db"].as<std::string>();
|
||||
std::string advisor_mode = options["mode"].as<std::string>();
|
||||
|
||||
std::string meta_path;
|
||||
std::vector<std::string> data_path_list;
|
||||
// if (options.count("path"))
|
||||
// {
|
||||
// std::string path = options["path"].as<std::string>();
|
||||
// if (!endsWith(path, "/"))
|
||||
// path.append("/");
|
||||
// meta_path = path;
|
||||
// data_path_list.emplace_back(path);
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
meta_path = options["meta-path"].as<std::string>();
|
||||
if (!endsWith(meta_path, "/"))
|
||||
meta_path.append("/");
|
||||
boost::split(data_path_list, options["data-path-list"].as<std::string>(), boost::is_any_of(" ,"));
|
||||
for(auto & path : data_path_list)
|
||||
{
|
||||
if (!endsWith(path, "/"))
|
||||
path = path.append("/");
|
||||
}
|
||||
// }
|
||||
|
||||
size_t sample_row_number = options["sample-number"].as<size_t>();
|
||||
size_t max_threads = options["max-threads"].as<size_t>();
|
||||
Float64 mark_filter_threshold = options["mark_filter_threshold"].as<Float64>();
|
||||
Float64 top_3_mark_filter_threshold = options["top_3_mark_filter_threshold"].as<Float64>();
|
||||
verbose = options.count("verbose");
|
||||
|
||||
|
||||
if (auto log_level = options["log-level"].as<std::string>(); !log_level.empty())
|
||||
setupLogging(log_level);
|
||||
|
||||
// prepare mock env
|
||||
MockEnvironment env(meta_path, max_threads);
|
||||
|
||||
if (advisor_mode == "codec")
|
||||
{
|
||||
if (!options.count("table"))
|
||||
throw Exception("Missing option, 'table' is missing", ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
std::string table_name = options["table"].as<std::string>();
|
||||
std::string absolute_part_path = selectPartPath(options, data_path_list, db_name, table_name, sample_row_number);
|
||||
serializeJsonPrefix(str_buf, db_name, table_name, absolute_part_path, verbose);
|
||||
ColumnsDescription columns = env.getColumnsDescription(db_name, table_name);
|
||||
|
||||
CodecAdvisor codec_advisor(options, columns, absolute_part_path, sample_row_number, max_threads);
|
||||
codec_advisor.execute();
|
||||
codec_advisor.serializeJson(str_buf, verbose);
|
||||
serializeJsonSuffix(str_buf);
|
||||
}
|
||||
else if (advisor_mode == "type")
|
||||
{
|
||||
if (!options.count("table"))
|
||||
throw Exception("Missing option, 'table' is missing", ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
auto lc_only = options.count("low-cardinality");
|
||||
auto scanned_count_threshold_for_lc = options["scanned_count_threshold_for_lc"].as<Float64>();
|
||||
auto cardinality_ratio_threshold_for_lc = options["cardinality_ratio_threshold_for_lc"].as<Float64>();
|
||||
|
||||
std::string table_name = options["table"].as<std::string>();
|
||||
std::string absolute_part_path = selectPartPath(options, data_path_list, db_name, table_name, sample_row_number);
|
||||
serializeJsonPrefix(str_buf, db_name, table_name, absolute_part_path, verbose);
|
||||
ColumnsDescription columns = env.getColumnsDescription(db_name, table_name);
|
||||
|
||||
TypeAdvisor type_advisor(env, options, columns, absolute_part_path, sample_row_number, max_threads, lc_only, scanned_count_threshold_for_lc, cardinality_ratio_threshold_for_lc);
|
||||
type_advisor.execute();
|
||||
type_advisor.serializeJson(str_buf, verbose);
|
||||
serializeJsonSuffix(str_buf);
|
||||
}
|
||||
else if (advisor_mode == "skip-index") // currently extracts all usages for the database
|
||||
{
|
||||
serializeJsonPrefixWithDB(str_buf, db_name);
|
||||
IndexAdvisor index_advisor(env, options, sample_row_number, max_threads);
|
||||
index_advisor.execute();
|
||||
index_advisor.serializeJson(str_buf, verbose);
|
||||
serializeJsonSuffix(str_buf);
|
||||
}
|
||||
else if (advisor_mode == "prewhere") // currently extracts all usages for the database
|
||||
{
|
||||
serializeJsonPrefixWithDB(str_buf, db_name);
|
||||
PrewhereAdvisor prewhere_advisor(env, options, sample_row_number, max_threads, mark_filter_threshold, top_3_mark_filter_threshold);
|
||||
prewhere_advisor.execute();
|
||||
prewhere_advisor.serializeJson(str_buf, verbose);
|
||||
serializeJsonSuffix(str_buf);
|
||||
}
|
||||
else if (advisor_mode == "materialized-view")
|
||||
{
|
||||
Advisor advisor{ASTAdviseQuery::AdvisorType::MATERIALIZED_VIEW};
|
||||
WorkloadAdvises advises = advisor.analyze(loadQueries(options), createContext(options, env));
|
||||
serializeJson("materialized-view", "ddl", db_name, advises, str_buf, verbose);
|
||||
}
|
||||
else if (advisor_mode == "projection")
|
||||
{
|
||||
Advisor advisor{ASTAdviseQuery::AdvisorType::PROJECTION};
|
||||
WorkloadAdvises advises = advisor.analyze(loadQueries(options), createContext(options, env));
|
||||
serializeJson("projection", "ddl", db_name, advises, str_buf, verbose);
|
||||
}
|
||||
else if (advisor_mode == "order-by-key")
|
||||
{
|
||||
Advisor advisor{ASTAdviseQuery::AdvisorType::ORDER_BY};
|
||||
WorkloadAdvises advises = advisor.analyze(loadQueries(options), createContext(options, env));
|
||||
serializeJson(advisor_mode, "candidate", db_name, advises, str_buf, verbose);
|
||||
}
|
||||
else if (advisor_mode == "cluster-key")
|
||||
{
|
||||
Advisor advisor{ASTAdviseQuery::AdvisorType::CLUSTER_BY};
|
||||
WorkloadAdvises advises = advisor.analyze(loadQueries(options), createContext(options, env));
|
||||
serializeJson(advisor_mode, "candidate", db_name, advises, str_buf, verbose);
|
||||
}
|
||||
else if (advisor_mode == "column-usage")
|
||||
{
|
||||
Advisor advisor{ASTAdviseQuery::AdvisorType::COLUMN_USAGE};
|
||||
WorkloadAdvises advises = advisor.analyze(loadQueries(options), createContext(options, env));
|
||||
serializeJson(advisor_mode, "usage", db_name, advises, str_buf, verbose);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw Exception("Unsupported advisor mode: " + advisor_mode, ErrorCodes::BAD_ARGUMENTS);
|
||||
}
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
serializeException(*stdout_buf, getCurrentExceptionMessage(verbose));
|
||||
return getCurrentExceptionCode();
|
||||
}
|
||||
writeString(str_buf.str(), *stdout_buf);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,418 @@
|
|||
#pragma once
|
||||
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <Advisor/Rules/WorkloadAdvisor.h>
|
||||
#include <IO/ConnectionTimeouts.h>
|
||||
#include <IO/HTTPCommon.h>
|
||||
#include <IO/ReadBufferFromFile.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <Interpreters/Context_fwd.h>
|
||||
#include <Optimizer/Dump/PlanReproducer.h>
|
||||
#include <Parsers/ASTCreateQuery.h>
|
||||
#include <Parsers/ParserCreateQuery.h>
|
||||
#include <Parsers/parseQuery.h>
|
||||
#include <boost/program_options/variables_map.hpp>
|
||||
#include <Poco/DirectoryIterator.h>
|
||||
#include <Poco/FormattingChannel.h>
|
||||
#include <Poco/JSON/Array.h>
|
||||
#include <Poco/JSON/Object.h>
|
||||
#include <Poco/JSON/Stringifier.h>
|
||||
#include <Poco/PatternFormatter.h>
|
||||
#include <Poco/StreamCopier.h>
|
||||
#include <Poco/String.h>
|
||||
#include <Common/Logger.h>
|
||||
#include <Common/escapeForFileName.h>
|
||||
#include <Common/formatIPv6.h>
|
||||
#include "MockEnvironment.h"
|
||||
|
||||
namespace DB
|
||||
{
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int NOT_FOUND_EXPECTED_DATA_PART;
|
||||
extern const int NO_FILE_IN_DATA_PART;
|
||||
}
|
||||
|
||||
namespace po = boost::program_options;
|
||||
|
||||
[[maybe_unused]] static void setupLogging(const std::string & log_level)
|
||||
{
|
||||
Poco::AutoPtr<Poco::ConsoleChannel> channel(new Poco::ConsoleChannel);
|
||||
Poco::AutoPtr<Poco::PatternFormatter> formatter(new Poco::PatternFormatter);
|
||||
formatter->setProperty("pattern", "%L%Y-%m-%d %H:%M:%S.%i <%p> %s: %t");
|
||||
Poco::AutoPtr<Poco::FormattingChannel> formatting_channel(new Poco::FormattingChannel(formatter, channel));
|
||||
Poco::Logger::root().setChannel(formatting_channel);
|
||||
Poco::Logger::root().setLevel(log_level);
|
||||
}
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int BAD_ARGUMENTS;
|
||||
extern const int NOT_FOUND_EXPECTED_DATA_PART;
|
||||
extern const int NETWORK_ERROR;
|
||||
}
|
||||
|
||||
namespace po = boost::program_options;
|
||||
|
||||
static constexpr size_t METADATA_FILE_BUFFER_SIZE = 32768;
|
||||
static constexpr size_t DEFAULT_MAX_SAMPLE_CANDIDATE_NUM = 20;
|
||||
static constexpr auto DEFAULT_TOS_PSM = "toutiao.tos.tosapi";
|
||||
|
||||
struct SamplingColumnFile
|
||||
{
|
||||
SamplingColumnFile(std::string file_path_, std::string column_name_)
|
||||
: file_path(std::move(file_path_)), column_name(std::move(column_name_))
|
||||
{
|
||||
}
|
||||
|
||||
std::string file_path;
|
||||
std::string column_name;
|
||||
size_t origin_file_size = 0;
|
||||
size_t optimized_file_size = 0;
|
||||
};
|
||||
|
||||
using SamplingColumnFilePtr = std::shared_ptr<SamplingColumnFile>;
|
||||
using SamplingColumnFiles = std::vector<SamplingColumnFilePtr>;
|
||||
|
||||
// a thread-safe implementation
|
||||
class MessageCollector
|
||||
{
|
||||
public:
|
||||
void collect(std::string && msg)
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
messages.emplace_back(std::move(msg));
|
||||
}
|
||||
|
||||
void logCollectedError()
|
||||
{
|
||||
for (const auto & msg : messages)
|
||||
LOG_ERROR(getLogger("MessageCollector"), "{}", msg);
|
||||
messages.clear();
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<std::string> messages;
|
||||
bthread::Mutex mutex;
|
||||
};
|
||||
|
||||
static std::string readSqlFile(String source_uri, [[maybe_unused]]const po::variables_map & options)
|
||||
{
|
||||
// std::string uri_prefix = source_uri.substr(0, source_uri.find_last_of('/'));
|
||||
// Poco::URI uri(uri_prefix);
|
||||
// const String& scheme = uri.getScheme();
|
||||
|
||||
|
||||
// if (scheme == "tos") // tos on cloud, url like "tos://bucket/key"
|
||||
// {
|
||||
// if (!options.count("tos-ak"))
|
||||
// throw Exception("Option tos-ak is missing for tos uri", ErrorCodes::BAD_ARGUMENTS);
|
||||
// std::string tos_ak = options["tos-ak"].as<std::string>();
|
||||
|
||||
// Poco::URI tos_uri(source_uri);
|
||||
// auto host = tos_uri.getHost();
|
||||
// auto port = tos_uri.getPort();
|
||||
// std::string tos_psm = DEFAULT_TOS_PSM;
|
||||
// std::string tos_server;
|
||||
|
||||
// if (host.empty() || port == 0)
|
||||
// {
|
||||
// auto tos_servers = ServiceDiscovery::lookup(DEFAULT_TOS_PSM, std::pair<std::string, std::string>("cluster", "default"));
|
||||
// if (tos_servers.empty())
|
||||
// throw Exception("Can not find tos servers with PSM: " + tos_psm, ErrorCodes::NETWORK_ERROR);
|
||||
// auto generator = std::mt19937(std::random_device{}()); // mt19937 engine
|
||||
// std::uniform_int_distribution<int> distribution(0, tos_servers.size() - 1);
|
||||
// tos_server = tos_servers.at(distribution(generator));
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// tos_server = normalizeHost(host) + ":" + toString(port);
|
||||
// }
|
||||
|
||||
// ConnectionTimeouts timeouts(
|
||||
// {DEFAULT_HTTP_READ_BUFFER_CONNECTION_TIMEOUT, 0},
|
||||
// {DEFAULT_HTTP_READ_BUFFER_TIMEOUT, 0},
|
||||
// {DEFAULT_HTTP_READ_BUFFER_TIMEOUT, 0});
|
||||
|
||||
// std::string tos_http_uri_str = fmt::format(
|
||||
// "http://{}{}?timeout={}s", tos_server, tos_uri.getPath(), DBMS_DEFAULT_CONNECT_TIMEOUT_SEC);
|
||||
// Poco::URI tos_http_uri = Poco::URI(tos_http_uri_str);
|
||||
// HTTPSessionPtr session = makeHTTPSession(tos_http_uri, timeouts);
|
||||
|
||||
// Poco::Net::HTTPRequest request{Poco::Net::HTTPRequest::HTTP_GET, tos_http_uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1};
|
||||
// request.set("X-Tos-Access", tos_ak);
|
||||
// request.setHost(tos_http_uri.getHost());
|
||||
// request.setChunkedTransferEncoding(false);
|
||||
|
||||
// session->sendRequest(request);
|
||||
// Poco::Net::HTTPResponse response;
|
||||
// std::istream * response_body = receiveResponse(*session, request, response, false);
|
||||
// Poco::StreamCopier::copyToString(*response_body, res);
|
||||
// }
|
||||
// #if USE_VE_TOS
|
||||
// else if (scheme == "vetos") // tos on volcano engine, url like "vetos://bucket/key"
|
||||
// {
|
||||
// Poco::URI vetos_uri(source_uri);
|
||||
// vetos_uri.getPath();
|
||||
// if(vetos_uri.getPath().empty() || vetos_uri.getHost().empty())
|
||||
// {
|
||||
// throw Exception("Invalid ve-tos path.", ErrorCodes::LOGICAL_ERROR);
|
||||
// }
|
||||
// const String& bucket = vetos_uri.getHost();
|
||||
// size_t size = vetos_uri.getPath().size();
|
||||
// String key = vetos_uri.getPath().substr(1, size - 1);
|
||||
// if (!options.count("vetos-endpoint"))
|
||||
// throw Exception("Option vetos-endpoint is missing for ve tos uri", ErrorCodes::BAD_ARGUMENTS);
|
||||
// if (!options.count("vetos-region"))
|
||||
// throw Exception("Option vetos-region is missing for ve tos uri", ErrorCodes::BAD_ARGUMENTS);
|
||||
// if (!options.count("vetos-ak"))
|
||||
// throw Exception("Option vetos-ak is missing for ve tos uri", ErrorCodes::BAD_ARGUMENTS);
|
||||
// if (!options.count("vetos-sk"))
|
||||
// throw Exception("Option vetos-sk is missing for ve tos uri", ErrorCodes::BAD_ARGUMENTS);
|
||||
// std::string ve_tos_endpoint = options["vetos-endpoint"].as<std::string>();
|
||||
// std::string ve_tos_region = options["vetos-region"].as<std::string>();
|
||||
// std::string ve_tos_ak = options["vetos-ak"].as<std::string>();
|
||||
// std::string ve_tos_sk = options["vetos-sk"].as<std::string>();
|
||||
|
||||
// std::unique_ptr<ReadBuffer> read_buf =
|
||||
// std::make_unique<ReadBufferFromVETos>(ve_tos_endpoint, ve_tos_region, ve_tos_ak, ve_tos_sk, bucket, key);
|
||||
|
||||
// readStringUntilEOF(res, *read_buf);
|
||||
// }
|
||||
// #endif // USE_VE_TOS
|
||||
// else // absolute file path on local file system
|
||||
// {
|
||||
|
||||
std::string res;
|
||||
|
||||
std::ifstream fin(source_uri);
|
||||
std::stringstream buffer;
|
||||
buffer << fin.rdbuf();
|
||||
res = buffer.str();
|
||||
// }
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
/// Select the target part according to specific rule if 'part' option is not specified
|
||||
[[maybe_unused]] static std::string selectPartPath(const po::variables_map & options, const std::vector<std::string> & data_path_list, const std::string & db_name, const std::string & table_name, size_t sample_row_number)
|
||||
{
|
||||
for (const auto & path : data_path_list)
|
||||
{
|
||||
if (options.count("part"))
|
||||
{
|
||||
std::string part = options["part"].as<std::string>();
|
||||
if (endsWith(part, "/"))
|
||||
part.pop_back();
|
||||
return path + "metadata/" + escapeForFileName(db_name) + "/" + escapeForFileName(table_name) + "/" + part;
|
||||
}
|
||||
|
||||
std::string table_data_path = path + "data/" + escapeForFileName(db_name) + "/" + escapeForFileName(table_name) + "/";
|
||||
if (!std::filesystem::exists(table_data_path))
|
||||
continue;
|
||||
|
||||
std::multimap<std::time_t, std::string> parts_by_timestamp;
|
||||
Poco::DirectoryIterator end;
|
||||
for (Poco::DirectoryIterator it(table_data_path); it != end; ++it)
|
||||
{
|
||||
if (it->isDirectory()
|
||||
&& it.name() != "detached"
|
||||
&& it.name() != "log"
|
||||
&& it.name() != "catalog.db"
|
||||
&& it.name() != "manifest"
|
||||
&& !startsWith(it.name(), "tmp-fetch")
|
||||
&& !startsWith(it.name(), "tmp_")
|
||||
&& !startsWith(it.name(), "delete_tmp"))
|
||||
{
|
||||
size_t part_row_count;
|
||||
std::string part_count_path = it->path() + "/count.txt";
|
||||
{
|
||||
ReadBufferFromFile in(part_count_path, METADATA_FILE_BUFFER_SIZE);
|
||||
readIntText(part_row_count, in);
|
||||
assertEOF(in);
|
||||
}
|
||||
if (part_row_count >= sample_row_number)
|
||||
{
|
||||
parts_by_timestamp.emplace(it->getLastModified().epochTime(), it->path());
|
||||
if (parts_by_timestamp.size() > DEFAULT_MAX_SAMPLE_CANDIDATE_NUM)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!parts_by_timestamp.empty())
|
||||
return parts_by_timestamp.begin()->second;
|
||||
}
|
||||
|
||||
throw Exception(db_name + "(" + table_name + "): failed to find qualified sample part.", ErrorCodes::NOT_FOUND_EXPECTED_DATA_PART);
|
||||
}
|
||||
|
||||
/// Generate output prefix in JSON format
|
||||
[[maybe_unused]] static void serializeJsonPrefix(WriteBuffer & buf, std::string db_name, std::string table_name, std::string absolute_part_path, bool verbose)
|
||||
{
|
||||
writeString(R"({"recommendation":{"db":")", buf);
|
||||
writeString(db_name, buf);
|
||||
writeString("\",", buf);
|
||||
writeString(R"("table":")", buf);
|
||||
writeString(table_name, buf);
|
||||
writeString("\",", buf);
|
||||
if (verbose)
|
||||
{
|
||||
writeString(R"("part selected":")", buf);
|
||||
writeString(absolute_part_path, buf);
|
||||
writeString("\",", buf);
|
||||
}
|
||||
}
|
||||
|
||||
[[maybe_unused]] static void serializeJsonPrefixWithDB(WriteBuffer & buf, std::string db_name)
|
||||
{
|
||||
writeString(R"({"recommendation":{"db":")", buf);
|
||||
writeString(db_name, buf);
|
||||
writeString("\",", buf);
|
||||
}
|
||||
|
||||
/// Generate output suffix in JSON format
|
||||
[[maybe_unused]] static void serializeJsonSuffix(WriteBuffer & buf)
|
||||
{
|
||||
writeString("}}", buf);
|
||||
}
|
||||
|
||||
/// Generate exception in JSON format
|
||||
[[maybe_unused]] static void serializeException(WriteBuffer & buf, std::string error_msg)
|
||||
{
|
||||
writeString(R"({"exception":")", buf);
|
||||
writeString(error_msg, buf);
|
||||
writeString("\"}", buf);
|
||||
}
|
||||
|
||||
[[maybe_unused]] static std::vector<String> loadQueries(po::variables_map & options)
|
||||
{
|
||||
std::string query_file = options["query-file"].as<std::string>();
|
||||
|
||||
std::vector<std::string> splits;
|
||||
if (Poco::toLower(query_file).ends_with(".json"))
|
||||
{
|
||||
PlanReproducer reproducer{query_file, nullptr};
|
||||
for (const auto & name : reproducer.getQueries()->getNames())
|
||||
splits.emplace_back(reproducer.getQuery(name).query);
|
||||
return splits;
|
||||
}
|
||||
|
||||
std::string query_content = readSqlFile(query_file, options);
|
||||
std::string delimiter = "\n";
|
||||
if (options.count("query-file-delimiter"))
|
||||
delimiter = options["query-file-delimiter"].as<std::string>();
|
||||
|
||||
size_t last = 0;
|
||||
size_t next;
|
||||
while ((next = query_content.find(delimiter, last)) != std::string::npos)
|
||||
{
|
||||
auto query = query_content.substr(last, next - last);
|
||||
boost::replace_all(query, "\\r", "\r");
|
||||
boost::replace_all(query, "\\n", "\n");
|
||||
boost::replace_all(query, "\\t", "\t");
|
||||
boost::replace_all(query, "\\\"", "\"");
|
||||
boost::replace_all(query, "\\'", "'");
|
||||
splits.push_back(query);
|
||||
last = next + 1;
|
||||
}
|
||||
if (splits.empty())
|
||||
throw Poco::Exception("'" + query_file + "' is empty?");
|
||||
return splits;
|
||||
}
|
||||
|
||||
[[maybe_unused]] static ContextMutablePtr createContext(po::variables_map & options, MockEnvironment & env)
|
||||
{
|
||||
if (options["db"].empty())
|
||||
throw Exception("argument db is requried", ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
std::string db_name = options["db"].as<std::string>();
|
||||
std::vector<std::string> db_list;
|
||||
boost::algorithm::split(db_list, db_name, boost::is_any_of(","), boost::token_compress_on);
|
||||
|
||||
if (db_list.empty())
|
||||
throw Exception("argument db is requried", ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
for (const auto & db : db_list)
|
||||
{
|
||||
env.createMockDatabase(db);
|
||||
// todo: currently we create all tables in the db
|
||||
for (const auto & table : env.listTables(db))
|
||||
env.createMockTable(db, table);
|
||||
}
|
||||
|
||||
auto context = env.createQueryContext();
|
||||
context->setCurrentDatabase(db_list[0]);
|
||||
|
||||
std::string settings = options["settings"].as<std::string>();
|
||||
if (!settings.empty())
|
||||
{
|
||||
ParserSetQuery parser{true};
|
||||
ASTPtr ast = parseQuery(parser, settings, 0, 0);
|
||||
context->applySettingsChanges(ast->as<ASTSetQuery>()->changes);
|
||||
}
|
||||
|
||||
return context;
|
||||
}
|
||||
|
||||
[[maybe_unused]] static void serializeJson(const std::string & advise_type, const String & advise_name, const String & db, const WorkloadAdvises & advises, WriteBuffer & buf, bool)
|
||||
{
|
||||
Poco::JSON::Array advises_array;
|
||||
for (const auto & advise : advises)
|
||||
{
|
||||
Poco::JSON::Object advise_object;
|
||||
advise_object.set("db", advise->getTable().database);
|
||||
advise_object.set("table", advise->getTable().table);
|
||||
if (advise->getColumnName().has_value()) {
|
||||
advise_object.set("column", advise->getColumnName().value());
|
||||
}
|
||||
|
||||
if (!advise->getCandidates().empty())
|
||||
{
|
||||
Poco::JSON::Array candidates;
|
||||
for (const auto & item : advise->getCandidates())
|
||||
{
|
||||
Poco::JSON::Object candidate_object;
|
||||
candidate_object.set(advise_name, item.first);
|
||||
candidate_object.set("benefit", item.second);
|
||||
candidates.add(candidate_object);
|
||||
}
|
||||
advise_object.set("candidates", candidates);
|
||||
}
|
||||
else
|
||||
{
|
||||
advise_object.set(advise_name, advise->getOptimizedValue());
|
||||
advise_object.set("benefit", advise->getBenefit());
|
||||
}
|
||||
|
||||
if (!advise->getRelatedQueries().empty())
|
||||
{
|
||||
Poco::JSON::Array related_queries;
|
||||
for (const auto & query : advise->getRelatedQueries())
|
||||
related_queries.add(query);
|
||||
advise_object.set("relatedQueries", related_queries);
|
||||
}
|
||||
|
||||
advises_array.add(advise_object);
|
||||
}
|
||||
|
||||
Poco::JSON::Object advises_object;
|
||||
advises_object.set(advise_type, advises_array);
|
||||
|
||||
Poco::JSON::Object recommendation_object;
|
||||
recommendation_object.set("db", db);
|
||||
recommendation_object.set(advise_type, advises_object);
|
||||
|
||||
Poco::JSON::Object res;
|
||||
res.set("recommendation", recommendation_object);
|
||||
std::ostringstream oss;
|
||||
Poco::JSON::Stringifier::condense(res, oss);
|
||||
writeString(oss.str(), buf);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,453 @@
|
|||
#include "Statistics.h"
|
||||
|
||||
#include <AggregateFunctions/AggregateFunctionNull.h>
|
||||
#include <AggregateFunctions/AggregateFunctionUniq.h>
|
||||
#include <AggregateFunctions/AggregateFunctionCombinatorFactory.h>
|
||||
#include <AggregateFunctions/FactoryHelpers.h>
|
||||
#include <AggregateFunctions/Helpers.h>
|
||||
#include <AggregateFunctions/HelpersMinMaxAny.h>
|
||||
#include <AggregateFunctions/IAggregateFunction.h>
|
||||
#include <AggregateFunctions/AggregateFunctionCountByGranularity.h>
|
||||
#include <DataTypes/DataTypeArray.h>
|
||||
#include <DataTypes/DataTypeDate.h>
|
||||
#include <DataTypes/DataTypeDate32.h>
|
||||
#include <DataTypes/DataTypeDateTime.h>
|
||||
#include <DataTypes/DataTypeUUID.h>
|
||||
#include <DataTypes/DataTypeLowCardinality.h>
|
||||
#include <DataTypes/IDataType.h>
|
||||
#include <Core/Field.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include "Columns/IColumn.h"
|
||||
|
||||
namespace DB
|
||||
{
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int CANNOT_ALLOCATE_MEMORY;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
||||
}
|
||||
|
||||
template <bool is_exact, template <typename, bool> typename Data, template <bool, bool, bool> typename DataForVariadic, bool is_able_to_parallelize_merge>
|
||||
AggregateFunctionPtr
|
||||
createAggregateFunctionUniq(const std::string & name, const DataTypes & argument_types, const Array & params, const Settings *)
|
||||
{
|
||||
assertNoParameters(name, params);
|
||||
|
||||
if (argument_types.empty())
|
||||
throw Exception("Incorrect number of arguments for aggregate function " + name,
|
||||
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
||||
|
||||
/// We use exact hash function if the user wants it;
|
||||
/// or if the arguments are not contiguous in memory, because only exact hash function have support for this case.
|
||||
/// bool use_exact_hash_function = is_exact || !isAllArgumentsContiguousInMemory(argument_types);
|
||||
|
||||
if (argument_types.size() == 1)
|
||||
{
|
||||
const IDataType & argument_type = *argument_types[0];
|
||||
|
||||
AggregateFunctionPtr res(createWithNumericType<AggregateFunctionUniq, Data, is_able_to_parallelize_merge>(*argument_types[0], argument_types));
|
||||
|
||||
WhichDataType which(argument_type);
|
||||
if (res)
|
||||
return res;
|
||||
else if (which.isDate())
|
||||
return std::make_shared<AggregateFunctionUniq<DataTypeDate::FieldType, Data<DataTypeDate::FieldType, is_able_to_parallelize_merge>>>(argument_types);
|
||||
else if (which.isDate32())
|
||||
return std::make_shared<AggregateFunctionUniq<DataTypeDate32::FieldType, Data<DataTypeDate32::FieldType, is_able_to_parallelize_merge>>>(argument_types);
|
||||
else if (which.isDateTime())
|
||||
return std::make_shared<AggregateFunctionUniq<DataTypeDateTime::FieldType, Data<DataTypeDateTime::FieldType, is_able_to_parallelize_merge>>>(argument_types);
|
||||
else if (which.isStringOrFixedString())
|
||||
return std::make_shared<AggregateFunctionUniq<String, Data<String, is_able_to_parallelize_merge>>>(argument_types);
|
||||
else if (which.isUUID())
|
||||
return std::make_shared<AggregateFunctionUniq<DataTypeUUID::FieldType, Data<DataTypeUUID::FieldType, is_able_to_parallelize_merge>>>(argument_types);
|
||||
else if (which.isTuple())
|
||||
{
|
||||
/*
|
||||
if (use_exact_hash_function)
|
||||
return std::make_shared<AggregateFunctionUniqVariadic<DataForVariadic<true, true, is_able_to_parallelize_merge>>>(argument_types);
|
||||
else
|
||||
return std::make_shared<AggregateFunctionUniqVariadic<DataForVariadic<false, true, is_able_to_parallelize_merge>>>(argument_types);
|
||||
*/
|
||||
throw Exception("Unsupported tuple data type for uniqExtract", ErrorCodes::BAD_ARGUMENTS);
|
||||
}
|
||||
}
|
||||
|
||||
/* "Variadic" method also works as a fallback generic case for single argument.
|
||||
if (use_exact_hash_function)
|
||||
return std::make_shared<AggregateFunctionUniqVariadic<DataForVariadic<true, false, is_able_to_parallelize_merge>>>(argument_types);
|
||||
else
|
||||
return std::make_shared<AggregateFunctionUniqVariadic<DataForVariadic<false, false, is_able_to_parallelize_merge>>>(argument_types);
|
||||
*/
|
||||
throw Exception("Unsupported arguments size " + std::to_string(argument_types.size()), ErrorCodes::BAD_ARGUMENTS);
|
||||
}
|
||||
|
||||
DataTypes transformArguments(const DataTypes & arguments)
|
||||
{
|
||||
size_t size = arguments.size();
|
||||
DataTypes res(size);
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
res[i] = removeNullable(arguments[i]);
|
||||
return res;
|
||||
}
|
||||
|
||||
Field UniExtract::executeOnColumn(const ColumnPtr & column, const DataTypePtr & type)
|
||||
{
|
||||
String name = "uniqExact";
|
||||
DataTypes argument_types(1);
|
||||
argument_types[0] = type;
|
||||
Array parameters;
|
||||
|
||||
AggregateFunctionPtr nested_function =
|
||||
createAggregateFunctionUniq<true, AggregateFunctionUniqExactData, AggregateFunctionUniqExactDataForVariadic, false /* is_able_to_parallelize_merge */>
|
||||
(name, transformArguments(argument_types), parameters, nullptr);
|
||||
AggregateFunctionPtr aggregate_function = type->isNullable()
|
||||
? std::make_shared<AggregateFunctionNullUnary<false, true>>(nested_function, argument_types, parameters)
|
||||
: nested_function;
|
||||
|
||||
size_t total_size_of_aggregate_states = 0; /// The total size of the row from the aggregate functions.
|
||||
// add info to track alignment requirement
|
||||
// If there are states whose alignment are v1, ..vn, align_aggregate_states will be max(v1, ... vn)
|
||||
size_t align_aggregate_states = 1;
|
||||
total_size_of_aggregate_states = aggregate_function->sizeOfData();
|
||||
align_aggregate_states = std::max(align_aggregate_states, aggregate_function->alignOfData());
|
||||
|
||||
std::shared_ptr<Arena> aggregates_pool = std::make_shared<Arena>(); /// The pool that is currently used for allocation.
|
||||
AggregateDataPtr place = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
|
||||
try
|
||||
{
|
||||
/** An exception may occur if there is a shortage of memory.
|
||||
* In order that then everything is properly destroyed, we "roll back" some of the created states.
|
||||
* The code is not very convenient.
|
||||
*/
|
||||
aggregate_function->create(place);
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
aggregate_function->destroy(place);
|
||||
throw Exception("Cannot allocate memory", ErrorCodes::CANNOT_ALLOCATE_MEMORY);
|
||||
}
|
||||
size_t rows = column->size();
|
||||
ColumnRawPtrs column_ptrs;
|
||||
column_ptrs.emplace_back(column.get());
|
||||
const IColumn ** batch_arguments = column_ptrs.data();
|
||||
|
||||
aggregate_function->addBatchSinglePlace(rows, place, batch_arguments, nullptr);
|
||||
|
||||
DataTypePtr result_type = std::make_shared<DataTypeUInt64>();
|
||||
ColumnPtr result_column = result_type->createColumn();
|
||||
MutableColumnPtr mutable_column = result_column->assumeMutable();
|
||||
aggregate_function->insertResultInto(place, *mutable_column, nullptr);
|
||||
return (*result_column)[0];
|
||||
}
|
||||
|
||||
Field UniExtract::executeOnColumnArray(const ColumnPtr & column, const DataTypePtr & type)
|
||||
{
|
||||
if (!isArray(type))
|
||||
return 0;
|
||||
|
||||
const auto * array_type = checkAndGetDataType<DataTypeArray>(type.get());
|
||||
const auto& nested_type = array_type->getNestedType();
|
||||
|
||||
String inner_func_name = "uniqExact";
|
||||
String combinator_suffix = "Array";
|
||||
|
||||
DataTypes nested_argument_types{nested_type};
|
||||
DataTypes argument_types{type};
|
||||
Array parameters;
|
||||
|
||||
// For inner func uniqExact
|
||||
AggregateFunctionPtr nested_function =
|
||||
createAggregateFunctionUniq<true, AggregateFunctionUniqExactData, AggregateFunctionUniqExactDataForVariadic, false /* is_able_to_parallelize_merge */>
|
||||
(inner_func_name, transformArguments(nested_argument_types), parameters, nullptr);
|
||||
AggregateFunctionPtr uniq_exact_function = type->isNullable()
|
||||
? std::make_shared<AggregateFunctionNullUnary<false, true>>(nested_function, nested_argument_types, parameters)
|
||||
: nested_function;
|
||||
|
||||
// For combinator -Array
|
||||
AggregateFunctionCombinatorPtr array_combinator = AggregateFunctionCombinatorFactory::instance().tryFindSuffix(combinator_suffix);
|
||||
AggregateFunctionPtr uniq_exact_array_function = array_combinator->transformAggregateFunction(
|
||||
uniq_exact_function, {}, argument_types, parameters);
|
||||
|
||||
size_t total_size_of_aggregate_states = 0;
|
||||
size_t align_aggregate_states = 1;
|
||||
total_size_of_aggregate_states = uniq_exact_array_function->sizeOfData();
|
||||
align_aggregate_states = std::max(align_aggregate_states, uniq_exact_array_function->alignOfData());
|
||||
|
||||
std::shared_ptr<Arena> aggregates_pool = std::make_shared<Arena>();
|
||||
AggregateDataPtr place = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
|
||||
try
|
||||
{
|
||||
uniq_exact_array_function->create(place);
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
uniq_exact_array_function->destroy(place);
|
||||
throw Exception("Cannot allocate memory", ErrorCodes::CANNOT_ALLOCATE_MEMORY);
|
||||
}
|
||||
size_t rows = column->size();
|
||||
ColumnRawPtrs column_ptrs;
|
||||
column_ptrs.emplace_back(column.get());
|
||||
const IColumn ** batch_arguments = column_ptrs.data();
|
||||
|
||||
uniq_exact_array_function->addBatchSinglePlace(rows, place, batch_arguments, nullptr);
|
||||
|
||||
DataTypePtr result_type = std::make_shared<DataTypeUInt64>();
|
||||
ColumnPtr result_column = result_type->createColumn();
|
||||
MutableColumnPtr mutable_column = result_column->assumeMutable();
|
||||
uniq_exact_array_function->insertResultInto(place, *mutable_column, nullptr);
|
||||
|
||||
return (*result_column)[0];
|
||||
}
|
||||
|
||||
AggregateFunctionPtr createAggregateFunctionMin(
|
||||
const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings * settings)
|
||||
{
|
||||
return AggregateFunctionPtr(createAggregateFunctionSingleValue<AggregateFunctionsSingleValue, AggregateFunctionMinData>(name, argument_types, parameters, settings));
|
||||
}
|
||||
|
||||
Field Min::executeOnColumn(const ColumnPtr & column, const DataTypePtr & type)
|
||||
{
|
||||
String name = "min";
|
||||
DataTypes argument_types(1);
|
||||
argument_types[0] = type;
|
||||
Array parameters;
|
||||
|
||||
AggregateFunctionPtr nested_function = createAggregateFunctionMin(name, transformArguments(argument_types), parameters, nullptr);
|
||||
AggregateFunctionPtr aggregate_function = type->isNullable()
|
||||
? std::make_shared<AggregateFunctionNullUnary<false, true>>(nested_function, argument_types, parameters)
|
||||
: nested_function;
|
||||
|
||||
size_t total_size_of_aggregate_states = 0; /// The total size of the row from the aggregate functions.
|
||||
// add info to track alignment requirement
|
||||
// If there are states whose alignment are v1, ..vn, align_aggregate_states will be max(v1, ... vn)
|
||||
size_t align_aggregate_states = 1;
|
||||
total_size_of_aggregate_states = aggregate_function->sizeOfData();
|
||||
align_aggregate_states = std::max(align_aggregate_states, aggregate_function->alignOfData());
|
||||
|
||||
std::shared_ptr<Arena> aggregates_pool = std::make_shared<Arena>(); /// The pool that is currently used for allocation.
|
||||
AggregateDataPtr place = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
|
||||
try
|
||||
{
|
||||
/** An exception may occur if there is a shortage of memory.
|
||||
* In order that then everything is properly destroyed, we "roll back" some of the created states.
|
||||
* The code is not very convenient.
|
||||
*/
|
||||
aggregate_function->create(place);
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
aggregate_function->destroy(place);
|
||||
throw Exception("Cannot allocate memory", ErrorCodes::CANNOT_ALLOCATE_MEMORY);
|
||||
}
|
||||
size_t rows = column->size();
|
||||
ColumnRawPtrs column_ptrs;
|
||||
column_ptrs.emplace_back(column.get());
|
||||
const IColumn ** batch_arguments = column_ptrs.data();
|
||||
|
||||
aggregate_function->addBatchSinglePlace(rows, place, batch_arguments, nullptr);
|
||||
|
||||
ColumnPtr result_column = type->createColumn();
|
||||
MutableColumnPtr mutable_column = result_column->assumeMutable();
|
||||
aggregate_function->insertResultInto(place, *mutable_column, nullptr);
|
||||
return (*result_column)[0];
|
||||
}
|
||||
|
||||
AggregateFunctionPtr createAggregateFunctionMax(
|
||||
const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings * settings)
|
||||
{
|
||||
return AggregateFunctionPtr(createAggregateFunctionSingleValue<AggregateFunctionsSingleValue, AggregateFunctionMaxData>(name, argument_types, parameters, settings));
|
||||
}
|
||||
|
||||
Field Max::executeOnColumn(const ColumnPtr & column, const DataTypePtr & type)
|
||||
{
|
||||
String name = "max";
|
||||
DataTypes argument_types(1);
|
||||
argument_types[0] = type;
|
||||
Array parameters;
|
||||
|
||||
AggregateFunctionPtr nested_function = createAggregateFunctionMax(name, transformArguments(argument_types), parameters, nullptr);
|
||||
AggregateFunctionPtr aggregate_function = type->isNullable()
|
||||
? std::make_shared<AggregateFunctionNullUnary<false, true>>(nested_function, argument_types, parameters)
|
||||
: nested_function;
|
||||
|
||||
size_t total_size_of_aggregate_states = 0; /// The total size of the row from the aggregate functions.
|
||||
// add info to track alignment requirement
|
||||
// If there are states whose alignment are v1, ..vn, align_aggregate_states will be max(v1, ... vn)
|
||||
size_t align_aggregate_states = 1;
|
||||
total_size_of_aggregate_states = aggregate_function->sizeOfData();
|
||||
align_aggregate_states = std::max(align_aggregate_states, aggregate_function->alignOfData());
|
||||
|
||||
std::shared_ptr<Arena> aggregates_pool = std::make_shared<Arena>(); /// The pool that is currently used for allocation.
|
||||
AggregateDataPtr place = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
|
||||
try
|
||||
{
|
||||
/** An exception may occur if there is a shortage of memory.
|
||||
* In order that then everything is properly destroyed, we "roll back" some of the created states.
|
||||
* The code is not very convenient.
|
||||
*/
|
||||
aggregate_function->create(place);
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
aggregate_function->destroy(place);
|
||||
throw Exception("Cannot allocate memory", ErrorCodes::CANNOT_ALLOCATE_MEMORY);
|
||||
}
|
||||
size_t rows = column->size();
|
||||
ColumnRawPtrs column_ptrs;
|
||||
column_ptrs.emplace_back(column.get());
|
||||
const IColumn ** batch_arguments = column_ptrs.data();
|
||||
|
||||
aggregate_function->addBatchSinglePlace(rows, place, batch_arguments, nullptr);
|
||||
|
||||
ColumnPtr result_column = type->createColumn();
|
||||
MutableColumnPtr mutable_column = result_column->assumeMutable();
|
||||
aggregate_function->insertResultInto(place, *mutable_column, nullptr);
|
||||
return (*result_column)[0];
|
||||
}
|
||||
|
||||
AggregateFunctionPtr createAggregateFunctionCountByGranularity(
|
||||
const std::string & name, const DataTypes & argument_types, const Array & params, const Settings *)
|
||||
{
|
||||
if (argument_types.size() != 1)
|
||||
throw Exception("Incorrect number of arguments for aggregate function " + name, ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
||||
|
||||
const IDataType & argument_type = *argument_types[0];
|
||||
WhichDataType which(argument_type);
|
||||
|
||||
if (which.isNothing() || which.isArray() || which.isFunction() || which.isAggregateFunction() || which.isMap() || which.isBitmap64()
|
||||
|| which.isSet() || which.isTuple() || which.isInterval() || which.isDecimal() || which.isInt128() || which.isUInt128() || which.isDateOrDateTime())
|
||||
{
|
||||
throw Exception(
|
||||
"argument of " + name
|
||||
+ " can not be "
|
||||
"(Nothing,Array,Function,"
|
||||
"AggregateFunction,Map,Bitmap64,"
|
||||
"Set,Tuple,Interval,"
|
||||
"Decimal,Int128,UInt128, DateOrDateTime)",
|
||||
ErrorCodes::BAD_ARGUMENTS);
|
||||
}
|
||||
else if (which.isStringOrFixedString())
|
||||
{
|
||||
//auto a =AggregateFunctionCountByGranularity<String>(argument_types, params);
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<String>>(argument_types, params);
|
||||
}
|
||||
else if (which.isInt8())
|
||||
{
|
||||
auto a = AggregateFunctionCountByGranularity<Int8>(argument_types, params);
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<Int8>>(argument_types, params);
|
||||
}
|
||||
else if (which.isUInt8() || which.isEnum8())
|
||||
{
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<UInt8>>(argument_types, params);
|
||||
}
|
||||
else if (which.isInt16())
|
||||
{
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<Int16>>(argument_types, params);
|
||||
}
|
||||
else if (which.isUInt16() || which.isEnum16())
|
||||
{
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<UInt16>>(argument_types, params);
|
||||
}
|
||||
else if (which.isInt32())
|
||||
{
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<Int32>>(argument_types, params);
|
||||
}
|
||||
else if (which.isUInt32() || which.isDateTime())
|
||||
{
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<UInt32>>(argument_types, params);
|
||||
}
|
||||
else if (which.isInt64())
|
||||
{
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<Int64>>(argument_types, params);
|
||||
}
|
||||
else if (which.isUInt64())
|
||||
{
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<UInt64>>(argument_types, params);
|
||||
}
|
||||
// TODO can't support Int128 for now
|
||||
// else if (which.isInt128())
|
||||
// {
|
||||
// return std::make_shared<AggregateFunctionCountByGranularity<Int128>>(argument_types, params);
|
||||
// }
|
||||
else if (which.isUInt128())
|
||||
{
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<UInt128>>(argument_types, params);
|
||||
}
|
||||
else if (which.isFloat32())
|
||||
{
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<Float32>>(argument_types, params);
|
||||
}
|
||||
else if (which.isFloat64())
|
||||
{
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<Float64>>(argument_types, params);
|
||||
}
|
||||
// TODO can't support Decimal for now
|
||||
// else if (which.isDecimal32())
|
||||
// {
|
||||
// return std::make_shared<AggregateFunctionCountByGranularity<Decimal32>>(argument_types, params);
|
||||
// }
|
||||
// else if (which.isDecimal64() || which.isDateTime64())
|
||||
// {
|
||||
// return std::make_shared<AggregateFunctionCountByGranularity<Decimal64>>(argument_types, params);
|
||||
// }
|
||||
// else if (which.isDecimal128())
|
||||
// {
|
||||
// return std::make_shared<AggregateFunctionCountByGranularity<Decimal128>>(argument_types, params);
|
||||
// }
|
||||
else
|
||||
{
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<String>>(argument_types, params);
|
||||
}
|
||||
|
||||
__builtin_unreachable();
|
||||
}
|
||||
|
||||
ColumnPtr CountByGranularity::executeOnColumn(const ColumnPtr & column, const DataTypePtr & type)
|
||||
{
|
||||
String name = "countByGranularity";
|
||||
DataTypes argument_types(1);
|
||||
argument_types[0] = recursiveRemoveLowCardinality(type);
|
||||
Array parameters;
|
||||
|
||||
AggregateFunctionPtr nested_function = createAggregateFunctionCountByGranularity(name, transformArguments(argument_types), parameters, nullptr);
|
||||
AggregateFunctionPtr aggregate_function = argument_types[0]->isNullable() ? std::make_shared<AggregateFunctionNullUnary<false, true>>(nested_function, argument_types, parameters)
|
||||
: nested_function;
|
||||
|
||||
size_t total_size_of_aggregate_states = 0; /// The total size of the row from the aggregate functions.
|
||||
// add info to track alignment requirement
|
||||
// If there are states whose alignment are v1, ..vn, align_aggregate_states will be max(v1, ... vn)
|
||||
size_t align_aggregate_states = 1;
|
||||
total_size_of_aggregate_states = aggregate_function->sizeOfData();
|
||||
align_aggregate_states = std::max(align_aggregate_states, aggregate_function->alignOfData());
|
||||
|
||||
std::shared_ptr<Arena> aggregates_pool = std::make_shared<Arena>(); /// The pool that is currently used for allocation.
|
||||
AggregateDataPtr place = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states);
|
||||
try
|
||||
{
|
||||
/** An exception may occur if there is a shortage of memory.
|
||||
* In order that then everything is properly destroyed, we "roll back" some of the created states.
|
||||
* The code is not very convenient.
|
||||
*/
|
||||
aggregate_function->create(place);
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
aggregate_function->destroy(place);
|
||||
throw Exception("Cannot allocate memory", ErrorCodes::CANNOT_ALLOCATE_MEMORY);
|
||||
}
|
||||
size_t rows = column->size();
|
||||
ColumnRawPtrs column_ptrs;
|
||||
column_ptrs.emplace_back(recursiveRemoveLowCardinality(column).get());
|
||||
const IColumn ** batch_arguments = column_ptrs.data();
|
||||
|
||||
aggregate_function->addBatchSinglePlace(rows, place, batch_arguments, nullptr);
|
||||
|
||||
ColumnPtr result_column = nested_function->getReturnType()->createColumn();
|
||||
MutableColumnPtr mutable_column = result_column->assumeMutable();
|
||||
aggregate_function->insertResultInto(place, *mutable_column, nullptr);
|
||||
|
||||
return result_column;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
#pragma once
|
||||
|
||||
#include <Columns/IColumn.h>
|
||||
#include <DataTypes/IDataType.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
class UniExtract
|
||||
{
|
||||
public:
|
||||
Field executeOnColumn(const ColumnPtr & column, const DataTypePtr & type);
|
||||
Field executeOnColumnArray(const ColumnPtr & column, const DataTypePtr & type);
|
||||
};
|
||||
|
||||
class Min
|
||||
{
|
||||
public:
|
||||
Field executeOnColumn(const ColumnPtr & column, const DataTypePtr & type);
|
||||
};
|
||||
|
||||
class Max
|
||||
{
|
||||
public:
|
||||
Field executeOnColumn(const ColumnPtr & column, const DataTypePtr & type);
|
||||
};
|
||||
|
||||
class CountByGranularity
|
||||
{
|
||||
public:
|
||||
ColumnPtr executeOnColumn(const ColumnPtr & column, const DataTypePtr & type);
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,237 @@
|
|||
#include "TypeAdvisor.h"
|
||||
#include "ColumnUsageExtractor.h"
|
||||
#include "Core/Types.h"
|
||||
#include "SampleColumnReader.h"
|
||||
#include "Statistics.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <boost/algorithm/string/join.hpp>
|
||||
#include <boost/program_options.hpp>
|
||||
|
||||
#include <Common/ThreadPool.h>
|
||||
#include <Core/NamesAndTypes.h>
|
||||
#include <DataTypes/DataTypeNullable.h>
|
||||
#include <DataTypes/MapHelpers.h>
|
||||
#include <IO/copyData.h>
|
||||
#include <IO/ReadBufferFromFile.h>
|
||||
#include <IO/ReadBufferFromFileDescriptor.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <IO/WriteBufferFromFile.h>
|
||||
#include <IO/WriteBufferFromFileDescriptor.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <Parsers/ASTCreateQuery.h>
|
||||
#include <Parsers/ParserCreateQuery.h>
|
||||
#include <Parsers/parseQuery.h>
|
||||
#include <Parsers/queryToString.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
TypeAdvisor::TypeAdvisor(
|
||||
MockEnvironment & env_,
|
||||
const po::variables_map & options_,
|
||||
const ColumnsDescription & column_descs_,
|
||||
std::string absolute_part_path_,
|
||||
size_t sample_row_number_,
|
||||
size_t max_threads_,
|
||||
bool lc_only_,
|
||||
Float64 scanned_count_threshold_for_lc_,
|
||||
Float64 cardinality_ratio_threshold_for_lc_)
|
||||
: env(env_)
|
||||
, options(options_)
|
||||
, column_descs(column_descs_)
|
||||
, absolute_part_path(absolute_part_path_ + "/")
|
||||
, sample_row_number(sample_row_number_)
|
||||
, max_threads(max_threads_)
|
||||
, lc_only(lc_only_)
|
||||
, scanned_count_threshold_for_lc(scanned_count_threshold_for_lc_)
|
||||
, cardinality_ratio_threshold_for_lc(cardinality_ratio_threshold_for_lc_)
|
||||
{
|
||||
parseCodecCandidates();
|
||||
}
|
||||
|
||||
void TypeAdvisor::parseCodecCandidates()
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
DataTypePtr decayDataType(DataTypePtr type)
|
||||
{
|
||||
if (type->isNullable())
|
||||
return dynamic_cast<const DataTypeNullable *>(type.get())->getNestedType();
|
||||
return type;
|
||||
}
|
||||
|
||||
TypeAdvisor::TypeRecommendation buildTypeRecommendation(std::string column_name, std::string origin_type, bool is_type_nullable, std::string optimized_type)
|
||||
{
|
||||
return {column_name, is_type_nullable ? "Nullable(" + origin_type + ")" : origin_type, is_type_nullable ? "Nullable(" + optimized_type + ")" : optimized_type};
|
||||
}
|
||||
|
||||
void TypeAdvisor::adviseLowCardinality()
|
||||
{
|
||||
auto context = createContext(options, env);
|
||||
auto queries = loadQueries(options);
|
||||
|
||||
ColumnUsageExtractor extractor(context, max_threads);
|
||||
auto column_usages = extractor.extractColumnUsages(queries);
|
||||
auto type_usages = extractor.extractUsageForLowCardinality(column_usages);
|
||||
|
||||
LOG_DEBUG(getLogger("TypeAdvisor"), "Extract {} candidate columns, {}, {}", type_usages.size(), scanned_count_threshold_for_lc, cardinality_ratio_threshold_for_lc);
|
||||
|
||||
UniExtract uniq_extract;
|
||||
for (const auto & type_usage : type_usages)
|
||||
{
|
||||
if (type_usage.second < queries.size() * scanned_count_threshold_for_lc)
|
||||
{
|
||||
LOG_DEBUG(getLogger("TypeAdvisor"), "Do not Recommend lowcardinality column {}, scanned count is {}", type_usage.first.column, type_usage.second);
|
||||
continue;
|
||||
}
|
||||
|
||||
auto column_info = type_usage.first;
|
||||
if (isMapImplicitKey(column_info.column))
|
||||
continue;
|
||||
|
||||
auto storage = MockEnvironment::tryGetLocalTable(column_info.database, column_info.table, context);
|
||||
if (!storage)
|
||||
throw Exception(column_info.database + "(" + column_info.table + "): can not find local table.", ErrorCodes::NOT_FOUND_EXPECTED_DATA_PART);
|
||||
|
||||
auto metadata = storage->getInMemoryMetadataCopy();
|
||||
auto column_and_type = metadata.getColumns().tryGetColumn(GetColumnsOptions::Kind::AllPhysical, column_info.column);
|
||||
if (!column_and_type)
|
||||
continue;
|
||||
|
||||
auto column_type = column_and_type->type;
|
||||
if (column_type->getTypeId() == TypeIndex::LowCardinality || !isString(decayDataType(column_type)))
|
||||
continue;
|
||||
|
||||
SampleColumnReader reader(absolute_part_path + "/", 0, sample_row_number);
|
||||
ColumnPtr column;
|
||||
try
|
||||
{
|
||||
column = reader.readColumn({type_usage.first.column, column_type});
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
// Just skip the column if it can't be read
|
||||
LOG_DEBUG(
|
||||
getLogger("TypeAdvisor"),
|
||||
"Can't read column file " + type_usage.first.column + " from table " + column_info.database + "." + column_info.table
|
||||
+ ", error message: "
|
||||
+ getCurrentExceptionMessage(true));
|
||||
continue;
|
||||
}
|
||||
|
||||
// All following: check skip index
|
||||
size_t ndv = uniq_extract.executeOnColumn(column, column_type).get<UInt64>();
|
||||
|
||||
if (ndv > sample_row_number * cardinality_ratio_threshold_for_lc)
|
||||
{
|
||||
LOG_DEBUG(getLogger("TypeAdvisor"), "Do not Recommend lowcardinality column {}, scanned count is {}, ndv is {}", type_usage.first.column, type_usage.second, ndv);
|
||||
continue;
|
||||
}
|
||||
|
||||
LOG_DEBUG(getLogger("TypeAdvisor"), "Recommend lowcardinality column {}, scanned count is {}, ndv is {}", type_usage.first.column, type_usage.second, ndv);
|
||||
|
||||
type_recommendations.push_back({column_and_type->name
|
||||
, column_and_type->type->isNullable() ? "Nullable(String)" : "String"
|
||||
, column_and_type->type->isNullable() ? "LowCardinality(Nullable(String))" : "LowCardinality(String)"});}
|
||||
|
||||
}
|
||||
|
||||
void TypeAdvisor::execute()
|
||||
{
|
||||
if (lc_only)
|
||||
return adviseLowCardinality();
|
||||
|
||||
UniExtract uniqExtractFunc;
|
||||
Max maxFunc;
|
||||
Min minFunc;
|
||||
SampleColumnReader reader(absolute_part_path, 0, sample_row_number);
|
||||
for (const NameAndTypePair & name_and_type : column_descs.getOrdinary())
|
||||
{
|
||||
auto decayed_type = decayDataType(name_and_type.type);
|
||||
|
||||
bool is_string = decayed_type->getTypeId() == TypeIndex::String;
|
||||
bool is_float_64 = decayed_type->getTypeId() == TypeIndex::Float64;
|
||||
bool is_unsigned_integer = decayed_type->isValueRepresentedByUnsignedInteger() && decayed_type->isSummable();
|
||||
bool is_integer = decayed_type->isValueRepresentedByInteger() && decayed_type->isSummable();
|
||||
|
||||
if (is_string)
|
||||
{
|
||||
ColumnPtr column = reader.readColumn(name_and_type);
|
||||
auto ndv = uniqExtractFunc.executeOnColumn(column, name_and_type.type).get<UInt64>();
|
||||
if (ndv < ADVISOR_LOW_CARDINALITY_NDV_THRESHOLD)
|
||||
type_recommendations.push_back({name_and_type.name
|
||||
, name_and_type.type->isNullable() ? "Nullable(String)" : "String"
|
||||
, name_and_type.type->isNullable() ? "LowCardinality(Nullable(String))" : "LowCardinality(String)"});
|
||||
}
|
||||
else if (is_float_64)
|
||||
{
|
||||
ColumnPtr column = reader.readColumn(name_and_type);
|
||||
auto max = maxFunc.executeOnColumn(column, name_and_type.type).get<Float64>();
|
||||
auto min = minFunc.executeOnColumn(column, name_and_type.type).get<Float64>();
|
||||
if (min >= std::numeric_limits<Float32>::min() && max <= std::numeric_limits<Float32>::max())
|
||||
type_recommendations.push_back({name_and_type.name
|
||||
, name_and_type.type->isNullable() ? "Nullable(Float64)" : "Float64"
|
||||
, name_and_type.type->isNullable() ? "Nullable(Float32)" : "Float32"});
|
||||
}
|
||||
else if (is_unsigned_integer)
|
||||
{
|
||||
if (decayed_type->getTypeId() == TypeIndex::UInt8) /// skip UInt8
|
||||
continue;
|
||||
|
||||
ColumnPtr column = reader.readColumn(name_and_type);
|
||||
auto max = maxFunc.executeOnColumn(column, name_and_type.type).get<UInt64>();
|
||||
if (max <= std::numeric_limits<UInt8>::max())
|
||||
type_recommendations.push_back(buildTypeRecommendation(name_and_type.name, decayed_type->getName(), name_and_type.type->isNullable(), "UInt8"));
|
||||
else if (max <= std::numeric_limits<UInt16>::max())
|
||||
type_recommendations.push_back(buildTypeRecommendation(name_and_type.name, decayed_type->getName(), name_and_type.type->isNullable(), "UInt16"));
|
||||
else if (max <= std::numeric_limits<UInt32>::max())
|
||||
type_recommendations.push_back(buildTypeRecommendation(name_and_type.name, decayed_type->getName(), name_and_type.type->isNullable(), "UInt32"));
|
||||
}
|
||||
else if (is_integer)
|
||||
{
|
||||
if (decayed_type->getTypeId() == TypeIndex::Int8) /// skip Int8
|
||||
continue;
|
||||
|
||||
ColumnPtr column = reader.readColumn(name_and_type);
|
||||
auto max = maxFunc.executeOnColumn(column, name_and_type.type).get<Int64>();
|
||||
auto min = minFunc.executeOnColumn(column, name_and_type.type).get<Int64>();
|
||||
if (min >= std::numeric_limits<Int8>::min() && max <= std::numeric_limits<Int8>::max())
|
||||
type_recommendations.push_back(buildTypeRecommendation(name_and_type.name, decayed_type->getName(), name_and_type.type->isNullable(), "Int8"));
|
||||
else if (min >= std::numeric_limits<Int16>::min() && max <= std::numeric_limits<Int16>::max())
|
||||
type_recommendations.push_back(buildTypeRecommendation(name_and_type.name, decayed_type->getName(), name_and_type.type->isNullable(), "Int16"));
|
||||
else if (min >= std::numeric_limits<Int32>::min() && max <= std::numeric_limits<Int32>::max())
|
||||
type_recommendations.push_back(buildTypeRecommendation(name_and_type.name, decayed_type->getName(), name_and_type.type->isNullable(), "Int32"));
|
||||
}
|
||||
/// TODO(weiping.qw): add more rules
|
||||
}
|
||||
}
|
||||
|
||||
void TypeAdvisor::serializeJson(WriteBuffer & buf, [[maybe_unused]] bool verbose)
|
||||
{
|
||||
bool first = true;
|
||||
writeString("\"type\":[", buf);
|
||||
for (const auto & entry : type_recommendations)
|
||||
{
|
||||
if (first)
|
||||
first = false;
|
||||
else
|
||||
writeString(",", buf);
|
||||
std::string column_name = entry.column_name;
|
||||
writeString("{\"name\":\"", buf);
|
||||
writeString(column_name, buf);
|
||||
writeString("\",", buf);
|
||||
std::string column_origin_type = entry.origin_type;
|
||||
std::string column_optimized_type = entry.optimized_type;
|
||||
writeString("\"origin\":\"", buf);
|
||||
writeString(column_origin_type, buf);
|
||||
writeString("\",", buf);
|
||||
writeString("\"optimized\":\"", buf);
|
||||
writeString(column_optimized_type, buf);
|
||||
writeString("\"}", buf);
|
||||
}
|
||||
writeString("]", buf);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,71 @@
|
|||
#pragma once
|
||||
|
||||
#include <boost/program_options.hpp>
|
||||
|
||||
#include "SchemaAdvisorHelpers.h"
|
||||
|
||||
#include <Compression/ICompressionCodec.h>
|
||||
#include <IO/WriteBuffer.h>
|
||||
#include <Storages/ColumnsDescription.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace po = boost::program_options;
|
||||
|
||||
class TypeAdvisor
|
||||
{
|
||||
public:
|
||||
struct TypeRecommendation
|
||||
{
|
||||
TypeRecommendation(
|
||||
std::string column_name_,
|
||||
std::string origin_type_,
|
||||
std::string optimized_type_
|
||||
) : column_name(column_name_)
|
||||
, origin_type(origin_type_)
|
||||
, optimized_type(optimized_type_) {}
|
||||
|
||||
std::string column_name;
|
||||
std::string origin_type;
|
||||
std::string optimized_type;
|
||||
};
|
||||
|
||||
private:
|
||||
static constexpr const size_t ADVISOR_LOW_CARDINALITY_NDV_THRESHOLD = 65535;
|
||||
|
||||
MockEnvironment & env;
|
||||
po::variables_map options;
|
||||
const ColumnsDescription column_descs;
|
||||
Codecs codecs_to_compare;
|
||||
std::string absolute_part_path;
|
||||
const size_t sample_row_number;
|
||||
[[maybe_unused]] const size_t max_threads;
|
||||
std::vector<TypeRecommendation> type_recommendations;
|
||||
const bool lc_only;
|
||||
const Float64 scanned_count_threshold_for_lc;
|
||||
const Float64 cardinality_ratio_threshold_for_lc;
|
||||
|
||||
void parseCodecCandidates();
|
||||
|
||||
void adviseLowCardinality();
|
||||
|
||||
public:
|
||||
TypeAdvisor(
|
||||
MockEnvironment & env_,
|
||||
const po::variables_map & options_,
|
||||
const ColumnsDescription & column_descs_,
|
||||
std::string absolute_part_path_,
|
||||
size_t sample_row_number_,
|
||||
size_t max_threads_,
|
||||
bool lc_only_,
|
||||
Float64 scanned_count_threshold_for_lc_,
|
||||
Float64 cardinality_ratio_threshold_for_lc_);
|
||||
|
||||
virtual ~TypeAdvisor() = default;
|
||||
|
||||
void execute();
|
||||
void serializeJson(WriteBuffer & buf, bool verbose = false);
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,6 @@
|
|||
int mainEntryClickHouseSchemaAdvisor(int argc, char ** argv);
|
||||
|
||||
int main(int argc_, char ** argv_)
|
||||
{
|
||||
return mainEntryClickHouseSchemaAdvisor(argc_, argv_);
|
||||
}
|
|
@ -1183,6 +1183,17 @@ int Server::main(const std::vector<std::string> & /*args*/)
|
|||
global_context->setVWCustomizedSettings(std::make_shared<VWCustomizedSettings>(config));
|
||||
}
|
||||
|
||||
if (global_context->getIsRestrictSettingsToWhitelist())
|
||||
{
|
||||
auto setting_names = getMultipleValuesFromConfig(*config, "tenant_whitelist_settings", "name");
|
||||
std::unordered_set<String> setting_names_set;
|
||||
for (auto& setting : setting_names)
|
||||
{
|
||||
setting_names_set.emplace(setting);
|
||||
}
|
||||
global_context->setExtraRestrictSettingsToWhitelist(std::move(setting_names_set));
|
||||
}
|
||||
|
||||
if (auto catalog = global_context->tryGetCnchCatalog())
|
||||
catalog->loadFromConfig("catalog_service", *config);
|
||||
},
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
#include <Advisor/Advisor.h>
|
||||
|
||||
#include <Advisor/AdvisorContext.h>
|
||||
#include <Advisor/Rules/ClusterKeyAdvise.h>
|
||||
#include <Advisor/Rules/ColumnUsageAdvise.h>
|
||||
#include <Advisor/Rules/DataTypeAdvise.h>
|
||||
#include <Advisor/Rules/MaterializedViewAdvise.h>
|
||||
#include <Advisor/Rules/OrderByKeyAdvise.h>
|
||||
#include <Advisor/Rules/PartitionKeyAdvise.h>
|
||||
#include <Advisor/Rules/WorkloadAdvisor.h>
|
||||
#include <Advisor/WorkloadQuery.h>
|
||||
|
@ -26,19 +28,25 @@ WorkloadAdvisors Advisor::getAdvisors(ASTAdviseQuery::AdvisorType type)
|
|||
{
|
||||
case ASTAdviseQuery::AdvisorType::ALL:
|
||||
return {
|
||||
std::make_shared<ClusterKeyAdvisor>(),
|
||||
std::make_shared<OrderByKeyAdvisor>(),
|
||||
std::make_shared<PartitionKeyAdvisor>(),
|
||||
std::make_shared<DataTypeAdvisor>(),
|
||||
std::make_shared<MaterializedViewAdvisor>(MaterializedViewAdvisor::OutputType::PROJECTION, true, true),
|
||||
std::make_shared<MaterializedViewAdvisor>(MaterializedViewAdvisor::OutputType::MATERIALIZED_VIEW, true, false)};
|
||||
std::make_shared<MaterializedViewAdvisor>(MaterializedViewAdvisor::OutputType::MATERIALIZED_VIEW, true, true)};
|
||||
|
||||
case ASTAdviseQuery::AdvisorType::ORDER_BY:
|
||||
return {std::make_shared<ClusterKeyAdvisor>()};
|
||||
case ASTAdviseQuery::AdvisorType::DISTRIBUTED_BY:
|
||||
return {std::make_shared<OrderByKeyAdvisor>()};
|
||||
case ASTAdviseQuery::AdvisorType::CLUSTER_BY:
|
||||
return {std::make_shared<PartitionKeyAdvisor>()};
|
||||
case ASTAdviseQuery::AdvisorType::DATA_TYPE:
|
||||
return {std::make_shared<DataTypeAdvisor>()};
|
||||
case ASTAdviseQuery::AdvisorType::MATERIALIZED_VIEW:
|
||||
return {std::make_shared<MaterializedViewAdvisor>(MaterializedViewAdvisor::OutputType::MATERIALIZED_VIEW, true, false)};
|
||||
return {std::make_shared<MaterializedViewAdvisor>(MaterializedViewAdvisor::OutputType::MATERIALIZED_VIEW, true, true)};
|
||||
case ASTAdviseQuery::AdvisorType::PROJECTION:
|
||||
return {std::make_shared<MaterializedViewAdvisor>(MaterializedViewAdvisor::OutputType::PROJECTION, true, true)};
|
||||
}
|
||||
case ASTAdviseQuery::AdvisorType::COLUMN_USAGE:
|
||||
return {std::make_shared<ColumnUsageAdvisor>()};
|
||||
}
|
||||
}
|
||||
|
||||
WorkloadAdvises Advisor::analyze(const std::vector<String> & queries_, ContextPtr context_)
|
||||
|
|
|
@ -21,7 +21,7 @@ public:
|
|||
}
|
||||
WorkloadAdvises analyze(const std::vector<String> & queries, ContextPtr context);
|
||||
|
||||
private:
|
||||
private:
|
||||
static WorkloadAdvisors getAdvisors(ASTAdviseQuery::AdvisorType type);
|
||||
|
||||
ASTAdviseQuery::AdvisorType type;
|
||||
|
|
|
@ -19,7 +19,7 @@ AdvisorContext AdvisorContext::buildFrom(ContextMutablePtr session_context, Work
|
|||
ColumnUsages column_usages = buildColumnUsages(queries);
|
||||
SignatureUsages signature_usages = buildSignatureUsages(queries, session_context);
|
||||
|
||||
std::unordered_map<String, WorkloadQueryPtr> query_id_to_query;
|
||||
std::unordered_map<String, WorkloadQueryPtr> query_id_to_query;
|
||||
for (const auto & query : queries)
|
||||
query_id_to_query[query->getQueryId()] = query;
|
||||
|
||||
|
@ -27,7 +27,7 @@ std::unordered_map<String, WorkloadQueryPtr> query_id_to_query;
|
|||
session_context,
|
||||
tables,
|
||||
queries,
|
||||
std::move(query_id_to_query),
|
||||
std::move(query_id_to_query),
|
||||
query_thread_pool,
|
||||
std::move(column_usages),
|
||||
std::move(signature_usages));
|
||||
|
|
|
@ -19,7 +19,7 @@ public:
|
|||
ContextMutablePtr session_context;
|
||||
WorkloadTables & tables;
|
||||
WorkloadQueries & queries;
|
||||
std::unordered_map<String, WorkloadQueryPtr> query_id_to_query;
|
||||
std::unordered_map<String, WorkloadQueryPtr> query_id_to_query;
|
||||
ThreadPool & query_thread_pool;
|
||||
const ColumnUsages column_usages;
|
||||
const SignatureUsages signature_usages;
|
||||
|
@ -34,16 +34,16 @@ std::unordered_map<String, WorkloadQueryPtr> query_id_to_query;
|
|||
private:
|
||||
AdvisorContext(
|
||||
ContextMutablePtr _session_context,
|
||||
WorkloadTables & _tables,
|
||||
WorkloadQueries & _queries,
|
||||
std::unordered_map<String, WorkloadQueryPtr> _query_id_to_query,
|
||||
ThreadPool & _query_thread_pool,
|
||||
ColumnUsages _column_usages,
|
||||
SignatureUsages _signature_usages)
|
||||
WorkloadTables & _tables,
|
||||
WorkloadQueries & _queries,
|
||||
std::unordered_map<String, WorkloadQueryPtr> _query_id_to_query,
|
||||
ThreadPool & _query_thread_pool,
|
||||
ColumnUsages _column_usages,
|
||||
SignatureUsages _signature_usages)
|
||||
: session_context(_session_context)
|
||||
, tables(_tables)
|
||||
, queries(_queries)
|
||||
, query_id_to_query(std::move(_query_id_to_query))
|
||||
, query_id_to_query(std::move(_query_id_to_query))
|
||||
, query_thread_pool(_query_thread_pool)
|
||||
, column_usages(std::move(_column_usages))
|
||||
, signature_usages(std::move(_signature_usages))
|
||||
|
|
|
@ -3,10 +3,12 @@
|
|||
#include <Advisor/WorkloadQuery.h>
|
||||
#include <Analyzers/QualifiedColumnName.h>
|
||||
#include <Core/Types.h>
|
||||
#include <Storages/MergeTree/Index/BitmapIndexHelper.h>
|
||||
#include <Functions/FunctionsComparison.h>
|
||||
#include <Interpreters/StorageID.h>
|
||||
#include <Optimizer/CostModel/CostCalculator.h>
|
||||
#include <Optimizer/PredicateUtils.h>
|
||||
#include <Optimizer/SymbolsExtractor.h>
|
||||
#include <Parsers/IAST_fwd.h>
|
||||
#include <Parsers/ASTFunction.h>
|
||||
#include <Parsers/ASTIdentifier.h>
|
||||
|
@ -52,13 +54,23 @@ namespace
|
|||
return function.name == "in" && function.arguments && function.arguments->children.size() == 2;
|
||||
}
|
||||
|
||||
std::optional<std::pair<std::string, ColumnUsageType>> extractPredicateUsage(ConstASTPtr expression)
|
||||
ASTPtr unwarpMonotonicFunction(ASTPtr expr)
|
||||
{
|
||||
auto fun = dynamic_pointer_cast<const ASTFunction>(expression);
|
||||
if (auto * function = expr->as<ASTFunction>())
|
||||
{
|
||||
if (function->arguments->children.size() == 1)
|
||||
return unwarpMonotonicFunction(function->arguments->children[0]);
|
||||
}
|
||||
return expr;
|
||||
};
|
||||
|
||||
std::optional<std::pair<std::string, ColumnUsageType>> extractPredicateUsage(ConstASTPtr predicate)
|
||||
{
|
||||
auto fun = dynamic_pointer_cast<const ASTFunction>(predicate);
|
||||
if (!fun || !fun->arguments || fun->arguments->children.size() != 2)
|
||||
return std::nullopt;
|
||||
auto identifier = dynamic_pointer_cast<const ASTIdentifier>(fun->arguments->children[0]);
|
||||
if (!identifier)
|
||||
auto left = unwarpMonotonicFunction(fun->arguments->children[0]);
|
||||
auto identifier = dynamic_pointer_cast<const ASTIdentifier>(left); if (!identifier)
|
||||
return std::nullopt;
|
||||
const std::string & symbol = identifier->name();
|
||||
|
||||
|
@ -119,6 +131,9 @@ protected:
|
|||
void visitAggregatingNode(AggregatingNode & node, ColumnUsages & column_usages) override;
|
||||
void visitCTERefNode(CTERefNode & node, ColumnUsages & column_usages) override;
|
||||
|
||||
void extractFilterUsages(ConstASTPtr expr, PlanNodePtr, ColumnUsages & column_usages);
|
||||
void extractArraySetFunctions(ConstASTPtr expression, const PlanNodePtr & node, ColumnUsages & column_usages);
|
||||
|
||||
private:
|
||||
std::unordered_map<std::string, ColumnNameWithSourceTableFlag> symbol_to_table_column_map;
|
||||
std::unordered_set<CTEId> visited_ctes;
|
||||
|
@ -154,6 +169,19 @@ size_t ColumnUsageInfo::getFrequency(ColumnUsageType type, bool only_source_tabl
|
|||
return freq;
|
||||
}
|
||||
|
||||
std::unordered_map<ColumnUsageType, size_t> ColumnUsageInfo::getFrequencies(bool only_source_table) const {
|
||||
std::unordered_map<ColumnUsageType, size_t> res;
|
||||
for (const auto & item : usages_only_source_table) {
|
||||
res[item.first] += 1;
|
||||
}
|
||||
if (!only_source_table) {
|
||||
for (const auto & item : usages_non_source_table) {
|
||||
res[item.first] += 1;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
std::vector<ColumnUsage> ColumnUsageInfo::getUsages(ColumnUsageType type, bool only_source_table) const
|
||||
{
|
||||
std::vector<ColumnUsage> res{};
|
||||
|
@ -195,11 +223,34 @@ void ColumnUsageVisitor::visitTableScanNode(TableScanNode & node, ColumnUsages &
|
|||
auto table_step = dynamic_pointer_cast<TableScanStep>(node.getStep());
|
||||
const StorageID & storage_id = table_step->getStorageID();
|
||||
|
||||
std::unordered_map<std::string, ColumnNameWithSourceTableFlag> table_columns;
|
||||
|
||||
for (const auto & column_name : table_step->getRequiredColumns())
|
||||
{
|
||||
QualifiedColumnName column{storage_id.getDatabaseName(), storage_id.getTableName(), column_name};
|
||||
table_columns.insert_or_assign(column_name, ColumnNameWithSourceTableFlag{column, true});
|
||||
}
|
||||
|
||||
// extract usages
|
||||
symbol_to_table_column_map.swap(table_columns);
|
||||
for (const auto & column_name : table_step->getRequiredColumns())
|
||||
addUsage(column_usages, column_name, ColumnUsageType::SCANNED, node.shared_from_this());
|
||||
|
||||
if (table_step->getPrewhere())
|
||||
extractFilterUsages(table_step->getPrewhere(), node.shared_from_this(), column_usages);
|
||||
|
||||
// for (auto [output, expr] : table_step->getIndexExpressions())
|
||||
// extractFilterUsages(expr, node.shared_from_this(), column_usages);
|
||||
|
||||
for (auto [output, expr] : table_step->getInlineExpressions())
|
||||
extractFilterUsages(expr, node.shared_from_this(), column_usages);
|
||||
|
||||
symbol_to_table_column_map.swap(table_columns);
|
||||
|
||||
for (const auto & [column_name, alias] : table_step->getColumnAlias())
|
||||
{
|
||||
QualifiedColumnName column{storage_id.getDatabaseName(), storage_id.getTableName(), column_name};
|
||||
symbol_to_table_column_map.emplace(alias, ColumnNameWithSourceTableFlag{column, true});
|
||||
addUsage(column_usages, alias, ColumnUsageType::SCANNED, node.shared_from_this());
|
||||
symbol_to_table_column_map.insert_or_assign(alias, ColumnNameWithSourceTableFlag{column, true});
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -207,12 +258,44 @@ void ColumnUsageVisitor::visitFilterNode(FilterNode & node, ColumnUsages & colum
|
|||
{
|
||||
processChildren(node, column_usages);
|
||||
auto filter_step = dynamic_pointer_cast<FilterStep>(node.getStep());
|
||||
for (const ConstASTPtr & expression : PredicateUtils::extractConjuncts(filter_step->getFilter()))
|
||||
extractFilterUsages(filter_step->getFilter(), node.shared_from_this(), column_usages);
|
||||
}
|
||||
|
||||
void ColumnUsageVisitor::extractFilterUsages(ConstASTPtr expr, PlanNodePtr node, ColumnUsages & column_usages)
|
||||
{
|
||||
for (const auto & expression : PredicateUtils::extractConjuncts(expr))
|
||||
{
|
||||
auto usage_opt = extractPredicateUsage(expression);
|
||||
if (usage_opt.has_value())
|
||||
addUsage(column_usages, usage_opt.value().first, usage_opt.value().second, node.shared_from_this(), expression);
|
||||
addUsage(column_usages, usage_opt.value().first, usage_opt.value().second, node, expression);
|
||||
else
|
||||
{
|
||||
auto names = SymbolsExtractor::extract(expression);
|
||||
for (const auto & name : names)
|
||||
{
|
||||
addUsage(column_usages, name, ColumnUsageType::OTHER_PREDICATE, node, expression);
|
||||
}
|
||||
}
|
||||
}
|
||||
extractArraySetFunctions(expr, node, column_usages);
|
||||
}
|
||||
|
||||
void ColumnUsageVisitor::extractArraySetFunctions(ConstASTPtr expression, const PlanNodePtr & node, ColumnUsages & column_usages)
|
||||
{
|
||||
auto function = dynamic_pointer_cast<const ASTFunction>(expression);
|
||||
if (const auto * func = expression->as<ASTFunction>())
|
||||
{
|
||||
if (!func->arguments || func->arguments->children.empty()) return;
|
||||
auto * ident = func->arguments->children[0]->as<ASTIdentifier>();
|
||||
if (ident && BitmapIndexHelper::isArraySetFunctions(func->name))
|
||||
{
|
||||
addUsage(column_usages, ident->name(), ColumnUsageType::ARRAY_SET_FUNCTION, node, expression);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto & child : expression->children)
|
||||
extractArraySetFunctions(child, node, column_usages);
|
||||
}
|
||||
|
||||
void ColumnUsageVisitor::visitJoinNode(JoinNode & node, ColumnUsages & column_usages)
|
||||
|
@ -256,8 +339,10 @@ void ColumnUsageVisitor::visitProjectionNode(ProjectionNode & node, ColumnUsages
|
|||
{
|
||||
auto it = symbol_to_table_column_map.find(identifier->name());
|
||||
if (it != symbol_to_table_column_map.end())
|
||||
symbol_to_table_column_map.emplace(out_symbol, it->second);
|
||||
symbol_to_table_column_map.insert_or_assign(out_symbol, it->second);
|
||||
}
|
||||
|
||||
extractArraySetFunctions(in_ast, node.shared_from_this(), column_usages);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -286,4 +371,27 @@ void ColumnUsageVisitor::visitCTERefNode(CTERefNode & node, ColumnUsages & colum
|
|||
VisitorUtil::accept(cte_info.getCTEs().at(cte_id), *this, column_usages);
|
||||
}
|
||||
|
||||
String toString(ColumnUsageType type)
|
||||
{
|
||||
switch (type) {
|
||||
case ColumnUsageType::SCANNED:
|
||||
return "Scanned";
|
||||
case ColumnUsageType::EQUI_JOIN:
|
||||
return "EquiJoin";
|
||||
case ColumnUsageType::NON_EQUI_JOIN:
|
||||
return "NonEquiJoin";
|
||||
case ColumnUsageType::GROUP_BY:
|
||||
return "GroupBy";
|
||||
case ColumnUsageType::EQUALITY_PREDICATE:
|
||||
return "EqualityPredicate";
|
||||
case ColumnUsageType::IN_PREDICATE:
|
||||
return "InPredicate";
|
||||
case ColumnUsageType::RANGE_PREDICATE:
|
||||
return "RangePredicate";
|
||||
case ColumnUsageType::ARRAY_SET_FUNCTION:
|
||||
return "ArraySetFunction";
|
||||
case ColumnUsageType::OTHER_PREDICATE:
|
||||
return "OtherPredicate";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,9 +23,12 @@ enum class ColumnUsageType
|
|||
EQUALITY_PREDICATE, // columns in "= literal" filters
|
||||
IN_PREDICATE, // columns in "in list" filters
|
||||
RANGE_PREDICATE, // columns in "> literal" or "< literal" filters
|
||||
ARRAY_SET_FUNCTION, // columns in "has" or "arraySetCheck"
|
||||
OTHER_PREDICATE, // columns in "column ???" filters
|
||||
};
|
||||
|
||||
String toString(ColumnUsageType type);
|
||||
|
||||
struct ColumnUsage
|
||||
{
|
||||
ColumnUsageType type;
|
||||
|
@ -42,6 +45,7 @@ public:
|
|||
void update(ColumnUsage usage, bool is_source_table);
|
||||
|
||||
size_t getFrequency(ColumnUsageType type, bool only_source_table = false) const;
|
||||
std::unordered_map<ColumnUsageType, size_t> getFrequencies(bool only_source_table = false) const;
|
||||
std::vector<ColumnUsage> getUsages(ColumnUsageType type, bool only_source_table = false) const;
|
||||
|
||||
private:
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
#include <Advisor/Rules/ColumnUsageAdvise.h>
|
||||
|
||||
#include <Advisor/AdvisorContext.h>
|
||||
#include <Advisor/ColumnUsage.h>
|
||||
#include <Advisor/Rules/WorkloadAdvisor.h>
|
||||
#include <Advisor/WorkloadTable.h>
|
||||
#include <Analyzers/QualifiedColumnName.h>
|
||||
#include <Core/QualifiedTableName.h>
|
||||
#include <Core/Types.h>
|
||||
#include <Parsers/IAST_fwd.h>
|
||||
#include <Poco/Logger.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class ColumnUsageAdvise : public IWorkloadAdvise
|
||||
{
|
||||
public:
|
||||
ColumnUsageAdvise(QualifiedTableName table_, String column_, std::vector<std::pair<String, double>> candidates_)
|
||||
: table(std::move(table_)), column(std::move(column_)), candidates(std::move(candidates_))
|
||||
{
|
||||
}
|
||||
|
||||
String apply(WorkloadTables &) override { return "not implement"; }
|
||||
|
||||
QualifiedTableName getTable() override { return table; }
|
||||
std::optional<String> getColumnName() override { return column; }
|
||||
String getAdviseType() override { return "Column Usage"; }
|
||||
String getOriginalValue() override { return ""; }
|
||||
String getOptimizedValue() override { return ""; }
|
||||
double getBenefit() override { return 0; }
|
||||
std::vector<std::pair<String, double>> getCandidates() override { return candidates; }
|
||||
|
||||
private:
|
||||
QualifiedTableName table;
|
||||
String column;
|
||||
std::vector<std::pair<String, double>> candidates;
|
||||
};
|
||||
|
||||
WorkloadAdvises ColumnUsageAdvisor::analyze(AdvisorContext & context) const
|
||||
{
|
||||
std::map<QualifiedColumnName, std::unordered_map<String, double>> column_usage_by_table;
|
||||
for (const auto & [qualified_column, metrics] : context.column_usages)
|
||||
{
|
||||
for (const auto & [type, count] : metrics.getFrequencies(true))
|
||||
{
|
||||
column_usage_by_table[qualified_column][toString(type)] += count;
|
||||
}
|
||||
}
|
||||
|
||||
WorkloadAdvises res;
|
||||
for (const auto & [table, column_freq] : column_usage_by_table)
|
||||
{
|
||||
std::vector<std::pair<String, double>> sorted_freq{column_freq.begin(), column_freq.end()};
|
||||
std::sort(sorted_freq.begin(), sorted_freq.end(), [](const auto & p1, const auto & p2) {
|
||||
// enforce unique ordering
|
||||
if (p1.second == p2.second)
|
||||
return p1.first < p2.first;
|
||||
return p1.second < p2.second;
|
||||
});
|
||||
|
||||
res.emplace_back(std::make_shared<ColumnUsageAdvise>(table.getQualifiedTable(), table.column, sorted_freq));
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
#pragma once
|
||||
|
||||
#include <Advisor/AdvisorContext.h>
|
||||
#include <Advisor/Rules/WorkloadAdvisor.h>
|
||||
#include <Analyzers/QualifiedColumnName.h>
|
||||
#include <Core/Types.h>
|
||||
#include <Poco/Logger.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class ColumnUsageAdvisor : public IWorkloadAdvisor
|
||||
{
|
||||
public:
|
||||
String getName() const override { return "ColumnUsageAdvisor"; }
|
||||
WorkloadAdvises analyze(AdvisorContext & context) const override;
|
||||
|
||||
private:
|
||||
// Poco::Logger * log = getLogger("OrderByKeyAdvisor");
|
||||
};
|
||||
|
||||
}
|
|
@ -0,0 +1,174 @@
|
|||
#include <Advisor/Rules/DataTypeAdvise.h>
|
||||
|
||||
#include <Advisor/AdvisorContext.h>
|
||||
#include <Advisor/Rules/WorkloadAdvisor.h>
|
||||
#include <Advisor/WorkloadTableStats.h>
|
||||
#include <Core/Types.h>
|
||||
#include <Core/Field.h>
|
||||
#include <Core/QualifiedTableName.h>
|
||||
#include <DataTypes/IDataType.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <Optimizer/CardinalityEstimate/SymbolStatistics.h>
|
||||
#include <Optimizer/CardinalityEstimate/PlanNodeStatistics.h>
|
||||
#include <Poco/Logger.h>
|
||||
#include "Interpreters/StorageID.h"
|
||||
#include <Storages/IStorage_fwd.h>
|
||||
#include <Storages/StorageCnchMergeTree.h>
|
||||
#include <Statistics/TypeUtils.h>
|
||||
|
||||
#include <limits>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int LOGICAL_ERROR;
|
||||
extern const int UNKNOWN_TABLE;
|
||||
}
|
||||
|
||||
WorkloadAdvises DataTypeAdvisor::analyze(AdvisorContext & context) const
|
||||
{
|
||||
WorkloadAdvises res;
|
||||
for (auto & [table_name, workload_table] : context.tables.getTables())
|
||||
{
|
||||
auto basic_stats = workload_table->getStats().getBasicStats();
|
||||
if (!basic_stats.get())
|
||||
throw Exception("Empty statistics when analyzing data types for table " + table_name.getFullName(), ErrorCodes::LOGICAL_ERROR);
|
||||
|
||||
auto storage = DatabaseCatalog::instance().getTable(StorageID{table_name.database, table_name.table}, context.session_context);
|
||||
auto columns = storage->getInMemoryMetadataPtr()->getColumns().getAll();
|
||||
auto extended_stats
|
||||
= workload_table->getStats().collectExtendedStats(context.session_context, table_name.database, table_name.table, columns);
|
||||
|
||||
if (!extended_stats.get())
|
||||
throw Exception("Empty extended statistics when analyzing data types for table " + table_name.getFullName(), ErrorCodes::LOGICAL_ERROR);
|
||||
|
||||
const auto local_table = workload_table->getTablePtr();
|
||||
if (!dynamic_cast<const StorageCnchMergeTree *>(local_table.get()))
|
||||
throw Exception("Table " + table_name.getFullName() + " is not merge tree table", ErrorCodes::UNKNOWN_TABLE);
|
||||
|
||||
UInt64 row_count = basic_stats->getRowCount();
|
||||
auto & table_stats = basic_stats->getSymbolStatistics();
|
||||
|
||||
for (auto & [column_name, symbol_stats] : table_stats)
|
||||
{
|
||||
if (symbol_stats->getNullsCount() == row_count) /// all nulls
|
||||
continue;
|
||||
|
||||
const auto & column_type = local_table->getInMemoryMetadataPtr()->getColumns().getPhysical(column_name).type;
|
||||
auto decayed_type = Statistics::decayDataType(column_type);
|
||||
|
||||
bool is_string = decayed_type->getTypeId() == TypeIndex::String || decayed_type->getTypeId() == TypeIndex::FixedString;
|
||||
bool is_unsigned_integer = decayed_type->isValueRepresentedByUnsignedInteger() && decayed_type->isSummable();
|
||||
bool is_integer = decayed_type->isValueRepresentedByInteger() && decayed_type->isSummable();
|
||||
|
||||
String optimized_type;
|
||||
if ((is_string && string_type_advisor->checkAndApply(local_table, symbol_stats, extended_stats->at(column_name), row_count, optimized_type))
|
||||
|| (is_unsigned_integer && integer_type_advisor->checkAndApply(local_table, symbol_stats, decayed_type, true, optimized_type))
|
||||
|| (is_integer && integer_type_advisor->checkAndApply(local_table, symbol_stats, decayed_type, false, optimized_type)))
|
||||
{
|
||||
res.emplace_back(std::make_shared<DataTypeAdvise>(table_name, column_name, column_type->getName(), optimized_type));
|
||||
}
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
bool DataTypeAdvisor::StringTypeAdvisor::checkAndApply(const StoragePtr & local_table, const SymbolStatisticsPtr & symbol_stats, WorkloadExtendedStat & extended_symbol_stats, UInt64 row_count, String & optimized_type)
|
||||
{
|
||||
const auto & nulls_count = symbol_stats->getNullsCount();
|
||||
|
||||
/// check date
|
||||
const Field & count_to_date = extended_symbol_stats[WorkloadExtendedStatsType::COUNT_TO_DATE_OR_NULL];
|
||||
bool all_date = !count_to_date.isNull() ? count_to_date.get<UInt64>() + nulls_count == row_count : false;
|
||||
if (all_date)
|
||||
{
|
||||
optimized_type = nulls_count > 0 ? "Nullable(Date)" : "Date";
|
||||
return true;
|
||||
}
|
||||
|
||||
/// check date time
|
||||
const Field & count_to_date_time = extended_symbol_stats[WorkloadExtendedStatsType::COUNT_TO_DATE_TIME_OR_NULL];
|
||||
bool all_date_time = !count_to_date_time.isNull() ? count_to_date_time.get<UInt64>() + nulls_count == row_count : false;
|
||||
if (all_date_time)
|
||||
{
|
||||
optimized_type = nulls_count > 0 ? "Nullable(DateTime)" : "DateTime";
|
||||
return true;
|
||||
}
|
||||
|
||||
/// check uint32
|
||||
const Field & count_to_uint32 = extended_symbol_stats[WorkloadExtendedStatsType::COUNT_TO_UINT32_OR_NULL];
|
||||
bool all_unsigned_integer = !count_to_uint32.isNull() ? count_to_uint32.get<UInt64>() + nulls_count == row_count : false;
|
||||
if (all_unsigned_integer)
|
||||
{
|
||||
optimized_type = nulls_count > 0 ? "Nullable(UInt32)" : "UInt32";
|
||||
return true;
|
||||
}
|
||||
|
||||
/// check float32
|
||||
const Field & count_to_float32 = extended_symbol_stats[WorkloadExtendedStatsType::COUNT_TO_FLOAT32_OR_NULL];
|
||||
bool all_float32 = !count_to_float32.isNull() ? count_to_float32.get<UInt64>() + nulls_count == row_count : false;
|
||||
if (all_float32)
|
||||
{
|
||||
optimized_type = nulls_count > 0 ? "Nullable(Float32)" : "Float32";
|
||||
return true;
|
||||
}
|
||||
|
||||
/// check (global) low cardinality
|
||||
const auto & ndv = symbol_stats->getNdv();
|
||||
const auto * merge_tree_storage = dynamic_cast<const StorageCnchMergeTree *>(local_table.get());
|
||||
bool can_be_inside_low_cardinality = ndv < merge_tree_storage->getSettings()->low_cardinality_ndv_threshold && ndv + nulls_count != row_count;
|
||||
if (can_be_inside_low_cardinality)
|
||||
{
|
||||
String nested_type = nulls_count > 0 ? "Nullable(String)" : "String";
|
||||
optimized_type = "LowCardinality(" + nested_type + ")";
|
||||
return true;
|
||||
}
|
||||
|
||||
/// check fixed string
|
||||
const auto & avg_len = symbol_stats->getAvg();
|
||||
bool is_fixed_size = false; /// TODO
|
||||
if (is_fixed_size)
|
||||
{
|
||||
optimized_type = nulls_count > 0 ? "Nullable(FixedString("+ toString(avg_len) +"))" : "FixedString(" + toString(avg_len) + ")";
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool DataTypeAdvisor::IntegerTypeAdvisor::checkAndApply(
|
||||
[[maybe_unused]] const StoragePtr & local_table,
|
||||
const SymbolStatisticsPtr & symbol_stats,
|
||||
const DataTypePtr & decayed_original_type,
|
||||
bool is_unsigned_type,
|
||||
String & optimized_type)
|
||||
{
|
||||
const auto & nulls_count = symbol_stats->getNullsCount();
|
||||
const auto & max = symbol_stats->getMax();
|
||||
|
||||
DataTypePtr new_type = nullptr;
|
||||
if (is_unsigned_type)
|
||||
{
|
||||
if (max <= std::numeric_limits<UInt8>::max()) new_type = std::make_shared<DataTypeUInt8>();
|
||||
else if (max <= std::numeric_limits<UInt16>::max()) new_type = std::make_shared<DataTypeUInt16>();
|
||||
else if (max <= std::numeric_limits<UInt32>::max()) new_type = std::make_shared<DataTypeUInt32>();
|
||||
}
|
||||
else
|
||||
{
|
||||
if (max <= std::numeric_limits<Int8>::max()) new_type = std::make_shared<DataTypeInt8>();
|
||||
else if (max <= std::numeric_limits<Int16>::max()) new_type = std::make_shared<DataTypeInt16>();
|
||||
else if (max <= std::numeric_limits<Int32>::max()) new_type = std::make_shared<DataTypeInt32>();
|
||||
}
|
||||
|
||||
if (new_type && new_type->getTypeId() < decayed_original_type->getTypeId())
|
||||
{
|
||||
optimized_type = nulls_count > 0 ? "Nullable(" + new_type->getName() + ")" : new_type->getName();
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,77 @@
|
|||
#pragma once
|
||||
|
||||
#include <Common/Logger.h>
|
||||
#include <Advisor/AdvisorContext.h>
|
||||
#include <Advisor/Rules/WorkloadAdvisor.h>
|
||||
#include <Advisor/WorkloadTableStats.h>
|
||||
#include <Core/Types.h>
|
||||
#include <Core/QualifiedTableName.h>
|
||||
#include <Optimizer/CardinalityEstimate/SymbolStatistics.h>
|
||||
#include <Poco/Logger.h>
|
||||
#include <Storages/IStorage_fwd.h>
|
||||
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class DataTypeAdvisor : public IWorkloadAdvisor
|
||||
{
|
||||
public:
|
||||
DataTypeAdvisor()
|
||||
{
|
||||
string_type_advisor = std::unique_ptr<StringTypeAdvisor>();
|
||||
integer_type_advisor = std::unique_ptr<IntegerTypeAdvisor>();
|
||||
}
|
||||
String getName() const override { return "DataTypeAdvisor"; }
|
||||
WorkloadAdvises analyze(AdvisorContext & context) const override;
|
||||
|
||||
private:
|
||||
class StringTypeAdvisor
|
||||
{
|
||||
public:
|
||||
bool checkAndApply(const StoragePtr & local_table, const SymbolStatisticsPtr & symbol_stats, WorkloadExtendedStat & extended_symbol_stats, UInt64 row_count, String & optimized_type);
|
||||
};
|
||||
|
||||
class IntegerTypeAdvisor
|
||||
{
|
||||
public:
|
||||
bool checkAndApply(const StoragePtr & local_table, const SymbolStatisticsPtr & symbol_stats, const DataTypePtr & decayed_original_type, bool is_unsigned_type, String & optimized_type);
|
||||
};
|
||||
|
||||
std::unique_ptr<StringTypeAdvisor> string_type_advisor;
|
||||
std::unique_ptr<IntegerTypeAdvisor> integer_type_advisor;
|
||||
};
|
||||
|
||||
class DataTypeAdvise : public IWorkloadAdvise
|
||||
{
|
||||
public:
|
||||
DataTypeAdvise(
|
||||
const QualifiedTableName & table_, const String & column_name_, const String & original_type_, const String & new_type_)
|
||||
: table(table_), column_name(column_name_), original_type(original_type_), new_type(new_type_)
|
||||
{
|
||||
}
|
||||
|
||||
String apply([[maybe_unused]] WorkloadTables & tables) override
|
||||
{
|
||||
/// TODO: modify ddl
|
||||
return "";
|
||||
}
|
||||
|
||||
QualifiedTableName getTable() override { return table; }
|
||||
std::optional<String> getColumnName() override { return {column_name}; }
|
||||
String getAdviseType() override { return "Data Type"; }
|
||||
String getOriginalValue() override { return original_type; }
|
||||
String getOptimizedValue() override { return new_type; }
|
||||
|
||||
private:
|
||||
QualifiedTableName table;
|
||||
String column_name;
|
||||
String original_type;
|
||||
String new_type;
|
||||
|
||||
const LoggerPtr log = getLogger("DataTypeAdvise");
|
||||
};
|
||||
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
#include <Advisor/Rules/ClusterKeyAdvise.h>
|
||||
#include <Advisor/Rules/OrderByKeyAdvise.h>
|
||||
|
||||
#include <Advisor/AdvisorContext.h>
|
||||
#include <Advisor/ColumnUsage.h>
|
||||
|
@ -13,20 +13,28 @@
|
|||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
class ClusterKeyAdvise : public IWorkloadAdvise
|
||||
class OrderByKeyAdvise : public IWorkloadAdvise
|
||||
{
|
||||
public:
|
||||
ClusterKeyAdvise(QualifiedTableName table_,
|
||||
ASTPtr original_order_by_,
|
||||
String original_column_,
|
||||
String new_column_,
|
||||
double benefit_)
|
||||
: table(std::move(table_)), original_order_by(original_order_by_)
|
||||
, original_column(std::move(original_column_)), new_column(std::move(new_column_)), benefit(benefit_)
|
||||
OrderByKeyAdvise(
|
||||
QualifiedTableName table_,
|
||||
ASTPtr original_order_by_,
|
||||
String original_column_,
|
||||
String new_column_,
|
||||
double benefit_,
|
||||
std::vector<std::pair<String, double>> candidates_)
|
||||
: table(std::move(table_))
|
||||
, original_order_by(original_order_by_)
|
||||
, original_column(std::move(original_column_))
|
||||
, new_column(std::move(new_column_))
|
||||
, benefit(benefit_)
|
||||
, candidates(std::move(candidates_))
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -52,6 +60,7 @@ public:
|
|||
String getOriginalValue() override { return original_column; }
|
||||
String getOptimizedValue() override { return new_column; }
|
||||
double getBenefit() override { return benefit; }
|
||||
std::vector<std::pair<String, double>> getCandidates() override { return candidates; }
|
||||
|
||||
private:
|
||||
QualifiedTableName table;
|
||||
|
@ -59,15 +68,18 @@ private:
|
|||
String original_column;
|
||||
String new_column;
|
||||
double benefit;
|
||||
std::vector<std::pair<String, double>> candidates;
|
||||
};
|
||||
|
||||
WorkloadAdvises ClusterKeyAdvisor::analyze(AdvisorContext & context) const
|
||||
WorkloadAdvises OrderByKeyAdvisor::analyze(AdvisorContext & context) const
|
||||
{
|
||||
std::unordered_map<QualifiedTableName, std::unordered_map<String, size_t>> column_usage_by_table;
|
||||
std::unordered_map<QualifiedTableName, std::unordered_map<String, double>> column_usage_by_table;
|
||||
for (const auto & [qualified_column, metrics] : context.column_usages)
|
||||
{
|
||||
auto predicate_freq = metrics.getFrequency(ColumnUsageType::EQUALITY_PREDICATE, /*only_source_table=*/true)
|
||||
+ metrics.getFrequency(ColumnUsageType::RANGE_PREDICATE, /*only_source_table=*/true);
|
||||
+ metrics.getFrequency(ColumnUsageType::IN_PREDICATE, /*only_source_table=*/true)
|
||||
+ metrics.getFrequency(ColumnUsageType::RANGE_PREDICATE, /*only_source_table=*/true)
|
||||
+ metrics.getFrequency(ColumnUsageType::EQUI_JOIN, /*only_source_table=*/true) /* runtime_filter*/;
|
||||
if (predicate_freq > 0 && isValidColumn(qualified_column, context))
|
||||
column_usage_by_table[qualified_column.getQualifiedTable()][qualified_column.column] += predicate_freq;
|
||||
}
|
||||
|
@ -75,31 +87,33 @@ WorkloadAdvises ClusterKeyAdvisor::analyze(AdvisorContext & context) const
|
|||
WorkloadAdvises res{};
|
||||
for (const auto & [table, column_freq] : column_usage_by_table)
|
||||
{
|
||||
auto max_column_freq = *std::max_element(column_freq.begin(), column_freq.end(),
|
||||
[](const auto & p1, const auto & p2) {
|
||||
// enforce unique ordering
|
||||
if (p1.second == p2.second)
|
||||
return p1.first < p2.first;
|
||||
return p1.second < p2.second;
|
||||
});
|
||||
std::vector<std::pair<String, double>> sorted_freq{column_freq.begin(), column_freq.end()};
|
||||
std::sort(sorted_freq.begin(), sorted_freq.end(), [](const auto & p1, const auto & p2) {
|
||||
// enforce unique ordering
|
||||
if (p1.second == p2.second)
|
||||
return p1.first > p2.first;
|
||||
return p1.second > p2.second;
|
||||
});
|
||||
if (sorted_freq.size() > 3)
|
||||
sorted_freq.resize(3);
|
||||
|
||||
auto optimized_table = context.tables.tryGetTable(table);
|
||||
auto order_by = optimized_table ? optimized_table->getOrderBy() : nullptr;
|
||||
auto original_column = (order_by) ? serializeAST(*order_by) : String{};
|
||||
res.emplace_back(std::make_shared<ClusterKeyAdvise>(table, order_by, original_column, max_column_freq.first, max_column_freq.second));
|
||||
res.emplace_back(
|
||||
std::make_shared<OrderByKeyAdvise>(table, order_by, original_column, sorted_freq[0].first, sorted_freq[0].second, sorted_freq));
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
bool ClusterKeyAdvisor::isValidColumn(const QualifiedColumnName & column, AdvisorContext & context) const
|
||||
bool OrderByKeyAdvisor::isValidColumn(const QualifiedColumnName & /*column*/, AdvisorContext & /*context*/) const
|
||||
{
|
||||
auto column_type = context.getColumnType(column);
|
||||
if (!column_type || !column_type->isComparable()) // sharding key only accepts integers
|
||||
{
|
||||
LOG_DEBUG(log, "Column {}.{}.{} is not a valid order by key, because it is not comparable",
|
||||
column.database, column.table, column.column);
|
||||
return false;
|
||||
}
|
||||
// auto column_type = context.getColumnType(column);
|
||||
// if (!column_type || !column_type->isValueRepresentedByInteger()) // sharding key only accepts integers
|
||||
// {
|
||||
// LOG_DEBUG(log, "Column {}.{}.{} is not a valid sharding key, because it is not an integer type", column.database, column.table, column.column);
|
||||
// return false;
|
||||
// }
|
||||
return true;
|
||||
}
|
||||
|
|
@ -10,10 +10,10 @@
|
|||
namespace DB
|
||||
{
|
||||
|
||||
class ClusterKeyAdvisor : public IWorkloadAdvisor
|
||||
class OrderByKeyAdvisor : public IWorkloadAdvisor
|
||||
{
|
||||
public:
|
||||
String getName() const override { return "ClusterKeyAdvisor"; }
|
||||
String getName() const override { return "OrderByKeyAdvisor"; }
|
||||
WorkloadAdvises analyze(AdvisorContext & context) const override;
|
||||
|
||||
private:
|
|
@ -78,7 +78,11 @@ public:
|
|||
virtual String getOriginalValue() = 0;
|
||||
virtual String getOptimizedValue() = 0;
|
||||
virtual double getBenefit() { return 0.0; }
|
||||
virtual std::vector<String> getRelatedQueries() { return {}; }
|
||||
virtual std::vector<std::pair<String, double>> getCandidates() { return {}; }
|
||||
virtual std::vector<String> getRelatedQueries()
|
||||
{
|
||||
return {};
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
|
|
@ -17,7 +17,7 @@ SignatureUsages buildSignatureUsages(const WorkloadQueries & queries, ContextPtr
|
|||
SignatureUsages signature_usages;
|
||||
for (const auto & query : queries)
|
||||
{
|
||||
const auto & plan = query->getPlan();
|
||||
const auto & plan = query->getPlanBeforeCascades();
|
||||
PlanSignatureProvider provider(plan->getCTEInfo(), context);
|
||||
auto plan_signatures = provider.computeSignatures(plan->getPlanNode());
|
||||
for (const auto & [plan_node, signature] : plan_signatures)
|
||||
|
|
|
@ -92,12 +92,23 @@ WorkloadQueryPtr WorkloadQuery::build(const std::string & query_id, const std::s
|
|||
context->applySettingsChanges(
|
||||
{DB::SettingChange("enable_sharding_optimize", "true"), // for colocated join
|
||||
DB::SettingChange("enable_runtime_filter", "false"), // for calculating signature
|
||||
DB::SettingChange("enable_optimzier", "true")});
|
||||
DB::SettingChange("enable_optimzier", "true"),
|
||||
DB::SettingChange("cte_mode", "INLINED")}); // for materialized view
|
||||
context->createPlanNodeIdAllocator();
|
||||
context->createSymbolAllocator();
|
||||
context->createOptimizerMetrics();
|
||||
context->makeQueryContext();
|
||||
|
||||
if (context->getSettingsRef().print_graphviz)
|
||||
{
|
||||
std::stringstream path;
|
||||
path << context->getSettingsRef().graphviz_path.toString();
|
||||
path << "/" << query_id << ".sql";
|
||||
std::ofstream out(path.str());
|
||||
out << query;
|
||||
out.close();
|
||||
}
|
||||
|
||||
// parse and plan
|
||||
const char * begin = query.data();
|
||||
const char * end = begin + query.size();
|
||||
|
@ -120,6 +131,7 @@ WorkloadQueryPtr WorkloadQuery::build(const std::string & query_id, const std::s
|
|||
|
||||
CardinalityEstimator::estimate(*query_plan, context);
|
||||
PlanCostMap costs = calculateCost(*query_plan, *context);
|
||||
|
||||
return std::make_unique<WorkloadQuery>(
|
||||
context, query_id, query, std::move(query_plan), std::move(plan_before_cascades), std::move(query_tables), std::move(costs));
|
||||
}
|
||||
|
@ -140,11 +152,10 @@ WorkloadQueries WorkloadQuery::build(const std::vector<std::string> & queries, c
|
|||
{
|
||||
WorkloadQueryPtr workload_query = build("q" + std::to_string(i), query, from_context);
|
||||
res[i] = std::move(workload_query);
|
||||
} catch (Exception & e)
|
||||
} catch (...)
|
||||
{
|
||||
LOG_WARNING(getLogger("WorkloadQuery"),
|
||||
"failed to build query, reason: {}, sql: {}",
|
||||
e.message(), query);
|
||||
LOG_WARNING(getLogger("WorkloadQuery"),"failed to build query, reason: {}, sql: {}",
|
||||
getCurrentExceptionMessage(true), query);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
@ -156,7 +167,7 @@ WorkloadQueries WorkloadQuery::build(const std::vector<std::string> & queries, c
|
|||
|
||||
double WorkloadQuery::getOptimalCost(const TableLayout & table_layout)
|
||||
{
|
||||
if (!root_group)
|
||||
if (!root_group)
|
||||
{
|
||||
cascades_context = std::make_shared<CascadesContext>(
|
||||
query_context,
|
||||
|
@ -180,6 +191,7 @@ if (!root_group)
|
|||
GroupId root_group_id = root_group->getGroupId();
|
||||
CascadesOptimizer::optimize(root_group_id, *cascades_context, required_property);
|
||||
auto res = cascades_context->getMemo().getGroupById(root_group_id)->getBestExpression(required_property)->getCost();
|
||||
|
||||
GraphvizPrinter::printMemo(cascades_context->getMemo(), root_group_id, query_context, "CascadesOptimizer-Memo-Graph");
|
||||
return res;
|
||||
}
|
||||
|
|
|
@ -49,6 +49,7 @@ public:
|
|||
}
|
||||
const std::string & getSQL() const { return sql; }
|
||||
const QueryPlanPtr & getPlan() const { return plan; }
|
||||
const QueryPlanPtr & getPlanBeforeCascades() const { return plan_before_cascades; }
|
||||
const PlanCostMap & getCosts() const { return costs; }
|
||||
|
||||
/*
|
||||
|
|
|
@ -48,6 +48,8 @@ public:
|
|||
{
|
||||
}
|
||||
|
||||
StoragePtr getTablePtr() const { return storage; }
|
||||
|
||||
ASTPtr getDDL() const { return create_table_ddl; }
|
||||
bool isOptimized() const { return optimized; }
|
||||
|
||||
|
|
|
@ -37,8 +37,7 @@ public:
|
|||
const String & table_name,
|
||||
const NamesAndTypesList & columns);
|
||||
|
||||
PlanNodeStatisticsPtr basic_stats;
|
||||
WorkloadExtendedStatsPtr extended_stats;
|
||||
PlanNodeStatisticsPtr getBasicStats() { return basic_stats; }
|
||||
|
||||
private:
|
||||
explicit WorkloadTableStats(PlanNodeStatisticsPtr basic_stats_)
|
||||
|
@ -47,6 +46,9 @@ private:
|
|||
{
|
||||
}
|
||||
|
||||
PlanNodeStatisticsPtr basic_stats;
|
||||
WorkloadExtendedStatsPtr extended_stats;
|
||||
|
||||
static const char * getStatsAggregation(const WorkloadExtendedStatsType & type)
|
||||
{
|
||||
switch (type)
|
||||
|
|
|
@ -19,14 +19,16 @@ public:
|
|||
" deptno UInt32 not null,"
|
||||
" name Nullable(String),"
|
||||
" salary Nullable(Float64),"
|
||||
" commission Nullable(UInt32)"
|
||||
") ENGINE=CnchMergeTree() order by empid;");
|
||||
" commission Nullable(UInt32),"
|
||||
" history Array(UInt32)"
|
||||
") ENGINE=Memory();");
|
||||
tester->execute("CREATE TABLE IF NOT EXISTS depts("
|
||||
" deptno UInt32 not null,"
|
||||
" name Nullable(String)"
|
||||
") ENGINE=CnchMergeTree() order by deptno;");
|
||||
") ENGINE=Memory();");
|
||||
}
|
||||
|
||||
static void TearDownTestCase() { tester.reset(); }
|
||||
|
||||
ColumnUsages buildColumnUsagesFromSQL(std::initializer_list<std::string> sql_list)
|
||||
{
|
||||
|
@ -72,7 +74,7 @@ TEST_F(ColumnUsageTest, testSelect)
|
|||
"select empid from emps"});
|
||||
auto select_usages = getColumnFrequencies(column_usages, ColumnUsageType::SCANNED, true);
|
||||
auto empid_column = QualifiedColumnName{tester->getDatabaseName(), "emps", "empid"};
|
||||
EXPECT_EQ(select_usages.size(), 5);
|
||||
EXPECT_EQ(select_usages.size(), 6);
|
||||
ASSERT_TRUE(select_usages.contains(empid_column));
|
||||
EXPECT_EQ(select_usages[empid_column], 2);
|
||||
}
|
||||
|
@ -106,9 +108,9 @@ TEST_F(ColumnUsageTest, testNestedJoin)
|
|||
|
||||
TEST_F(ColumnUsageTest, testNestedJoinCountAll)
|
||||
{
|
||||
tester->execute("CREATE TABLE IF NOT EXISTS A(a UInt32 not null, b UInt32 not null) ENGINE=CnchMergeTree() order by tuple();");
|
||||
tester->execute("CREATE TABLE IF NOT EXISTS B(b UInt32 not null, c UInt32 not null) ENGINE=CnchMergeTree() order by tuple();");
|
||||
tester->execute("CREATE TABLE IF NOT EXISTS C(c UInt32 not null, d UInt32 not null) ENGINE=CnchMergeTree() order by tuple();");
|
||||
tester->execute("CREATE TABLE IF NOT EXISTS A(a UInt32 not null, b UInt32 not null) ENGINE=Memory();");
|
||||
tester->execute("CREATE TABLE IF NOT EXISTS B(b UInt32 not null, c UInt32 not null) ENGINE=Memory();");
|
||||
tester->execute("CREATE TABLE IF NOT EXISTS C(c UInt32 not null, d UInt32 not null) ENGINE=Memory();");
|
||||
|
||||
auto column_usages = buildColumnUsagesFromSQL({"select * from A, B, C where A.b = B.b and B.c = C.c"});
|
||||
|
||||
|
@ -161,4 +163,27 @@ TEST_F(ColumnUsageTest, testInFilter)
|
|||
EXPECT_EQ(in_usages[empid_column], 1);
|
||||
}
|
||||
|
||||
TEST_F(ColumnUsageTest, testArraySetFnction)
|
||||
{
|
||||
auto column_usages = buildColumnUsagesFromSQL({"select if(arraySetCheck(history, (9000)), 'hint', 'miss') from emps "
|
||||
"where has(history, 9000) and arraySetCheck(history, (9000)) = 1"});
|
||||
auto history_column = QualifiedColumnName{tester->getDatabaseName(), "emps", "history"};
|
||||
|
||||
auto array_set_usages = getColumnFrequencies(column_usages, ColumnUsageType::ARRAY_SET_FUNCTION, true);
|
||||
ASSERT_TRUE(array_set_usages.contains(history_column));
|
||||
EXPECT_GE(array_set_usages[history_column], 2);
|
||||
}
|
||||
|
||||
TEST_F(ColumnUsageTest, testPrewhere)
|
||||
{
|
||||
auto column_usages = buildColumnUsagesFromSQL({"select empid from emps "
|
||||
"prewhere arraySetCheck(history, (9000)) = 1 where empid in (1,2,3)"});
|
||||
auto history_column = QualifiedColumnName{tester->getDatabaseName(), "emps", "history"};
|
||||
|
||||
auto array_set_usages = getColumnFrequencies(column_usages, ColumnUsageType::ARRAY_SET_FUNCTION, true);
|
||||
ASSERT_TRUE(array_set_usages.contains(history_column));
|
||||
EXPECT_GE(array_set_usages[history_column], 1);
|
||||
}
|
||||
|
||||
|
||||
} // namespace DB
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#include <Advisor/Rules/ClusterKeyAdvise.h>
|
||||
#include <Advisor/Rules/OrderByKeyAdvise.h>
|
||||
|
||||
#include <Advisor/Rules/WorkloadAdvisor.h>
|
||||
#include <Advisor/WorkloadQuery.h>
|
||||
|
@ -11,7 +11,7 @@
|
|||
|
||||
using namespace DB;
|
||||
|
||||
class ClusterKeyTest : public ::testing::Test
|
||||
class OrderByKeyTest : public ::testing::Test
|
||||
{
|
||||
public:
|
||||
static void SetUpTestSuite()
|
||||
|
@ -23,24 +23,24 @@ public:
|
|||
" name Nullable(String),"
|
||||
" salary Nullable(Float64),"
|
||||
" commission Nullable(UInt32)"
|
||||
") ENGINE=CnchMergeTree() order by empid;");
|
||||
") ENGINE=Memory();");
|
||||
tester->execute("CREATE TABLE IF NOT EXISTS depts("
|
||||
" deptno UInt32 not null,"
|
||||
" name Nullable(String)"
|
||||
") ENGINE=CnchMergeTree() order by deptno;");
|
||||
") ENGINE=Memory();");
|
||||
}
|
||||
|
||||
static void TearDownTestCase()
|
||||
{
|
||||
tester.reset();
|
||||
}
|
||||
|
||||
|
||||
static std::shared_ptr<BaseWorkloadTest> tester;
|
||||
};
|
||||
|
||||
std::shared_ptr<BaseWorkloadTest> ClusterKeyTest::tester;
|
||||
std::shared_ptr<BaseWorkloadTest> OrderByKeyTest::tester;
|
||||
|
||||
TEST_F(ClusterKeyTest, testSimple)
|
||||
TEST_F(OrderByKeyTest, testSimple)
|
||||
{
|
||||
auto context = tester->createQueryContext();
|
||||
std::vector<std::string> sqls(
|
||||
|
@ -49,17 +49,17 @@ TEST_F(ClusterKeyTest, testSimple)
|
|||
WorkloadQueries queries = WorkloadQuery::build(sqls, context, query_thread_pool);
|
||||
WorkloadTables tables(context);
|
||||
AdvisorContext advisor_context = AdvisorContext::buildFrom(context, tables, queries, query_thread_pool);
|
||||
auto advise = ClusterKeyAdvisor().analyze(advisor_context);
|
||||
auto advise = OrderByKeyAdvisor().analyze(advisor_context);
|
||||
EXPECT_EQ(advise.size(), 1);
|
||||
QualifiedTableName emps{tester->getDatabaseName(), "emps"};
|
||||
EXPECT_EQ(advise[0]->getTable(), emps);
|
||||
EXPECT_EQ(advise[0]->getOptimizedValue(), "empid");
|
||||
}
|
||||
|
||||
TEST_F(ClusterKeyTest, testUpdateOrderBy)
|
||||
TEST_F(OrderByKeyTest, testUpdateOrderBy)
|
||||
{
|
||||
std::string database = tester->getDatabaseName();
|
||||
std::string create_table_ddl = "CREATE TABLE IF NOT EXISTS " + database
|
||||
std::string table_ddl = "CREATE TABLE IF NOT EXISTS " + database
|
||||
+ ".emps("
|
||||
" empid UInt32 not null,"
|
||||
" deptno UInt32 not null,"
|
||||
|
@ -70,13 +70,14 @@ TEST_F(ClusterKeyTest, testUpdateOrderBy)
|
|||
"order by deptno;";
|
||||
|
||||
auto query_context = tester->createQueryContext();
|
||||
auto create_ast = tester->parse(create_table_ddl, query_context);
|
||||
query_context->applySettingsChanges({DB::SettingChange("dialect_type", "CLICKHOUSE")});
|
||||
auto create_ast = tester->parse(table_ddl, query_context);
|
||||
WorkloadTable table(nullptr, create_ast, WorkloadTableStats::build(query_context, tester->getDatabaseName(), "emps"));
|
||||
table.updateOrderBy(std::make_shared<ASTIdentifier>("empid"));
|
||||
|
||||
std::string optimal_ddl = serializeAST(*table.getDDL());
|
||||
std::cout << optimal_ddl << std::endl;
|
||||
EXPECT_TRUE(optimal_ddl.find("ORDER BY deptno") == std::string::npos);
|
||||
EXPECT_TRUE(optimal_ddl.find("ORDER BY empid") != std::string::npos);
|
||||
std::string local_ddl = serializeAST(*table.getDDL());
|
||||
std::cout << local_ddl << std::endl;
|
||||
EXPECT_TRUE(local_ddl.find("ORDER BY deptno") == std::string::npos);
|
||||
EXPECT_TRUE(local_ddl.find("ORDER BY empid") != std::string::npos);
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
|
@ -22,6 +22,12 @@
|
|||
|
||||
namespace DB
|
||||
{
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int BAD_ARGUMENTS;
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
}
|
||||
|
||||
/// Expected format is 'P.I' or 'I', and P means the position,
|
||||
/// as well as the I means the index of argument
|
||||
PositionIndexPair parsePositionAndIndex(String & input)
|
||||
|
@ -52,15 +58,15 @@ namespace DB
|
|||
for (size_t i = 0; i < arr.size(); ++i)
|
||||
{
|
||||
if (arr.at(i).safeGet<String>().empty())
|
||||
throw Exception("AggregateFunction " + name + ": empty string in parameter is invalid", ErrorCodes::LOGICAL_ERROR);
|
||||
throw Exception("AggregateFunction " + name + ": empty string in parameter is invalid", ErrorCodes::BAD_ARGUMENTS);
|
||||
UInt64 pos = 0, idx = 0;
|
||||
std::tie(pos, idx) = parsePositionAndIndex(arr.at(i).safeGet<String>());
|
||||
if (pos == 0 || ((pos^0xFF) && pos > union_num+1))
|
||||
{
|
||||
throw Exception("AggregateFunction " + name + ": wrong value of keys postion identifier, which starts from 1", ErrorCodes::LOGICAL_ERROR);
|
||||
throw Exception("AggregateFunction " + name + ": wrong value of keys postion identifier, which starts from 1", ErrorCodes::BAD_ARGUMENTS);
|
||||
}
|
||||
if (idx < 3 || idx > argument_num)
|
||||
throw Exception("AggregateFunction " + name + ": wrong value of key index, which starts from 3", ErrorCodes::LOGICAL_ERROR);
|
||||
throw Exception("AggregateFunction " + name + ": wrong value of key index, which starts from 3", ErrorCodes::BAD_ARGUMENTS);
|
||||
to.emplace_back(pos, idx);
|
||||
}
|
||||
}
|
||||
|
@ -70,7 +76,7 @@ namespace DB
|
|||
{
|
||||
UInt64 idx = arr.at(i).safeGet<UInt64>();
|
||||
if (idx < 3 || idx > argument_num)
|
||||
throw Exception("AggregateFunction " + name + ": wrong value of key index", ErrorCodes::LOGICAL_ERROR);
|
||||
throw Exception("AggregateFunction " + name + ": wrong value of key index", ErrorCodes::BAD_ARGUMENTS);
|
||||
to.emplace_back(0xFF, idx);
|
||||
}
|
||||
}
|
||||
|
@ -93,11 +99,11 @@ namespace
|
|||
/// 6 params are: (union_num, [join_keys], [group_by_keys], bitmap_op, join_type, thread_number, 0), the last 0 mean result is cardinality
|
||||
/// 7 params are: (union_num, [join_keys], [group_by_keys], bitmap_op, join_type, thread_number, result_type) result_type: 0->cardinality, 1->raw bitmap
|
||||
if (parameters.size() != 3 && parameters.size() != 5 && parameters.size() != 6 && parameters.size() != 7)
|
||||
throw Exception("AggregateFunction " + name + " needs 3, 5, 6 or 7 parameters", ErrorCodes::LOGICAL_ERROR);
|
||||
throw Exception("AggregateFunction " + name + " needs 3, 5, 6 or 7 parameters", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
||||
|
||||
UInt64 union_num = parameters[0].safeGet<UInt64>();
|
||||
if (union_num != 1)
|
||||
throw Exception("AggregateFunction " + name + " can only support one JOIN now, set 1 please", ErrorCodes::LOGICAL_ERROR);
|
||||
throw Exception("AggregateFunction " + name + " can only support one JOIN now, set 1 please", ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
Array join_arr = parameters[1].safeGet<Array>();
|
||||
Array group_by_arr = parameters[2].safeGet<Array>();
|
||||
|
@ -110,7 +116,7 @@ namespace
|
|||
keys_set.emplace(jk.second);
|
||||
}
|
||||
if (keys_set.size() != join_keys_idx.size())
|
||||
throw Exception("AggregateFunction " + name + ": duplicated join key index, only one is ok", ErrorCodes::LOGICAL_ERROR);
|
||||
throw Exception("AggregateFunction " + name + ": duplicated join key index, only one is ok", ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
getParameterOfPositionAndIndex(group_by_arr, name, union_num, argument_types.size(), group_by_keys_idx);
|
||||
|
||||
|
@ -125,12 +131,12 @@ namespace
|
|||
if (group_by_keys_idx[i] == group_by_keys_idx[j] ||
|
||||
(group_by_keys_idx[i].second == group_by_keys_idx[j].second
|
||||
&& (group_by_keys_idx[i].first == 0xFF || group_by_keys_idx[j].first == 0xFF)))
|
||||
throw Exception("AggregateFunction " + name + ": duplicated group by index", ErrorCodes::LOGICAL_ERROR);
|
||||
throw Exception("AggregateFunction " + name + ": duplicated group by index", ErrorCodes::BAD_ARGUMENTS);
|
||||
}
|
||||
}
|
||||
|
||||
String logic_str, join_str;
|
||||
if (parameters.size() == 5 || parameters.size() == 6)
|
||||
if (parameters.size() >= 5)
|
||||
{
|
||||
logic_str = parameters[3].safeGet<String>();
|
||||
join_str = parameters[4].safeGet<String>();
|
||||
|
@ -140,16 +146,16 @@ namespace
|
|||
if (!logic_op.isValid())
|
||||
throw Exception(
|
||||
"AggregateFunction " + name + " only support logic operation: AND, OR, XOR, besides empty string is also ok",
|
||||
DB::ErrorCodes::LOGICAL_ERROR);
|
||||
DB::ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
JoinOperation join_op(join_str);
|
||||
if (!join_op.isValid())
|
||||
throw Exception(
|
||||
"AggregateFunction " + name + " only support join type: INNER, LEFT. And empty string means INNER JOIN",
|
||||
DB::ErrorCodes::LOGICAL_ERROR);
|
||||
DB::ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
UInt64 thread_num = 32;
|
||||
if (parameters.size() == 6)
|
||||
if (parameters.size() >= 6)
|
||||
{
|
||||
thread_num = parameters[5].safeGet<UInt64>();
|
||||
}
|
||||
|
@ -160,19 +166,19 @@ namespace
|
|||
result_type = parameters[6].safeGet<UInt64>();
|
||||
}
|
||||
if (result_type != 0 && result_type != 1)
|
||||
throw Exception("AggregateFunction " + name + " only support result_type: 0, 1", ErrorCodes::LOGICAL_ERROR);
|
||||
throw Exception("AggregateFunction " + name + " only support result_type: 0, 1", ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
if (!WhichDataType(argument_types[0]).isUInt8())
|
||||
throw Exception("AggregateFunction " + name + " needs Int type for its first argument", ErrorCodes::NOT_IMPLEMENTED);
|
||||
throw Exception("AggregateFunction " + name + " needs Int type for its first argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
if (!isBitmap64(argument_types[1]))
|
||||
throw Exception(
|
||||
"AggregateFunction " + name + " needs BitMap64 type for its second argument", ErrorCodes::NOT_IMPLEMENTED);
|
||||
"AggregateFunction " + name + " needs BitMap64 type for its second argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
for (size_t i = 2; i < argument_types.size(); ++i)
|
||||
{
|
||||
if (!isString(argument_types[i]))
|
||||
throw Exception("AggregateFunction " + name + " needs String type", ErrorCodes::NOT_IMPLEMENTED);
|
||||
throw Exception("AggregateFunction " + name + " needs String type", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
}
|
||||
|
||||
return std::make_shared<AggregateFunctionBitMapJoin>(argument_types, union_num, join_keys_idx, group_by_keys_idx, logic_op, join_op, thread_num, result_type);
|
||||
|
|
|
@ -45,7 +45,6 @@ namespace DB
|
|||
{
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int LOGICAL_ERROR;
|
||||
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
||||
}
|
||||
|
||||
|
@ -62,34 +61,34 @@ enum LogicOperationType
|
|||
|
||||
struct LogicOperation
|
||||
{
|
||||
LogicOperation() : logicOp(LogicOperationType::NONE) {}
|
||||
LogicOperation(String operation)
|
||||
LogicOperation() : logic_op(LogicOperationType::NONE) {}
|
||||
explicit LogicOperation(String operation)
|
||||
{
|
||||
std::transform(operation.begin(), operation.end(), operation.begin(), ::toupper);
|
||||
if (operation == "NONE" || operation.empty())
|
||||
logicOp = LogicOperationType::NONE;
|
||||
logic_op = LogicOperationType::NONE;
|
||||
else if (operation == "AND")
|
||||
logicOp = LogicOperationType::AND;
|
||||
logic_op = LogicOperationType::AND;
|
||||
else if (operation == "OR")
|
||||
logicOp = LogicOperationType::OR;
|
||||
logic_op = LogicOperationType::OR;
|
||||
else if (operation == "XOR")
|
||||
logicOp = LogicOperationType::XOR;
|
||||
logic_op = LogicOperationType::XOR;
|
||||
else if (operation == "ANDNOT")
|
||||
logicOp = LogicOperationType::ANDNOT;
|
||||
logic_op = LogicOperationType::ANDNOT;
|
||||
else if (operation == "RANDNOT" || operation == "REVERSEANDNOT")
|
||||
logicOp = LogicOperationType::REVERSEANDNOT;
|
||||
logic_op = LogicOperationType::REVERSEANDNOT;
|
||||
else
|
||||
logicOp = LogicOperationType::UNDEFINED;
|
||||
logic_op = LogicOperationType::UNDEFINED;
|
||||
}
|
||||
|
||||
LogicOperation(const LogicOperation & rhs)
|
||||
{
|
||||
this->logicOp = rhs.logicOp;
|
||||
this->logic_op = rhs.logic_op;
|
||||
}
|
||||
|
||||
bool isValid() { return logicOp < LogicOperationType::UNDEFINED; }
|
||||
bool isValid() const { return logic_op < LogicOperationType::UNDEFINED; }
|
||||
|
||||
LogicOperationType logicOp;
|
||||
LogicOperationType logic_op;
|
||||
};
|
||||
|
||||
enum JoinType
|
||||
|
@ -101,37 +100,39 @@ enum JoinType
|
|||
|
||||
struct JoinOperation
|
||||
{
|
||||
JoinOperation() : joinOp(JoinType::INNER) {}
|
||||
JoinOperation(String operation)
|
||||
JoinOperation() : join_op(JoinType::INNER) {}
|
||||
explicit JoinOperation(String operation)
|
||||
{
|
||||
std::transform(operation.begin(), operation.end(), operation.begin(), ::toupper);
|
||||
if (operation.empty() || operation == "INNER")
|
||||
joinOp = JoinType::INNER;
|
||||
join_op = JoinType::INNER;
|
||||
else if (operation == "LEFT")
|
||||
joinOp = JoinType::LEFT;
|
||||
join_op = JoinType::LEFT;
|
||||
else
|
||||
joinOp = JoinType::INVALID;
|
||||
join_op = JoinType::INVALID;
|
||||
}
|
||||
|
||||
bool isValid() { return joinOp < JoinType::INVALID; }
|
||||
bool isValid() const { return join_op < JoinType::INVALID; }
|
||||
|
||||
JoinType joinOp;
|
||||
JoinType join_op;
|
||||
};
|
||||
|
||||
using JoinKeys = Strings;
|
||||
using GroupByKeys = Strings;
|
||||
using Position = UInt8;
|
||||
using BitMapPtr = std::shared_ptr<BitMap64>;
|
||||
using JoinTuple = std::tuple<JoinKeys, GroupByKeys, BitMapPtr>;
|
||||
using JoinKeysPtr = std::shared_ptr<JoinKeys>;
|
||||
using GroupByKeysPtr = std::shared_ptr<GroupByKeys>;
|
||||
using JoinTuple = std::tuple<JoinKeysPtr, GroupByKeysPtr, BitMapPtr>;
|
||||
using JoinTuplePtr = std::shared_ptr<JoinTuple>;
|
||||
using JoinTuplePtrs = std::vector<JoinTuplePtr>;
|
||||
using PositionIndexPair = std::pair<UInt64, UInt64>;
|
||||
|
||||
void writeStrings(const Strings & data, WriteBuffer & buf)
|
||||
void writeStrings(const std::shared_ptr<Strings> & data, WriteBuffer & buf)
|
||||
{
|
||||
size_t size = data.size();
|
||||
size_t size = data->size();
|
||||
writeVarUInt(size, buf);
|
||||
for (auto & key : data)
|
||||
for (auto & key : *data)
|
||||
writeString(key.data(), key.size(), buf);
|
||||
}
|
||||
|
||||
|
@ -151,21 +152,24 @@ void readStrings(Strings & data, ReadBuffer & buf)
|
|||
// The key used to hash the join keys or group by keys
|
||||
struct StringsMapKey
|
||||
{
|
||||
Strings keys;
|
||||
std::shared_ptr<Strings> keys;
|
||||
|
||||
StringsMapKey() = default;
|
||||
StringsMapKey(String & key_) : keys{key_} {}
|
||||
StringsMapKey(Strings && keys_) : keys(std::move(keys_)) {}
|
||||
StringsMapKey(const Strings && keys_) : keys(std::move(keys_)) {}
|
||||
explicit StringsMapKey(String & key_)
|
||||
{
|
||||
Strings strs{ key_ };
|
||||
keys = std::make_shared<Strings>(std::move(strs));
|
||||
}
|
||||
explicit StringsMapKey(std::shared_ptr<Strings> && keyPtr) : keys(std::move(keyPtr)) {}
|
||||
|
||||
bool operator==(const StringsMapKey & rhs) const
|
||||
{
|
||||
if (keys.size() != rhs.keys.size())
|
||||
if (keys->size() != rhs.keys->size())
|
||||
return false;
|
||||
|
||||
for (size_t i = 0; i < keys.size(); ++i)
|
||||
for (size_t i = 0; i < keys->size(); ++i)
|
||||
{
|
||||
if (keys.at(i) != rhs.keys.at(i))
|
||||
if (keys->at(i) != rhs.keys->at(i))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -176,12 +180,12 @@ struct HashStringsMapKey
|
|||
{
|
||||
size_t operator()(const StringsMapKey & one) const
|
||||
{
|
||||
if (one.keys.empty())
|
||||
if (one.keys->empty())
|
||||
return std::hash<String>()("");
|
||||
|
||||
size_t res = std::hash<String>()(one.keys.at(0));
|
||||
for (size_t i = 1; i < one.keys.size(); ++i)
|
||||
res ^= std::hash<String>()(one.keys.at(i)) >> i;
|
||||
size_t res = std::hash<String>()(one.keys->at(0));
|
||||
for (size_t i = 1; i < one.keys->size(); ++i)
|
||||
res ^= std::hash<String>()(one.keys->at(i)) >> i;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
@ -225,14 +229,14 @@ public:
|
|||
// Here is no lock, we just do this in a single thread
|
||||
void getAllKeyValueByResultType(ColumnTuple & tuple_in_array, size_t result_type)
|
||||
{
|
||||
for (auto it = m_map.begin(); it != m_map.end(); ++it)
|
||||
for (auto & it : m_map)
|
||||
{
|
||||
BitMapPtr bitmap_ptr = std::get<2>(*(it->second.at(0)));
|
||||
size_t key_size = it->first.keys.size();
|
||||
BitMapPtr bitmap_ptr = std::get<2>(*(it.second.at(0)));
|
||||
size_t key_size = it.first.keys->size();
|
||||
for (size_t i = 0; i < key_size; ++i)
|
||||
{
|
||||
auto & column_group_by = static_cast<ColumnString &>(tuple_in_array.getColumn(i));
|
||||
column_group_by.insert(it->first.keys.at(i));
|
||||
column_group_by.insert(it.first.keys->at(i));
|
||||
}
|
||||
if (result_type == 0)
|
||||
{
|
||||
|
@ -256,42 +260,47 @@ private:
|
|||
class KVSharded
|
||||
{
|
||||
public:
|
||||
KVSharded(size_t num_shard) : m_mask(num_shard - 1), m_shards(num_shard)
|
||||
explicit KVSharded(size_t num_shard) : m_mask(num_shard - 1), m_shards(num_shard)
|
||||
{
|
||||
if ((num_shard & m_mask) != 0)
|
||||
throw Exception("num_shard should be a power of two", ErrorCodes::LOGICAL_ERROR);
|
||||
throw Exception("num_shard should be a power of two", ErrorCodes::BAD_ARGUMENTS);
|
||||
}
|
||||
|
||||
KVSharded(KVSharded && rhs) : m_mask(std::move(rhs.m_mask)), m_shards(std::move(rhs.m_shards)) {}
|
||||
void operator=(KVSharded && rhs)
|
||||
KVSharded& operator=(KVSharded && rhs)
|
||||
{
|
||||
m_shards = std::move(rhs.m_shards);
|
||||
if (this != &rhs) // Optional: Check for self-assignment
|
||||
{
|
||||
m_mask = std::move(rhs.m_mask);
|
||||
m_shards = std::move(rhs.m_shards);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
void put(const StringsMapKey & key, const JoinTuplePtrs & value)
|
||||
{
|
||||
get_shard(key).emplaceKVOrAddValue(std::move(key), std::move(value));
|
||||
getShard(key).emplaceKVOrAddValue(std::move(key), std::move(value));
|
||||
}
|
||||
|
||||
std::optional<JoinTuplePtrs> get(const StringsMapKey & key)
|
||||
{
|
||||
return get_shard(key).get(key);
|
||||
return getShard(key).get(key);
|
||||
}
|
||||
|
||||
/// It's used in insertIntoResult function, by a single thread
|
||||
void writeResultOfKeyAndValue(ColumnTuple & tuple_in_array, size_t result_type)
|
||||
{
|
||||
for (auto it = m_shards.begin(); it != m_shards.end(); ++it)
|
||||
for (auto & m_shard : m_shards)
|
||||
{
|
||||
it->getAllKeyValueByResultType(tuple_in_array, result_type);
|
||||
m_shard.getAllKeyValueByResultType(tuple_in_array, result_type);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
const size_t m_mask;
|
||||
size_t m_mask;
|
||||
std::vector<KVBigLock> m_shards;
|
||||
|
||||
KVBigLock & get_shard(const StringsMapKey & key)
|
||||
KVBigLock & getShard(const StringsMapKey & key)
|
||||
{
|
||||
HashStringsMapKey hash_fn;
|
||||
size_t h = hash_fn(key);
|
||||
|
@ -300,31 +309,37 @@ private:
|
|||
};
|
||||
|
||||
/// It's used to accommodate user input data, and data is grouped by join keys
|
||||
struct PositionTuples
|
||||
struct JoinPositionTuples
|
||||
{
|
||||
Position position;
|
||||
HashedStringsKeyTuples tuples; // The key used here is join key
|
||||
|
||||
PositionTuples() = default;
|
||||
PositionTuples(Position pos) : position(pos) {}
|
||||
PositionTuples(const PositionTuples & rhs) : position(rhs.position), tuples(rhs.tuples) {}
|
||||
PositionTuples(PositionTuples && rhs) : position(rhs.position), tuples(std::move(rhs.tuples)) {}
|
||||
PositionTuples(Position && pos, StringsMapKey && join_keys, JoinTuplePtr && val)
|
||||
JoinPositionTuples() = default;
|
||||
explicit JoinPositionTuples(Position pos) : position(pos) {}
|
||||
JoinPositionTuples(const JoinPositionTuples & rhs) = default;
|
||||
JoinPositionTuples(JoinPositionTuples && rhs) : position(rhs.position), tuples(std::move(rhs.tuples)) {}
|
||||
JoinPositionTuples(Position && pos, StringsMapKey && join_keys, JoinTuplePtr && val)
|
||||
: position(std::move(pos)), tuples{{std::move(join_keys), JoinTuplePtrs{val}}} {}
|
||||
|
||||
void operator=(const PositionTuples & rhs)
|
||||
JoinPositionTuples& operator=(const JoinPositionTuples & rhs)
|
||||
{
|
||||
this->position = rhs.position;
|
||||
this->tuples = rhs.tuples;
|
||||
if (this != &rhs) { // Check for self-assignment
|
||||
this->position = rhs.position;
|
||||
this->tuples = rhs.tuples;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
void operator=(const PositionTuples && rhs)
|
||||
JoinPositionTuples& operator=(JoinPositionTuples && rhs)
|
||||
{
|
||||
this->position = std::move(rhs.position);
|
||||
this->tuples = std::move(rhs.tuples);
|
||||
if (this != &rhs) { // Check for self-assignment
|
||||
this->position = std::move(rhs.position);
|
||||
this->tuples = std::move(rhs.tuples);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
void emplace_back(StringsMapKey && join_key, JoinTuplePtrs && value)
|
||||
void emplaceBack(StringsMapKey && join_key, JoinTuplePtrs && value)
|
||||
{
|
||||
auto it = this->tuples.find(join_key);
|
||||
if (it == this->tuples.end())
|
||||
|
@ -337,16 +352,16 @@ struct PositionTuples
|
|||
std::make_move_iterator(value.end()));
|
||||
}
|
||||
|
||||
void emplace_back(StringsMapKey && join_key, JoinTuplePtr && value)
|
||||
void emplaceBack(StringsMapKey && join_key, JoinTuplePtr && value)
|
||||
{
|
||||
this->emplace_back(std::move(join_key), JoinTuplePtrs{value});
|
||||
this->emplaceBack(std::move(join_key), JoinTuplePtrs{value});
|
||||
}
|
||||
|
||||
void insert(PositionTuples && rhs)
|
||||
void insert(JoinPositionTuples && rhs)
|
||||
{
|
||||
for (auto rt = rhs.tuples.begin(); rt != rhs.tuples.end(); ++rt)
|
||||
for (auto & tuple : rhs.tuples)
|
||||
{
|
||||
this->emplace_back(std::move(const_cast<StringsMapKey &>(rt->first)), std::move(rt->second));
|
||||
this->emplaceBack(std::move(const_cast<StringsMapKey &>(tuple.first)), std::move(tuple.second));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -355,22 +370,21 @@ struct PositionTuples
|
|||
writeVarUInt(position, buf);
|
||||
size_t map_size = tuples.size();
|
||||
writeVarUInt(map_size, buf);
|
||||
|
||||
for (auto it = tuples.begin(); it != tuples.end(); ++it)
|
||||
for (const auto & tuple : tuples)
|
||||
{
|
||||
writeStrings(it->first.keys, buf);
|
||||
writeStrings(tuple.first.keys, buf);
|
||||
|
||||
size_t tuples_num = it->second.size();
|
||||
size_t tuples_num = tuple.second.size();
|
||||
writeVarUInt(tuples_num, buf);
|
||||
for (auto jt = it->second.begin(); jt != it->second.end(); ++jt)
|
||||
for (const auto & jt : tuple.second)
|
||||
{
|
||||
JoinKeys join_key;
|
||||
GroupByKeys group_by;
|
||||
JoinKeysPtr join_key_ptr;
|
||||
GroupByKeysPtr group_by_ptr;
|
||||
BitMapPtr bitmap_ptr;
|
||||
std::tie(join_key, group_by, bitmap_ptr) = *(*jt);
|
||||
std::tie(join_key_ptr, group_by_ptr, bitmap_ptr) = *jt;
|
||||
|
||||
writeStrings(join_key, buf);
|
||||
writeStrings(group_by, buf);
|
||||
writeStrings(const_cast<const JoinKeysPtr&>(join_key_ptr), buf);
|
||||
writeStrings(const_cast<const GroupByKeysPtr&>(group_by_ptr), buf);
|
||||
|
||||
size_t bytes_size = (*bitmap_ptr).getSizeInBytes();
|
||||
writeVarUInt(bytes_size, buf);
|
||||
|
@ -415,14 +429,18 @@ struct PositionTuples
|
|||
buf.readStrict(buffer.data(), bytes_size);
|
||||
BitMap64 bitmap = BitMap64::readSafe(buffer.data(), bytes_size);
|
||||
|
||||
tmp_tuple = std::make_tuple(std::move(join_key),
|
||||
std::move(group_by),
|
||||
std::make_shared<BitMap64>(bitmap));
|
||||
JoinKeysPtr join_key_ptr = make_shared<JoinKeys>(join_key);
|
||||
GroupByKeysPtr group_by_ptr = make_shared<GroupByKeys>(group_by);
|
||||
|
||||
tmp_tuple = std::make_tuple(std::move(join_key_ptr),
|
||||
std::move(group_by_ptr),
|
||||
std::make_shared<BitMap64>(bitmap));
|
||||
|
||||
tuples_ptrs.emplace_back(std::make_shared<JoinTuple>(tmp_tuple));
|
||||
}
|
||||
|
||||
this->emplace_back(StringsMapKey(std::move(key)), std::move(tuples_ptrs));
|
||||
std::shared_ptr<Strings> key_ptr = std::make_shared<Strings>(std::move(key));
|
||||
this->emplaceBack(StringsMapKey(std::move(key_ptr)), std::move(tuples_ptrs));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -431,22 +449,23 @@ struct AggregateFunctionBitMapJoinData
|
|||
{
|
||||
AggregateFunctionBitMapJoinData() = default;
|
||||
|
||||
std::vector<PositionTuples> join_tuples_by_position;
|
||||
std::vector<JoinPositionTuples> join_tuples_by_position;
|
||||
|
||||
void add(const Position & pos, const BitMapPtr bitmap_ptr, const JoinKeys & join_keys, GroupByKeys & group_bys, size_t union_num)
|
||||
void add(const Position & pos, const BitMapPtr bitmap_ptr, JoinKeysPtr & join_keys, GroupByKeysPtr & group_bys, size_t union_num)
|
||||
{
|
||||
if (pos > union_num+1)
|
||||
throw Exception("AggregateFunction BitMapJoin: Wrong position value. Position starts from 1 and ends with join_num+1 ",
|
||||
DB::ErrorCodes::LOGICAL_ERROR);
|
||||
DB::ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
StringsMapKey key(std::move(join_keys));
|
||||
JoinKeysPtr cpy(join_keys);
|
||||
StringsMapKey key(std::move(cpy));
|
||||
JoinTuplePtr tuple_ptr{std::make_shared<JoinTuple>(std::make_tuple(std::move(join_keys), std::move(group_bys), std::move(bitmap_ptr)))};
|
||||
|
||||
for (auto & pos_tuples : join_tuples_by_position) // Position value is in a small range, just compare one by one
|
||||
{
|
||||
if (pos-1 == pos_tuples.position) // position starts from 0, but pos from user starts from 1
|
||||
{
|
||||
pos_tuples.emplace_back(std::move(key), std::move(tuple_ptr));
|
||||
pos_tuples.emplaceBack(std::move(key), std::move(tuple_ptr));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
@ -457,7 +476,7 @@ struct AggregateFunctionBitMapJoinData
|
|||
void merge (const AggregateFunctionBitMapJoinData & rhs)
|
||||
{
|
||||
auto & lhs_tuples_by_position = this->join_tuples_by_position;
|
||||
auto & rhs_tuples_by_position = const_cast<std::vector<PositionTuples> &>(rhs.join_tuples_by_position);
|
||||
auto & rhs_tuples_by_position = const_cast<std::vector<JoinPositionTuples> &>(rhs.join_tuples_by_position);
|
||||
|
||||
if (rhs_tuples_by_position.empty())
|
||||
return;
|
||||
|
@ -468,20 +487,20 @@ struct AggregateFunctionBitMapJoinData
|
|||
}
|
||||
|
||||
// Position value is in a small range, just compare one by one
|
||||
for (auto rt = rhs_tuples_by_position.begin(); rt != rhs_tuples_by_position.end(); ++rt)
|
||||
for (auto & rt : rhs_tuples_by_position)
|
||||
{
|
||||
bool pos_exists = false;
|
||||
for (auto lt = lhs_tuples_by_position.begin(); lt != lhs_tuples_by_position.end(); ++lt)
|
||||
for (auto & lt : lhs_tuples_by_position)
|
||||
{
|
||||
if (lt->position == rt->position)
|
||||
if (lt.position == rt.position)
|
||||
{
|
||||
lt->insert(std::move(*rt));
|
||||
lt.insert(std::move(rt));
|
||||
pos_exists = true;
|
||||
}
|
||||
}
|
||||
if (!pos_exists)
|
||||
{
|
||||
lhs_tuples_by_position.emplace_back(std::move(*rt));
|
||||
lhs_tuples_by_position.emplace_back(std::move(rt));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -490,10 +509,9 @@ struct AggregateFunctionBitMapJoinData
|
|||
{
|
||||
size_t position_num = join_tuples_by_position.size();
|
||||
writeVarUInt(position_num, buf);
|
||||
for (auto it = join_tuples_by_position.begin();
|
||||
it != join_tuples_by_position.end(); ++it)
|
||||
for (const auto & it : join_tuples_by_position)
|
||||
{
|
||||
it->serialize(buf);
|
||||
it.serialize(buf);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -504,7 +522,7 @@ struct AggregateFunctionBitMapJoinData
|
|||
|
||||
for (size_t i = 0; i < position_num; ++i)
|
||||
{
|
||||
PositionTuples pos_tuple;
|
||||
JoinPositionTuples pos_tuple;
|
||||
pos_tuple.deserialize(buf);
|
||||
join_tuples_by_position.emplace_back(std::move(pos_tuple));
|
||||
}
|
||||
|
@ -563,12 +581,15 @@ public:
|
|||
for (auto pi : group_by_keys_idx)
|
||||
{
|
||||
if (pi.first == static_cast<UInt64>(pos) && columns_str.at(pi.second - 3) == "#-1#")
|
||||
throw Exception("The column you identified for group by is invalid, where data is '#-1#'", ErrorCodes::LOGICAL_ERROR);
|
||||
throw Exception("The column you identified for group by is invalid, where data is '#-1#'", ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
group_by_keys.emplace_back(columns_str.at(pi.second - 3));
|
||||
}
|
||||
|
||||
this->data(place).add(pos, bitmap_ptr, join_keys, group_by_keys, union_num);
|
||||
auto join_keys_ptr = make_shared<JoinKeys>(join_keys);
|
||||
auto group_by_keys_ptr = make_shared<GroupByKeys>(group_by_keys);
|
||||
|
||||
this->data(place).add(pos, bitmap_ptr, join_keys_ptr, group_by_keys_ptr, union_num);
|
||||
}
|
||||
|
||||
void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr __restrict rhs, Arena *) const override
|
||||
|
@ -588,13 +609,12 @@ public:
|
|||
|
||||
void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
|
||||
{
|
||||
auto & this_join_tuples = const_cast<std::vector<PositionTuples> &>(this->data(place).join_tuples_by_position);
|
||||
auto & this_join_tuples = const_cast<std::vector<JoinPositionTuples> &>(this->data(place).join_tuples_by_position);
|
||||
if (this_join_tuples.size() < 2)
|
||||
return;
|
||||
// throw Exception("AggregateFunction " + getName() + ": at least one position has no data actually", ErrorCodes::LOGICAL_ERROR);
|
||||
|
||||
sort(this_join_tuples.begin(), this_join_tuples.end(),
|
||||
[](const PositionTuples & left, const PositionTuples & right) -> bool {
|
||||
[](const JoinPositionTuples & left, const JoinPositionTuples & right) -> bool {
|
||||
return left.position < right.position;
|
||||
});
|
||||
|
||||
|
@ -619,31 +639,34 @@ private:
|
|||
{
|
||||
ThreadGroupStatusPtr thread_group = CurrentThread::getGroup();
|
||||
|
||||
auto runJoin = [&](size_t index)
|
||||
auto run_join = [&](size_t index)
|
||||
{
|
||||
setThreadName("bitmapJoin");
|
||||
CurrentThread::attachToIfDetached(thread_group);
|
||||
|
||||
JoinTuplePtrs tuples_tmp;
|
||||
Pairs & group = split_lhs_data.at(index);
|
||||
for (auto gt = group.begin(); gt != group.end(); ++gt)
|
||||
for (auto & gt : group)
|
||||
{
|
||||
auto & key = gt->first;
|
||||
auto & left = gt->second; // left JoinTuplePtrs
|
||||
auto & key = gt.first;
|
||||
auto & left = gt.second; // left JoinTuplePtrs
|
||||
|
||||
if (left.empty())
|
||||
continue;
|
||||
|
||||
auto rjt = rhs_data.find(key);
|
||||
if (rjt == rhs_data.end()) // key is not matched
|
||||
{
|
||||
switch (join_operation.joinOp)
|
||||
switch (join_operation.join_op)
|
||||
{
|
||||
case JoinType::INNER : // INNER JOIN
|
||||
continue;
|
||||
case JoinType::LEFT : // ALL LEFT JOIN
|
||||
{
|
||||
for (auto it = left.begin(); it != left.end(); ++it)
|
||||
for (auto & it : left)
|
||||
{
|
||||
Strings group_by_keys = std::get<1>(*(*it));
|
||||
result.put(StringsMapKey(std::move(group_by_keys)), {*it});
|
||||
auto group_by_keys = std::get<1>(*it);
|
||||
result.put(StringsMapKey(std::move(group_by_keys)), {it});
|
||||
}
|
||||
}
|
||||
continue;
|
||||
|
@ -653,39 +676,39 @@ private:
|
|||
}
|
||||
|
||||
auto & right = rjt->second; // right JoinTuplePtrs
|
||||
for (auto lt = left.begin(); lt != left.end(); ++lt)
|
||||
for (auto & lt : left)
|
||||
{
|
||||
for (auto rt = right.cbegin(); rt != right.cend(); ++rt)
|
||||
for (const auto & rt : right)
|
||||
{
|
||||
Strings join_keys;
|
||||
Strings lt_group_bys, rt_group_bys;
|
||||
JoinKeysPtr join_keys_ptr;
|
||||
GroupByKeysPtr lt_group_bys, rt_group_bys;
|
||||
BitMapPtr lt_bitmap_ptr, rt_bitmap_ptr;
|
||||
|
||||
std::tie(join_keys, lt_group_bys, lt_bitmap_ptr) = *(*lt);
|
||||
std::tie(std::ignore, rt_group_bys, rt_bitmap_ptr) = *(*rt);
|
||||
std::tie(join_keys_ptr, lt_group_bys, lt_bitmap_ptr) = *lt;
|
||||
std::tie(std::ignore, rt_group_bys, rt_bitmap_ptr) = *rt;
|
||||
|
||||
Strings group_bys;
|
||||
for (size_t i = 0; i < group_by_keys_idx.size(); ++i)
|
||||
{
|
||||
if (group_by_keys_idx[i].first == 0xFF) // If no position identifier
|
||||
{
|
||||
if (lt_group_bys.at(i) != "#-1#") // left subquery has a group by key
|
||||
group_bys.emplace_back(std::move(lt_group_bys.at(i)));
|
||||
if (lt_group_bys->at(i) != "#-1#") // left subquery has a group by key
|
||||
group_bys.emplace_back(std::move(lt_group_bys->at(i)));
|
||||
else
|
||||
group_bys.emplace_back(std::move(rt_group_bys.at(i)));
|
||||
group_bys.emplace_back(std::move(rt_group_bys->at(i)));
|
||||
}
|
||||
else
|
||||
{
|
||||
if (group_by_keys_idx[i].first == 1)
|
||||
group_bys.emplace_back(std::move(lt_group_bys.at(i)));
|
||||
group_bys.emplace_back(std::move(lt_group_bys->at(i)));
|
||||
else if (group_by_keys_idx[i].first == 2)
|
||||
group_bys.emplace_back(std::move(rt_group_bys.at(i)));
|
||||
group_bys.emplace_back(std::move(rt_group_bys->at(i)));
|
||||
}
|
||||
}
|
||||
|
||||
BitMap64 bitmap(*lt_bitmap_ptr);
|
||||
|
||||
switch (logic_operation.logicOp)
|
||||
switch (logic_operation.logic_op)
|
||||
{
|
||||
case DB::LogicOperationType::NONE :
|
||||
{
|
||||
|
@ -712,10 +735,12 @@ private:
|
|||
break;
|
||||
}
|
||||
|
||||
JoinTuple tmp_tuple{std::make_tuple(join_keys, group_bys,
|
||||
std::make_shared<BitMap64>(std::move(bitmap)))};
|
||||
auto group_by_ptr = make_shared<GroupByKeys>(group_bys);
|
||||
|
||||
result.put(std::move(StringsMapKey(std::move(group_bys))),
|
||||
JoinTuple tmp_tuple{std::make_tuple(join_keys_ptr, group_by_ptr,
|
||||
std::make_shared<BitMap64>(std::move(bitmap)))};
|
||||
|
||||
result.put(std::move(StringsMapKey(std::move(group_by_ptr))),
|
||||
std::move(JoinTuplePtrs{std::make_shared<JoinTuple>(tmp_tuple)}));
|
||||
}
|
||||
}
|
||||
|
@ -723,34 +748,44 @@ private:
|
|||
}
|
||||
};
|
||||
|
||||
std::unique_ptr<ThreadPool> threadPool = std::make_unique<ThreadPool>(thread_num_);
|
||||
std::unique_ptr<ThreadPool> thread_pool = std::make_unique<ThreadPool>(thread_num_);
|
||||
|
||||
for (size_t i = 0; i < thread_num_; ++i)
|
||||
{
|
||||
auto joinAndFunc = std::bind(runJoin, i);
|
||||
threadPool->scheduleOrThrowOnError(joinAndFunc);
|
||||
auto join_and_func = [i, &run_join]() { run_join(i); };
|
||||
thread_pool->scheduleOrThrowOnError(join_and_func);
|
||||
}
|
||||
|
||||
threadPool->wait();
|
||||
thread_pool->wait();
|
||||
}
|
||||
|
||||
KVSharded doJoinWithLogicOperation(std::vector<PositionTuples> & this_join_tuples) const
|
||||
KVSharded doJoinWithLogicOperation(std::vector<JoinPositionTuples> & this_join_tuples) const
|
||||
{
|
||||
HashedStringsKeyTuples & left_join_tuples = this_join_tuples.at(0).tuples;
|
||||
HashedStringsKeyTuples & right_join_tuples = this_join_tuples.at(1).tuples;
|
||||
|
||||
// split the map to several vector
|
||||
std::vector<Pairs> pair_vector_buckets(thread_num);
|
||||
std::vector<Pairs> pair_vector_buckets;
|
||||
size_t idx = 0;
|
||||
for (auto key_tuple_it = left_join_tuples.begin(); key_tuple_it != left_join_tuples.end(); ++key_tuple_it)
|
||||
auto key_tuple_it = left_join_tuples.begin();
|
||||
for (; key_tuple_it != left_join_tuples.end(); ++key_tuple_it)
|
||||
{
|
||||
pair_vector_buckets.at(idx % thread_num).emplace_back(std::move(*key_tuple_it));
|
||||
left_join_tuples.erase(key_tuple_it);
|
||||
idx++;
|
||||
Pairs p{{key_tuple_it->first, key_tuple_it->second}};
|
||||
pair_vector_buckets.emplace_back(p);
|
||||
++idx;
|
||||
}
|
||||
|
||||
/// processing remaing data
|
||||
for (; key_tuple_it != left_join_tuples.end(); ++key_tuple_it)
|
||||
{
|
||||
pair_vector_buckets.at(idx % thread_num).emplace_back(key_tuple_it->first, key_tuple_it->second);
|
||||
++idx;
|
||||
}
|
||||
left_join_tuples.clear();
|
||||
|
||||
KVSharded result(128);
|
||||
joinMultiThreads(result, pair_vector_buckets, right_join_tuples, thread_num);
|
||||
size_t actual_thread_num = std::min(thread_num, pair_vector_buckets.size());
|
||||
joinMultiThreads(result, pair_vector_buckets, right_join_tuples, actual_thread_num);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
|
|
@ -21,19 +21,26 @@
|
|||
|
||||
namespace DB
|
||||
{
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int BAD_ARGUMENTS;
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
AggregateFunctionPtr createAggregateFunctionBitMapJoinAndCard(const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
|
||||
{
|
||||
if (argument_types.size() < 4)
|
||||
throw Exception("AggregateFunction " + name + " needs at least four arguments", ErrorCodes::NOT_IMPLEMENTED);
|
||||
throw Exception("AggregateFunction " + name + " needs at least four arguments", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
||||
|
||||
Int32 union_num = 0;
|
||||
UInt64 thread_num = 0;
|
||||
UInt64 limit_bitmap_number = 0;
|
||||
if (parameters.size() == 0)
|
||||
throw Exception("AggregateFunction " + name + " needs two parameters (join_num, thread_num)", ErrorCodes::NOT_IMPLEMENTED);
|
||||
if (parameters.empty())
|
||||
throw Exception("AggregateFunction " + name + " needs two parameters (join_num, thread_num)", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
||||
else
|
||||
{
|
||||
union_num = static_cast<Int32>(parameters[0].safeGet<UInt64>());
|
||||
|
@ -44,7 +51,7 @@ AggregateFunctionPtr createAggregateFunctionBitMapJoinAndCard(const std::string
|
|||
}
|
||||
|
||||
if (union_num == 0 || union_num > 8) // a continuos 8 join is meaningless, 1 join is mostly used.
|
||||
throw Exception("AggregateFunction " + name + " join_number is in range [1,8]", ErrorCodes::NOT_IMPLEMENTED);
|
||||
throw Exception("AggregateFunction " + name + " join_number is in range [1,8]", ErrorCodes::BAD_ARGUMENTS);
|
||||
if (thread_num == 0)
|
||||
thread_num = 16;
|
||||
if (thread_num > 48) // Several Storage-C machine only have 48 cores, besides 48 threads is large enough
|
||||
|
@ -53,23 +60,23 @@ AggregateFunctionPtr createAggregateFunctionBitMapJoinAndCard(const std::string
|
|||
limit_bitmap_number = 100000000; // 100 million
|
||||
|
||||
if (!isBitmap64(argument_types[0]))
|
||||
throw Exception("AggregateFunction " + name + " needs BitMap64 type for its first argument", ErrorCodes::NOT_IMPLEMENTED);
|
||||
throw Exception("AggregateFunction " + name + " needs BitMap64 type for its first argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
if (!WhichDataType(argument_types[1]).isUInt8())
|
||||
throw Exception("AggregateFunction " + name + " needs Int type for its second argument", ErrorCodes::NOT_IMPLEMENTED);
|
||||
throw Exception("AggregateFunction " + name + " needs Int type for its second argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
if (!WhichDataType(argument_types[2]).isInt32())
|
||||
throw Exception("AggregateFunction " + name + " needs Int32 type for its third argument", ErrorCodes::NOT_IMPLEMENTED);
|
||||
throw Exception("AggregateFunction " + name + " needs Int32 type for its third argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
DataTypePtr attr_val_type = argument_types[3];
|
||||
const DataTypePtr& attr_val_type = argument_types[3];
|
||||
|
||||
if (!isString(*attr_val_type))
|
||||
throw Exception("AggregateFunction " + name + " needs String type for its fourth argument", ErrorCodes::NOT_IMPLEMENTED);
|
||||
throw Exception("AggregateFunction " + name + " needs String type for its fourth argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
for (size_t i = 4; i < argument_types.size(); ++i)
|
||||
{
|
||||
if (!isString(argument_types[i]))
|
||||
throw Exception("AggregateFunction " + name + " needs String type for args...", ErrorCodes::NOT_IMPLEMENTED);
|
||||
throw Exception("AggregateFunction " + name + " needs String type for args...", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
}
|
||||
|
||||
return std::make_shared<AggregateFunctionBitMapJoinAndCard>(argument_types, union_num, thread_num, limit_bitmap_number);
|
||||
|
@ -78,13 +85,13 @@ AggregateFunctionPtr createAggregateFunctionBitMapJoinAndCard(const std::string
|
|||
AggregateFunctionPtr createAggregateFunctionBitMapJoinAndCard2(const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
|
||||
{
|
||||
if (argument_types.size() < 4)
|
||||
throw Exception("AggregateFunction " + name + " needs at least four arguments", ErrorCodes::NOT_IMPLEMENTED);
|
||||
throw Exception("AggregateFunction " + name + " needs at least four arguments", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
||||
|
||||
Int32 union_num = 0;
|
||||
UInt64 thread_num = 0;
|
||||
UInt64 limit_bitmap_number = 0;
|
||||
if (parameters.size() == 0)
|
||||
throw Exception("AggregateFunction " + name + " needs two parameters (join_num, thread_num)", ErrorCodes::NOT_IMPLEMENTED);
|
||||
if (parameters.empty())
|
||||
throw Exception("AggregateFunction " + name + " needs two parameters (join_num, thread_num)", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
||||
else
|
||||
{
|
||||
union_num = static_cast<Int32>(parameters[0].safeGet<UInt64>());
|
||||
|
@ -95,7 +102,7 @@ AggregateFunctionPtr createAggregateFunctionBitMapJoinAndCard2(const std::string
|
|||
}
|
||||
|
||||
if (union_num == 0 || union_num > 8) // a continuos 8 join is meaningless, 1 join is mostly used.
|
||||
throw Exception("AggregateFunction " + name + " join_number is in range [1,8]", ErrorCodes::NOT_IMPLEMENTED);
|
||||
throw Exception("AggregateFunction " + name + " join_number is in range [1,8]", ErrorCodes::BAD_ARGUMENTS);
|
||||
if (thread_num == 0)
|
||||
thread_num = 16;
|
||||
if (thread_num > 48) // Several Storage-C machine only have 48 cores, and 48 threads is large enough
|
||||
|
@ -104,23 +111,23 @@ AggregateFunctionPtr createAggregateFunctionBitMapJoinAndCard2(const std::string
|
|||
limit_bitmap_number = 100000000; // 100 million
|
||||
|
||||
if (!isBitmap64(argument_types[0]))
|
||||
throw Exception("AggregateFunction " + name + " needs BitMap64 type for its first argument", ErrorCodes::NOT_IMPLEMENTED);
|
||||
throw Exception("AggregateFunction " + name + " needs BitMap64 type for its first argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
if (!WhichDataType(argument_types[1]).isUInt8())
|
||||
throw Exception("AggregateFunction " + name + " needs Int type for its second argument", ErrorCodes::NOT_IMPLEMENTED);
|
||||
throw Exception("AggregateFunction " + name + " needs Int type for its second argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
if (!WhichDataType(argument_types[2]).isInt32())
|
||||
throw Exception("AggregateFunction " + name + " needs Int32 type for its third argument", ErrorCodes::NOT_IMPLEMENTED);
|
||||
throw Exception("AggregateFunction " + name + " needs Int32 type for its third argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
DataTypePtr attr_val_type = argument_types[3];
|
||||
const DataTypePtr& attr_val_type = argument_types[3];
|
||||
|
||||
if (!isString(*attr_val_type))
|
||||
throw Exception("AggregateFunction " + name + " needs String type for its fourth argument", ErrorCodes::NOT_IMPLEMENTED);
|
||||
throw Exception("AggregateFunction " + name + " needs String type for its fourth argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
for (size_t i = 4; i < argument_types.size(); ++i)
|
||||
{
|
||||
if (!isString(argument_types[i]))
|
||||
throw Exception("AggregateFunction " + name + " needs String type for args...", ErrorCodes::NOT_IMPLEMENTED);
|
||||
throw Exception("AggregateFunction " + name + " needs String type for args...", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
}
|
||||
|
||||
return std::make_shared<AggregateFunctionBitMapJoinAndCard2>(argument_types, union_num, thread_num, limit_bitmap_number);
|
||||
|
|
|
@ -52,7 +52,6 @@ namespace DB
|
|||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int LOGICAL_ERROR;
|
||||
extern const int NUMBER_OF_ARGUMENTS_DOES_NOT_MATCH;
|
||||
extern const int TOO_MANY_ROWS;
|
||||
}
|
||||
|
@ -68,7 +67,7 @@ struct PositionTuples
|
|||
JoinTuplePtrs tuples;
|
||||
|
||||
PositionTuples() = default;
|
||||
PositionTuples(Int32 pos_):position(pos_) {}
|
||||
explicit PositionTuples(Int32 pos_):position(pos_) {}
|
||||
PositionTuples(Int32 pos_, JoinTuplePtrs && tuples_) : position(pos_), tuples(std::move(tuples_)) {}
|
||||
|
||||
void addTuple(const JoinTuple & tup)
|
||||
|
@ -89,7 +88,7 @@ struct JoinTupleMapKey
|
|||
DB::String attr_val;
|
||||
DB::Strings args;
|
||||
|
||||
JoinTupleMapKey() { }
|
||||
JoinTupleMapKey() = default;
|
||||
JoinTupleMapKey(const Int32 pos_, const DB::String & attr_val_, const DB::Strings & args_) : pos(pos_), attr_val(attr_val_), args(args_) { }
|
||||
|
||||
bool operator==(const JoinTupleMapKey & rhs) const
|
||||
|
@ -104,7 +103,7 @@ struct HashJoinTupleMapKey
|
|||
{
|
||||
size_t res = std::hash<Int32>()(key.pos);
|
||||
res ^= std::hash<DB::String>()(key.attr_val);
|
||||
for (auto a : key.args)
|
||||
for (const auto& a : key.args)
|
||||
{
|
||||
res ^= std::hash<DB::String>()(a);
|
||||
}
|
||||
|
@ -121,7 +120,7 @@ struct AggregateFunctionBitMapJoinAndCardData
|
|||
void add(const BitMapPtr & bitmap_ptr, const Int32 & pos, const JoinKey & join_key, const String & attr_val, const Strings & args, Int32 union_num)
|
||||
{
|
||||
if (pos <= 0 || pos > union_num+1)
|
||||
throw Exception("AggregateFunction BitMapJoinAndCard: Wrong position value. Position starts from 1 and ends with union_num+1 ", DB::ErrorCodes::LOGICAL_ERROR);
|
||||
throw Exception("AggregateFunction BitMapJoinAndCard: Wrong position value. Position starts from 1 and ends with union_num+1 ", DB::ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
Strings attr_vals(union_num+1);
|
||||
attr_vals[pos-1] = attr_val;
|
||||
|
@ -140,15 +139,15 @@ struct AggregateFunctionBitMapJoinAndCardData
|
|||
|
||||
void merge(const AggregateFunctionBitMapJoinAndCardData & rhs)
|
||||
{
|
||||
for (auto rt = rhs.join_tuple_map.begin(); rt != rhs.join_tuple_map.end(); ++rt)
|
||||
for (const auto & rt : rhs.join_tuple_map)
|
||||
{
|
||||
|
||||
auto it = join_tuple_map.find(rt->first);
|
||||
auto it = join_tuple_map.find(rt.first);
|
||||
if (it == join_tuple_map.end())
|
||||
join_tuple_map.emplace(std::move(rt->first), std::move(rt->second));
|
||||
join_tuple_map.emplace(std::move(rt.first), std::move(rt.second));
|
||||
else
|
||||
{
|
||||
*std::get<0>((it->second)) |= *std::get<0>((rt->second));
|
||||
*std::get<0>((it->second)) |= *std::get<0>((rt.second));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -157,14 +156,14 @@ struct AggregateFunctionBitMapJoinAndCardData
|
|||
{
|
||||
size_t map_size = join_tuple_map.size();
|
||||
writeVarUInt(map_size, buf);
|
||||
for (auto it = join_tuple_map.begin(); it != join_tuple_map.end(); ++it)
|
||||
for (const auto & it : join_tuple_map)
|
||||
{
|
||||
BitMapPtr bitmap_ptr;
|
||||
Int32 pos;
|
||||
JoinKey joinkey;
|
||||
Strings attr_vals;
|
||||
Strings args;
|
||||
std::tie(bitmap_ptr, pos, joinkey, attr_vals, args) = it->second;
|
||||
std::tie(bitmap_ptr, pos, joinkey, attr_vals, args) = it.second;
|
||||
|
||||
size_t bytes_size = (*bitmap_ptr).getSizeInBytes();
|
||||
writeVarUInt(bytes_size, buf);
|
||||
|
@ -176,13 +175,13 @@ struct AggregateFunctionBitMapJoinAndCardData
|
|||
writeVarInt(joinkey, buf);
|
||||
|
||||
writeVarUInt(attr_vals.size(), buf);
|
||||
for (auto str : attr_vals)
|
||||
for (const auto& str : attr_vals)
|
||||
{
|
||||
writeString(str, buf);
|
||||
}
|
||||
|
||||
writeVarUInt((args).size(), buf);
|
||||
for (auto a : args)
|
||||
for (const auto& a : args)
|
||||
{
|
||||
writeString(a, buf);
|
||||
}
|
||||
|
@ -256,7 +255,7 @@ public:
|
|||
auto bitmap_ptr = std::make_shared<BitMap64>(std::move(const_cast<BitMap64 &>(bitmap)));
|
||||
|
||||
const auto & col_position = static_cast<const ColumnInt8 &>(*columns[1]);
|
||||
const Int32 & positionInUnion = static_cast<Int32>(col_position.getElement(row_num));
|
||||
const Int32 & position_in_union = static_cast<Int32>(col_position.getElement(row_num));
|
||||
|
||||
const auto & col_joinkey = static_cast<const ColumnInt32 &>(*columns[2]);
|
||||
const JoinKey & join_key = col_joinkey.getElement(row_num);
|
||||
|
@ -271,7 +270,7 @@ public:
|
|||
args.emplace_back(col_arg.getDataAt(row_num).toString());
|
||||
}
|
||||
|
||||
this->data(place).add(bitmap_ptr, positionInUnion, join_key, attr_val, args, union_num);
|
||||
this->data(place).add(bitmap_ptr, position_in_union, join_key, attr_val, args, union_num);
|
||||
}
|
||||
|
||||
void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr __restrict rhs, Arena *) const override
|
||||
|
@ -304,20 +303,20 @@ public:
|
|||
void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
|
||||
{
|
||||
auto & tuples_map = this->data(place).join_tuple_map;
|
||||
std::vector<PositionTuples> tuplesByPosition;
|
||||
std::vector<PositionTuples> tuples_by_position;
|
||||
for (size_t i = 0; i < union_num + 1; ++i)
|
||||
{
|
||||
tuplesByPosition.emplace_back(i, JoinTuplePtrs());
|
||||
tuples_by_position.emplace_back(i, JoinTuplePtrs());
|
||||
}
|
||||
|
||||
//partition all input tuples by position
|
||||
for (auto p = tuples_map.begin(); p != tuples_map.end(); ++p)
|
||||
for (auto & p : tuples_map)
|
||||
{
|
||||
Int32 pos = p->first.pos;
|
||||
tuplesByPosition.at(pos-1).addTuple(p->second);
|
||||
Int32 pos = p.first.pos;
|
||||
tuples_by_position.at(pos-1).addTuple(p.second);
|
||||
}
|
||||
|
||||
const auto res = calcJoin(tuplesByPosition);
|
||||
const auto res = calcJoin(tuples_by_position);
|
||||
|
||||
auto & col = static_cast<ColumnArray &>(to);
|
||||
auto &col_offsets = static_cast<ColumnArray::ColumnOffsets &>(col.getOffsetsColumn());
|
||||
|
@ -329,16 +328,16 @@ public:
|
|||
|
||||
size_t args_num = arguments_num - 4;
|
||||
|
||||
for (auto & p : res)
|
||||
for (const auto & p : res)
|
||||
{
|
||||
for (auto rt = p.begin(); rt != p.end(); ++rt)
|
||||
for (const auto & rt : p)
|
||||
{
|
||||
UInt64 bitmap_cardinality;
|
||||
JoinKey joinkey;
|
||||
Strings attr_vals;
|
||||
Strings args;
|
||||
|
||||
std::tie(bitmap_cardinality, std::ignore, joinkey, attr_vals, args) = std::move(*rt);
|
||||
std::tie(bitmap_cardinality, std::ignore, joinkey, attr_vals, args) = std::move(rt);
|
||||
col_bitmap_card.insert(bitmap_cardinality);
|
||||
col_joinkey.insert(joinkey);
|
||||
|
||||
|
@ -358,24 +357,24 @@ public:
|
|||
}
|
||||
|
||||
private:
|
||||
std::vector<std::vector<ResultTuple>>
|
||||
calcJoinMultiThreads(std::shared_ptr<std::vector<JoinTuplePtrs>> & res_ptr, const std::shared_ptr<PositionTuples> & rhs, size_t thread_num_, const bool is_last_join) const
|
||||
static std::vector<std::vector<ResultTuple>>
|
||||
calcJoinMultiThreads(std::shared_ptr<std::vector<JoinTuplePtrs>> & res_ptr, const std::shared_ptr<PositionTuples> & rhs, size_t thread_num_, const bool is_last_join)
|
||||
{
|
||||
std::vector<JoinTuplePtrs> intermediate_tuples_bucktes(thread_num_, JoinTuplePtrs()); // It store the intermediate JOIN result, and it's used for next JOIN
|
||||
std::vector<std::vector<ResultTuple>> res_tuples_buckets(thread_num_, std::vector<ResultTuple>()); // It store the final result of the last JOIN
|
||||
ThreadGroupStatusPtr thread_group = CurrentThread::getGroup();
|
||||
|
||||
auto runJoinAndCard = [&] (size_t index)
|
||||
auto run_join_and_card = [&] (size_t index)
|
||||
{
|
||||
setThreadName("bitmapJoinAndCard");
|
||||
setThreadName("JoinAndCard");
|
||||
CurrentThread::attachToIfDetached(thread_group);
|
||||
JoinTuplePtrs tuples_tmp;
|
||||
std::vector<ResultTuple> res_tuples_in_a_thread;
|
||||
|
||||
auto & left = res_ptr->at(index);
|
||||
for (auto rt = rhs->tuples.begin(); rt != rhs->tuples.end(); ++rt)
|
||||
for (auto & rt : rhs->tuples)
|
||||
{
|
||||
for (auto lt = left.begin(); lt != left.end(); ++lt)
|
||||
for (auto & lt : left)
|
||||
{
|
||||
BitMapPtr bitmap_ptr, rt_bitmap_ptr;
|
||||
Int32 pos, rt_pos;
|
||||
|
@ -383,8 +382,8 @@ private:
|
|||
Strings attr_vals, rt_attr_vals;
|
||||
Strings args, rt_args;
|
||||
|
||||
std::tie(bitmap_ptr, pos, joinkey, attr_vals, args) = *(*lt);
|
||||
std::tie(rt_bitmap_ptr, rt_pos, std::ignore, rt_attr_vals, rt_args) = *(*rt);
|
||||
std::tie(bitmap_ptr, pos, joinkey, attr_vals, args) = *lt;
|
||||
std::tie(rt_bitmap_ptr, rt_pos, std::ignore, rt_attr_vals, rt_args) = *rt;
|
||||
|
||||
BitMap64 bitmap(*bitmap_ptr);
|
||||
bitmap &= *rt_bitmap_ptr;
|
||||
|
@ -416,15 +415,15 @@ private:
|
|||
res_tuples_buckets[index] = std::move(res_tuples_in_a_thread);
|
||||
};
|
||||
|
||||
std::unique_ptr<ThreadPool> threadPool = std::make_unique<ThreadPool>(thread_num_);
|
||||
std::unique_ptr<ThreadPool> thread_pool = std::make_unique<ThreadPool>(thread_num_);
|
||||
|
||||
for (size_t i = 0; i < thread_num_; ++i)
|
||||
{
|
||||
auto joinAndCardFunc = std::bind(runJoinAndCard, i);
|
||||
threadPool->scheduleOrThrowOnError(joinAndCardFunc);
|
||||
auto join_and_card_func = [&run_join_and_card, i]() { run_join_and_card(i); };
|
||||
thread_pool->scheduleOrThrowOnError(join_and_card_func);
|
||||
}
|
||||
|
||||
threadPool->wait();
|
||||
thread_pool->wait();
|
||||
|
||||
res_ptr = std::make_shared<std::vector<JoinTuplePtrs>>(std::move(intermediate_tuples_bucktes));
|
||||
// For intermediate JOIN, a empty object returned,
|
||||
|
@ -436,7 +435,7 @@ private:
|
|||
{
|
||||
//partition the entire position tuples into several parts
|
||||
if (position_tuples.empty())
|
||||
throw Exception("BitMapJoinAndCard::calcJoin: empty input data!", DB::ErrorCodes::LOGICAL_ERROR);
|
||||
throw Exception("BitMapJoinAndCard::calcJoin: empty input data!", DB::ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
// look up for the largest parts
|
||||
size_t max_size = 0;
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#include <IO/WriteHelpers.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
|
||||
#include "Common/formatIPv6.h"
|
||||
#include <Common/ThreadPool.h>
|
||||
#include <Common/setThreadName.h>
|
||||
#include <Common/CurrentThread.h>
|
||||
|
@ -50,7 +51,6 @@ namespace DB
|
|||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int LOGICAL_ERROR;
|
||||
extern const int NUMBER_OF_ARGUMENTS_DOES_NOT_MATCH;
|
||||
}
|
||||
|
||||
|
@ -65,7 +65,7 @@ struct AggregateFunctionBitMapJoinAndCard2Data
|
|||
void add(const BitMapPtr & bitmap_ptr, const Int32 & pos, const JoinKey & join_key, const String & attr_val, const Strings & args, Int32 union_num)
|
||||
{
|
||||
if (pos <= 0 || pos > union_num+1)
|
||||
throw Exception("AggregateFunction BitMapJoinAndCard2: Wrong position value. Position starts from 1 and ends with join_num+1, please check", DB::ErrorCodes::LOGICAL_ERROR);
|
||||
throw Exception("AggregateFunction BitMapJoinAndCard2: Wrong position value. Position starts from 1 and ends with join_num+1, please check", DB::ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
Strings attr_vals(union_num+1);
|
||||
attr_vals[pos-1] = attr_val;
|
||||
|
@ -84,14 +84,14 @@ struct AggregateFunctionBitMapJoinAndCard2Data
|
|||
|
||||
size_t input_tuples_size = input_tuples.size();
|
||||
writeVarUInt(input_tuples_size, buf);
|
||||
for (auto it = input_tuples.begin(); it != input_tuples.end(); ++it)
|
||||
for (const auto & input_tuple : input_tuples)
|
||||
{
|
||||
BitMapPtr bitmap_ptr;
|
||||
Int32 pos;
|
||||
JoinKey joinkey;
|
||||
Strings attr_vals;
|
||||
Strings args;
|
||||
std::tie(bitmap_ptr, pos, joinkey, attr_vals, args) = *it;
|
||||
std::tie(bitmap_ptr, pos, joinkey, attr_vals, args) = input_tuple;
|
||||
|
||||
size_t bytes_size = (*bitmap_ptr).getSizeInBytes();
|
||||
writeVarUInt(bytes_size, buf);
|
||||
|
@ -103,13 +103,13 @@ struct AggregateFunctionBitMapJoinAndCard2Data
|
|||
writeVarInt(joinkey, buf);
|
||||
|
||||
writeVarUInt(attr_vals.size(), buf);
|
||||
for (auto str: attr_vals)
|
||||
for (const auto& str: attr_vals)
|
||||
{
|
||||
writeString(str, buf);
|
||||
}
|
||||
|
||||
writeVarUInt((args).size(), buf);
|
||||
for (auto a: args)
|
||||
for (const auto& a: args)
|
||||
{
|
||||
writeString(a, buf);
|
||||
}
|
||||
|
@ -183,7 +183,7 @@ public:
|
|||
auto bitmap_ptr = std::make_shared<BitMap64>(std::move(const_cast<BitMap64 &>(bitmap)));
|
||||
|
||||
const auto & col_position = static_cast<const ColumnInt8 &>(*columns[1]);
|
||||
const Int32 & positionInUnion = static_cast<Int32>(col_position.getElement(row_num));
|
||||
const Int32 & position_in_union = static_cast<Int32>(col_position.getElement(row_num));
|
||||
|
||||
const auto & col_joinkey = static_cast<const ColumnInt32 &>(*columns[2]);
|
||||
const JoinKey & join_key = col_joinkey.getElement(row_num);
|
||||
|
@ -198,7 +198,7 @@ public:
|
|||
args.emplace_back(col_arg.getDataAt(row_num).toString());
|
||||
}
|
||||
|
||||
this->data(place).add(bitmap_ptr, positionInUnion, join_key, attr_val, args, union_num);
|
||||
this->data(place).add(bitmap_ptr, position_in_union, join_key, attr_val, args, union_num);
|
||||
}
|
||||
|
||||
void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr __restrict rhs, Arena *) const override
|
||||
|
@ -232,20 +232,20 @@ public:
|
|||
{
|
||||
auto & input_tuples = this->data(place).input_tuples;
|
||||
|
||||
std::vector<PositionTuples> tuplesByPosition;
|
||||
std::vector<PositionTuples> tuples_by_position;
|
||||
for (size_t i = 0; i < union_num + 1; ++i)
|
||||
{
|
||||
tuplesByPosition.emplace_back(i, JoinTuplePtrs());
|
||||
tuples_by_position.emplace_back(i, JoinTuplePtrs());
|
||||
}
|
||||
|
||||
//partition all input tuples by position
|
||||
for (auto & p : input_tuples)
|
||||
{
|
||||
Int32 pos = std::get<1>(p);
|
||||
tuplesByPosition.at(pos-1).addTuple(p);
|
||||
tuples_by_position.at(pos-1).addTuple(p);
|
||||
}
|
||||
|
||||
const auto res = calcJoin(tuplesByPosition);
|
||||
const auto res = calcJoin(tuples_by_position);
|
||||
|
||||
auto & col = static_cast<ColumnArray &>(to);
|
||||
auto &col_offsets = static_cast<ColumnArray::ColumnOffsets &>(col.getOffsetsColumn());
|
||||
|
@ -257,16 +257,16 @@ public:
|
|||
|
||||
size_t args_num = arguments_num - 4;
|
||||
|
||||
for (auto & p : res)
|
||||
for (const auto & p : res)
|
||||
{
|
||||
for (auto rt = p.begin(); rt != p.end(); ++rt)
|
||||
for (const auto & rt : p)
|
||||
{
|
||||
UInt64 bitmap_cardinality;
|
||||
JoinKey joinkey;
|
||||
Strings attr_vals;
|
||||
Strings args;
|
||||
|
||||
std::tie(bitmap_cardinality, std::ignore, joinkey, attr_vals, args) = std::move(*rt);
|
||||
std::tie(bitmap_cardinality, std::ignore, joinkey, attr_vals, args) = std::move(rt);
|
||||
col_bitmap_card.insert(bitmap_cardinality);
|
||||
col_joinkey.insert(joinkey);
|
||||
|
||||
|
@ -293,17 +293,17 @@ private:
|
|||
std::vector<std::vector<ResultTuple>> res_tuples_buckets(thread_num_, std::vector<ResultTuple>()); // It store the final result of the last JOIN
|
||||
ThreadGroupStatusPtr thread_group = CurrentThread::getGroup();
|
||||
|
||||
auto runJoinAndCard = [&] (size_t index)
|
||||
auto run_join_and_card = [&] (size_t index)
|
||||
{
|
||||
setThreadName("bitmapJoinAndCard");
|
||||
setThreadName("JoinAndCard2");
|
||||
CurrentThread::attachToIfDetached(thread_group);
|
||||
JoinTuplePtrs tuples_tmp;
|
||||
std::vector<ResultTuple> res_tuples_in_a_thread;
|
||||
|
||||
auto & left = res_ptr->at(index);
|
||||
for (auto rt = rhs->tuples.begin(); rt != rhs->tuples.end(); ++rt)
|
||||
for (auto & rt : rhs->tuples)
|
||||
{
|
||||
for (auto lt = left.begin(); lt != left.end(); ++lt)
|
||||
for (auto & lt : left)
|
||||
{
|
||||
BitMapPtr bitmap_ptr, rt_bitmap_ptr;
|
||||
Int32 pos, rt_pos;
|
||||
|
@ -311,8 +311,8 @@ private:
|
|||
Strings attr_vals, rt_attr_vals;
|
||||
Strings args, rt_args;
|
||||
|
||||
std::tie(bitmap_ptr, pos, joinkey, attr_vals, args) = *(*lt);
|
||||
std::tie(rt_bitmap_ptr, rt_pos, std::ignore, rt_attr_vals, rt_args) = *(*rt);
|
||||
std::tie(bitmap_ptr, pos, joinkey, attr_vals, args) = *lt;
|
||||
std::tie(rt_bitmap_ptr, rt_pos, std::ignore, rt_attr_vals, rt_args) = *rt;
|
||||
|
||||
BitMap64 bitmap(*bitmap_ptr);
|
||||
bitmap &= *rt_bitmap_ptr;
|
||||
|
@ -344,15 +344,15 @@ private:
|
|||
res_tuples_buckets[index] = std::move(res_tuples_in_a_thread);
|
||||
};
|
||||
|
||||
std::unique_ptr<ThreadPool> threadPool = std::make_unique<ThreadPool>(thread_num_);
|
||||
std::unique_ptr<ThreadPool> thread_pool = std::make_unique<ThreadPool>(thread_num_);
|
||||
|
||||
for (size_t i = 0; i < thread_num; ++i)
|
||||
{
|
||||
auto joinAndCardFunc = std::bind(runJoinAndCard, i);
|
||||
threadPool->scheduleOrThrowOnError(joinAndCardFunc);
|
||||
auto join_and_card_func = [&run_join_and_card, i]() { run_join_and_card(i); };
|
||||
thread_pool->scheduleOrThrowOnError(join_and_card_func);
|
||||
}
|
||||
|
||||
threadPool->wait();
|
||||
thread_pool->wait();
|
||||
|
||||
res_ptr = std::make_shared<std::vector<JoinTuplePtrs>>(std::move(intermediate_tuples_bucktes));
|
||||
// For intermediate JOIN, a empty object returned,
|
||||
|
@ -364,7 +364,7 @@ private:
|
|||
{
|
||||
//partition the entire position tuples into several parts
|
||||
if (position_tuples.empty())
|
||||
throw Exception("BitMapJoinAndCard::calcJoin: empty input data!", DB::ErrorCodes::LOGICAL_ERROR);
|
||||
throw Exception("BitMapJoinAndCard::calcJoin: empty input data!", DB::ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
//look up for the largest parts
|
||||
size_t max_size = 0;
|
||||
|
|
|
@ -28,33 +28,33 @@ AggregateFunctionPtr createAggregateFunctionBitmapColumnDiff(const std::string &
|
|||
if (argument_types.size() != 2)
|
||||
throw Exception("AggregateFunction " + name + " need only two arguments", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
||||
|
||||
UInt64 return_type_{0}, diff_step_{1};
|
||||
UInt64 return_type{0}, diff_step{1};
|
||||
String diff_direction_str{"forward"};
|
||||
if (!parameters.empty() && parameters.size() != 3)
|
||||
throw Exception("AggregateFunction " + name + " need three parameters", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
||||
|
||||
if (!parameters.empty())
|
||||
{
|
||||
parameters[0].tryGet<UInt64>(return_type_);
|
||||
parameters[0].tryGet<UInt64>(return_type);
|
||||
parameters[1].tryGet<String>(diff_direction_str);
|
||||
parameters[2].tryGet<UInt64>(diff_step_);
|
||||
parameters[2].tryGet<UInt64>(diff_step);
|
||||
}
|
||||
|
||||
if (!isBitmap64(argument_types[1]))
|
||||
throw Exception("AggregateFunction " + name + " need BitMap64 type for its second argument", ErrorCodes::NOT_IMPLEMENTED);
|
||||
|
||||
DataTypePtr data_type_0 = argument_types[0];
|
||||
const DataTypePtr& data_type_0 = argument_types[0];
|
||||
if (!WhichDataType(data_type_0).isDate() && !WhichDataType(data_type_0).isUInt()
|
||||
&& !WhichDataType(data_type_0).isInt() && !WhichDataType(data_type_0).isString())
|
||||
throw Exception("AggregateFunction " + name + " need Date/Int/UInt/String type for its first argument, for order sorting.", ErrorCodes::NOT_IMPLEMENTED);
|
||||
|
||||
if (WhichDataType(data_type_0).isDate())
|
||||
return std::make_shared<AggregateFunctionBitMapColumnDiff<UInt16>>(argument_types, return_type_, diff_direction_str, diff_step_, true);
|
||||
return std::make_shared<AggregateFunctionBitMapColumnDiff<UInt16>>(argument_types, return_type, diff_direction_str, diff_step, true);
|
||||
else if (WhichDataType(data_type_0).isString())
|
||||
return std::make_shared<AggregateFunctionBitMapColumnDiff<String>>(argument_types, return_type_, diff_direction_str, diff_step_);
|
||||
return std::make_shared<AggregateFunctionBitMapColumnDiff<String>>(argument_types, return_type, diff_direction_str, diff_step);
|
||||
else {
|
||||
AggregateFunctionPtr res;
|
||||
res.reset(createWithNumericType<Function>(*data_type_0, argument_types, return_type_, diff_direction_str, diff_step_));
|
||||
res.reset(createWithNumericType<Function>(*data_type_0, argument_types, return_type, diff_direction_str, diff_step));
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -35,7 +35,6 @@ namespace DB
|
|||
{
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int LOGICAL_ERROR;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
extern const int TOO_MANY_ARGUMENTS_FOR_FUNCTION;
|
||||
extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION;
|
||||
|
@ -56,10 +55,12 @@ struct AggregateFunctionBitMapColumnDiffData
|
|||
|
||||
void add(const T key, const BitMap64 & bitmap)
|
||||
{
|
||||
auto [it, inserted] = data.try_emplace(key, std::make_unique<BitMap64>(std::move(const_cast<BitMap64 &>(bitmap))));
|
||||
if (!inserted) {
|
||||
auto it = data.find(key);
|
||||
|
||||
if (it != data.end())
|
||||
*(it->second) |= bitmap;
|
||||
}
|
||||
else
|
||||
data.emplace(key, std::make_unique<BitMap64>(const_cast<BitMap64 &>(bitmap)));
|
||||
}
|
||||
|
||||
void merge(AggregateFunctionBitMapColumnDiffData & rhs)
|
||||
|
@ -133,7 +134,7 @@ enum DiffDirection
|
|||
struct DiffDirectionOp
|
||||
{
|
||||
DiffDirectionOp() : diff_direc(DiffDirection::FORWARD) {}
|
||||
DiffDirectionOp(String diff_dir_op)
|
||||
explicit DiffDirectionOp(String diff_dir_op)
|
||||
{
|
||||
std::transform(diff_dir_op.begin(), diff_dir_op.end(), diff_dir_op.begin(), ::tolower);
|
||||
if (diff_dir_op.empty() || diff_dir_op == "forward")
|
||||
|
@ -227,7 +228,7 @@ public:
|
|||
return;
|
||||
|
||||
if (diff_step >= input_data.size())
|
||||
throw Exception(getName() + ": the step " + std::to_string(diff_step) + " is larger than data size", ErrorCodes::LOGICAL_ERROR);
|
||||
throw Exception(getName() + ": the step " + std::to_string(diff_step) + " is larger than data size", ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
std::vector<DiffPair> all_data;
|
||||
std::unordered_map<T, std::vector<BitMapPtr>> intermediate_res;
|
||||
|
|
|
@ -17,30 +17,37 @@
|
|||
#include <AggregateFunctions/AggregateFunctionBitmapExpressionCalculation.h>
|
||||
#include <AggregateFunctions/FactoryHelpers.h>
|
||||
#include <AggregateFunctions/Helpers.h>
|
||||
|
||||
#pragma GCC diagnostic ignored "-Wunused"
|
||||
#pragma GCC diagnostic ignored "-Wunused-parameter"
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
struct Settings;
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
const extern int TYPE_MISMATCH;
|
||||
const extern int SIZES_OF_COLUMNS_DOESNT_MATCH;
|
||||
const extern int AGGREGATE_FUNCTION_THROW;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
template <template <typename, typename> class AggregateFunctionTemplate, typename... TArgs>
|
||||
static IAggregateFunction * createWithSpecificType(const IDataType & argument_type, TArgs &&... args)
|
||||
IAggregateFunction * createWithSpecificType(const IDataType & argument_type, TArgs &&... args)
|
||||
{
|
||||
WhichDataType which(argument_type);
|
||||
if (which.idx == TypeIndex::UInt8) return new AggregateFunctionTemplate<uint8_t, uint8_t>(std::forward<TArgs>(args)...);
|
||||
if (which.idx == TypeIndex::UInt16) return new AggregateFunctionTemplate<UInt16, UInt16>(std::forward<TArgs>(args)...);
|
||||
if (which.idx == TypeIndex::UInt32) return new AggregateFunctionTemplate<UInt32, UInt32>(std::forward<TArgs>(args)...);
|
||||
if (which.idx == TypeIndex::UInt64) return new AggregateFunctionTemplate<UInt64, UInt64>(std::forward<TArgs>(args)...);
|
||||
if (which.idx == TypeIndex::Int8) return new AggregateFunctionTemplate<Int8, Int8>(std::forward<TArgs>(args)...);
|
||||
if (which.idx == TypeIndex::Int16) return new AggregateFunctionTemplate<Int16, Int16>(std::forward<TArgs>(args)...);
|
||||
if (which.idx == TypeIndex::Int32) return new AggregateFunctionTemplate<Int32, Int32>(std::forward<TArgs>(args)...);
|
||||
if (which.idx == TypeIndex::Int64) return new AggregateFunctionTemplate<Int64, Int64>(std::forward<TArgs>(args)...);
|
||||
|
||||
if (which.idx == TypeIndex::UInt8 || which.idx == TypeIndex::UInt16 ||
|
||||
which.idx == TypeIndex::UInt32 || which.idx == TypeIndex::UInt64)
|
||||
return new AggregateFunctionTemplate<UInt64, UInt64>(std::forward<TArgs>(args)...);
|
||||
else if (which.idx == TypeIndex::Int8 || which.idx == TypeIndex::Int16 ||
|
||||
which.idx == TypeIndex::Int32 || which.idx == TypeIndex::Int64)
|
||||
return new AggregateFunctionTemplate<Int64, Int64>(std::forward<TArgs>(args)...);
|
||||
else if (which.idx == TypeIndex::String)
|
||||
return new AggregateFunctionTemplate<String, String>(std::forward<TArgs>(args)...);
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
@ -48,30 +55,30 @@ template<template <typename, typename> class Function>
|
|||
AggregateFunctionPtr createAggregateFunctionBitMapCount(const String & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
|
||||
{
|
||||
if (argument_types.size() != 2 )
|
||||
throw Exception("AggregateFunction " + name + " need two arguments", ErrorCodes::NOT_IMPLEMENTED);
|
||||
throw Exception("AggregateFunction " + name + " need two arguments", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
|
||||
|
||||
String expression;
|
||||
if (parameters.size() > 0)
|
||||
parameters[0].tryGet<String>(expression);
|
||||
if (!parameters.empty() && !parameters[0].tryGet<String>(expression))
|
||||
throw Exception("AggregateFunction " + name + " need String as 1st parameter", ErrorCodes::BAD_TYPE_OF_FIELD);
|
||||
|
||||
UInt64 is_bitmap_execute = 0;
|
||||
if (parameters.size() > 1)
|
||||
parameters[1].tryGet<UInt64>(is_bitmap_execute);
|
||||
|
||||
DataTypePtr data_type = argument_types[0];
|
||||
if (!WhichDataType(data_type).isInt())
|
||||
throw Exception("AggregateFunction " + name + " need signed numeric type (Int16 or bigger) for its first argument", ErrorCodes::NOT_IMPLEMENTED);
|
||||
const DataTypePtr& data_type = argument_types[0];
|
||||
if (!WhichDataType(data_type).isNativeInt() && !WhichDataType(data_type).isString())
|
||||
throw Exception("AggregateFunction " + name + " need signed numeric type (Int16 or bigger) or a string type for its first argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
if (WhichDataType(data_type).isInt8())
|
||||
throw Exception("Int8 type is not recommended! Please use Int16 or bigger size number", ErrorCodes::BAD_TYPE_OF_FIELD);
|
||||
|
||||
if (!isBitmap64(argument_types[1]))
|
||||
throw Exception("AggregateFunction " + name + " need BitMap type for its second argument", ErrorCodes::NOT_IMPLEMENTED);
|
||||
throw Exception("AggregateFunction " + name + " need BitMap type for its second argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
AggregateFunctionPtr res(createWithSpecificType<Function>(*data_type, argument_types, expression, is_bitmap_execute));
|
||||
|
||||
// res.reset(createWithNumericType<Function>(*data_type, argument_types, expression, is_bitmap_execute));
|
||||
if (!res)
|
||||
throw Exception("Failed to create aggregate function " + name, ErrorCodes::LOGICAL_ERROR);
|
||||
throw Exception("Failed to create aggregate function " + name, ErrorCodes::AGGREGATE_FUNCTION_THROW);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
@ -80,91 +87,174 @@ template<template <typename, typename> class Function>
|
|||
AggregateFunctionPtr createAggregateFunctionBitMapMultiCount(const String & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
|
||||
{
|
||||
if (argument_types.size() != 2 )
|
||||
throw Exception("AggregateFunction " + name + " need two arguments", ErrorCodes::NOT_IMPLEMENTED);
|
||||
throw Exception("AggregateFunction " + name + " need two arguments", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
|
||||
|
||||
std::vector<String> expressions;
|
||||
if (parameters.size() > 0)
|
||||
if (!parameters.empty())
|
||||
{
|
||||
for (size_t i = 0; i < parameters.size(); i++)
|
||||
for (size_t i = 0; i < parameters.size(); ++i)
|
||||
{
|
||||
String expression;
|
||||
parameters[i].tryGet<String>(expression);
|
||||
if (!parameters[i].tryGet<String>(expression))
|
||||
throw Exception(fmt::format("AggregateFunction {} need String as its {} parameter", name, argPositionToSequence(i+1)), ErrorCodes::BAD_TYPE_OF_FIELD);
|
||||
expressions.push_back(expression);
|
||||
}
|
||||
}
|
||||
|
||||
DataTypePtr data_type = argument_types[0];
|
||||
if (!WhichDataType(data_type).isInt())
|
||||
throw Exception("AggregateFunction " + name + " need signed numeric type (Int16 or bigger) for its first argument", ErrorCodes::NOT_IMPLEMENTED);
|
||||
const DataTypePtr& data_type = argument_types[0];
|
||||
if (!WhichDataType(data_type).isNativeInt() && !WhichDataType(data_type).isString())
|
||||
throw Exception("AggregateFunction " + name + " need signed numeric type (Int16 or bigger) or a string type for its first argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
if (WhichDataType(data_type).isInt8())
|
||||
throw Exception("Int8 type is not recommended! Please use Int16 or bigger size number", ErrorCodes::BAD_TYPE_OF_FIELD);
|
||||
|
||||
if (!isBitmap64(argument_types[1]))
|
||||
throw Exception("AggregateFunction " + name + " need BitMap type for its second argument", ErrorCodes::NOT_IMPLEMENTED);
|
||||
throw Exception("AggregateFunction " + name + " need BitMap type for its second argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
AggregateFunctionPtr res;
|
||||
AggregateFunctionPtr res(createWithSpecificType<Function>(*data_type, argument_types, expressions));
|
||||
|
||||
res.reset(createWithSpecificType<Function>(*data_type, argument_types, expressions));
|
||||
if (!res)
|
||||
throw Exception("Failed to create aggregate function " + name, ErrorCodes::AGGREGATE_FUNCTION_THROW);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
template<template <typename, typename> class Function>
|
||||
AggregateFunctionPtr createAggregateFunctionBitMapMultiCountWithDate(const String & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
|
||||
{
|
||||
if (argument_types.size() != 3 )
|
||||
throw Exception("AggregateFunction " + name + " need three arguments", ErrorCodes::NOT_IMPLEMENTED);
|
||||
throw Exception("AggregateFunction " + name + " need three arguments", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
|
||||
|
||||
std::vector<String> expressions;
|
||||
if (parameters.size() > 0)
|
||||
if (!parameters.empty())
|
||||
{
|
||||
for (size_t i = 0; i < parameters.size(); i++)
|
||||
{
|
||||
String expression;
|
||||
parameters[i].tryGet<String>(expression);
|
||||
if (!parameters[i].tryGet<String>(expression))
|
||||
throw Exception(fmt::format("AggregateFunction {} need String as its {} parameter", name, argPositionToSequence(i+1)), ErrorCodes::BAD_TYPE_OF_FIELD);
|
||||
expressions.push_back(expression);
|
||||
}
|
||||
}
|
||||
|
||||
DataTypePtr date_type = argument_types[0];
|
||||
if (!WhichDataType(date_type).isInt())
|
||||
throw Exception("AggregateFunction " + name + " need signed numeric type (Int16 or bigger) for its first argument", ErrorCodes::NOT_IMPLEMENTED);
|
||||
const DataTypePtr& date_type = argument_types[0];
|
||||
if (!WhichDataType(date_type).isNativeInt())
|
||||
throw Exception("AggregateFunction " + name + " need signed numeric type (Int16 or bigger) for its first argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
DataTypePtr data_type = argument_types[1];
|
||||
if (!WhichDataType(data_type).isInt())
|
||||
throw Exception("AggregateFunction " + name + " need signed numeric type for its second argument", ErrorCodes::NOT_IMPLEMENTED);
|
||||
const DataTypePtr& data_type = argument_types[1];
|
||||
if (!WhichDataType(data_type).isNativeInt() && !WhichDataType(data_type).isString())
|
||||
throw Exception("AggregateFunction " + name + " need signed numeric type (Int16 or bigger) or a string type for its second argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
if (!isBitmap64(argument_types[2]))
|
||||
throw Exception("AggregateFunction " + name + " need BitMap type for its third argument", ErrorCodes::NOT_IMPLEMENTED);
|
||||
throw Exception("AggregateFunction " + name + " need BitMap type for its third argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
return std::make_shared<AggregateFunctionBitMapMultiCountWithDate>(argument_types, expressions);
|
||||
AggregateFunctionPtr res(createWithSpecificType<Function>(*data_type, argument_types, expressions));
|
||||
|
||||
if (!res)
|
||||
throw Exception("Failed to create aggregate function " + name, ErrorCodes::AGGREGATE_FUNCTION_THROW);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
template<template <typename, typename> class Function>
|
||||
AggregateFunctionPtr createAggregateFunctionBitMapExtract(const String & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
|
||||
{
|
||||
if (argument_types.size() != 2 )
|
||||
throw Exception("AggregateFunction " + name + " need two arguments", ErrorCodes::NOT_IMPLEMENTED);
|
||||
throw Exception("AggregateFunction " + name + " need two arguments", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
|
||||
|
||||
String expression;
|
||||
if (parameters.size() > 0)
|
||||
parameters[0].tryGet<String>(expression);
|
||||
if (!parameters.empty() && !parameters[0].tryGet<String>(expression))
|
||||
throw Exception("AggregateFunction " + name + " need String as 1st parameter", ErrorCodes::BAD_TYPE_OF_FIELD);
|
||||
|
||||
UInt64 is_bitmap_execute = 0;
|
||||
if (parameters.size() > 1)
|
||||
parameters[1].tryGet<UInt64>(is_bitmap_execute);
|
||||
|
||||
DataTypePtr data_type = argument_types[0];
|
||||
if (!WhichDataType(data_type).isInt())
|
||||
throw Exception("AggregateFunction " + name + " need signed numeric type (Int16 or bigger) for its first argument", ErrorCodes::NOT_IMPLEMENTED);
|
||||
const DataTypePtr& data_type = argument_types[0];
|
||||
if (!WhichDataType(data_type).isNativeInt() && !WhichDataType(data_type).isString())
|
||||
throw Exception("AggregateFunction " + name + " need signed numeric type (Int16 or bigger) or a string type for its first argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
if (WhichDataType(data_type).isInt8())
|
||||
throw Exception("Int8 type is not recommended! Please use Int16 or bigger size number", ErrorCodes::BAD_TYPE_OF_FIELD);
|
||||
|
||||
if (!isBitmap64(argument_types[1]))
|
||||
throw Exception("AggregateFunction " + name + " need BitMap type for its second argument", ErrorCodes::NOT_IMPLEMENTED);
|
||||
throw Exception("AggregateFunction " + name + " need BitMap type for its second argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
AggregateFunctionPtr res;
|
||||
AggregateFunctionPtr res(createWithSpecificType<Function>(*data_type, argument_types, expression, is_bitmap_execute));
|
||||
|
||||
res.reset(createWithSpecificType<Function>(*data_type, argument_types, expression, is_bitmap_execute));
|
||||
if (!res)
|
||||
throw Exception("Failed to create aggregate function " + name, ErrorCodes::AGGREGATE_FUNCTION_THROW);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
template<template <typename, typename> class Function>
|
||||
AggregateFunctionPtr createAggregateFunctionBitMapMultiExtract(
|
||||
const String & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
|
||||
{
|
||||
if (argument_types.size() != 2 )
|
||||
throw Exception("AggregateFunction " + name + " need two arguments", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
|
||||
|
||||
std::vector<String> expressions;
|
||||
if (!parameters.empty())
|
||||
{
|
||||
for (size_t i = 0; i < parameters.size(); i++)
|
||||
{
|
||||
String expression;
|
||||
if (!parameters[i].tryGet<String>(expression))
|
||||
throw Exception(fmt::format("AggregateFunction {} need String as its {} parameter", name, argPositionToSequence(i+1)), ErrorCodes::BAD_TYPE_OF_FIELD);
|
||||
expressions.push_back(expression);
|
||||
}
|
||||
}
|
||||
|
||||
const DataTypePtr& data_type = argument_types[0];
|
||||
|
||||
if (!WhichDataType(data_type).isNativeInt() && !WhichDataType(data_type).isString())
|
||||
throw Exception(
|
||||
"AggregateFunction " + name + " need signed numeric type (Int16 or bigger) or a string type for its first argument",
|
||||
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT
|
||||
);
|
||||
|
||||
if (WhichDataType(data_type).isInt8())
|
||||
throw Exception("Int8 type is not recommended! Please use Int16 or bigger size number", ErrorCodes::BAD_TYPE_OF_FIELD);
|
||||
|
||||
if (!isBitmap64(argument_types[1]))
|
||||
throw Exception("AggregateFunction " + name + " need BitMap type for its second argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
AggregateFunctionPtr res(createWithSpecificType<Function>(*data_type, argument_types, expressions));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
template<template <typename, typename> class Function>
|
||||
AggregateFunctionPtr createAggregateFunctionBitMapMultiExtractWithDate(
|
||||
const String & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
|
||||
{
|
||||
if (argument_types.size() != 3)
|
||||
throw Exception("AggregateFunction " + name + " need two arguments", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
std::vector<String> expressions;
|
||||
if (!parameters.empty())
|
||||
{
|
||||
for (size_t i = 0; i < parameters.size(); i++)
|
||||
{
|
||||
String expression;
|
||||
if (!parameters[i].tryGet<String>(expression))
|
||||
throw Exception(fmt::format("AggregateFunction {} need String as its {} parameter", name, argPositionToSequence(i+1)), ErrorCodes::BAD_TYPE_OF_FIELD);
|
||||
expressions.push_back(expression);
|
||||
}
|
||||
}
|
||||
|
||||
const DataTypePtr& date_type = argument_types[0];
|
||||
if(!WhichDataType(date_type).isNativeInt())
|
||||
throw Exception("AggregateFunction " + name + " need signed numeric type for its first argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
const DataTypePtr& data_type = argument_types[1];
|
||||
if (!WhichDataType(data_type).isNativeInt() && !WhichDataType(data_type).isString())
|
||||
throw Exception("AggregateFunction " + name + " need signed numeric type (Int16 or bigger) or a string type for its second argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
if (!isBitmap64(argument_types[2]))
|
||||
throw Exception("AggregateFunction " + name + " need BitMap type for its third argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
AggregateFunctionPtr res(createWithSpecificType<Function>(*data_type, argument_types, expressions));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
@ -173,10 +263,54 @@ AggregateFunctionPtr createAggregateFunctionBitMapExtract(const String & name, c
|
|||
|
||||
void registerAggregateFunctionsBitmapExpressionCalculation(AggregateFunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction("BitMapCount", createAggregateFunctionBitMapCount<AggregateFunctionBitMapCount>, AggregateFunctionFactory::CaseInsensitive);
|
||||
factory.registerFunction("BitMapMultiCount", createAggregateFunctionBitMapMultiCount<AggregateFunctionBitMapMultiCount>, AggregateFunctionFactory::CaseInsensitive);
|
||||
factory.registerFunction("BitMapMultiCountWithDate", createAggregateFunctionBitMapMultiCountWithDate, AggregateFunctionFactory::CaseInsensitive);
|
||||
factory.registerFunction("BitMapExtract", createAggregateFunctionBitMapExtract<AggregateFunctionBitMapExtract>, AggregateFunctionFactory::CaseInsensitive);
|
||||
factory.registerFunction(
|
||||
"BitmapCount",
|
||||
createAggregateFunctionBitMapCount<AggregateFunctionBitMapCount>,
|
||||
AggregateFunctionFactory::CaseInsensitive
|
||||
);
|
||||
|
||||
factory.registerFunction(
|
||||
"BitmapMultiCount",
|
||||
createAggregateFunctionBitMapMultiCount<AggregateFunctionBitMapMultiCount>,
|
||||
AggregateFunctionFactory::CaseInsensitive
|
||||
);
|
||||
|
||||
factory.registerFunction(
|
||||
"BitmapMultiCountWithDate",
|
||||
createAggregateFunctionBitMapMultiCountWithDate<AggregateFunctionBitMapMultiCountWithDate>,
|
||||
AggregateFunctionFactory::CaseInsensitive
|
||||
);
|
||||
|
||||
factory.registerFunction(
|
||||
"BitmapExtract",
|
||||
createAggregateFunctionBitMapExtract<AggregateFunctionBitMapExtract>,
|
||||
AggregateFunctionFactory::CaseInsensitive
|
||||
);
|
||||
|
||||
factory.registerFunction(
|
||||
"BitmapMultiExtract",
|
||||
createAggregateFunctionBitMapMultiExtract<AggregateFunctionBitMapMultiExtract>,
|
||||
AggregateFunctionFactory::CaseInsensitive
|
||||
);
|
||||
|
||||
factory.registerFunction(
|
||||
"BitmapMultiExtractWithDate",
|
||||
createAggregateFunctionBitMapMultiExtractWithDate<AggregateFunctionBitMapMultiExtractWithDate>,
|
||||
AggregateFunctionFactory::CaseInsensitive
|
||||
);
|
||||
|
||||
factory.registerAlias(
|
||||
"BitmapCountV2", "BitmapCount", AggregateFunctionFactory::CaseInsensitive);
|
||||
factory.registerAlias(
|
||||
"BitmapMultiCountV2", "BitmapMultiCount", AggregateFunctionFactory::CaseInsensitive);
|
||||
factory.registerAlias(
|
||||
"BitmapMultiCountWithDateV2", "BitmapMultiCountWithDate", AggregateFunctionFactory::CaseInsensitive);
|
||||
factory.registerAlias(
|
||||
"BitmapExtractV2", "BitmapExtract", AggregateFunctionFactory::CaseInsensitive);
|
||||
factory.registerAlias(
|
||||
"BitmapMultiExtractV2", "BitmapMultiExtract", AggregateFunctionFactory::CaseInsensitive);
|
||||
factory.registerAlias(
|
||||
"BitmapMultiExtractWithDateV2", "BitmapMultiExtractWithDate", AggregateFunctionFactory::CaseInsensitive);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,138 @@
|
|||
//
|
||||
// Created by 袁宇豪 on 9/18/22.
|
||||
//
|
||||
|
||||
#include "AggregateFunctionCountByGranularity.h"
|
||||
|
||||
#include <AggregateFunctions/AggregateFunctionFactory.h>
|
||||
#include <AggregateFunctions/Helpers.h>
|
||||
#include <Common/FieldVisitors.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
||||
extern const int ARGUMENT_OUT_OF_BOUND;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
AggregateFunctionPtr createAggregateFunctionCountByGranularity
|
||||
(const std::string & name, const DataTypes & argument_types, const Array & params, const Settings *)
|
||||
{
|
||||
if (argument_types.size() != 1)
|
||||
throw Exception("Incorrect number of arguments for aggregate function " + name, ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
||||
|
||||
const IDataType & argument_type = *argument_types[0];
|
||||
WhichDataType which(argument_type);
|
||||
|
||||
if
|
||||
(
|
||||
which.isNothing()
|
||||
|| which.isArray()
|
||||
|| which.isFunction()
|
||||
|| which.isAggregateFunction()
|
||||
|| which.isMap()
|
||||
|| which.isBitmap64()
|
||||
|| which.isSet()
|
||||
|| which.isTuple()
|
||||
|| which.isInterval()
|
||||
|| which.isDecimal()
|
||||
|| which.isInt128()
|
||||
|| which.isUInt128()
|
||||
)
|
||||
{
|
||||
throw Exception("argument of " + name + " can not be "
|
||||
"(Nothing,Array,Function,"
|
||||
"AggregateFunction,Map,Bitmap64,"
|
||||
"Set,Tuple,Interval,"
|
||||
"Decimal,Int128,UInt128)", ErrorCodes::BAD_ARGUMENTS);
|
||||
}
|
||||
else if (which.isStringOrFixedString())
|
||||
{
|
||||
//auto a =AggregateFunctionCountByGranularity<String>(argument_types, params);
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<String>>(argument_types, params);
|
||||
}
|
||||
else if (which.isInt8())
|
||||
{
|
||||
auto a =AggregateFunctionCountByGranularity<Int8>(argument_types, params);
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<Int8>>(argument_types, params);
|
||||
}
|
||||
else if (which.isUInt8() || which.isEnum8())
|
||||
{
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<UInt8>>(argument_types, params);
|
||||
}
|
||||
else if (which.isInt16())
|
||||
{
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<Int16>>(argument_types, params);
|
||||
}
|
||||
else if (which.isUInt16() || which.isEnum16())
|
||||
{
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<UInt16>>(argument_types, params);
|
||||
}
|
||||
else if (which.isInt32())
|
||||
{
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<Int32>>(argument_types, params);
|
||||
}
|
||||
else if (which.isUInt32() || which.isDateTime())
|
||||
{
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<UInt32>>(argument_types, params);
|
||||
}
|
||||
else if (which.isInt64())
|
||||
{
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<Int64>>(argument_types, params);
|
||||
}
|
||||
else if (which.isUInt64())
|
||||
{
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<UInt64>>(argument_types, params);
|
||||
}
|
||||
// TODO can't support Int128 for now
|
||||
// else if (which.isInt128())
|
||||
// {
|
||||
// return std::make_shared<AggregateFunctionCountByGranularity<Int128>>(argument_types, params);
|
||||
// }
|
||||
else if (which.isUInt128())
|
||||
{
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<UInt128>>(argument_types, params);
|
||||
}
|
||||
else if (which.isFloat32())
|
||||
{
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<Float32>>(argument_types, params);
|
||||
}
|
||||
else if (which.isFloat64())
|
||||
{
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<Float64>>(argument_types, params);
|
||||
}
|
||||
// TODO can't support Decimal for now
|
||||
// else if (which.isDecimal32())
|
||||
// {
|
||||
// return std::make_shared<AggregateFunctionCountByGranularity<Decimal32>>(argument_types, params);
|
||||
// }
|
||||
// else if (which.isDecimal64() || which.isDateTime64())
|
||||
// {
|
||||
// return std::make_shared<AggregateFunctionCountByGranularity<Decimal64>>(argument_types, params);
|
||||
// }
|
||||
// else if (which.isDecimal128())
|
||||
// {
|
||||
// return std::make_shared<AggregateFunctionCountByGranularity<Decimal128>>(argument_types, params);
|
||||
// }
|
||||
else
|
||||
{
|
||||
return std::make_shared<AggregateFunctionCountByGranularity<String>>(argument_types, params);
|
||||
}
|
||||
|
||||
__builtin_unreachable();
|
||||
}
|
||||
}
|
||||
|
||||
void registerAggregateFunctionCountByGranularity(AggregateFunctionFactory & factory)
|
||||
{
|
||||
AggregateFunctionProperties properties = { .returns_default_when_only_null = true, .is_order_dependent = false };
|
||||
factory.registerFunction("countByGranularity", {createAggregateFunctionCountByGranularity, properties}, AggregateFunctionFactory::CaseInsensitive);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,577 @@
|
|||
//
|
||||
// Created by 袁宇豪 on 9/18/22.
|
||||
//
|
||||
|
||||
#ifndef CLICKHOUSE_AGGREGATEFUNCTIONCOUNTBYGRANULARITY_H
|
||||
#define CLICKHOUSE_AGGREGATEFUNCTIONCOUNTBYGRANULARITY_H
|
||||
|
||||
#include <AggregateFunctions/IAggregateFunction.h>
|
||||
#include <Columns/ColumnArray.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Columns/ColumnDecimal.h>
|
||||
#include <Columns/ColumnTuple.h>
|
||||
#include <Columns/ColumnsNumber.h>
|
||||
#include <DataTypes/DataTypeAggregateFunction.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <DataTypes/DataTypeArray.h>
|
||||
#include <Common/FieldVisitors.h>
|
||||
#include <Common/FieldVisitorConvertToNumber.h>
|
||||
#include <Common/HashTable/HashMap.h>
|
||||
#include <Common/SipHash.h>
|
||||
#include <DataTypes/DataTypeTuple.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int LOGICAL_ERROR;
|
||||
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct AggregateFunctionCountByGranularityData
|
||||
{
|
||||
using Key = T;
|
||||
|
||||
using Table = HashMap<Key, UInt64>;
|
||||
|
||||
AggregateFunctionCountByGranularityData()
|
||||
: granularity(8192), uniq_count_table(), uniq_position_table() {}
|
||||
|
||||
UInt32 granularity;
|
||||
|
||||
Table uniq_count_table;
|
||||
|
||||
Table uniq_position_table;
|
||||
|
||||
const std::unordered_map<Key, UInt64> getCountInUnorderedMap() const
|
||||
{
|
||||
std::unordered_map<Key, UInt64> result;
|
||||
for (const auto & item : uniq_count_table)
|
||||
{
|
||||
result[item.getKey()] = item.getMapped();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <typename Value>
|
||||
void addImpl(Value value, UInt64 now_position)
|
||||
{
|
||||
auto pos_iter = uniq_position_table.find(value);
|
||||
if ((pos_iter == uniq_position_table.end())
|
||||
|| (pos_iter != uniq_position_table.end() && pos_iter->getMapped() < now_position))
|
||||
{
|
||||
if (pos_iter == uniq_position_table.end())
|
||||
{
|
||||
bool is_inserted;
|
||||
uniq_position_table.emplace(value, pos_iter, is_inserted);
|
||||
}
|
||||
pos_iter->getMapped() = now_position;
|
||||
auto count_iter = uniq_count_table.find(value);
|
||||
if (count_iter != uniq_count_table.end())
|
||||
{
|
||||
count_iter->getMapped() += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
bool is_inserted;
|
||||
uniq_count_table.emplace(value, count_iter, is_inserted);
|
||||
count_iter->getMapped() = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Value>
|
||||
void addMany(const Value * __restrict ptr, const UInt8 * __restrict null_map, size_t count)
|
||||
{
|
||||
if (null_map)
|
||||
{
|
||||
addManyNotNull<Value>(ptr, null_map, count);
|
||||
}
|
||||
else
|
||||
{
|
||||
addMany<Value>(ptr, count);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Value>
|
||||
void addMany(const Value * __restrict ptr, size_t count)
|
||||
{
|
||||
size_t total_positions = (count / granularity);
|
||||
size_t remains = count - total_positions * granularity;
|
||||
for (size_t pos = 0; pos < total_positions; ++pos)
|
||||
{
|
||||
for (size_t i = 0; i < granularity; ++i)
|
||||
{
|
||||
size_t now_iter = granularity * pos + i;
|
||||
addImpl(ptr[now_iter], pos);
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < remains; ++i)
|
||||
{
|
||||
size_t now_iter = granularity * total_positions + i;
|
||||
addImpl(ptr[now_iter], total_positions);
|
||||
}
|
||||
uniq_position_table.clear();
|
||||
}
|
||||
|
||||
template <typename Value>
|
||||
void addManyNotNull(const Value * __restrict ptr, const UInt8 * __restrict null_map, size_t count)
|
||||
{
|
||||
size_t total_positions = (count / granularity);
|
||||
size_t remains = count - total_positions * granularity;
|
||||
for (size_t pos = 0; pos < total_positions; ++pos)
|
||||
{
|
||||
for (size_t i = 0; i < granularity; ++i)
|
||||
{
|
||||
size_t now_iter = granularity * pos + i;
|
||||
if (null_map[now_iter])
|
||||
{
|
||||
continue;
|
||||
}
|
||||
addImpl(ptr[now_iter], pos);
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < remains; ++i)
|
||||
{
|
||||
size_t now_iter = granularity * total_positions + i;
|
||||
if (null_map[now_iter])
|
||||
{
|
||||
continue;
|
||||
}
|
||||
addImpl(ptr[now_iter], total_positions);
|
||||
}
|
||||
uniq_position_table.clear();
|
||||
}
|
||||
|
||||
void merge(const AggregateFunctionCountByGranularityData<T> & other)
|
||||
{
|
||||
this->uniq_position_table.clear();
|
||||
for (const auto & item : other.uniq_count_table)
|
||||
{
|
||||
auto iter = this->uniq_count_table.find(item.getKey());
|
||||
if (iter == this->uniq_count_table.end())
|
||||
{
|
||||
bool is_inserted;
|
||||
this->uniq_count_table.emplace(item.getKey(), iter, is_inserted);
|
||||
iter->getMapped() = item.getMapped();
|
||||
}
|
||||
else
|
||||
{
|
||||
iter->getMapped() += item.getMapped();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void serialize(WriteBuffer & buf) const
|
||||
{
|
||||
writeIntBinary(granularity, buf);
|
||||
//this->uniqCountTable.write(buf);
|
||||
writeIntBinary(this->uniq_count_table.size(), buf);
|
||||
for (const auto & item : this->uniq_count_table)
|
||||
{
|
||||
writeBinary(item.getKey(), buf);
|
||||
writeIntBinary(item.getMapped(), buf);
|
||||
}
|
||||
}
|
||||
|
||||
void deserialize(ReadBuffer & buf)
|
||||
{
|
||||
this->uniq_count_table.clear();
|
||||
|
||||
readIntBinary(granularity, buf);
|
||||
//this->uniqCountTable.read(buf);
|
||||
UInt64 size;
|
||||
readIntBinary(size, buf);
|
||||
for (UInt64 i = 0; i < size; ++i)
|
||||
{
|
||||
T key;
|
||||
UInt64 count;
|
||||
readBinary(key, buf);
|
||||
readIntBinary(count, buf);
|
||||
this->uniq_count_table[key] = count;
|
||||
}
|
||||
}
|
||||
|
||||
String str() const
|
||||
{
|
||||
std::ostringstream oss;
|
||||
for (const auto & item : this->uniq_count_table)
|
||||
{
|
||||
oss << "(" << item.getKey() << ", " << item.getMapped() << ")";
|
||||
}
|
||||
return oss.str();
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct AggregateFunctionCountByGranularityData<String>
|
||||
{
|
||||
using Table = std::unordered_map<String, UInt64, std::hash<String>, std::equal_to<>, TrackAllocator<std::pair<const String, UInt64>>>;
|
||||
|
||||
AggregateFunctionCountByGranularityData()
|
||||
: granularity(8192), uniq_count_table(), uniq_position_table() {}
|
||||
|
||||
UInt32 granularity;
|
||||
|
||||
Table uniq_count_table;
|
||||
|
||||
Table uniq_position_table;
|
||||
|
||||
const std::unordered_map<String , UInt64> getCountInUnorderedMap() const
|
||||
{
|
||||
std::unordered_map<String, UInt64> result;
|
||||
for (const auto & item : uniq_count_table)
|
||||
{
|
||||
result[item.first] = item.second;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void addImpl(String value, UInt64 now_position)
|
||||
{
|
||||
auto pos_iter = uniq_position_table.find(value);
|
||||
if ((pos_iter == uniq_position_table.end())
|
||||
|| (pos_iter != uniq_position_table.end() && pos_iter->second < now_position))
|
||||
{
|
||||
if (pos_iter == uniq_position_table.end())
|
||||
{
|
||||
uniq_position_table.emplace(value, now_position);
|
||||
}
|
||||
else
|
||||
pos_iter->second = now_position;
|
||||
auto count_iter = uniq_count_table.find(value);
|
||||
if (count_iter != uniq_count_table.end())
|
||||
{
|
||||
count_iter->second += 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
uniq_count_table.emplace(value, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void addMany(const ColumnString & column, const UInt8 * __restrict null_map, size_t count)
|
||||
{
|
||||
if (null_map)
|
||||
{
|
||||
addManyNotNull(column, null_map, count);
|
||||
}
|
||||
else
|
||||
{
|
||||
addMany(column, count);
|
||||
}
|
||||
}
|
||||
|
||||
void addMany(const ColumnString & column, size_t count)
|
||||
{
|
||||
size_t total_positions = (count / granularity);
|
||||
size_t remains = count - total_positions * granularity;
|
||||
for (size_t pos = 0; pos < total_positions; ++pos)
|
||||
{
|
||||
for (size_t i = 0; i < granularity; ++i)
|
||||
{
|
||||
size_t nowIter = granularity * pos + i;
|
||||
const auto & value = column.getDataAt(nowIter);
|
||||
addImpl(value.toString(), pos);
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < remains; ++i)
|
||||
{
|
||||
size_t nowIter = granularity * total_positions + i;
|
||||
const auto & value = column.getDataAt(nowIter);
|
||||
addImpl(value.toString(), total_positions);
|
||||
}
|
||||
uniq_position_table.clear();
|
||||
}
|
||||
|
||||
void addManyNotNull(const ColumnString & column, const UInt8 * __restrict null_map, size_t count)
|
||||
{
|
||||
size_t total_positions = (count / granularity);
|
||||
size_t remains = count - total_positions * granularity;
|
||||
for (size_t pos = 0; pos < total_positions; ++pos)
|
||||
{
|
||||
for (size_t i = 0; i < granularity; ++i)
|
||||
{
|
||||
size_t nowIter = granularity * pos + i;
|
||||
if (null_map[nowIter])
|
||||
{
|
||||
continue;
|
||||
}
|
||||
const auto & value = column.getDataAt(nowIter);
|
||||
addImpl(value.toString(), pos);
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < remains; ++i)
|
||||
{
|
||||
size_t nowIter = granularity * total_positions + i;
|
||||
if (null_map[nowIter])
|
||||
{
|
||||
continue;
|
||||
}
|
||||
const auto & value = column.getDataAt(nowIter);
|
||||
addImpl(value.toString(), total_positions);
|
||||
}
|
||||
uniq_position_table.clear();
|
||||
}
|
||||
|
||||
void merge(const AggregateFunctionCountByGranularityData<String> & other)
|
||||
{
|
||||
this->uniq_position_table.clear();
|
||||
for (const auto & item : other.uniq_count_table)
|
||||
{
|
||||
auto iter = this->uniq_count_table.find(item.first);
|
||||
if (iter == this->uniq_count_table.end())
|
||||
{
|
||||
this->uniq_count_table.emplace(item.first, item.second);
|
||||
}
|
||||
else
|
||||
{
|
||||
iter->second += item.second;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void serialize(WriteBuffer & buf) const
|
||||
{
|
||||
writeIntBinary(granularity, buf);
|
||||
//this->uniqCountTable.write(buf);
|
||||
writeIntBinary(this->uniq_count_table.size(), buf);
|
||||
for (const auto & item : this->uniq_count_table)
|
||||
{
|
||||
writeStringBinary(item.first, buf);
|
||||
writeIntBinary(item.second, buf);
|
||||
}
|
||||
}
|
||||
|
||||
void deserialize(ReadBuffer & buf)
|
||||
{
|
||||
this->uniq_count_table.clear();
|
||||
|
||||
readIntBinary(granularity, buf);
|
||||
//this->uniqCountTable.read(buf);
|
||||
UInt64 size;
|
||||
readIntBinary(size, buf);
|
||||
for (UInt64 i = 0; i < size; ++i)
|
||||
{
|
||||
String key;
|
||||
UInt64 count;
|
||||
readStringBinary(key, buf);
|
||||
readIntBinary(count, buf);
|
||||
this->uniq_count_table[key] = count;
|
||||
}
|
||||
}
|
||||
|
||||
String str() const
|
||||
{
|
||||
std::ostringstream oss;
|
||||
for (const auto & item : this->uniq_count_table)
|
||||
{
|
||||
oss << "(" << item.first << ", " << item.second << ")";
|
||||
}
|
||||
return oss.str();
|
||||
}
|
||||
};
|
||||
|
||||
using T=UInt8;
|
||||
|
||||
template <typename T>
|
||||
class AggregateFunctionCountByGranularity final
|
||||
: public IAggregateFunctionDataHelper<AggregateFunctionCountByGranularityData<T>, AggregateFunctionCountByGranularity<T>>
|
||||
{
|
||||
public:
|
||||
using ColVecType = std::conditional_t<IsDecimalNumber<T>, ColumnDecimal<T>, ColumnVector<T>>;
|
||||
|
||||
AggregateFunctionCountByGranularity(const DataTypes & argument_types_, const Array & params_)
|
||||
: IAggregateFunctionDataHelper<AggregateFunctionCountByGranularityData<T>, AggregateFunctionCountByGranularity>(argument_types_, params_)
|
||||
{
|
||||
if (!params_.empty())
|
||||
{
|
||||
if (params_.size() != 1)
|
||||
{
|
||||
throw Exception(
|
||||
"Aggregate function AggregateFunctionCountByGranularity requires one parameter or less.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
||||
}
|
||||
|
||||
UInt64 granularity_param = applyVisitorExplicit(FieldVisitorConvertToNumber<UInt64>(), params_[0]);
|
||||
|
||||
// This range is hardcoded below
|
||||
if (granularity_param == 0)
|
||||
{
|
||||
throw Exception(
|
||||
"Parameter for aggregate function AggregateFunctionCountByGranularity is out or range: (0,].", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
|
||||
}
|
||||
granularity = granularity_param;
|
||||
}
|
||||
else
|
||||
{
|
||||
granularity = 8192;
|
||||
}
|
||||
}
|
||||
|
||||
String getName() const override { return "countByGranularity"; }
|
||||
|
||||
DataTypePtr getReturnType() const override
|
||||
{
|
||||
DataTypes types;
|
||||
if constexpr (
|
||||
std::is_same_v<T, Int8>
|
||||
|| std::is_same_v<T, UInt8>
|
||||
|| std::is_same_v<T, Int16>
|
||||
|| std::is_same_v<T, UInt16>
|
||||
|| std::is_same_v<T, Int32>
|
||||
|| std::is_same_v<T, UInt32>
|
||||
|| std::is_same_v<T, Int64>
|
||||
|| std::is_same_v<T, UInt64>
|
||||
// || std::is_same_v<T, Int128> TODO can't support Int128 for now
|
||||
|| std::is_same_v<T, UInt128>
|
||||
|| std::is_same_v<T, Float32>
|
||||
|| std::is_same_v<T, Float64>
|
||||
)
|
||||
{
|
||||
types.emplace_back(std::make_shared<DataTypeNumber<T>>()); // group by
|
||||
}
|
||||
// TODO can't support Decimal for now
|
||||
// else if constexpr (std::is_same_v<T, Decimal32> || std::is_same_v<T, Decimal64> || std::is_same_v<T, Decimal128>)
|
||||
// {
|
||||
// types.emplace_back(std::make_shared<DataTypeDecimal<T>>(DataTypeDecimal<T>::maxPrecision(), ????scale????)); // can't construct for now
|
||||
// }
|
||||
else
|
||||
{
|
||||
types.emplace_back(std::make_shared<DataTypeString>()); // group by
|
||||
}
|
||||
types.emplace_back(std::make_shared<DataTypeUInt64>()); // count
|
||||
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeTuple>(types));
|
||||
}
|
||||
|
||||
void add([[maybe_unused]] AggregateDataPtr __restrict place, const IColumn **, size_t, Arena *) const override
|
||||
{
|
||||
throw new Exception("Logical error: Count by granularity must run in batch mode.", ErrorCodes::LOGICAL_ERROR);
|
||||
}
|
||||
|
||||
void addBatchSinglePlace(
|
||||
size_t batch_size, AggregateDataPtr place, const IColumn ** columns, Arena *,[[maybe_unused]] ssize_t if_argument_pos) const override
|
||||
{
|
||||
this->data(place).granularity = this->granularity;
|
||||
if constexpr (std::is_same_v<T, String>)
|
||||
{
|
||||
const auto & string_column = static_cast<const ColumnString &>(*columns[0]);
|
||||
this->data(place).addMany(string_column, batch_size);
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto & column = static_cast<const ColVecType &>(*columns[0]);
|
||||
this->data(place).addMany(column.getData().data(), batch_size);
|
||||
}
|
||||
}
|
||||
|
||||
void addBatchSinglePlaceNotNull(
|
||||
size_t batch_size,
|
||||
AggregateDataPtr place,
|
||||
const IColumn ** columns,
|
||||
const UInt8 * null_map,
|
||||
Arena *,
|
||||
[[maybe_unused]] ssize_t if_argument_pos) const override
|
||||
{
|
||||
this->data(place).granularity = this->granularity;
|
||||
if constexpr (std::is_same_v<T, String>)
|
||||
{
|
||||
const auto & string_column = static_cast<const ColumnString &>(*columns[0]);
|
||||
this->data(place).addManyNotNull(string_column, null_map, batch_size);
|
||||
}
|
||||
else
|
||||
{
|
||||
const auto & column = static_cast<const ColVecType &>(*columns[0]);
|
||||
this->data(place).addManyNotNull(column.getData().data(), null_map, batch_size);
|
||||
}
|
||||
}
|
||||
|
||||
void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
|
||||
{
|
||||
this->data(place).merge(this->data(rhs));
|
||||
}
|
||||
|
||||
void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf) const override
|
||||
{
|
||||
this->data(place).serialize(buf);
|
||||
}
|
||||
|
||||
void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, Arena *) const override
|
||||
{
|
||||
this->data(place).deserialize(buf);
|
||||
}
|
||||
|
||||
void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
|
||||
{
|
||||
const auto & tuples = this->data(place).uniq_count_table;
|
||||
|
||||
auto & column_res = static_cast<ColumnArray &>(to);
|
||||
auto & column_offsets = static_cast<ColumnArray::ColumnOffsets &>(column_res.getOffsetsColumn());
|
||||
|
||||
auto & tuple_in_array = static_cast<ColumnTuple &>(column_res.getData());
|
||||
|
||||
for (const auto & item : tuples)
|
||||
{
|
||||
if constexpr (
|
||||
std::is_same_v<T, Int8>
|
||||
|| std::is_same_v<T, UInt8>
|
||||
|| std::is_same_v<T, Int16>
|
||||
|| std::is_same_v<T, UInt16>
|
||||
|| std::is_same_v<T, Int32>
|
||||
|| std::is_same_v<T, UInt32>
|
||||
|| std::is_same_v<T, Int64>
|
||||
|| std::is_same_v<T, UInt64>
|
||||
// || std::is_same_v<T, Int128> TODO can't support Int128 for now
|
||||
|| std::is_same_v<T, UInt128>
|
||||
|| std::is_same_v<T, Float32>
|
||||
|| std::is_same_v<T, Float64>
|
||||
)
|
||||
{
|
||||
auto & column_group_by = static_cast<ColumnVector<T> &>(tuple_in_array.getColumn(0));
|
||||
column_group_by.insert(item.getKey());
|
||||
}
|
||||
// TODO can't support Decimal for now
|
||||
// else if constexpr (std::is_same_v<T, Decimal32> || std::is_same_v<T, Decimal64> || std::is_same_v<T, Decimal128>)
|
||||
// {
|
||||
// auto & column_group_by = static_cast<ColumnDecimal<T> &>(tuple_in_array.getColumn(0));
|
||||
// column_group_by.insert(item.getKey());
|
||||
// }
|
||||
else
|
||||
{
|
||||
auto & column_group_by = static_cast<ColumnString &>(tuple_in_array.getColumn(0));
|
||||
std::ostringstream oss;
|
||||
oss << item.first;
|
||||
column_group_by.insert(oss.str());
|
||||
}
|
||||
|
||||
if constexpr (std::is_same_v<T, String>)
|
||||
{
|
||||
auto & column_count = static_cast<ColumnUInt64 &>(tuple_in_array.getColumn(1));
|
||||
column_count.insert(item.second);
|
||||
}
|
||||
else
|
||||
{
|
||||
auto & column_count = static_cast<ColumnUInt64 &>(tuple_in_array.getColumn(1));
|
||||
column_count.insert(item.getMapped());
|
||||
}
|
||||
}
|
||||
column_offsets.getData().push_back(column_res.getData().size());
|
||||
}
|
||||
|
||||
bool allocatesMemoryInArena() const override
|
||||
{
|
||||
return false;
|
||||
}
|
||||
private:
|
||||
UInt32 granularity;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif //CLICKHOUSE_AGGREGATEFUNCTIONCOUNTBYGRANULARITY_H
|
|
@ -579,6 +579,57 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
UInt64 rb_offset_limit(UInt64 offset, UInt64 limit, RoaringBitmapWithSmallSet & r1) const
|
||||
{
|
||||
if (limit == 0 || offset >= size())
|
||||
return 0;
|
||||
|
||||
if (isSmall())
|
||||
{
|
||||
UInt64 offset_count = 0;
|
||||
std::vector<T> answer;
|
||||
for (const auto & x : small)
|
||||
{
|
||||
T val = x.getValue();
|
||||
if (offset_count >= offset)
|
||||
{
|
||||
answer.push_back(val);
|
||||
} else {
|
||||
offset_count++;
|
||||
}
|
||||
}
|
||||
if (limit < answer.size())
|
||||
{
|
||||
std::nth_element(answer.begin(), answer.begin() + limit, answer.end());
|
||||
answer.resize(limit);
|
||||
}
|
||||
|
||||
for (const auto & elem : answer)
|
||||
r1.add(elem);
|
||||
return answer.size();
|
||||
}
|
||||
else
|
||||
{
|
||||
UInt64 count = 0;
|
||||
UInt64 offset_count = 0;
|
||||
for (auto it = rb->begin(); it != rb->end(); ++it)
|
||||
{
|
||||
offset_count++;
|
||||
if (offset_count <= offset)
|
||||
continue;
|
||||
|
||||
if (count < limit)
|
||||
{
|
||||
r1.add(*it);
|
||||
++count;
|
||||
}
|
||||
else
|
||||
break;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
}
|
||||
|
||||
UInt64 rb_min() const
|
||||
{
|
||||
if (isSmall())
|
||||
|
|
|
@ -185,6 +185,8 @@ void CnchRefreshMaterializedViewThread::runImpl()
|
|||
bool CnchRefreshMaterializedViewThread::constructAndScheduleRefreshTasks(StoragePtr & istorage, StorageMaterializedView & storage)
|
||||
{
|
||||
ContextMutablePtr query_context = Context::createCopy(getContext());
|
||||
query_context->makeQueryContext();
|
||||
query_context->makeSessionContext();
|
||||
auto refresh_params = storage.getAsyncRefreshParams(query_context, false);
|
||||
std::vector<String> task_ids = {};
|
||||
|
||||
|
@ -296,19 +298,19 @@ String CnchRefreshMaterializedViewThread::executeTaskLocal(
|
|||
task_id = task_id,
|
||||
mv_refresh_param = mv_refresh_param,
|
||||
command_context = Context::createCopy(query_context)]() {
|
||||
auto settings = query_context->getSettings();
|
||||
auto user_password = const_cast<const Context &> (*command_context).getCnchInterserverCredentials();
|
||||
command_context->setCurrentTransaction(nullptr, false);
|
||||
command_context->setCurrentVW(nullptr);
|
||||
command_context->setCurrentWorkerGroup(nullptr);
|
||||
command_context->makeSessionContext();
|
||||
command_context->makeQueryContext();
|
||||
auto settings = query_context->getSettings();
|
||||
command_context->setSettings(settings);
|
||||
CurrentThread::get().pushTenantId(command_context->getSettingsRef().tenant_id);
|
||||
|
||||
auto user_password = const_cast<const Context &> (*command_context).getCnchInterserverCredentials();
|
||||
command_context->setTenantId(command_context->getSettingsRef().tenant_id);
|
||||
command_context->setUser(user_password.first, user_password.second, Poco::Net::SocketAddress{});
|
||||
command_context->setCurrentQueryId(task_id);
|
||||
|
||||
command_context->makeSessionContext();
|
||||
command_context->makeQueryContext();
|
||||
|
||||
storage.refreshAsync(mv_refresh_param, command_context);
|
||||
command_context->setCurrentTransaction(nullptr);
|
||||
});
|
||||
|
|
|
@ -369,6 +369,8 @@ void CnchServerClient::redirectCommitParts(
|
|||
{
|
||||
auto timer = ProfileEventsTimer(ProfileEvents::ServerRpcRequest, ProfileEvents::ServerRpcElaspsedMicroseconds);
|
||||
brpc::Controller cntl;
|
||||
if (const auto * storage = dynamic_cast<const MergeTreeMetaBase *>(table.get()))
|
||||
cntl.set_timeout_ms(storage->getSettings()->cnch_meta_rpc_timeout_ms);
|
||||
Protos::RedirectCommitPartsReq request;
|
||||
Protos::RedirectCommitPartsResp response;
|
||||
|
||||
|
|
|
@ -566,27 +566,28 @@ void CnchWorkerServiceImpl::preloadDataParts(
|
|||
auto & cloud_merge_tree = dynamic_cast<StorageCloudMergeTree &>(*storage);
|
||||
auto data_parts = createPartVectorFromModelsForSend<MutableMergeTreeDataPartCNCHPtr>(cloud_merge_tree, request->parts());
|
||||
|
||||
auto preload_level = request->preload_level();
|
||||
auto submit_ts = request->submit_ts();
|
||||
auto sync = request->sync();
|
||||
auto read_injection = request->read_injection();
|
||||
|
||||
LOG_TRACE(
|
||||
log,
|
||||
"Receiving preload parts task level = {}, sync = {}, current table preload setting: parts_preload_level = {}, "
|
||||
"enable_preload_parts = {}, enable_parts_sync_preload = {}, enable_local_disk_cache = {}, enable_nexus_fs = {}",
|
||||
request->preload_level(),
|
||||
request->sync(),
|
||||
preload_level,
|
||||
sync,
|
||||
cloud_merge_tree.getSettings()->parts_preload_level.value,
|
||||
cloud_merge_tree.getSettings()->enable_preload_parts.value,
|
||||
cloud_merge_tree.getSettings()->enable_parts_sync_preload,
|
||||
cloud_merge_tree.getSettings()->enable_local_disk_cache,
|
||||
cloud_merge_tree.getSettings()->enable_nexus_fs);
|
||||
|
||||
if (!request->preload_level()
|
||||
if (!preload_level
|
||||
|| (!cloud_merge_tree.getSettings()->parts_preload_level && !cloud_merge_tree.getSettings()->enable_preload_parts))
|
||||
return;
|
||||
|
||||
auto preload_level = request->preload_level();
|
||||
auto submit_ts = request->submit_ts();
|
||||
auto read_injection = request->read_injection();
|
||||
|
||||
if (request->sync())
|
||||
if (sync)
|
||||
{
|
||||
auto & settings = getContext()->getSettingsRef();
|
||||
auto pool = std::make_unique<ThreadPool>(std::min(data_parts.size(), settings.cnch_parallel_preloading.value));
|
||||
|
@ -599,26 +600,27 @@ void CnchWorkerServiceImpl::preloadDataParts(
|
|||
});
|
||||
}
|
||||
pool->wait();
|
||||
LOG_DEBUG(
|
||||
log,
|
||||
"Finish preload tasks in {} ms, level: {}, sync: {}, size: {}",
|
||||
watch.elapsedMilliseconds(),
|
||||
preload_level,
|
||||
sync,
|
||||
data_parts.size());
|
||||
}
|
||||
else
|
||||
{
|
||||
ThreadPool * preload_thread_pool = &(IDiskCache::getPreloadPool());
|
||||
for (const auto & part : data_parts)
|
||||
{
|
||||
preload_thread_pool->scheduleOrThrowOnError([part, preload_level, submit_ts, read_injection, storage] {
|
||||
preload_thread_pool->trySchedule([part, preload_level, submit_ts, read_injection, storage] {
|
||||
part->remote_fs_read_failed_injection = read_injection;
|
||||
part->disk_cache_mode = DiskCacheMode::SKIP_DISK_CACHE;// avoid getCheckum & getIndex re-cache
|
||||
part->preload(preload_level, submit_ts);
|
||||
});
|
||||
}
|
||||
}
|
||||
LOG_DEBUG(
|
||||
log,
|
||||
"Finish preload table {} tasks in {} ms, level: {}, sync: {}, size: {}",
|
||||
cloud_merge_tree.getCnchStorageID().getNameForLogs(),
|
||||
watch.elapsedMilliseconds(),
|
||||
preload_level,
|
||||
sync,
|
||||
data_parts.size());
|
||||
})
|
||||
}
|
||||
|
||||
|
|
|
@ -16,6 +16,39 @@ struct StaticVisitor
|
|||
using ResultType = R;
|
||||
};
|
||||
|
||||
/// F is template parameter, to allow universal reference for field, that is useful for const and non-const values.
|
||||
template <typename Visitor, typename F>
|
||||
typename std::decay_t<Visitor>::ResultType applyVisitorExplicit(Visitor && visitor, F && field)
|
||||
{
|
||||
switch (field.getType())
|
||||
{
|
||||
case Field::Types::Null: return visitor(field.template get<Null>());
|
||||
case Field::Types::UInt64: return visitor(field.template get<UInt64>());
|
||||
case Field::Types::UInt128: return visitor(field.template get<UInt128>());
|
||||
case Field::Types::UInt256: return visitor(field.template get<UInt256>());
|
||||
case Field::Types::Int64: return visitor(field.template get<Int64>());
|
||||
case Field::Types::Float64: return visitor(field.template get<Float64>());
|
||||
case Field::Types::String: return visitor(field.template get<String>());
|
||||
case Field::Types::Array: return visitor(field.template get<Array>());
|
||||
case Field::Types::Tuple: return visitor(field.template get<Tuple>());
|
||||
case Field::Types::Decimal32: return visitor(field.template get<DecimalField<Decimal32>>());
|
||||
case Field::Types::Decimal64: return visitor(field.template get<DecimalField<Decimal64>>());
|
||||
case Field::Types::Decimal128: return visitor(field.template get<DecimalField<Decimal128>>());
|
||||
case Field::Types::Decimal256: return visitor(field.template get<DecimalField<Decimal256>>());
|
||||
case Field::Types::AggregateFunctionState: return visitor(field.template get<AggregateFunctionStateData>());
|
||||
#ifdef HAVE_BOO_TYPE
|
||||
case Field::Types::Bool:
|
||||
return visitor(field.template get<bool>());
|
||||
#endif
|
||||
case Field::Types::Object: return visitor(field.template get<Object>());
|
||||
case Field::Types::Map: return visitor(field.template get<Map>());
|
||||
case Field::Types::BitMap64:
|
||||
return visitor(field.template get<BitMap64>());
|
||||
|
||||
default:
|
||||
throw Exception("Bad type of Field", ErrorCodes::BAD_TYPE_OF_FIELD);
|
||||
}
|
||||
}
|
||||
|
||||
/// F is template parameter, to allow universal reference for field, that is useful for const and non-const values.
|
||||
template <typename Visitor, typename F>
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
#include <Common/MemoryTrackerBlockerInThread.h>
|
||||
|
||||
// MemoryTrackerBlockerInThread
|
||||
thread_local uint64_t MemoryTrackerBlockerInThread::counter;
|
||||
thread_local VariableContext MemoryTrackerBlockerInThread::level;
|
||||
MemoryTrackerBlockerInThread::MemoryTrackerBlockerInThread(VariableContext level_)
|
||||
: previous_level(level)
|
||||
{
|
||||
++counter;
|
||||
level = level_;
|
||||
}
|
||||
MemoryTrackerBlockerInThread::~MemoryTrackerBlockerInThread()
|
||||
{
|
||||
--counter;
|
||||
level = previous_level;
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
#pragma once
|
||||
#include <cstdint>
|
||||
#include <Common/VariableContext.h>
|
||||
|
||||
|
||||
/// To be able to temporarily stop memory tracking from current thread.
|
||||
struct MemoryTrackerBlockerInThread
|
||||
{
|
||||
private:
|
||||
static thread_local uint64_t counter;
|
||||
static thread_local VariableContext level;
|
||||
|
||||
VariableContext previous_level;
|
||||
public:
|
||||
/// level_ - block in level and above
|
||||
explicit MemoryTrackerBlockerInThread(VariableContext level_ = VariableContext::User);
|
||||
~MemoryTrackerBlockerInThread();
|
||||
|
||||
MemoryTrackerBlockerInThread(const MemoryTrackerBlockerInThread &) = delete;
|
||||
MemoryTrackerBlockerInThread & operator=(const MemoryTrackerBlockerInThread &) = delete;
|
||||
|
||||
static bool isBlocked(VariableContext current_level)
|
||||
{
|
||||
return counter > 0 && current_level >= level;
|
||||
}
|
||||
|
||||
friend class MemoryTracker;
|
||||
friend struct AllocationTrace;
|
||||
};
|
|
@ -274,6 +274,7 @@ std::unordered_set<String> SettingsChanges::WHITELIST_SETTINGS =
|
|||
"enable_sync_build_bitmap",
|
||||
"enable_sync_fetch",
|
||||
"enable_sync_from_ha",
|
||||
"enable_table_scan_build_pipeline_optimization",
|
||||
"enable_testlog_to_console",
|
||||
"enable_unaligned_array_join",
|
||||
"enable_variadic_arraySetCheck",
|
||||
|
|
|
@ -79,6 +79,7 @@ enum PreloadLevelSettings : UInt64
|
|||
"The maximum size of blocks of uncompressed data before compressing for writing to a table.", \
|
||||
0) \
|
||||
M(UInt64, max_block_size, DEFAULT_BLOCK_SIZE, "Maximum block size for reading", 0) \
|
||||
M(UInt64, min_block_size, 1024, "Minimum block size for reading", 0) \
|
||||
M(UInt64, max_insert_block_size, DEFAULT_INSERT_BLOCK_SIZE, "The maximum block size for insertion, if we control the creation of blocks for insertion.", 0) \
|
||||
M(UInt64, max_insert_block_size_bytes, DEFAULT_BLOCK_SIZE_BYTES, "The maximum block bytes for insertion, if we control the creation of blocks for insertion.", 0) \
|
||||
M(UInt64, min_insert_block_size_rows, DEFAULT_INSERT_BLOCK_SIZE, "Squash blocks passed to INSERT query to specified size in rows, if blocks are not big enough.", 0) \
|
||||
|
@ -314,8 +315,6 @@ enum PreloadLevelSettings : UInt64
|
|||
M(UInt64, optimize_skip_unused_shards_nesting, 0, "Same as optimize_skip_unused_shards, but accept nesting level until which it will work.", 0) \
|
||||
M(UInt64, force_optimize_skip_unused_shards_nesting, 0, "Same as force_optimize_skip_unused_shards, but accept nesting level until which it will work.", 0) \
|
||||
\
|
||||
M(Bool, use_sync_pipeline_executor, false, "Whether to use sync pipeline executor", 0) \
|
||||
\
|
||||
M(Bool, input_format_parallel_parsing, true, "Enable parallel parsing for some data formats.", 0) \
|
||||
M(UInt64, \
|
||||
min_chunk_bytes_for_parallel_parsing, \
|
||||
|
@ -807,13 +806,10 @@ enum PreloadLevelSettings : UInt64
|
|||
"will create several shared dictionaries.", \
|
||||
0) \
|
||||
M(Bool, decimal_check_overflow, true, "Check overflow of decimal arithmetic/comparison operations", 0) \
|
||||
\
|
||||
M(Bool, \
|
||||
prefer_localhost_replica, \
|
||||
1, \
|
||||
"1 - always send query to local replica, if it exists. 0 - choose replica to send query between local and remote ones according to " \
|
||||
"load_balancing", \
|
||||
0) \
|
||||
\
|
||||
M(Bool, size_predictor_estimate_lc_size_by_fullstate, true, "Using estimate size of fullstate LowCardinality in size predictor", 0) \
|
||||
\
|
||||
M(Bool, prefer_localhost_replica, 1, "1 - always send query to local replica, if it exists. 0 - choose replica to send query between local and remote ones according to load_balancing", 0) \
|
||||
M(UInt64, max_fetch_partition_retries_count, 5, "Amount of retries while fetching partition from another host.", 0) \
|
||||
M(UInt64, \
|
||||
http_max_multipart_form_data_size, \
|
||||
|
@ -1548,6 +1544,10 @@ enum PreloadLevelSettings : UInt64
|
|||
M(Float, ab_test_traffic_factor, 0, "Proportion of queries that perform ab test, meaningful between 0 and 1", 0) \
|
||||
M(String, ab_test_profile, "default", "Profile name for ab test", 0) \
|
||||
M(Bool, optimize_json_function_to_subcolumn, false, "Whether to optimize json extract functions to subcolumn read", 0) \
|
||||
/** Point lookup optimizations */ \
|
||||
M(Bool, enable_point_lookup_profile, false, "Whether to enable settings for point-lookup queries, If true, the settings from point_lookup_profile are applied in order to improve QPS.", 0) \
|
||||
M(String, point_lookup_profile, "", "Name of the setting profile to apply when enable_point_lookup_profile is true. If empty, will apply engine's default settings for point-lookup queries. If not empty but the profile doesn't exist, will also fallback to engine's default settings", 0) \
|
||||
M(Bool, use_sync_pipeline_executor, false, "Whether to use sync pipeline executor", 0) \
|
||||
/** Optimizer relative settings, statistics */ \
|
||||
M(Bool, create_stats_time_output, true, "Enable time output in create stats, should be disabled at regression test", 0) \
|
||||
M(Bool, statistics_forward_query, false, "Indicate whether this query is coming from another replica", 0) \
|
||||
|
|
|
@ -46,10 +46,7 @@ public:
|
|||
|
||||
bool isParametric() const override { return false; }
|
||||
bool haveSubtypes() const override { return false; }
|
||||
bool isComparable() const override { return true; }
|
||||
bool canBeComparedWithCollation() const override { return true; }
|
||||
bool isValueUnambiguouslyRepresentedInContiguousMemoryRegion() const override { return true; }
|
||||
bool isCategorial() const override { return true; }
|
||||
bool isComparable() const override { return false; }
|
||||
bool canBeInsideNullable() const override { return false; }
|
||||
bool canBeInsideLowCardinality() const override { return false; }
|
||||
|
||||
|
|
|
@ -561,6 +561,13 @@ inline bool isFloat(const T & data_type)
|
|||
return which.isFloat();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline bool isNativeUInt(const T & data_type)
|
||||
{
|
||||
WhichDataType which(data_type);
|
||||
return which.isNativeUInt();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline bool isNativeInteger(const T & data_type)
|
||||
{
|
||||
|
|
|
@ -35,6 +35,8 @@ REGISTER_FUNCTION(Bitmap)
|
|||
factory.registerFunction<FunctionBitmapToArray>();
|
||||
factory.registerFunction<FunctionBitmapSubsetInRange>();
|
||||
factory.registerFunction<FunctionBitmapSubsetLimit>();
|
||||
factory.registerFunction<FunctionBitmapSubsetOffsetLimit>();
|
||||
factory.registerFunction<FunctionSubBitmapStartsFromOne>();
|
||||
factory.registerFunction<FunctionBitmapTransform>();
|
||||
|
||||
factory.registerFunction<FunctionBitmapSelfCardinality>();
|
||||
|
|
|
@ -50,6 +50,7 @@ namespace ErrorCodes
|
|||
{
|
||||
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
|
||||
extern const int LOGICAL_ERROR;
|
||||
extern const int ZERO_ARRAY_OR_TUPLE_INDEX;
|
||||
}
|
||||
|
||||
/** Bitmap functions.
|
||||
|
@ -706,8 +707,83 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
struct BitmapSubsetOffsetLimitImpl
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = "subBitmap";
|
||||
template <typename T>
|
||||
static void apply(
|
||||
const AggregateFunctionGroupBitmapData<T> & bitmap_data_0,
|
||||
UInt64 range_start,
|
||||
UInt64 range_end,
|
||||
AggregateFunctionGroupBitmapData<T> & bitmap_data_2)
|
||||
{
|
||||
bitmap_data_0.rbs.rb_offset_limit(range_start, range_end, bitmap_data_2.rbs);
|
||||
}
|
||||
|
||||
static BitMap64 apply(const BitMap64 & bitmap, UInt64 offset, UInt64 card_limit)
|
||||
{
|
||||
if (bitmap.isEmpty() || bitmap.cardinality() <= offset || card_limit == 0)
|
||||
return BitMap64();
|
||||
|
||||
PODArray<UInt64> res_array;
|
||||
UInt64 count = 0;
|
||||
UInt64 offset_count = 0;
|
||||
auto it = bitmap.begin();
|
||||
for (;it != bitmap.end() && offset_count < offset; ++it)
|
||||
++offset_count;
|
||||
|
||||
for (; it != bitmap.end() && count < card_limit; ++it, ++count)
|
||||
res_array.emplace_back(*it);
|
||||
return BitMap64(res_array.size(), res_array.data());
|
||||
}
|
||||
};
|
||||
|
||||
struct SubBitmapStartsFromOneImpl
|
||||
{
|
||||
static constexpr auto name = "subBitmapStartsFromOne";
|
||||
|
||||
template <typename T>
|
||||
static void apply(
|
||||
const AggregateFunctionGroupBitmapData<T> & bitmap_data_0,
|
||||
UInt64 range_start,
|
||||
UInt64 range_end,
|
||||
AggregateFunctionGroupBitmapData<T> & bitmap_data_2)
|
||||
{
|
||||
if (range_start == 0)
|
||||
throw Exception("Indices in bitmap are 1-based, same as subString", ErrorCodes::ZERO_ARRAY_OR_TUPLE_INDEX);
|
||||
|
||||
--range_start;
|
||||
bitmap_data_0.rbs.rb_offset_limit(range_start, range_end, bitmap_data_2.rbs);
|
||||
}
|
||||
|
||||
static BitMap64 apply(const BitMap64 & bitmap, UInt64 offset, UInt64 card_limit)
|
||||
{
|
||||
if (offset == 0)
|
||||
throw Exception("Indices in bitmap are 1-based, same as subString", ErrorCodes::ZERO_ARRAY_OR_TUPLE_INDEX);
|
||||
|
||||
--offset;
|
||||
|
||||
if (bitmap.isEmpty() || bitmap.cardinality() <= offset || card_limit == 0)
|
||||
return BitMap64();
|
||||
|
||||
PODArray<UInt64> res_array;
|
||||
UInt64 count = 0;
|
||||
UInt64 offset_count = 0;
|
||||
auto it = bitmap.begin();
|
||||
for (;it != bitmap.end() && offset_count < offset; ++it)
|
||||
++offset_count;
|
||||
|
||||
for (; it != bitmap.end() && count < card_limit; ++it, ++count)
|
||||
res_array.emplace_back(*it);
|
||||
return BitMap64(res_array.size(), res_array.data());
|
||||
}
|
||||
};
|
||||
|
||||
using FunctionBitmapSubsetInRange = FunctionBitmapSubset<BitmapSubsetInRangeImpl>;
|
||||
using FunctionBitmapSubsetLimit = FunctionBitmapSubset<BitmapSubsetLimitImpl>;
|
||||
using FunctionBitmapSubsetOffsetLimit = FunctionBitmapSubset<BitmapSubsetOffsetLimitImpl>;
|
||||
using FunctionSubBitmapStartsFromOne = FunctionBitmapSubset<SubBitmapStartsFromOneImpl>;
|
||||
|
||||
|
||||
class FunctionBitmapTransform : public IFunction
|
||||
|
@ -847,16 +923,19 @@ private:
|
|||
if (from_end - from_start != to_end - to_start)
|
||||
throw Exception("From array size and to array size mismatch", ErrorCodes::LOGICAL_ERROR);
|
||||
|
||||
auto & bitmap = column_bitmap->getBitMapAt( is_column_const[0] ? 0ULL : i );
|
||||
/// get a copy of the original bitmap
|
||||
auto bitmap = column_bitmap->getBitMapAt( is_column_const[0] ? 0ULL : i );
|
||||
|
||||
for (size_t j = from_start; j < from_end; ++j)
|
||||
{
|
||||
if (from_container[j] == to_container[j])
|
||||
continue;
|
||||
bool changed = const_cast<BitMap64 &>(bitmap).removeChecked(from_container[j]);
|
||||
bool changed = bitmap.removeChecked(from_container[j]);
|
||||
if (changed)
|
||||
const_cast<BitMap64 &>(bitmap).add(to_container[j]);
|
||||
bitmap.add(to_container[j]);
|
||||
}
|
||||
|
||||
col_to->insert(bitmap);
|
||||
}
|
||||
return col_to;
|
||||
}
|
||||
|
|
|
@ -114,7 +114,7 @@ public:
|
|||
throw Exception{"The first argument of function " + String(Name::name) + " should be a string containing JSON, illegal type: " + first_column.type->getName(),
|
||||
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
|
||||
|
||||
const ColumnPtr & arg_json = recursiveAssumeNotNullable(first_column.column);
|
||||
const ColumnPtr & arg_json = recursiveAssumeNotNullable(recursiveRemoveLowCardinality(first_column.column));
|
||||
const auto * col_json_const = typeid_cast<const ColumnConst *>(arg_json.get());
|
||||
const auto * col_json_string
|
||||
= typeid_cast<const ColumnString *>(col_json_const ? col_json_const->getDataColumnPtr().get() : arg_json.get());
|
||||
|
@ -510,9 +510,11 @@ public:
|
|||
String getName() const override { return Name::name; }
|
||||
bool useDefaultImplementationForNulls() const override { return false; }
|
||||
bool useDefaultImplementationForConstants() const override { return true; }
|
||||
// bool useDefaultImplementationForLowCardinalityColumns() const override { return false; }
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
|
||||
{
|
||||
NullPresence lc_null_presence = getNullPresense(arguments);
|
||||
if (null_presence.has_null_constant)
|
||||
return result_type->createColumnConstWithDefaultValue(input_rows_count);
|
||||
|
||||
|
@ -520,6 +522,10 @@ public:
|
|||
auto temporary_result = Derived::run(temp_arguments, json_return_type, input_rows_count);
|
||||
if (null_presence.has_nullable)
|
||||
return wrapInNullable(temporary_result, arguments, result_type, input_rows_count);
|
||||
|
||||
if (lc_null_presence.has_nullable)
|
||||
return wrapInNullable(temporary_result, arguments, result_type, input_rows_count);
|
||||
|
||||
return temporary_result;
|
||||
}
|
||||
|
||||
|
@ -750,6 +756,7 @@ public:
|
|||
bool isVariadic() const override { return true; }
|
||||
size_t getNumberOfArguments() const override { return 0; }
|
||||
bool useDefaultImplementationForNulls() const override { return false; }
|
||||
// bool useDefaultImplementationForLowCardinalityColumns() const override { return false; }
|
||||
|
||||
FunctionBasePtr build(const ColumnsWithTypeAndName & arguments) const override
|
||||
{
|
||||
|
@ -763,6 +770,8 @@ public:
|
|||
|
||||
const auto & first_column = arguments[0];
|
||||
auto first_type_base = removeNullable(removeLowCardinality(first_column.type));
|
||||
auto first_type_is_lc_null = first_column.type->isLowCardinalityNullable();
|
||||
auto first_type_is_lc = WhichDataType(first_column.type).isLowCardinality();
|
||||
|
||||
bool is_string = isString(first_type_base);
|
||||
bool is_object = isObject(first_type_base);
|
||||
|
@ -783,6 +792,10 @@ public:
|
|||
return_type = makeNullable(std::make_shared<DataTypeNothing>());
|
||||
else if (null_presence.has_nullable)
|
||||
return_type = makeNullable(json_return_type);
|
||||
else if (first_type_is_lc_null)
|
||||
return_type = std::make_shared<DataTypeLowCardinality>(makeNullable(json_return_type));
|
||||
else if (first_type_is_lc)
|
||||
return_type = std::make_shared<DataTypeLowCardinality>(json_return_type);
|
||||
else
|
||||
return_type = json_return_type;
|
||||
|
||||
|
|
|
@ -0,0 +1,222 @@
|
|||
#include <DataTypes/DataTypesNumber.h>
|
||||
#include <Functions/FunctionFactory.h>
|
||||
#include <Functions/FunctionsRandom.h>
|
||||
#include <Functions/FunctionHelpers.h>
|
||||
#include <Core/ServerUUID.h>
|
||||
#include <Poco/Logger.h>
|
||||
#include <Common/ErrorCodes.h>
|
||||
#include <common/logger_useful.h>
|
||||
#include <common/types.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
/* Snowflake ID
|
||||
https://en.wikipedia.org/wiki/Snowflake_ID
|
||||
|
||||
0 1 2 3
|
||||
0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
|
||||
├─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
|
||||
|0| timestamp |
|
||||
├─┼ ┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┼─┤
|
||||
| | machine_id | machine_seq_num |
|
||||
└─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘
|
||||
|
||||
- The first 41 (+ 1 top zero bit) bits is the timestamp (millisecond since Unix epoch 1 Jan 1970)
|
||||
- The middle 10 bits are the machine ID
|
||||
- The last 12 bits are a counter to disambiguate multiple snowflakeIDs generated within the same millisecond by different processes
|
||||
*/
|
||||
|
||||
/// bit counts
|
||||
constexpr auto timestamp_bits_count = 41;
|
||||
constexpr auto machine_id_bits_count = 10;
|
||||
constexpr auto machine_seq_num_bits_count = 12;
|
||||
|
||||
/// bits masks for Snowflake ID components
|
||||
constexpr uint64_t machine_id_mask = ((1ull << machine_id_bits_count) - 1) << machine_seq_num_bits_count;
|
||||
constexpr uint64_t machine_seq_num_mask = (1ull << machine_seq_num_bits_count) - 1;
|
||||
|
||||
/// max values
|
||||
constexpr uint64_t max_machine_seq_num = machine_seq_num_mask;
|
||||
|
||||
uint64_t getTimestamp()
|
||||
{
|
||||
auto now = std::chrono::system_clock::now();
|
||||
auto ticks_since_epoch = std::chrono::duration_cast<std::chrono::milliseconds>(now.time_since_epoch()).count();
|
||||
return static_cast<uint64_t>(ticks_since_epoch) & ((1ull << timestamp_bits_count) - 1);
|
||||
}
|
||||
|
||||
uint64_t getMachineIdImpl()
|
||||
{
|
||||
UUID server_uuid = ServerUUID::get();
|
||||
/// hash into 64 bits
|
||||
uint64_t hi = UUIDHelpers::getHighBytes(server_uuid);
|
||||
uint64_t lo = UUIDHelpers::getLowBytes(server_uuid);
|
||||
/// return only 10 bits
|
||||
return (((hi * 11) ^ (lo * 17)) & machine_id_mask) >> machine_seq_num_bits_count;
|
||||
}
|
||||
|
||||
uint64_t getMachineId()
|
||||
{
|
||||
static uint64_t machine_id = getMachineIdImpl();
|
||||
return machine_id;
|
||||
}
|
||||
|
||||
struct SnowflakeId
|
||||
{
|
||||
uint64_t timestamp;
|
||||
uint64_t machine_id;
|
||||
uint64_t machine_seq_num;
|
||||
};
|
||||
|
||||
SnowflakeId toSnowflakeId(uint64_t snowflake)
|
||||
{
|
||||
return {.timestamp = (snowflake >> (machine_id_bits_count + machine_seq_num_bits_count)),
|
||||
.machine_id = ((snowflake & machine_id_mask) >> machine_seq_num_bits_count),
|
||||
.machine_seq_num = (snowflake & machine_seq_num_mask)};
|
||||
}
|
||||
|
||||
uint64_t fromSnowflakeId(SnowflakeId components)
|
||||
{
|
||||
return (components.timestamp << (machine_id_bits_count + machine_seq_num_bits_count) |
|
||||
components.machine_id << (machine_seq_num_bits_count) |
|
||||
components.machine_seq_num);
|
||||
}
|
||||
|
||||
struct SnowflakeIdRange
|
||||
{
|
||||
SnowflakeId begin; /// inclusive
|
||||
SnowflakeId end; /// exclusive
|
||||
};
|
||||
|
||||
/// To get the range of `input_rows_count` Snowflake IDs from `max(available, now)`:
|
||||
/// 1. calculate Snowflake ID by current timestamp (`now`)
|
||||
/// 2. `begin = max(available, now)`
|
||||
/// 3. Calculate `end = begin + input_rows_count` handling `machine_seq_num` overflow
|
||||
SnowflakeIdRange getRangeOfAvailableIds(const SnowflakeId & available, uint64_t machine_id, size_t input_rows_count)
|
||||
|
||||
{
|
||||
/// 1. `now`
|
||||
SnowflakeId begin = {.timestamp = getTimestamp(), .machine_id = machine_id, .machine_seq_num = 0};
|
||||
|
||||
/// 2. `begin`
|
||||
if (begin.timestamp <= available.timestamp)
|
||||
{
|
||||
begin.timestamp = available.timestamp;
|
||||
begin.machine_seq_num = available.machine_seq_num;
|
||||
}
|
||||
|
||||
/// 3. `end = begin + input_rows_count`
|
||||
SnowflakeId end;
|
||||
const uint64_t seq_nums_in_current_timestamp_left = (max_machine_seq_num - begin.machine_seq_num + 1);
|
||||
if (input_rows_count >= seq_nums_in_current_timestamp_left)
|
||||
/// if sequence numbers in current timestamp is not enough for rows --> depending on how many elements input_rows_count overflows, forward timestamp by at least 1 tick
|
||||
end.timestamp = begin.timestamp + 1 + (input_rows_count - seq_nums_in_current_timestamp_left) / (max_machine_seq_num + 1);
|
||||
else
|
||||
end.timestamp = begin.timestamp;
|
||||
|
||||
end.machine_id = begin.machine_id;
|
||||
end.machine_seq_num = (begin.machine_seq_num + input_rows_count) & machine_seq_num_mask;
|
||||
|
||||
return {begin, end};
|
||||
}
|
||||
|
||||
struct Data
|
||||
{
|
||||
/// Guarantee counter monotonicity within one timestamp across all threads generating Snowflake IDs simultaneously.
|
||||
static inline std::atomic<uint64_t> lowest_available_snowflake_id = 0;
|
||||
|
||||
SnowflakeId reserveRange(uint64_t machine_id, size_t input_rows_count)
|
||||
{
|
||||
uint64_t available_snowflake_id = lowest_available_snowflake_id.load();
|
||||
SnowflakeIdRange range;
|
||||
do
|
||||
{
|
||||
range = getRangeOfAvailableIds(toSnowflakeId(available_snowflake_id), machine_id, input_rows_count);
|
||||
}
|
||||
while (!lowest_available_snowflake_id.compare_exchange_weak(available_snowflake_id, fromSnowflakeId(range.end)));
|
||||
/// CAS failed --> another thread updated `lowest_available_snowflake_id` and we re-try
|
||||
/// else --> our thread reserved ID range [begin, end) and return the beginning of the range
|
||||
|
||||
return range.begin;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
class FunctionGenerateSnowflakeID : public IFunction
|
||||
{
|
||||
public:
|
||||
static constexpr auto name = "generateSnowflakeID";
|
||||
|
||||
static FunctionPtr create(ContextPtr /*context*/) { return std::make_shared<FunctionGenerateSnowflakeID>(); }
|
||||
|
||||
String getName() const override { return name; }
|
||||
size_t getNumberOfArguments() const override { return 0; }
|
||||
bool isDeterministic() const override { return false; }
|
||||
bool isDeterministicInScopeOfQuery() const override { return false; }
|
||||
bool useDefaultImplementationForNulls() const override { return false; }
|
||||
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
|
||||
bool isVariadic() const override { return true; }
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
|
||||
{
|
||||
FunctionArgumentDescriptors mandatory_args;
|
||||
FunctionArgumentDescriptors optional_args{
|
||||
{"expr", nullptr, nullptr, "Arbitrary expression"},
|
||||
{"machine_id", &isNativeUInt, &isColumnConst, "const UInt*"}
|
||||
};
|
||||
validateFunctionArgumentTypes(*this, arguments, mandatory_args, optional_args);
|
||||
|
||||
return std::make_shared<DataTypeUInt64>();
|
||||
}
|
||||
|
||||
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
|
||||
{
|
||||
auto col_res = ColumnVector<UInt64>::create();
|
||||
typename ColumnVector<UInt64>::Container & vec_to = col_res->getData();
|
||||
|
||||
if (input_rows_count > 0)
|
||||
{
|
||||
vec_to.resize(input_rows_count);
|
||||
|
||||
uint64_t machine_id = getMachineId();
|
||||
if (arguments.size() == 2)
|
||||
{
|
||||
machine_id = arguments[1].column->getUInt(0);
|
||||
machine_id &= (1ull << machine_id_bits_count) - 1;
|
||||
}
|
||||
|
||||
Data data;
|
||||
SnowflakeId snowflake_id = data.reserveRange(machine_id, input_rows_count);
|
||||
|
||||
for (UInt64 & to_row : vec_to)
|
||||
{
|
||||
to_row = fromSnowflakeId(snowflake_id);
|
||||
if (snowflake_id.machine_seq_num == max_machine_seq_num)
|
||||
{
|
||||
/// handle overflow
|
||||
snowflake_id.machine_seq_num = 0;
|
||||
++snowflake_id.timestamp;
|
||||
}
|
||||
else
|
||||
{
|
||||
++snowflake_id.machine_seq_num;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return col_res;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
REGISTER_FUNCTION(GenerateSnowflakeID)
|
||||
{
|
||||
factory.registerFunction<FunctionGenerateSnowflakeID>();
|
||||
}
|
||||
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue