fix: keeping optimize search (#2321)
This commit is contained in:
parent
d8519a3856
commit
e0a1ef7a26
|
@ -2,6 +2,8 @@
|
|||
.DS_Store
|
||||
.history
|
||||
report.json
|
||||
flamegraph.svg
|
||||
flamegraph.html
|
||||
|
||||
# Generated by Cargo
|
||||
# will have compiled files and executables
|
||||
|
|
|
@ -2053,7 +2053,6 @@ dependencies = [
|
|||
"arrow-json",
|
||||
"arrow-schema",
|
||||
"aws-sdk-dynamodb",
|
||||
"blake3",
|
||||
"byteorder",
|
||||
"bytes",
|
||||
"chrono",
|
||||
|
@ -2061,6 +2060,7 @@ dependencies = [
|
|||
"dotenv_config",
|
||||
"dotenvy",
|
||||
"getrandom",
|
||||
"hashbrown 0.14.3",
|
||||
"hex",
|
||||
"indexmap 2.1.0",
|
||||
"itertools 0.12.0",
|
||||
|
@ -2446,8 +2446,7 @@ checksum = "7e962a19be5cfc3f3bf6dd8f61eb50107f356ad6270fbb3ed41476571db78be5"
|
|||
[[package]]
|
||||
name = "datafusion"
|
||||
version = "34.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "193fd1e7628278d0641c5122860f9a7fd6a1d77d055838d12f55d15bbe28d4d0"
|
||||
source = "git+https://github.com/openobserve/arrow-datafusion.git?rev=45e5537ca43d2c2a6e55b9804073b191b337b9e5#45e5537ca43d2c2a6e55b9804073b191b337b9e5"
|
||||
dependencies = [
|
||||
"ahash 0.8.6",
|
||||
"arrow",
|
||||
|
@ -2493,8 +2492,7 @@ dependencies = [
|
|||
[[package]]
|
||||
name = "datafusion-common"
|
||||
version = "34.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "548bc49c4a489e3de474813831ea556dc9d368f9ed8d867b1493da42e8e9f613"
|
||||
source = "git+https://github.com/openobserve/arrow-datafusion.git?rev=45e5537ca43d2c2a6e55b9804073b191b337b9e5#45e5537ca43d2c2a6e55b9804073b191b337b9e5"
|
||||
dependencies = [
|
||||
"ahash 0.8.6",
|
||||
"arrow",
|
||||
|
@ -2513,8 +2511,7 @@ dependencies = [
|
|||
[[package]]
|
||||
name = "datafusion-execution"
|
||||
version = "34.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ecc865657ffcf4da5ff08bdc6436a9a833bc0aa96c3254c8d18ab8a0ad4e437d"
|
||||
source = "git+https://github.com/openobserve/arrow-datafusion.git?rev=45e5537ca43d2c2a6e55b9804073b191b337b9e5#45e5537ca43d2c2a6e55b9804073b191b337b9e5"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"chrono",
|
||||
|
@ -2534,8 +2531,7 @@ dependencies = [
|
|||
[[package]]
|
||||
name = "datafusion-expr"
|
||||
version = "34.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "33c473f72d8d81a532e63f6e562ed66dd9209dfd8e433d9712abd42444ee161e"
|
||||
source = "git+https://github.com/openobserve/arrow-datafusion.git?rev=45e5537ca43d2c2a6e55b9804073b191b337b9e5#45e5537ca43d2c2a6e55b9804073b191b337b9e5"
|
||||
dependencies = [
|
||||
"ahash 0.8.6",
|
||||
"arrow",
|
||||
|
@ -2550,8 +2546,7 @@ dependencies = [
|
|||
[[package]]
|
||||
name = "datafusion-optimizer"
|
||||
version = "34.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cb6218318001d2f6783b7fffa17592318f65f26609d7aab605a3dd0c7c2e2618"
|
||||
source = "git+https://github.com/openobserve/arrow-datafusion.git?rev=45e5537ca43d2c2a6e55b9804073b191b337b9e5#45e5537ca43d2c2a6e55b9804073b191b337b9e5"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"async-trait",
|
||||
|
@ -2568,8 +2563,7 @@ dependencies = [
|
|||
[[package]]
|
||||
name = "datafusion-physical-expr"
|
||||
version = "34.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9e1ca7e35ca22f9dc506c2375b92054b03ccf91afe25c0a90b395a1473a09735"
|
||||
source = "git+https://github.com/openobserve/arrow-datafusion.git?rev=45e5537ca43d2c2a6e55b9804073b191b337b9e5#45e5537ca43d2c2a6e55b9804073b191b337b9e5"
|
||||
dependencies = [
|
||||
"ahash 0.8.6",
|
||||
"arrow",
|
||||
|
@ -2602,8 +2596,7 @@ dependencies = [
|
|||
[[package]]
|
||||
name = "datafusion-physical-plan"
|
||||
version = "34.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ddde97adefcca3a55257c646ffee2a95b6cac66f74d1146a6e3a6dbb37830631"
|
||||
source = "git+https://github.com/openobserve/arrow-datafusion.git?rev=45e5537ca43d2c2a6e55b9804073b191b337b9e5#45e5537ca43d2c2a6e55b9804073b191b337b9e5"
|
||||
dependencies = [
|
||||
"ahash 0.8.6",
|
||||
"arrow",
|
||||
|
@ -2633,8 +2626,7 @@ dependencies = [
|
|||
[[package]]
|
||||
name = "datafusion-sql"
|
||||
version = "34.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a60d9d6460a64fddb8663db41da97e6b8b0bf79da42f997ebe81722731eaf0e5"
|
||||
source = "git+https://github.com/openobserve/arrow-datafusion.git?rev=45e5537ca43d2c2a6e55b9804073b191b337b9e5#45e5537ca43d2c2a6e55b9804073b191b337b9e5"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-schema",
|
||||
|
@ -3759,6 +3751,7 @@ dependencies = [
|
|||
"bytes",
|
||||
"chrono",
|
||||
"config",
|
||||
"futures",
|
||||
"hashbrown 0.14.3",
|
||||
"indexmap 2.1.0",
|
||||
"itertools 0.12.0",
|
||||
|
@ -4817,6 +4810,7 @@ dependencies = [
|
|||
"get_if_addrs",
|
||||
"getrandom",
|
||||
"glob",
|
||||
"hashbrown 0.14.3",
|
||||
"hashlink",
|
||||
"hex",
|
||||
"http-auth-basic",
|
||||
|
|
23
Cargo.toml
23
Cargo.toml
|
@ -76,8 +76,10 @@ clap = { version = "4.1", default-features = false, features = [
|
|||
cloudevents-sdk = { version = "0.7.0", features = ["actix"] }
|
||||
csv = "1.2.1"
|
||||
dashmap = { version = "5.4", features = ["serde"] }
|
||||
datafusion = { version = "34", features = ["simd"] }
|
||||
datafusion-expr = "34"
|
||||
datafusion = { git = "https://github.com/openobserve/arrow-datafusion.git", rev = "45e5537ca43d2c2a6e55b9804073b191b337b9e5", version = "34", features = [
|
||||
"simd",
|
||||
] }
|
||||
datafusion-expr = { git = "https://github.com/openobserve/arrow-datafusion.git", rev = "45e5537ca43d2c2a6e55b9804073b191b337b9e5", version = "34" }
|
||||
arrow = { version = "49", features = ["simd", "ipc_compression"] }
|
||||
arrow-schema = { version = "49", features = ["serde"] }
|
||||
parquet = { version = "49", features = ["arrow", "async"] }
|
||||
|
@ -91,6 +93,7 @@ flate2 = { version = "1.0", features = ["zlib"] }
|
|||
futures = "0.3"
|
||||
get_if_addrs = "0.5"
|
||||
glob = "0.3"
|
||||
hashbrown = { version = "0.14.3", features = ["serde"] }
|
||||
hashlink = "0.8.4"
|
||||
hex = "0.4"
|
||||
http-auth-basic = "0.3"
|
||||
|
@ -207,23 +210,17 @@ actix-web-prometheus = { version = "0.1", features = ["process"] }
|
|||
anyhow = "1.0"
|
||||
argon2 = { version = "0.5", features = ["alloc", "password-hash"] }
|
||||
async-trait = "0.1"
|
||||
async-recursion = "1.0"
|
||||
awc = "3.2"
|
||||
aws-config = "0.56.1"
|
||||
aws-sdk-dynamodb = "0.30.0"
|
||||
base64 = "0.21"
|
||||
blake3 = { version = "1.4", features = ["rayon"] }
|
||||
bytes = "1.4"
|
||||
byteorder = "1.4.3"
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock"] }
|
||||
dashmap = { version = "5.4", features = ["serde"] }
|
||||
datafusion = { version = "34", features = ["simd"] }
|
||||
datafusion-expr = "34"
|
||||
arrow = { version = "49", features = ["simd", "ipc_compression"] }
|
||||
arrow-json = "49"
|
||||
arrow-schema = { version = "49", features = ["serde"] }
|
||||
parquet = { version = "49", features = ["arrow", "async"] }
|
||||
object_store = { version = "0.8", features = ["aws", "azure", "gcp"] }
|
||||
dotenv_config = "0.1.7"
|
||||
dotenvy = "0.15"
|
||||
faststr = "0.2"
|
||||
|
@ -249,19 +246,9 @@ rs-snowflake = "0.6"
|
|||
segment = "0.2"
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
simd-json = "0.13"
|
||||
sha256 = "1.4.0"
|
||||
sled = "0.34"
|
||||
snafu = "0.7.5"
|
||||
snap = "1"
|
||||
sqlparser = { version = "0.40", features = ["serde", "visitor"] }
|
||||
sqlx = { version = "0.7", features = [
|
||||
"runtime-tokio-rustls",
|
||||
"postgres",
|
||||
"mysql",
|
||||
"sqlite",
|
||||
"chrono",
|
||||
] }
|
||||
sysinfo = "0.29"
|
||||
tempfile = "3"
|
||||
thiserror = "1.0"
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
version = "Two"
|
||||
unstable_features = true
|
||||
|
||||
comment_width = 80
|
||||
comment_width = 100
|
||||
wrap_comments = true
|
||||
format_code_in_doc_comments = true
|
||||
format_macro_bodies = true
|
||||
|
|
|
@ -184,14 +184,19 @@ INSERT IGNORE INTO file_list (org, stream, date, file, deleted, min_ts, max_ts,
|
|||
for file in files {
|
||||
let (stream_key, date_key, file_name) =
|
||||
parse_file_key_columns(file).map_err(|e| Error::Message(e.to_string()))?;
|
||||
let ret: Option<i64> = sqlx::query_scalar(
|
||||
let ret: Option<i64> = match sqlx::query_scalar(
|
||||
r#"SELECT id FROM file_list WHERE stream = ? AND date = ? AND file = ?"#,
|
||||
)
|
||||
.bind(stream_key)
|
||||
.bind(date_key)
|
||||
.bind(file_name)
|
||||
.fetch_one(&pool)
|
||||
.await?;
|
||||
.await
|
||||
{
|
||||
Ok(v) => v,
|
||||
Err(sqlx::Error::RowNotFound) => continue,
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
match ret {
|
||||
Some(v) => ids.push(v.to_string()),
|
||||
None => {
|
||||
|
@ -202,8 +207,10 @@ INSERT IGNORE INTO file_list (org, stream, date, file, deleted, min_ts, max_ts,
|
|||
}
|
||||
}
|
||||
// delete files by ids
|
||||
let sql = format!("DELETE FROM file_list WHERE id IN({});", ids.join(","));
|
||||
_ = pool.execute(sql.as_str()).await?;
|
||||
if !ids.is_empty() {
|
||||
let sql = format!("DELETE FROM file_list WHERE id IN({});", ids.join(","));
|
||||
_ = pool.execute(sql.as_str()).await?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
@ -259,13 +266,19 @@ INSERT IGNORE INTO file_list (org, stream, date, file, deleted, min_ts, max_ts,
|
|||
for file in files {
|
||||
let (stream_key, date_key, file_name) =
|
||||
parse_file_key_columns(file).map_err(|e| Error::Message(e.to_string()))?;
|
||||
let ret: Option<i64> = sqlx::query_scalar(
|
||||
let ret: Option<i64> = match sqlx::query_scalar(
|
||||
r#"SELECT id FROM file_list_deleted WHERE stream = ? AND date = ? AND file = ?"#,
|
||||
)
|
||||
.bind(stream_key)
|
||||
.bind(date_key)
|
||||
.bind(file_name)
|
||||
.fetch_one(&pool).await?;
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
{
|
||||
Ok(v) => v,
|
||||
Err(sqlx::Error::RowNotFound) => continue,
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
match ret {
|
||||
Some(v) => ids.push(v.to_string()),
|
||||
None => {
|
||||
|
@ -277,11 +290,13 @@ INSERT IGNORE INTO file_list (org, stream, date, file, deleted, min_ts, max_ts,
|
|||
}
|
||||
}
|
||||
// delete files by ids
|
||||
let sql = format!(
|
||||
"DELETE FROM file_list_deleted WHERE id IN({});",
|
||||
ids.join(",")
|
||||
);
|
||||
_ = pool.execute(sql.as_str()).await?;
|
||||
if !ids.is_empty() {
|
||||
let sql = format!(
|
||||
"DELETE FROM file_list_deleted WHERE id IN({});",
|
||||
ids.join(",")
|
||||
);
|
||||
_ = pool.execute(sql.as_str()).await?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -185,14 +185,19 @@ INSERT INTO file_list (org, stream, date, file, deleted, min_ts, max_ts, records
|
|||
for file in files {
|
||||
let (stream_key, date_key, file_name) =
|
||||
parse_file_key_columns(file).map_err(|e| Error::Message(e.to_string()))?;
|
||||
let ret: Option<i64> = sqlx::query_scalar(
|
||||
let ret: Option<i64> = match sqlx::query_scalar(
|
||||
r#"SELECT id FROM file_list WHERE stream = $1 AND date = $2 AND file = $3;"#,
|
||||
)
|
||||
.bind(stream_key)
|
||||
.bind(date_key)
|
||||
.bind(file_name)
|
||||
.fetch_one(&pool)
|
||||
.await?;
|
||||
.await
|
||||
{
|
||||
Ok(v) => v,
|
||||
Err(sqlx::Error::RowNotFound) => continue,
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
match ret {
|
||||
Some(v) => ids.push(v.to_string()),
|
||||
None => {
|
||||
|
@ -204,8 +209,10 @@ INSERT INTO file_list (org, stream, date, file, deleted, min_ts, max_ts, records
|
|||
}
|
||||
}
|
||||
// delete files by ids
|
||||
let sql = format!("DELETE FROM file_list WHERE id IN({});", ids.join(","));
|
||||
_ = pool.execute(sql.as_str()).await?;
|
||||
if !ids.is_empty() {
|
||||
let sql = format!("DELETE FROM file_list WHERE id IN({});", ids.join(","));
|
||||
_ = pool.execute(sql.as_str()).await?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
@ -264,14 +271,19 @@ INSERT INTO file_list (org, stream, date, file, deleted, min_ts, max_ts, records
|
|||
for file in files {
|
||||
let (stream_key, date_key, file_name) =
|
||||
parse_file_key_columns(file).map_err(|e| Error::Message(e.to_string()))?;
|
||||
let ret: Option<i64> = sqlx::query_scalar(
|
||||
let ret: Option<i64> = match sqlx::query_scalar(
|
||||
r#"SELECT id FROM file_list_deleted WHERE stream = $1 AND date = $2 AND file = $3;"#,
|
||||
)
|
||||
.bind(stream_key)
|
||||
.bind(date_key)
|
||||
.bind(file_name)
|
||||
.fetch_one(&pool)
|
||||
.await?;
|
||||
.await
|
||||
{
|
||||
Ok(v) => v,
|
||||
Err(sqlx::Error::RowNotFound) => continue,
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
match ret {
|
||||
Some(v) => ids.push(v.to_string()),
|
||||
None => {
|
||||
|
@ -283,11 +295,13 @@ INSERT INTO file_list (org, stream, date, file, deleted, min_ts, max_ts, records
|
|||
}
|
||||
}
|
||||
// delete files by ids
|
||||
let sql = format!(
|
||||
"DELETE FROM file_list_deleted WHERE id IN({});",
|
||||
ids.join(",")
|
||||
);
|
||||
_ = pool.execute(sql.as_str()).await?;
|
||||
if !ids.is_empty() {
|
||||
let sql = format!(
|
||||
"DELETE FROM file_list_deleted WHERE id IN({});",
|
||||
ids.join(",")
|
||||
);
|
||||
_ = pool.execute(sql.as_str()).await?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -469,6 +469,9 @@ INSERT INTO file_list (org, stream, date, file, deleted, min_ts, max_ts, records
|
|||
}
|
||||
|
||||
pub async fn batch_add(client: &Pool<Sqlite>, files: &[FileKey]) -> Result<()> {
|
||||
if files.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
let chunks = files.chunks(100);
|
||||
for files in chunks {
|
||||
let mut tx = client.begin().await?;
|
||||
|
@ -529,6 +532,9 @@ pub async fn batch_add(client: &Pool<Sqlite>, files: &[FileKey]) -> Result<()> {
|
|||
}
|
||||
|
||||
pub async fn batch_remove(client: &Pool<Sqlite>, files: &[String]) -> Result<()> {
|
||||
if files.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
let chunks = files.chunks(100);
|
||||
for files in chunks {
|
||||
// get ids of the files
|
||||
|
@ -537,14 +543,19 @@ pub async fn batch_remove(client: &Pool<Sqlite>, files: &[String]) -> Result<()>
|
|||
for file in files {
|
||||
let (stream_key, date_key, file_name) =
|
||||
parse_file_key_columns(file).map_err(|e| Error::Message(e.to_string()))?;
|
||||
let ret: Option<i64> = sqlx::query_scalar(
|
||||
let ret: Option<i64> = match sqlx::query_scalar(
|
||||
r#"SELECT id FROM file_list WHERE stream = $1 AND date = $2 AND file = $3;"#,
|
||||
)
|
||||
.bind(stream_key)
|
||||
.bind(date_key)
|
||||
.bind(file_name)
|
||||
.fetch_one(&pool)
|
||||
.await?;
|
||||
.await
|
||||
{
|
||||
Ok(v) => v,
|
||||
Err(sqlx::Error::RowNotFound) => continue,
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
match ret {
|
||||
Some(v) => ids.push(v.to_string()),
|
||||
None => {
|
||||
|
@ -555,8 +566,10 @@ pub async fn batch_remove(client: &Pool<Sqlite>, files: &[String]) -> Result<()>
|
|||
}
|
||||
}
|
||||
// delete files by ids
|
||||
let sql = format!("DELETE FROM file_list WHERE id IN({});", ids.join(","));
|
||||
_ = pool.execute(sql.as_str()).await?;
|
||||
if !ids.is_empty() {
|
||||
let sql = format!("DELETE FROM file_list WHERE id IN({});", ids.join(","));
|
||||
_ = pool.execute(sql.as_str()).await?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
@ -597,6 +610,9 @@ pub async fn batch_add_deleted(
|
|||
}
|
||||
|
||||
pub async fn batch_remove_deleted(client: &Pool<Sqlite>, files: &[String]) -> Result<()> {
|
||||
if files.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
let chunks = files.chunks(100);
|
||||
for files in chunks {
|
||||
// get ids of the files
|
||||
|
@ -605,14 +621,19 @@ pub async fn batch_remove_deleted(client: &Pool<Sqlite>, files: &[String]) -> Re
|
|||
for file in files {
|
||||
let (stream_key, date_key, file_name) =
|
||||
parse_file_key_columns(file).map_err(|e| Error::Message(e.to_string()))?;
|
||||
let ret: Option<i64> = sqlx::query_scalar(
|
||||
let ret: Option<i64> = match sqlx::query_scalar(
|
||||
r#"SELECT id FROM file_list_deleted WHERE stream = $1 AND date = $2 AND file = $3;"#,
|
||||
)
|
||||
.bind(stream_key)
|
||||
.bind(date_key)
|
||||
.bind(file_name)
|
||||
.fetch_one(&pool)
|
||||
.await?;
|
||||
.await
|
||||
{
|
||||
Ok(v) => v,
|
||||
Err(sqlx::Error::RowNotFound) => continue,
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
match ret {
|
||||
Some(v) => ids.push(v.to_string()),
|
||||
None => {
|
||||
|
@ -624,11 +645,13 @@ pub async fn batch_remove_deleted(client: &Pool<Sqlite>, files: &[String]) -> Re
|
|||
}
|
||||
}
|
||||
// delete files by ids
|
||||
let sql = format!(
|
||||
"DELETE FROM file_list_deleted WHERE id IN({});",
|
||||
ids.join(",")
|
||||
);
|
||||
_ = pool.execute(sql.as_str()).await?;
|
||||
if !ids.is_empty() {
|
||||
let sql = format!(
|
||||
"DELETE FROM file_list_deleted WHERE id IN({});",
|
||||
ids.join(",")
|
||||
);
|
||||
_ = pool.execute(sql.as_str()).await?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -343,9 +343,7 @@ impl StreamParams {
|
|||
pub struct SchemaEvolution {
|
||||
pub schema_compatible: bool,
|
||||
pub types_delta: Option<Vec<Field>>,
|
||||
pub schema_fields: Vec<Field>,
|
||||
pub is_schema_changed: bool,
|
||||
pub record_schema: Schema,
|
||||
}
|
||||
|
||||
pub struct SchemaRecords {
|
||||
|
|
|
@ -16,7 +16,6 @@
|
|||
use serde_json::value::{Map, Value};
|
||||
|
||||
const KEY_SEPARATOR: &str = "_";
|
||||
const FORMAT_KEY_ENABLED: bool = true;
|
||||
|
||||
/// Flattens the provided JSON object (`current`).
|
||||
///
|
||||
|
@ -27,7 +26,28 @@ const FORMAT_KEY_ENABLED: bool = true;
|
|||
/// # Errors
|
||||
/// Will return `Err` if `to_flatten` it's not an object, or if flattening the
|
||||
/// object would result in two or more keys colliding.
|
||||
pub fn flatten(to_flatten: &Value) -> Result<Value, anyhow::Error> {
|
||||
pub fn flatten(to_flatten: Value) -> Result<Value, anyhow::Error> {
|
||||
// quick check to see if we have an object`
|
||||
let to_flatten = match to_flatten {
|
||||
Value::Object(v) => {
|
||||
if v.is_empty() || !v.iter().any(|(_k, v)| v.is_object() || v.is_array()) {
|
||||
if v.iter().all(|(k, _v)| check_key(k)) {
|
||||
return Ok(Value::Object(v));
|
||||
}
|
||||
let mut formatted_map = Map::<String, Value>::with_capacity(v.len());
|
||||
for (mut k, v) in v.into_iter() {
|
||||
format_key(&mut k);
|
||||
formatted_map.insert(k, v);
|
||||
}
|
||||
return Ok(Value::Object(formatted_map));
|
||||
}
|
||||
Value::Object(v)
|
||||
}
|
||||
_ => {
|
||||
return Err(anyhow::anyhow!("flatten value must be an object"));
|
||||
}
|
||||
};
|
||||
|
||||
let mut flat = Map::<String, Value>::new();
|
||||
flatten_value(to_flatten, "".to_owned(), 0, &mut flat).map(|_x| Value::Object(flat))
|
||||
}
|
||||
|
@ -36,38 +56,21 @@ pub fn flatten(to_flatten: &Value) -> Result<Value, anyhow::Error> {
|
|||
/// its 0-based depth is `depth`. The result is stored in the JSON object
|
||||
/// `flattened`.
|
||||
fn flatten_value(
|
||||
current: &Value,
|
||||
current: Value,
|
||||
parent_key: String,
|
||||
depth: u32,
|
||||
flattened: &mut Map<String, Value>,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
if depth == 0 {
|
||||
match current {
|
||||
Value::Object(map) => {
|
||||
if map.is_empty() {
|
||||
return Ok(()); // If the top level input object is empty there is nothing to do
|
||||
}
|
||||
}
|
||||
_ => return Err(anyhow::anyhow!("flatten value must be an object")),
|
||||
match current {
|
||||
Value::Object(map) => {
|
||||
flatten_object(map, &parent_key, depth, flattened)?;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(current) = current.as_object() {
|
||||
flatten_object(current, &parent_key, depth, flattened)?;
|
||||
} else if let Some(current) = current.as_array() {
|
||||
flatten_array(current, &parent_key, depth, flattened)?;
|
||||
} else {
|
||||
if flattened.contains_key(&parent_key) {
|
||||
// log::error!(
|
||||
// "flatten will be overwritten current: {:?}, new key: {}, val:
|
||||
// {}, ", flattened,
|
||||
// parent_key,
|
||||
// current.clone(),
|
||||
// );
|
||||
// return Err(anyhow::anyhow!( "flatten will be overwritten a key
|
||||
// {}", parent_key));
|
||||
Value::Array(arr) => {
|
||||
flatten_array(arr, &parent_key, depth, flattened)?;
|
||||
}
|
||||
_ => {
|
||||
flattened.insert(parent_key, current);
|
||||
}
|
||||
flattened.insert(parent_key, current.clone());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
@ -76,17 +79,13 @@ fn flatten_value(
|
|||
/// 0-based depth is `depth`. The result is stored in the JSON object
|
||||
/// `flattened`.
|
||||
fn flatten_object(
|
||||
current: &Map<String, Value>,
|
||||
current: Map<String, Value>,
|
||||
parent_key: &str,
|
||||
depth: u32,
|
||||
flattened: &mut Map<String, Value>,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
for (k, v) in current.iter() {
|
||||
let k = if FORMAT_KEY_ENABLED {
|
||||
format_key(k)
|
||||
} else {
|
||||
k.to_string()
|
||||
};
|
||||
for (mut k, v) in current.into_iter() {
|
||||
format_key(&mut k);
|
||||
let parent_key = if depth > 0 {
|
||||
format!("{}{}{}", parent_key, KEY_SEPARATOR, k)
|
||||
} else {
|
||||
|
@ -101,7 +100,7 @@ fn flatten_object(
|
|||
/// 0-based depth is `depth`. The result is stored in the JSON object
|
||||
/// `flattened`.
|
||||
fn flatten_array(
|
||||
current: &[Value],
|
||||
current: Vec<Value>,
|
||||
parent_key: &str,
|
||||
depth: u32,
|
||||
flattened: &mut Map<String, Value>,
|
||||
|
@ -114,30 +113,32 @@ fn flatten_array(
|
|||
// flatten_value(obj, parent_key, depth + 1, flattened)?;
|
||||
// }
|
||||
let v = Value::String(Value::Array(current.to_vec()).to_string());
|
||||
flatten_value(&v, parent_key.to_string(), depth, flattened)?;
|
||||
flatten_value(v, parent_key.to_string(), depth, flattened)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// We need every character in the key to be lowercase alphanumeric or
|
||||
/// underscore
|
||||
pub fn format_key(key: &str) -> String {
|
||||
if key
|
||||
.chars()
|
||||
.all(|c| c.is_lowercase() || c.is_numeric() || c == '_')
|
||||
{
|
||||
return key.to_string();
|
||||
pub fn format_key(key: &mut String) {
|
||||
if check_key(key) {
|
||||
return;
|
||||
}
|
||||
let mut key_chars = key.chars().collect::<Vec<_>>();
|
||||
for c in key_chars.iter_mut() {
|
||||
if c.is_lowercase() || c.is_numeric() {
|
||||
continue;
|
||||
} else if c.is_uppercase() {
|
||||
*c = c.to_lowercase().next().unwrap();
|
||||
} else {
|
||||
*c = '_';
|
||||
}
|
||||
}
|
||||
*key = key_chars.into_iter().collect::<String>();
|
||||
}
|
||||
|
||||
fn check_key(key: &str) -> bool {
|
||||
key.chars()
|
||||
.map(|c| {
|
||||
if c.is_lowercase() || c.is_numeric() {
|
||||
c
|
||||
} else if c.is_uppercase() {
|
||||
c.to_lowercase().next().unwrap()
|
||||
} else {
|
||||
'_'
|
||||
}
|
||||
})
|
||||
.collect::<String>()
|
||||
.all(|c| c.is_lowercase() || c.is_numeric() || c == '_')
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
@ -146,10 +147,42 @@ mod tests {
|
|||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_check_key_lowercase() {
|
||||
assert_eq!(check_key("hello"), true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_check_key_numeric() {
|
||||
assert_eq!(check_key("123"), true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_check_key_underscore() {
|
||||
assert_eq!(check_key("my_key"), true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_check_key_mixed_case() {
|
||||
assert_eq!(check_key("Hello_World"), false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_check_key_special_characters() {
|
||||
assert_eq!(check_key("key!"), false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn object_with_plain_values() {
|
||||
let obj = json!({"int": 1, "float": 2.0, "str": "a", "bool": true, "null": null});
|
||||
assert_eq!(obj, flatten(&obj).unwrap());
|
||||
assert_eq!(obj, flatten(obj.clone()).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn object_with_plain_values_with_format_key() {
|
||||
let obj = json!({"int": 1, "float": 2.0, "str": "a", "bool": true, "null": null});
|
||||
let obj2 = json!({"int": 1, "Float": 2.0, "str": "a", "bool": true, "null": null});
|
||||
assert_eq!(obj, flatten(obj2).unwrap());
|
||||
}
|
||||
|
||||
/// Ensures that when using `ArrayFormatting::Plain` both arrays and objects
|
||||
|
@ -158,7 +191,7 @@ mod tests {
|
|||
fn array_formatting_plain() {
|
||||
let obj = json!({"s": {"a": [1, 2.0, "b", null, true]}});
|
||||
assert_eq!(
|
||||
flatten(&obj).unwrap(),
|
||||
flatten(obj).unwrap(),
|
||||
json!({
|
||||
format!("s{k}a", k = KEY_SEPARATOR): "[1,2.0,\"b\",null,true]",
|
||||
})
|
||||
|
@ -169,7 +202,7 @@ mod tests {
|
|||
fn nested_single_key_value() {
|
||||
let obj = json!({"key": "value", "nested_key": {"key": "value"}});
|
||||
assert_eq!(
|
||||
flatten(&obj).unwrap(),
|
||||
flatten(obj).unwrap(),
|
||||
json!({"key": "value", "nested_key_key": "value"}),
|
||||
);
|
||||
}
|
||||
|
@ -178,7 +211,7 @@ mod tests {
|
|||
fn nested_multiple_key_value() {
|
||||
let obj = json!({"key": "value", "nested_key": {"key1": "value1", "key2": "value2"}});
|
||||
assert_eq!(
|
||||
flatten(&obj).unwrap(),
|
||||
flatten(obj).unwrap(),
|
||||
json!({"key": "value", "nested_key_key1": "value1", "nested_key_key2": "value2"}),
|
||||
);
|
||||
}
|
||||
|
@ -198,7 +231,7 @@ mod tests {
|
|||
]
|
||||
});
|
||||
assert_eq!(
|
||||
flatten(&obj).unwrap(),
|
||||
flatten(obj).unwrap(),
|
||||
json!({"simple_key": "simple_value", "key": "[\"value1\",{\"key\":\"value2\"},{\"nested_array\":[\"nested1\",\"nested2\",[\"nested3\",\"nested4\"]]}]"}),
|
||||
);
|
||||
}
|
||||
|
@ -218,20 +251,20 @@ mod tests {
|
|||
#[test]
|
||||
fn empty_array() {
|
||||
let obj = json!({"key": []});
|
||||
assert_eq!(flatten(&obj).unwrap(), json!({}));
|
||||
assert_eq!(flatten(obj).unwrap(), json!({}));
|
||||
}
|
||||
|
||||
/// Ensure that empty objects are not present in the result
|
||||
#[test]
|
||||
fn empty_object() {
|
||||
let obj = json!({"key": {}});
|
||||
assert_eq!(flatten(&obj).unwrap(), json!({}));
|
||||
assert_eq!(flatten(obj).unwrap(), json!({}));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_top_object() {
|
||||
let obj = json!({});
|
||||
assert_eq!(flatten(&obj).unwrap(), json!({}));
|
||||
assert_eq!(flatten(obj).unwrap(), json!({}));
|
||||
}
|
||||
|
||||
/// Ensure that if all the end values of the JSON object are either `[]` or
|
||||
|
@ -240,7 +273,7 @@ mod tests {
|
|||
fn empty_complex_object() {
|
||||
let obj = json!({"key": {"key2": {}, "key3": [[], {}, {"k": {}, "q": []}]}});
|
||||
assert_eq!(
|
||||
flatten(&obj).unwrap(),
|
||||
flatten(obj).unwrap(),
|
||||
json!({"key_key3": "[[],{},{\"k\":{},\"q\":[]}]"})
|
||||
);
|
||||
}
|
||||
|
@ -248,25 +281,25 @@ mod tests {
|
|||
#[test]
|
||||
fn nested_object_with_empty_array_and_string() {
|
||||
let obj = json!({"key": {"key2": [], "key3": "a"}});
|
||||
assert_eq!(flatten(&obj).unwrap(), json!({"key_key3": "a"}));
|
||||
assert_eq!(flatten(obj).unwrap(), json!({"key_key3": "a"}));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn nested_object_with_empty_object_and_string() {
|
||||
let obj = json!({"key": {"key2": {}, "key3": "a"}});
|
||||
assert_eq!(flatten(&obj).unwrap(), json!({"key_key3": "a"}));
|
||||
assert_eq!(flatten(obj).unwrap(), json!({"key_key3": "a"}));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_string_as_key() {
|
||||
let obj = json!({"key": {"": "a"}});
|
||||
assert_eq!(flatten(&obj).unwrap(), json!({"key_": "a"}));
|
||||
assert_eq!(flatten(obj).unwrap(), json!({"key_": "a"}));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_string_as_key_multiple_times() {
|
||||
let obj = json!({"key": {"": {"": {"": "a"}}}});
|
||||
assert_eq!(flatten(&obj).unwrap(), json!({"key___": "a"}));
|
||||
assert_eq!(flatten(obj).unwrap(), json!({"key___": "a"}));
|
||||
}
|
||||
|
||||
/// Flattening only makes sense for objects. Passing something else must
|
||||
|
@ -279,7 +312,7 @@ mod tests {
|
|||
let null = json!(null);
|
||||
let array = json!([1, 2, 3]);
|
||||
|
||||
for j in [integer, string, boolean, null, array].iter() {
|
||||
for j in [integer, string, boolean, null, array].into_iter() {
|
||||
let res = flatten(j);
|
||||
match res {
|
||||
Err(_) => {} // Good
|
||||
|
@ -291,7 +324,7 @@ mod tests {
|
|||
#[test]
|
||||
fn complex_array() {
|
||||
let obj = json!({"a": [1, [2, [3, 4], 5], 6]});
|
||||
assert_eq!(flatten(&obj).unwrap(), json!({"a": "[1,[2,[3,4],5],6]"}));
|
||||
assert_eq!(flatten(obj).unwrap(), json!({"a": "[1,[2,[3,4],5],6]"}));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
@ -302,16 +335,16 @@ mod tests {
|
|||
json!({"key": "value", "nested_key_key": "value", "nested_key_foo": "bar"}),
|
||||
),
|
||||
(
|
||||
json!({"key+bar": "value", "@nested_key": {"key": "value", "Foo": "Bar"}}),
|
||||
json!({"key_bar": "value", "_nested_key_key": "value", "_nested_key_foo": "Bar"}),
|
||||
json!({"key+bar": "value", "@nested_key": {"#key": "value", "&Foo": "Bar"}}),
|
||||
json!({"key_bar": "value", "_nested_key__key": "value", "_nested_key__foo": "Bar"}),
|
||||
),
|
||||
(
|
||||
json!({"a": {"A.1": [1, [3, 4], 5], "A_2": 6}}),
|
||||
json!({"a_a_1": "[1,[3,4],5]", "a_a_2": 6}),
|
||||
),
|
||||
];
|
||||
for (input, expected) in datas.iter() {
|
||||
assert_eq!(flatten(input).unwrap(), *expected);
|
||||
for (input, expected) in datas.into_iter() {
|
||||
assert_eq!(flatten(input).unwrap(), expected);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -350,7 +383,7 @@ mod tests {
|
|||
"phonenumbers":"[{\"number\":\"555-555-1234\",\"type\":\"home\"},{\"number\":\"555-555-5678\",\"type\":\"work\"}]"
|
||||
});
|
||||
|
||||
let output = flatten(&input).unwrap();
|
||||
let output = flatten(input).unwrap();
|
||||
assert_eq!(output, expected_output);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -11,13 +11,13 @@ anyhow.workspace = true
|
|||
arrow-json.workspace = true
|
||||
arrow-schema.workspace = true
|
||||
aws-sdk-dynamodb.workspace = true
|
||||
blake3.workspace = true
|
||||
bytes.workspace = true
|
||||
byteorder.workspace = true
|
||||
chrono.workspace = true
|
||||
dashmap.workspace = true
|
||||
dotenv_config.workspace = true
|
||||
dotenvy.workspace = true
|
||||
hashbrown.workspace = true
|
||||
hex.workspace = true
|
||||
indexmap.workspace = true
|
||||
itertools.workspace = true
|
||||
|
|
|
@ -278,6 +278,8 @@ pub struct Common {
|
|||
pub bloom_filter_enabled: bool,
|
||||
#[env_config(name = "ZO_BLOOM_FILTER_DEFAULT_FIELDS", default = "")]
|
||||
pub bloom_filter_default_fields: String,
|
||||
#[env_config(name = "ZO_BLOOM_FILTER_FORCE_DISABLED", default = false)]
|
||||
pub bloom_filter_force_disabled: bool,
|
||||
#[env_config(name = "ZO_TRACING_ENABLED", default = false)]
|
||||
pub tracing_enabled: bool,
|
||||
#[env_config(name = "OTEL_OTLP_HTTP_ENDPOINT", default = "")]
|
||||
|
@ -361,10 +363,17 @@ pub struct Limit {
|
|||
pub req_json_limit: usize,
|
||||
#[env_config(name = "ZO_PAYLOAD_LIMIT", default = 209715200)]
|
||||
pub req_payload_limit: usize,
|
||||
#[env_config(name = "ZO_MAX_FILE_SIZE_ON_DISK", default = 32)] // MB
|
||||
pub max_file_size_on_disk: u64,
|
||||
#[env_config(name = "ZO_MAX_FILE_RETENTION_TIME", default = 600)] // seconds
|
||||
pub max_file_retention_time: u64,
|
||||
#[env_config(name = "ZO_MAX_FILE_SIZE_ON_DISK", default = 64)] // MB, per log file size on disk
|
||||
pub max_file_size_on_disk: usize,
|
||||
#[env_config(name = "ZO_MEM_FILE_MAX_SIZE", default = 256)] // MB, per log file size in memory
|
||||
pub mem_file_max_size: usize,
|
||||
#[env_config(name = "ZO_MEM_TABLE_MAX_SIZE", default = 0)]
|
||||
// MB, total file size in memory, default is 50% of system memory
|
||||
pub mem_table_max_size: usize,
|
||||
#[env_config(name = "ZO_MEM_PERSIST_INTERVAL", default = 5)] // seconds
|
||||
pub mem_persist_interval: u64,
|
||||
#[env_config(name = "ZO_FILE_PUSH_INTERVAL", default = 10)] // seconds
|
||||
pub file_push_interval: u64,
|
||||
#[env_config(name = "ZO_FILE_MOVE_THREAD_NUM", default = 0)]
|
||||
|
@ -607,9 +616,9 @@ pub fn init() -> Config {
|
|||
if cfg.limit.query_thread_num == 0 {
|
||||
cfg.limit.query_thread_num = cpu_num * 4;
|
||||
}
|
||||
// HACK for move_file_thread_num equal to CPU core
|
||||
// HACK for move_file_thread_num equal to CPU core * 2
|
||||
if cfg.limit.file_move_thread_num == 0 {
|
||||
cfg.limit.file_move_thread_num = cpu_num;
|
||||
cfg.limit.file_move_thread_num = cpu_num * 2;
|
||||
}
|
||||
|
||||
// check common config
|
||||
|
@ -623,7 +632,7 @@ pub fn init() -> Config {
|
|||
}
|
||||
|
||||
// check memeory cache
|
||||
if let Err(e) = check_memory_cache_config(&mut cfg) {
|
||||
if let Err(e) = check_memory_config(&mut cfg) {
|
||||
panic!("memory cache config error: {e}");
|
||||
}
|
||||
|
||||
|
@ -840,7 +849,7 @@ fn check_sled_config(cfg: &mut Config) -> Result<(), anyhow::Error> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn check_memory_cache_config(cfg: &mut Config) -> Result<(), anyhow::Error> {
|
||||
fn check_memory_config(cfg: &mut Config) -> Result<(), anyhow::Error> {
|
||||
let mem_total = cgroup::get_memory_limit();
|
||||
cfg.limit.mem_total = mem_total;
|
||||
if cfg.memory_cache.max_size == 0 {
|
||||
|
@ -867,6 +876,14 @@ fn check_memory_cache_config(cfg: &mut Config) -> Result<(), anyhow::Error> {
|
|||
} else {
|
||||
cfg.memory_cache.datafusion_max_size *= 1024 * 1024;
|
||||
}
|
||||
|
||||
// for memtable limit check
|
||||
cfg.limit.mem_file_max_size *= 1024 * 1024;
|
||||
if cfg.limit.mem_table_max_size == 0 {
|
||||
cfg.limit.mem_table_max_size = mem_total / 2; // 50%
|
||||
} else {
|
||||
cfg.limit.mem_table_max_size *= 1024 * 1024;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -1000,7 +1017,7 @@ mod tests {
|
|||
|
||||
cfg.memory_cache.max_size = 1024;
|
||||
cfg.memory_cache.release_size = 1024;
|
||||
check_memory_cache_config(&mut cfg).unwrap();
|
||||
check_memory_config(&mut cfg).unwrap();
|
||||
assert_eq!(cfg.memory_cache.max_size, 1024 * 1024 * 1024);
|
||||
assert_eq!(cfg.memory_cache.release_size, 1024 * 1024 * 1024);
|
||||
|
||||
|
|
|
@ -1,89 +0,0 @@
|
|||
// Copyright 2023 Zinc Labs Inc.
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
use arrow_schema::{Field, Schema};
|
||||
use itertools::Itertools;
|
||||
|
||||
use super::schema_ext::SchemaExt;
|
||||
|
||||
#[derive(Debug, Default, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct Signature(pub [u8; 32]);
|
||||
|
||||
impl From<Signature> for String {
|
||||
fn from(sig: Signature) -> Self {
|
||||
hex::encode(sig.0)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_fields_key(fields: &[Field]) -> String {
|
||||
let mut hasher = blake3::Hasher::new();
|
||||
fields.iter().sorted_by_key(|v| v.name()).for_each(|field| {
|
||||
hasher.update(field.name().as_bytes());
|
||||
hasher.update(field.data_type().to_string().as_bytes());
|
||||
});
|
||||
Signature(hasher.finalize().into()).into()
|
||||
}
|
||||
|
||||
pub fn get_schema_key(schema: &Schema) -> String {
|
||||
get_fields_key(&schema.to_cloned_fields())
|
||||
}
|
||||
|
||||
pub fn get_schema_key_xxh3(schema: &Schema) -> String {
|
||||
get_fields_key_xxh3(&schema.to_cloned_fields())
|
||||
}
|
||||
|
||||
pub fn get_fields_key_xxh3(fields: &[Field]) -> String {
|
||||
let mut hasher = xxhash_rust::xxh3::Xxh3::new();
|
||||
for field in fields.iter().sorted_by_key(|v| v.name()) {
|
||||
hasher.update(field.name().as_bytes());
|
||||
hasher.update(field.data_type().to_string().as_bytes());
|
||||
}
|
||||
let hash = hasher.digest();
|
||||
format!("{hash:x}")
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use arrow_schema::DataType;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_ingest() {
|
||||
let mut schmea_vec = vec![
|
||||
Field::new("log", DataType::Utf8, false),
|
||||
Field::new("pod_id", DataType::Int64, false),
|
||||
];
|
||||
|
||||
for i in 0..30 {
|
||||
schmea_vec.push(Field::new(format!("field_{}", i), DataType::Utf8, false));
|
||||
}
|
||||
|
||||
let schema = Schema::new(schmea_vec);
|
||||
|
||||
let start1 = std::time::Instant::now();
|
||||
for _ in 0..100000 {
|
||||
get_schema_key(&schema);
|
||||
}
|
||||
log::info!("Time taken for blake3: {:?}", start1.elapsed());
|
||||
|
||||
let start2 = std::time::Instant::now();
|
||||
for _ in 0..100000 {
|
||||
get_schema_key_xxh3(&schema);
|
||||
}
|
||||
log::info!("Time taken for xxh3: {:?}", start2.elapsed());
|
||||
}
|
||||
}
|
|
@ -15,7 +15,6 @@
|
|||
|
||||
pub(crate) mod cgroup;
|
||||
pub(crate) mod file;
|
||||
pub mod hasher;
|
||||
pub mod parquet;
|
||||
pub mod rand;
|
||||
pub mod schema;
|
||||
|
|
|
@ -43,7 +43,7 @@ pub fn new_parquet_writer<'a>(
|
|||
.set_dictionary_enabled(true)
|
||||
.set_encoding(Encoding::PLAIN)
|
||||
.set_sorting_columns(Some(
|
||||
[SortingColumn::new(sort_column_id as i32, false, false)].to_vec(),
|
||||
[SortingColumn::new(sort_column_id as i32, true, false)].to_vec(),
|
||||
))
|
||||
.set_column_dictionary_enabled(
|
||||
ColumnPath::from(vec![CONFIG.common.column_timestamp.to_string()]),
|
||||
|
@ -66,6 +66,10 @@ pub fn new_parquet_writer<'a>(
|
|||
writer_props = writer_props
|
||||
.set_column_dictionary_enabled(ColumnPath::from(vec![field.to_string()]), false);
|
||||
}
|
||||
for field in BLOOM_FILTER_DEFAULT_FIELDS.iter() {
|
||||
writer_props = writer_props
|
||||
.set_column_dictionary_enabled(ColumnPath::from(vec![field.to_string()]), false);
|
||||
}
|
||||
// Bloom filter stored by row_group, so if the num_rows can limit to
|
||||
// PARQUET_MAX_ROW_GROUP_SIZE,
|
||||
let num_rows = metadata.records as u64;
|
||||
|
|
|
@ -21,8 +21,9 @@ use std::{
|
|||
|
||||
use arrow_json::reader;
|
||||
use arrow_schema::{ArrowError, DataType, Field, Schema};
|
||||
use serde_json::{Map, Value};
|
||||
|
||||
use crate::meta::stream::StreamType;
|
||||
use crate::{meta::stream::StreamType, FxIndexMap};
|
||||
|
||||
pub fn infer_json_schema<R: BufRead>(
|
||||
reader: R,
|
||||
|
@ -42,21 +43,118 @@ pub fn infer_json_schema_from_seekable<R: BufRead + Seek>(
|
|||
Ok(fix_schema(schema, stream_type))
|
||||
}
|
||||
|
||||
pub fn infer_json_schema_from_iterator<I, V>(
|
||||
pub fn infer_json_schema_from_values<I, V>(
|
||||
value_iter: I,
|
||||
stream_type: impl Into<StreamType>,
|
||||
) -> Result<Schema, ArrowError>
|
||||
where
|
||||
I: Iterator<Item = Result<V, ArrowError>>,
|
||||
V: Borrow<serde_json::Value>,
|
||||
I: Iterator<Item = V>,
|
||||
V: Borrow<Value>,
|
||||
{
|
||||
let schema = reader::infer_json_schema_from_iterator(value_iter)?;
|
||||
Ok(fix_schema(schema, stream_type.into()))
|
||||
let mut fields = None;
|
||||
for value in value_iter {
|
||||
match value.borrow() {
|
||||
Value::Object(v) => {
|
||||
if fields.is_none() {
|
||||
fields = Some(FxIndexMap::with_capacity_and_hasher(
|
||||
v.len(),
|
||||
Default::default(),
|
||||
));
|
||||
}
|
||||
infer_json_schema_from_object(fields.as_mut().unwrap(), v)?;
|
||||
}
|
||||
_ => {
|
||||
return Err(ArrowError::SchemaError(
|
||||
"Cannot infer schema from non-object value".to_string(),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
let fields = fields.unwrap_or_default();
|
||||
let fields = fields
|
||||
.into_iter()
|
||||
.map(|(_, field)| field)
|
||||
.collect::<Vec<_>>();
|
||||
Ok(fix_schema(Schema::new(fields), stream_type.into()))
|
||||
}
|
||||
|
||||
/// Fix the schema to ensure that the start_time and end_time fields are always
|
||||
/// present with uint64 and that null fields are removed and sort the fields by
|
||||
/// name.
|
||||
fn infer_json_schema_from_object(
|
||||
fields: &mut FxIndexMap<String, Field>,
|
||||
value: &Map<String, Value>,
|
||||
) -> Result<(), ArrowError> {
|
||||
for (key, value) in value.iter() {
|
||||
match value {
|
||||
Value::String(_) => {
|
||||
convet_data_type(fields, key, DataType::Utf8)?;
|
||||
}
|
||||
Value::Number(v) => {
|
||||
if v.is_i64() {
|
||||
convet_data_type(fields, key, DataType::Int64)?;
|
||||
} else if v.is_u64() {
|
||||
convet_data_type(fields, key, DataType::UInt64)?;
|
||||
} else if v.is_f64() {
|
||||
convet_data_type(fields, key, DataType::Float64)?;
|
||||
} else {
|
||||
return Err(ArrowError::SchemaError(
|
||||
"Cannot infer schema from non-basic-number type value".to_string(),
|
||||
));
|
||||
}
|
||||
}
|
||||
Value::Bool(_) => {
|
||||
convet_data_type(fields, key, DataType::Boolean)?;
|
||||
}
|
||||
Value::Null => {}
|
||||
_ => {
|
||||
return Err(ArrowError::SchemaError(
|
||||
"Cannot infer schema from non-basic type value".to_string(),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn convet_data_type(
|
||||
fields: &mut FxIndexMap<String, Field>,
|
||||
key: &str,
|
||||
data_type: DataType,
|
||||
) -> Result<(), ArrowError> {
|
||||
let Some(f) = fields.get(key) else {
|
||||
fields.insert(key.to_string(), Field::new(key, data_type, true));
|
||||
return Ok(());
|
||||
};
|
||||
let f_type = f.data_type();
|
||||
if f_type == &data_type {
|
||||
return Ok(());
|
||||
}
|
||||
match (f_type, &data_type) {
|
||||
(DataType::Utf8, _) => {}
|
||||
(DataType::Int64, DataType::UInt64)
|
||||
| (DataType::Int64, DataType::Float64)
|
||||
| (DataType::Int64, DataType::Utf8) => {
|
||||
fields.insert(key.to_string(), Field::new(key, data_type, true));
|
||||
}
|
||||
(DataType::UInt64, DataType::Float64) | (DataType::UInt64, DataType::Utf8) => {
|
||||
fields.insert(key.to_string(), Field::new(key, data_type, true));
|
||||
}
|
||||
(DataType::Float64, DataType::Utf8) => {
|
||||
fields.insert(key.to_string(), Field::new(key, data_type, true));
|
||||
}
|
||||
(DataType::Boolean, _) => {
|
||||
fields.insert(key.to_string(), Field::new(key, data_type, true));
|
||||
}
|
||||
_ => {
|
||||
return Err(ArrowError::SchemaError(format!(
|
||||
"Cannot infer schema from conflicting types: {:?} and {:?}",
|
||||
f_type, data_type
|
||||
)));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Fix the schema to ensure that the start_time and end_time fields are always present with uint64
|
||||
/// and that null fields are removed and sort the fields by name.
|
||||
fn fix_schema(schema: Schema, stream_type: StreamType) -> Schema {
|
||||
let mut fields = if stream_type == StreamType::Traces {
|
||||
itertools::chain(
|
||||
|
@ -71,8 +169,8 @@ fn fix_schema(schema: Schema, stream_type: StreamType) -> Schema {
|
|||
}
|
||||
}),
|
||||
vec![
|
||||
Arc::new(Field::new("start_time", DataType::UInt64, false)),
|
||||
Arc::new(Field::new("end_time", DataType::UInt64, false)),
|
||||
Arc::new(Field::new("start_time", DataType::UInt64, true)),
|
||||
Arc::new(Field::new("end_time", DataType::UInt64, true)),
|
||||
],
|
||||
)
|
||||
.collect::<Vec<_>>()
|
||||
|
|
|
@ -13,9 +13,9 @@
|
|||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
use arrow_schema::{Field, Schema};
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
use super::hasher::get_fields_key_xxh3;
|
||||
use arrow_schema::{Field, Schema};
|
||||
|
||||
/// SchemaExt helper...
|
||||
pub trait SchemaExt {
|
||||
|
@ -29,6 +29,8 @@ impl SchemaExt for Schema {
|
|||
}
|
||||
|
||||
fn hash_key(&self) -> String {
|
||||
get_fields_key_xxh3(&self.to_cloned_fields())
|
||||
let mut hasher = xxhash_rust::xxh3::Xxh3::new();
|
||||
self.hash(&mut hasher);
|
||||
format!("{:x}", hasher.finish())
|
||||
}
|
||||
}
|
||||
|
|
|
@ -102,7 +102,10 @@ pub async fn multi(
|
|||
)
|
||||
.await
|
||||
{
|
||||
Ok(v) => MetaHttpResponse::json(v),
|
||||
Ok(v) => match v.code {
|
||||
503 => HttpResponse::ServiceUnavailable().json(v),
|
||||
_ => MetaHttpResponse::json(v),
|
||||
},
|
||||
Err(e) => {
|
||||
log::error!("Error processing request: {:?}", e);
|
||||
HttpResponse::BadRequest().json(MetaHttpResponse::error(
|
||||
|
@ -148,7 +151,10 @@ pub async fn json(
|
|||
)
|
||||
.await
|
||||
{
|
||||
Ok(v) => MetaHttpResponse::json(v),
|
||||
Ok(v) => match v.code {
|
||||
503 => HttpResponse::ServiceUnavailable().json(v),
|
||||
_ => MetaHttpResponse::json(v),
|
||||
},
|
||||
Err(e) => {
|
||||
log::error!("Error processing request: {:?}", e);
|
||||
HttpResponse::BadRequest().json(MetaHttpResponse::error(
|
||||
|
|
|
@ -12,6 +12,7 @@ arrow-schema.workspace = true
|
|||
bytes.workspace = true
|
||||
byteorder.workspace = true
|
||||
chrono.workspace = true
|
||||
futures.workspace = true
|
||||
hashbrown.workspace = true
|
||||
indexmap.workspace = true
|
||||
itertools.workspace = true
|
||||
|
|
|
@ -25,9 +25,6 @@ pub enum Error {
|
|||
WalError {
|
||||
source: wal::Error,
|
||||
},
|
||||
Message {
|
||||
message: String,
|
||||
},
|
||||
OpenFileError {
|
||||
source: io::Error,
|
||||
path: PathBuf,
|
||||
|
@ -90,4 +87,8 @@ pub enum Error {
|
|||
WriteParquetRecordBatchError {
|
||||
source: parquet::errors::ParquetError,
|
||||
},
|
||||
TokioJoinError {
|
||||
source: tokio::task::JoinError,
|
||||
},
|
||||
MemoryTableOverflowError {},
|
||||
}
|
||||
|
|
|
@ -16,14 +16,15 @@
|
|||
use std::{path::PathBuf, sync::Arc};
|
||||
|
||||
use arrow_schema::Schema;
|
||||
use config::metrics;
|
||||
use config::{metrics, CONFIG};
|
||||
use futures::future::try_join_all;
|
||||
use once_cell::sync::Lazy;
|
||||
use snafu::ResultExt;
|
||||
use tokio::time;
|
||||
use tokio::{sync::Semaphore, task};
|
||||
|
||||
use crate::{
|
||||
entry::RecordBatchEntry,
|
||||
errors::{DeleteFileSnafu, RenameFileSnafu, Result, WriteDataSnafu},
|
||||
errors::{DeleteFileSnafu, RenameFileSnafu, Result, TokioJoinSnafu, WriteDataSnafu},
|
||||
memtable::MemTable,
|
||||
rwmap::RwIndexMap,
|
||||
writer::WriterKey,
|
||||
|
@ -63,7 +64,8 @@ impl Immutable {
|
|||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn persist(&self, wal_path: &PathBuf) -> Result<()> {
|
||||
pub(crate) async fn persist(&self, wal_path: &PathBuf) -> Result<i64> {
|
||||
let mut persist_size = 0;
|
||||
// 1. dump memtable to disk
|
||||
let paths = self
|
||||
.memtable
|
||||
|
@ -73,39 +75,70 @@ impl Immutable {
|
|||
let done_path = wal_path.with_extension("lock");
|
||||
let lock_data = paths
|
||||
.iter()
|
||||
.map(|p| p.to_string_lossy())
|
||||
.map(|(p, _)| p.to_string_lossy())
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n");
|
||||
std::fs::write(&done_path, lock_data.as_bytes()).context(WriteDataSnafu)?;
|
||||
// 3. delete wal file
|
||||
std::fs::remove_file(wal_path).context(DeleteFileSnafu { path: wal_path })?;
|
||||
// 4. rename the tmp files to parquet files
|
||||
for path in paths {
|
||||
for (path, size) in paths {
|
||||
persist_size += size;
|
||||
let parquet_path = path.with_extension("parquet");
|
||||
std::fs::rename(&path, &parquet_path).context(RenameFileSnafu { path: &path })?;
|
||||
}
|
||||
// 5. delete the lock file
|
||||
std::fs::remove_file(&done_path).context(DeleteFileSnafu { path: &done_path })?;
|
||||
Ok(())
|
||||
Ok(persist_size)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn persist() -> Result<()> {
|
||||
loop {
|
||||
let r = IMMUTABLES.read().await;
|
||||
let Some((path, immutable)) = r.first() else {
|
||||
let r = IMMUTABLES.read().await;
|
||||
let n = r.len();
|
||||
let mut paths = Vec::with_capacity(n);
|
||||
for item in r.iter() {
|
||||
if paths.len() >= n {
|
||||
break;
|
||||
};
|
||||
let path = path.clone();
|
||||
// persist entry to local disk
|
||||
immutable.persist(&path).await?;
|
||||
drop(r);
|
||||
|
||||
// remove entry from IMMUTABLES
|
||||
IMMUTABLES.write().await.remove(&path);
|
||||
metrics::INGEST_MEMTABLE_FILES.with_label_values(&[]).dec();
|
||||
|
||||
time::sleep(time::Duration::from_millis(10)).await;
|
||||
}
|
||||
paths.push(item.0.clone());
|
||||
}
|
||||
drop(r);
|
||||
|
||||
let mut tasks = Vec::with_capacity(paths.len());
|
||||
let semaphore = Arc::new(Semaphore::new(CONFIG.limit.file_move_thread_num));
|
||||
for path in paths {
|
||||
let permit = semaphore.clone().acquire_owned().await.unwrap();
|
||||
let task: task::JoinHandle<Result<Option<(PathBuf, i64)>>> = task::spawn(async move {
|
||||
let r = IMMUTABLES.read().await;
|
||||
let Some(immutable) = r.get(&path) else {
|
||||
drop(permit);
|
||||
return Ok(None);
|
||||
};
|
||||
// persist entry to local disk
|
||||
let ret = immutable.persist(&path).await;
|
||||
drop(permit);
|
||||
ret.map(|size| Some((path, size)))
|
||||
});
|
||||
tasks.push(task);
|
||||
}
|
||||
|
||||
// remove entry from IMMUTABLES
|
||||
let tasks = try_join_all(tasks).await.context(TokioJoinSnafu)?;
|
||||
let mut rw = IMMUTABLES.write().await;
|
||||
for task in tasks {
|
||||
if let Some((path, size)) = task? {
|
||||
log::info!("[INGESTER] persist file: {:?}, size: {}", &path, size);
|
||||
// remove entry
|
||||
rw.remove(&path);
|
||||
// update metrics
|
||||
metrics::INGEST_MEMTABLE_BYTES
|
||||
.with_label_values(&[])
|
||||
.sub(size);
|
||||
metrics::INGEST_MEMTABLE_FILES.with_label_values(&[]).dec();
|
||||
}
|
||||
}
|
||||
rw.shrink_to_fit();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -25,8 +25,7 @@ mod writer;
|
|||
|
||||
pub use entry::Entry;
|
||||
pub use immutable::read_from_immutable;
|
||||
use tokio::time;
|
||||
pub use writer::{get_writer, read_from_memtable};
|
||||
pub use writer::{check_memtable_size, get_writer, read_from_memtable};
|
||||
|
||||
pub async fn init() -> errors::Result<()> {
|
||||
// check uncompleted parquet files, need delete those files
|
||||
|
@ -37,8 +36,10 @@ pub async fn init() -> errors::Result<()> {
|
|||
|
||||
// start a job to dump immutable data to disk
|
||||
tokio::task::spawn(async move {
|
||||
// immutable persist every 10 seconds
|
||||
let mut interval = time::interval(time::Duration::from_secs(10));
|
||||
// immutable persist every 10 (default) seconds
|
||||
let mut interval = tokio::time::interval(tokio::time::Duration::from_secs(
|
||||
config::CONFIG.limit.mem_persist_interval,
|
||||
));
|
||||
interval.tick().await; // the first tick is immediate
|
||||
loop {
|
||||
if let Err(e) = immutable::persist().await {
|
||||
|
|
|
@ -60,7 +60,7 @@ impl MemTable {
|
|||
thread_id: usize,
|
||||
org_id: &str,
|
||||
stream_type: &str,
|
||||
) -> Result<Vec<PathBuf>> {
|
||||
) -> Result<Vec<(PathBuf, i64)>> {
|
||||
let mut paths = Vec::new();
|
||||
let r = self.streams.read().await;
|
||||
for (stream_name, stream) in r.iter() {
|
||||
|
|
|
@ -71,7 +71,7 @@ impl Partition {
|
|||
org_id: &str,
|
||||
stream_type: &str,
|
||||
stream_name: &str,
|
||||
) -> Result<Vec<PathBuf>> {
|
||||
) -> Result<Vec<(PathBuf, i64)>> {
|
||||
let r = self.files.read().await;
|
||||
let mut paths = Vec::with_capacity(r.len());
|
||||
let mut path = PathBuf::from(&CONFIG.common.data_wal_dir);
|
||||
|
@ -119,9 +119,6 @@ impl Partition {
|
|||
.context(WriteFileSnafu { path: path.clone() })?;
|
||||
|
||||
// update metrics
|
||||
metrics::INGEST_MEMTABLE_BYTES
|
||||
.with_label_values(&[])
|
||||
.sub(file_meta.original_size);
|
||||
metrics::INGEST_WAL_USED_BYTES
|
||||
.with_label_values(&[&org_id, &stream_name, stream_type])
|
||||
.add(buf_parquet.len() as i64);
|
||||
|
@ -129,7 +126,7 @@ impl Partition {
|
|||
.with_label_values(&[&org_id, &stream_name, stream_type])
|
||||
.inc_by(buf_parquet.len() as u64);
|
||||
|
||||
paths.push(path);
|
||||
paths.push((path, file_meta.original_size));
|
||||
}
|
||||
Ok(paths)
|
||||
}
|
||||
|
|
|
@ -61,7 +61,7 @@ impl Stream {
|
|||
org_id: &str,
|
||||
stream_type: &str,
|
||||
stream_name: &str,
|
||||
) -> Result<Vec<PathBuf>> {
|
||||
) -> Result<Vec<(PathBuf, i64)>> {
|
||||
let mut paths = Vec::new();
|
||||
let r = self.partitions.read().await;
|
||||
for (_, partition) in r.iter() {
|
||||
|
|
|
@ -20,7 +20,7 @@ use std::{
|
|||
sync::Arc,
|
||||
};
|
||||
|
||||
use config::{utils::schema::infer_json_schema_from_iterator, CONFIG};
|
||||
use config::{utils::schema::infer_json_schema_from_values, CONFIG};
|
||||
use snafu::ResultExt;
|
||||
|
||||
use crate::{errors::*, immutable, memtable, writer::WriterKey};
|
||||
|
@ -34,17 +34,14 @@ use crate::{errors::*, immutable, memtable, writer::WriterKey};
|
|||
// 5. delete the lock file
|
||||
//
|
||||
// so, there are some cases that the process is not completed:
|
||||
// 1. the process is killed before step 2, so there are some .par files and have
|
||||
// no lock file, need delete those files
|
||||
// 2. the process is killed before step 3, so there are some .par files and have
|
||||
// lock file, the files actually wrote to disk completely, need to continue
|
||||
// step 3, 4 and 5
|
||||
// 3. the process is killed before step 4, so there are some .par files and have
|
||||
// lock file, the files actually wrote to disk completely, need to continue
|
||||
// step 4 and 5
|
||||
// 4. the process is killed before step 5, so there are some .parquet files and
|
||||
// have lock file, the files actually wrote to disk completely, need to
|
||||
// continue step 5
|
||||
// 1. the process is killed before step 2, so there are some .par files and have no lock file, need
|
||||
// delete those files
|
||||
// 2. the process is killed before step 3, so there are some .par files and have lock file, the
|
||||
// files actually wrote to disk completely, need to continue step 3, 4 and 5
|
||||
// 3. the process is killed before step 4, so there are some .par files and have lock file, the
|
||||
// files actually wrote to disk completely, need to continue step 4 and 5
|
||||
// 4. the process is killed before step 5, so there are some .parquet files and have lock file, the
|
||||
// files actually wrote to disk completely, need to continue step 5
|
||||
pub(crate) async fn check_uncompleted_parquet_files() -> Result<()> {
|
||||
// 1. get all .lock files
|
||||
let wal_dir = PathBuf::from(&CONFIG.common.data_wal_dir).join("logs");
|
||||
|
@ -54,8 +51,7 @@ pub(crate) async fn check_uncompleted_parquet_files() -> Result<()> {
|
|||
})?;
|
||||
let lock_files = scan_files(wal_dir, "lock");
|
||||
|
||||
// 2. check if there is a .wal file with the same name, delete it and rename the
|
||||
// .par file to .parquet
|
||||
// 2. check if there is a .wal file with same name, delete it and rename the .par to .parquet
|
||||
for lock_file in lock_files.iter() {
|
||||
log::warn!("found uncompleted wal file: {:?}", lock_file);
|
||||
let wal_file = lock_file.with_extension("wal");
|
||||
|
@ -151,9 +147,8 @@ pub(crate) async fn replay_wal_files() -> Result<()> {
|
|||
let entry = super::Entry::from_bytes(&entry)?;
|
||||
i += 1;
|
||||
total += entry.data.len();
|
||||
let schema =
|
||||
infer_json_schema_from_iterator(entry.data.iter().cloned().map(Ok), stream_type)
|
||||
.context(InferJsonSchemaSnafu)?;
|
||||
let schema = infer_json_schema_from_values(entry.data.iter().cloned(), stream_type)
|
||||
.context(InferJsonSchemaSnafu)?;
|
||||
memtable.write(Arc::new(schema), entry).await?;
|
||||
}
|
||||
log::warn!(
|
||||
|
|
|
@ -23,7 +23,7 @@ use std::{
|
|||
|
||||
use arrow_schema::Schema;
|
||||
use chrono::{Duration, Utc};
|
||||
use config::CONFIG;
|
||||
use config::{metrics, CONFIG};
|
||||
use once_cell::sync::Lazy;
|
||||
use snafu::ResultExt;
|
||||
use tokio::sync::{Mutex, RwLock};
|
||||
|
@ -60,6 +60,16 @@ pub struct Writer {
|
|||
created_at: AtomicI64,
|
||||
}
|
||||
|
||||
// check total memory size
|
||||
pub fn check_memtable_size() -> Result<()> {
|
||||
let total_mem_size = metrics::INGEST_MEMTABLE_BYTES.with_label_values(&[]).get();
|
||||
if total_mem_size >= CONFIG.limit.mem_table_max_size as i64 {
|
||||
Err(Error::MemoryTableOverflowError {})
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a writer for a given org_id and stream_type
|
||||
pub async fn get_writer(thread_id: usize, org_id: &str, stream_type: &str) -> Arc<Writer> {
|
||||
let key = WriterKey::new(org_id, stream_type);
|
||||
|
@ -103,6 +113,13 @@ impl Writer {
|
|||
let wal_dir = PathBuf::from(&CONFIG.common.data_wal_dir)
|
||||
.join("logs")
|
||||
.join(thread_id.to_string());
|
||||
log::info!(
|
||||
"[INGESTER] create file: {}/{}/{}/{}.wal",
|
||||
wal_dir.display().to_string(),
|
||||
&key.org_id,
|
||||
&key.stream_type,
|
||||
wal_id
|
||||
);
|
||||
Self {
|
||||
thread_id,
|
||||
key: key.clone(),
|
||||
|
@ -112,7 +129,7 @@ impl Writer {
|
|||
&key.org_id,
|
||||
&key.stream_type,
|
||||
wal_id,
|
||||
CONFIG.limit.max_file_size_on_disk,
|
||||
CONFIG.limit.max_file_size_on_disk as u64,
|
||||
)
|
||||
.expect("wal file create error"),
|
||||
)),
|
||||
|
@ -136,10 +153,17 @@ impl Writer {
|
|||
&self.key.org_id,
|
||||
&self.key.stream_type,
|
||||
wal_id,
|
||||
CONFIG.limit.max_file_size_on_disk,
|
||||
CONFIG.limit.max_file_size_on_disk as u64,
|
||||
)
|
||||
.context(WalSnafu)?;
|
||||
let old_wal = std::mem::replace(&mut *wal, new_wal);
|
||||
log::info!(
|
||||
"[INGESTER] create file: {}/{}/{}/{}.wal",
|
||||
self.thread_id,
|
||||
&self.key.org_id,
|
||||
&self.key.stream_type,
|
||||
wal_id
|
||||
);
|
||||
|
||||
// rotation memtable
|
||||
let mut mem = self.memtable.write().await;
|
||||
|
@ -180,9 +204,10 @@ impl Writer {
|
|||
|
||||
/// Check if the wal file size is over the threshold or the file is too old
|
||||
async fn check_threshold(&self, written_size: (usize, usize), data_size: usize) -> bool {
|
||||
let (compressed_size, _uncompressed_size) = written_size;
|
||||
let (compressed_size, uncompressed_size) = written_size;
|
||||
compressed_size > 0
|
||||
&& (compressed_size + data_size > CONFIG.limit.max_file_size_on_disk as usize
|
||||
&& (compressed_size + data_size > CONFIG.limit.max_file_size_on_disk
|
||||
|| uncompressed_size + data_size > CONFIG.limit.mem_file_max_size
|
||||
|| self.created_at.load(Ordering::Relaxed)
|
||||
+ Duration::seconds(CONFIG.limit.max_file_retention_time as i64)
|
||||
.num_microseconds()
|
||||
|
|
|
@ -26,7 +26,7 @@ use config::{
|
|||
metrics,
|
||||
utils::{
|
||||
parquet::new_parquet_writer,
|
||||
schema::{infer_json_schema_from_iterator, infer_json_schema_from_seekable},
|
||||
schema::{infer_json_schema_from_seekable, infer_json_schema_from_values},
|
||||
},
|
||||
CONFIG,
|
||||
};
|
||||
|
@ -36,6 +36,7 @@ use tokio::{sync::Semaphore, task, time};
|
|||
use crate::{
|
||||
common::{
|
||||
infra::{cluster, storage, wal},
|
||||
meta::stream::StreamParams,
|
||||
utils::{file::scan_files, json, stream::populate_file_meta},
|
||||
},
|
||||
service::{
|
||||
|
@ -53,10 +54,10 @@ pub async fn run() -> Result<(), anyhow::Error> {
|
|||
}
|
||||
interval.tick().await;
|
||||
if let Err(e) = move_files_to_storage().await {
|
||||
log::error!("Error moving disk files to remote: {}", e);
|
||||
log::error!("Error moving json files to remote: {}", e);
|
||||
}
|
||||
}
|
||||
log::info!("job::files::disk is stopped");
|
||||
log::info!("job::files::json is stopped");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -84,8 +85,9 @@ pub async fn move_files_to_storage() -> Result<(), anyhow::Error> {
|
|||
let columns = file_path.splitn(5, '/').collect::<Vec<&str>>();
|
||||
|
||||
// eg: files/default/logs/olympics/0/2023/08/21/08/8b8a5451bbe1c44b/
|
||||
// 7099303408192061440f3XQ2p.json eg: files/default/traces/default/0/
|
||||
// 2023/09/04/05/default/service_name=ingester/7104328279989026816guOA4t.json
|
||||
// 7099303408192061440f3XQ2p.json
|
||||
// eg: files/default/traces/default/0/023/09/04/05/default/
|
||||
// service_name=ingester/7104328279989026816guOA4t.json
|
||||
// let _ = columns[0].to_string(); // files/
|
||||
let org_id = columns[1].to_string();
|
||||
let stream_type = StreamType::from(columns[2]);
|
||||
|
@ -98,20 +100,20 @@ pub async fn move_files_to_storage() -> Result<(), anyhow::Error> {
|
|||
}
|
||||
|
||||
// check the file is using for write
|
||||
// if wal::check_in_use(
|
||||
// StreamParams::new(&org_id, &stream_name, stream_type),
|
||||
// &file_name,
|
||||
// )
|
||||
// .await
|
||||
// {
|
||||
// // println!("file is using for write, skip, {}", file_name);
|
||||
// continue;
|
||||
// }
|
||||
log::info!("[JOB] convert disk file: {}", file);
|
||||
if wal::check_in_use(
|
||||
StreamParams::new(&org_id, &stream_name, stream_type),
|
||||
&file_name,
|
||||
)
|
||||
.await
|
||||
{
|
||||
// println!("file is using for write, skip, {}", file_name);
|
||||
continue;
|
||||
}
|
||||
// log::info!("[JOB] convert json file: {}", file);
|
||||
|
||||
// check if we are allowed to ingest or just delete the file
|
||||
if db::compact::retention::is_deleting_stream(&org_id, &stream_name, stream_type, None) {
|
||||
log::info!(
|
||||
log::warn!(
|
||||
"[JOB] the stream [{}/{}/{}] is deleting, just delete file: {}",
|
||||
&org_id,
|
||||
stream_type,
|
||||
|
@ -120,7 +122,7 @@ pub async fn move_files_to_storage() -> Result<(), anyhow::Error> {
|
|||
);
|
||||
if let Err(e) = tokio::fs::remove_file(&local_file).await {
|
||||
log::error!(
|
||||
"[JOB] Failed to remove disk file from disk: {}, {}",
|
||||
"[JOB] Failed to remove json file from disk: {}, {}",
|
||||
local_file,
|
||||
e
|
||||
);
|
||||
|
@ -133,7 +135,7 @@ pub async fn move_files_to_storage() -> Result<(), anyhow::Error> {
|
|||
let ret =
|
||||
upload_file(&org_id, &stream_name, stream_type, &local_file, &file_name).await;
|
||||
if let Err(e) = ret {
|
||||
log::error!("[JOB] Error while uploading disk file to storage {}", e);
|
||||
log::error!("[JOB] Error while uploading json file to storage {}", e);
|
||||
drop(permit);
|
||||
return Ok(());
|
||||
}
|
||||
|
@ -142,7 +144,7 @@ pub async fn move_files_to_storage() -> Result<(), anyhow::Error> {
|
|||
let ret = db::file_list::local::set(&key, Some(meta.clone()), false).await;
|
||||
if let Err(e) = ret {
|
||||
log::error!(
|
||||
"[JOB] Failed write disk file meta: {}, error: {}",
|
||||
"[JOB] Failed write json file meta: {}, error: {}",
|
||||
local_file,
|
||||
e.to_string()
|
||||
);
|
||||
|
@ -153,7 +155,7 @@ pub async fn move_files_to_storage() -> Result<(), anyhow::Error> {
|
|||
// check if allowed to delete the file
|
||||
loop {
|
||||
if wal::lock_files_exists(&file_path).await {
|
||||
log::info!(
|
||||
log::warn!(
|
||||
"[JOB] the file is still in use, waiting for a few ms: {}",
|
||||
file_path
|
||||
);
|
||||
|
@ -166,7 +168,7 @@ pub async fn move_files_to_storage() -> Result<(), anyhow::Error> {
|
|||
let ret = tokio::fs::remove_file(&local_file).await;
|
||||
if let Err(e) = ret {
|
||||
log::error!(
|
||||
"[JOB] Failed to remove disk file from disk: {}, {}",
|
||||
"[JOB] Failed to remove json file from disk: {}, {}",
|
||||
local_file,
|
||||
e.to_string()
|
||||
);
|
||||
|
@ -191,7 +193,7 @@ pub async fn move_files_to_storage() -> Result<(), anyhow::Error> {
|
|||
|
||||
for task in tasks {
|
||||
if let Err(e) = task.await {
|
||||
log::error!("[JOB] Error while uploading disk file to storage {}", e);
|
||||
log::error!("[JOB] Error while uploading json file to storage {}", e);
|
||||
};
|
||||
}
|
||||
Ok(())
|
||||
|
@ -207,11 +209,11 @@ async fn upload_file(
|
|||
let mut file = fs::File::open(path_str).unwrap();
|
||||
let file_meta = file.metadata().unwrap();
|
||||
let file_size = file_meta.len();
|
||||
log::info!("[JOB] File upload begin: disk: {}", path_str);
|
||||
log::info!("[JOB] File upload begin: {}", path_str);
|
||||
if file_size == 0 {
|
||||
if let Err(e) = tokio::fs::remove_file(path_str).await {
|
||||
log::error!(
|
||||
"[JOB] Failed to remove disk file from disk: {}, {}",
|
||||
"[JOB] Failed to remove json file from disk: {}, {}",
|
||||
path_str,
|
||||
e
|
||||
);
|
||||
|
@ -261,8 +263,8 @@ async fn upload_file(
|
|||
path_str
|
||||
));
|
||||
}
|
||||
let value_iter = res_records.iter().map(Ok);
|
||||
infer_json_schema_from_iterator(value_iter, stream_type).unwrap()
|
||||
let value_iter = res_records.iter();
|
||||
infer_json_schema_from_values(value_iter, stream_type).unwrap()
|
||||
}
|
||||
};
|
||||
let arrow_schema = Arc::new(inferred_schema);
|
||||
|
@ -347,11 +349,11 @@ async fn upload_file(
|
|||
let file_name = new_file_name.to_owned();
|
||||
match storage::put(&new_file_name, bytes::Bytes::from(buf_parquet)).await {
|
||||
Ok(_) => {
|
||||
log::info!("[JOB] disk file upload succeeded: {}", file_name);
|
||||
log::info!("[JOB] File upload succeeded: {}", file_name);
|
||||
Ok((file_name, file_meta, stream_type))
|
||||
}
|
||||
Err(err) => {
|
||||
log::error!("[JOB] disk file upload error: {:?}", err);
|
||||
log::error!("[JOB] File upload error: {:?}", err);
|
||||
Err(anyhow::anyhow!(err))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
use std::{fs, io::Read, path::Path};
|
||||
use std::{fs, io::Read, path::Path, sync::Arc};
|
||||
|
||||
use config::{
|
||||
meta::stream::{FileMeta, StreamType},
|
||||
|
@ -21,6 +21,7 @@ use config::{
|
|||
utils::parquet::read_metadata,
|
||||
CONFIG,
|
||||
};
|
||||
use parquet::arrow::ParquetRecordBatchStreamBuilder;
|
||||
use tokio::{sync::Semaphore, task, time};
|
||||
|
||||
use crate::{
|
||||
|
@ -28,7 +29,7 @@ use crate::{
|
|||
infra::{cluster, storage, wal},
|
||||
utils::file::scan_files,
|
||||
},
|
||||
service::{db, usage::report_compression_stats},
|
||||
service::{db, schema::schema_evolution, usage::report_compression_stats},
|
||||
};
|
||||
|
||||
pub async fn run() -> Result<(), anyhow::Error> {
|
||||
|
@ -40,10 +41,10 @@ pub async fn run() -> Result<(), anyhow::Error> {
|
|||
}
|
||||
interval.tick().await;
|
||||
if let Err(e) = move_files_to_storage().await {
|
||||
log::error!("Error moving disk files to remote: {}", e);
|
||||
log::error!("Error moving parquet files to remote: {}", e);
|
||||
}
|
||||
}
|
||||
log::info!("job::files::disk is stopped");
|
||||
log::info!("job::files::parquet is stopped");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -71,8 +72,9 @@ pub async fn move_files_to_storage() -> Result<(), anyhow::Error> {
|
|||
let columns = file_path.splitn(5, '/').collect::<Vec<&str>>();
|
||||
|
||||
// eg: files/default/logs/olympics/0/2023/08/21/08/8b8a5451bbe1c44b/
|
||||
// 7099303408192061440f3XQ2p.json eg: files/default/traces/default/0/
|
||||
// 2023/09/04/05/default/service_name=ingester/7104328279989026816guOA4t.json
|
||||
// 7099303408192061440f3XQ2p.parquet
|
||||
// eg: files/default/traces/default/0/2023/09/04/05/default/
|
||||
// service_name=ingester/7104328279989026816guOA4t.parquet
|
||||
// let _ = columns[0].to_string(); // files/
|
||||
let org_id = columns[1].to_string();
|
||||
let stream_type = StreamType::from(columns[2]);
|
||||
|
@ -84,21 +86,9 @@ pub async fn move_files_to_storage() -> Result<(), anyhow::Error> {
|
|||
file_name = file_name.replace('_', "/");
|
||||
}
|
||||
|
||||
// check the file is using for write
|
||||
// if wal::check_in_use(
|
||||
// StreamParams::new(&org_id, &stream_name, stream_type),
|
||||
// &file_name,
|
||||
// )
|
||||
// .await
|
||||
// {
|
||||
// // println!("file is using for write, skip, {}", file_name);
|
||||
// continue;
|
||||
// }
|
||||
log::info!("[JOB] convert disk file: {}", file);
|
||||
|
||||
// check if we are allowed to ingest or just delete the file
|
||||
if db::compact::retention::is_deleting_stream(&org_id, &stream_name, stream_type, None) {
|
||||
log::info!(
|
||||
log::warn!(
|
||||
"[JOB] the stream [{}/{}/{}] is deleting, just delete file: {}",
|
||||
&org_id,
|
||||
stream_type,
|
||||
|
@ -107,7 +97,7 @@ pub async fn move_files_to_storage() -> Result<(), anyhow::Error> {
|
|||
);
|
||||
if let Err(e) = tokio::fs::remove_file(&local_file).await {
|
||||
log::error!(
|
||||
"[JOB] Failed to remove disk file from disk: {}, {}",
|
||||
"[JOB] Failed to remove parquet file from disk: {}, {}",
|
||||
local_file,
|
||||
e
|
||||
);
|
||||
|
@ -120,7 +110,7 @@ pub async fn move_files_to_storage() -> Result<(), anyhow::Error> {
|
|||
let ret =
|
||||
upload_file(&org_id, &stream_name, stream_type, &local_file, &file_name).await;
|
||||
if let Err(e) = ret {
|
||||
log::error!("[JOB] Error while uploading disk file to storage {}", e);
|
||||
log::error!("[JOB] Error while uploading parquet file to storage {}", e);
|
||||
drop(permit);
|
||||
return Ok(());
|
||||
}
|
||||
|
@ -129,7 +119,7 @@ pub async fn move_files_to_storage() -> Result<(), anyhow::Error> {
|
|||
let ret = db::file_list::local::set(&key, Some(meta.clone()), false).await;
|
||||
if let Err(e) = ret {
|
||||
log::error!(
|
||||
"[JOB] Failed write disk file meta: {}, error: {}",
|
||||
"[JOB] Failed write parquet file meta: {}, error: {}",
|
||||
local_file,
|
||||
e.to_string()
|
||||
);
|
||||
|
@ -140,7 +130,7 @@ pub async fn move_files_to_storage() -> Result<(), anyhow::Error> {
|
|||
// check if allowed to delete the file
|
||||
loop {
|
||||
if wal::lock_files_exists(&file_path).await {
|
||||
log::info!(
|
||||
log::warn!(
|
||||
"[JOB] the file is still in use, waiting for a few ms: {}",
|
||||
file_path
|
||||
);
|
||||
|
@ -153,7 +143,7 @@ pub async fn move_files_to_storage() -> Result<(), anyhow::Error> {
|
|||
let ret = tokio::fs::remove_file(&local_file).await;
|
||||
if let Err(e) = ret {
|
||||
log::error!(
|
||||
"[JOB] Failed to remove disk file from disk: {}, {}",
|
||||
"[JOB] Failed to remove parquet file from disk: {}, {}",
|
||||
local_file,
|
||||
e.to_string()
|
||||
);
|
||||
|
@ -178,7 +168,7 @@ pub async fn move_files_to_storage() -> Result<(), anyhow::Error> {
|
|||
|
||||
for task in tasks {
|
||||
if let Err(e) = task.await {
|
||||
log::error!("[JOB] Error while uploading disk file to storage {}", e);
|
||||
log::error!("[JOB] Error while uploading parquet file to storage {}", e);
|
||||
};
|
||||
}
|
||||
Ok(())
|
||||
|
@ -194,11 +184,11 @@ async fn upload_file(
|
|||
let mut file = fs::File::open(path_str).unwrap();
|
||||
let file_meta = file.metadata().unwrap();
|
||||
let file_size = file_meta.len();
|
||||
log::info!("[JOB] File upload begin: disk: {}", path_str);
|
||||
log::info!("[JOB] File upload begin: {}", path_str);
|
||||
if file_size == 0 {
|
||||
if let Err(e) = tokio::fs::remove_file(path_str).await {
|
||||
log::error!(
|
||||
"[JOB] Failed to remove disk file from disk: {}, {}",
|
||||
"[JOB] Failed to remove parquet file from disk: {}, {}",
|
||||
path_str,
|
||||
e
|
||||
);
|
||||
|
@ -218,15 +208,23 @@ async fn upload_file(
|
|||
let mut file_meta = read_metadata(&buf_parquet).await?;
|
||||
file_meta.compressed_size = file_size as i64;
|
||||
|
||||
// TODO ?
|
||||
// schema_evolution(
|
||||
// org_id,
|
||||
// stream_name,
|
||||
// stream_type,
|
||||
// arrow_schema,
|
||||
// file_meta.min_ts,
|
||||
// )
|
||||
// .await;
|
||||
// read schema
|
||||
let schema_reader = std::io::Cursor::new(buf_parquet.clone());
|
||||
let arrow_reader = ParquetRecordBatchStreamBuilder::new(schema_reader).await?;
|
||||
let inferred_schema = arrow_reader
|
||||
.schema()
|
||||
.as_ref()
|
||||
.clone()
|
||||
.with_metadata(std::collections::HashMap::new());
|
||||
|
||||
schema_evolution(
|
||||
org_id,
|
||||
stream_name,
|
||||
stream_type,
|
||||
Arc::new(inferred_schema),
|
||||
file_meta.min_ts,
|
||||
)
|
||||
.await;
|
||||
|
||||
let new_file_name =
|
||||
super::generate_storage_file_name(org_id, stream_type, stream_name, file_name);
|
||||
|
@ -234,11 +232,11 @@ async fn upload_file(
|
|||
let file_name = new_file_name.to_owned();
|
||||
match storage::put(&new_file_name, buf_parquet).await {
|
||||
Ok(_) => {
|
||||
log::info!("[JOB] disk file upload succeeded: {}", file_name);
|
||||
log::info!("[JOB] File upload succeeded: {}", file_name);
|
||||
Ok((file_name, file_meta, stream_type))
|
||||
}
|
||||
Err(err) => {
|
||||
log::error!("[JOB] disk file upload error: {:?}", err);
|
||||
log::error!("[JOB] File upload error: {:?}", err);
|
||||
Err(anyhow::anyhow!(err))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,7 +18,7 @@ use std::collections::{HashMap, HashSet};
|
|||
use actix_web::http;
|
||||
use arrow_schema::DataType;
|
||||
use chrono::{Duration, Local, TimeZone, Utc};
|
||||
use config::{meta::stream::StreamType, utils::schema_ext::SchemaExt, CONFIG};
|
||||
use config::{meta::stream::StreamType, CONFIG};
|
||||
|
||||
use crate::{
|
||||
common::{
|
||||
|
@ -80,8 +80,7 @@ pub async fn save(
|
|||
|
||||
// before saving alert check column type to decide numeric condition
|
||||
let schema = db::schema::get(org_id, stream_name, stream_type).await?;
|
||||
let fields = schema.to_cloned_fields();
|
||||
if stream_name.is_empty() || fields.is_empty() {
|
||||
if stream_name.is_empty() || schema.fields().is_empty() {
|
||||
return Err(anyhow::anyhow!("Stream {stream_name} not found"));
|
||||
}
|
||||
|
||||
|
|
|
@ -30,7 +30,7 @@ use tokio::{sync::Semaphore, task::JoinHandle};
|
|||
use crate::{
|
||||
common::{
|
||||
infra::{cache, file_list as infra_file_list, storage},
|
||||
meta::stream::StreamStats,
|
||||
meta::stream::{PartitionTimeLevel, StreamStats},
|
||||
utils::json,
|
||||
},
|
||||
service::{db, file_list, search::datafusion, stream},
|
||||
|
@ -84,6 +84,17 @@ pub async fn merge_by_stream(
|
|||
)
|
||||
.unwrap()
|
||||
.timestamp_micros();
|
||||
let offset_time_day = Utc
|
||||
.with_ymd_and_hms(
|
||||
offset_time.year(),
|
||||
offset_time.month(),
|
||||
offset_time.day(),
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
)
|
||||
.unwrap()
|
||||
.timestamp_micros();
|
||||
|
||||
// check offset
|
||||
let time_now: DateTime<Utc> = Utc::now();
|
||||
|
@ -98,10 +109,8 @@ pub async fn merge_by_stream(
|
|||
)
|
||||
.unwrap()
|
||||
.timestamp_micros();
|
||||
// 1. if step_secs less than 1 hour, must wait for at least
|
||||
// max_file_retention_time
|
||||
// 2. if step_secs greater than 1 hour, must wait for at least 3 *
|
||||
// max_file_retention_time
|
||||
// 1. if step_secs less than 1 hour, must wait for at least max_file_retention_time
|
||||
// 2. if step_secs greater than 1 hour, must wait for at least 3 * max_file_retention_time
|
||||
// -- first period: the last hour local file upload to storage, write file list
|
||||
// -- second period, the last hour file list upload to storage
|
||||
// -- third period, we can do the merge, so, at least 3 times of
|
||||
|
@ -122,12 +131,19 @@ pub async fn merge_by_stream(
|
|||
return Ok(()); // the time is future, just wait
|
||||
}
|
||||
|
||||
// get current hour all files
|
||||
let (partition_offset_start, partition_offset_end) = (
|
||||
offset_time_hour,
|
||||
offset_time_hour + Duration::hours(1).num_microseconds().unwrap()
|
||||
- Duration::seconds(1).num_microseconds().unwrap(),
|
||||
);
|
||||
// get current hour(day) all files
|
||||
let (partition_offset_start, partition_offset_end) =
|
||||
if partition_time_level == PartitionTimeLevel::Daily {
|
||||
(
|
||||
offset_time_day,
|
||||
offset_time_day + Duration::hours(24).num_microseconds().unwrap() - 1,
|
||||
)
|
||||
} else {
|
||||
(
|
||||
offset_time_hour,
|
||||
offset_time_hour + Duration::hours(1).num_microseconds().unwrap() - 1,
|
||||
)
|
||||
};
|
||||
let files = file_list::query(
|
||||
org_id,
|
||||
stream_name,
|
||||
|
|
|
@ -264,8 +264,7 @@ pub async fn run_merge() -> Result<(), anyhow::Error> {
|
|||
}
|
||||
|
||||
/// compactor delete files run steps:
|
||||
/// 1. get pending deleted files from file_list_deleted table, created_at > 2
|
||||
/// hours
|
||||
/// 1. get pending deleted files from file_list_deleted table, created_at > 2 hours
|
||||
/// 2. delete files from storage
|
||||
pub async fn run_delete_files() -> Result<(), anyhow::Error> {
|
||||
let now = Utc::now();
|
||||
|
|
|
@ -160,7 +160,7 @@ impl DistinctValues {
|
|||
data,
|
||||
Some(&schema_key),
|
||||
);
|
||||
let data = json::Value::Object(data.to_owned());
|
||||
let data = json::Value::Object(data.clone());
|
||||
let data_size = json::to_vec(&data).unwrap_or_default().len();
|
||||
|
||||
let hour_buf = buf.entry(hour_key).or_insert_with(|| SchemaRecords {
|
||||
|
@ -172,7 +172,7 @@ impl DistinctValues {
|
|||
hour_buf.records.push(Arc::new(data));
|
||||
hour_buf.records_size += data_size;
|
||||
}
|
||||
_ = ingestion::write_file(buf, 0, &stream_params, None).await;
|
||||
_ = ingestion::write_file(buf, 0, &stream_params).await;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -196,7 +196,6 @@ pub async fn save_enrichment_data(
|
|||
buf,
|
||||
thread_id,
|
||||
&StreamParams::new(org_id, stream_name, StreamType::EnrichmentTables),
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
req_stats.response_time = start.elapsed().as_secs_f64();
|
||||
|
|
|
@ -18,7 +18,7 @@ use opentelemetry_proto::tonic::{
|
|||
metrics::v1::{exemplar, number_data_point},
|
||||
};
|
||||
|
||||
use crate::{common::utils::json, service::ingestion::get_value};
|
||||
use crate::common::utils::json;
|
||||
|
||||
pub fn get_val(attr_val: &Option<&AnyValue>) -> json::Value {
|
||||
match attr_val {
|
||||
|
@ -108,7 +108,7 @@ pub fn get_exemplar_val(attr_val: &Option<exemplar::Value>) -> json::Value {
|
|||
pub fn get_val_for_attr(attr_val: json::Value) -> json::Value {
|
||||
let local_val = attr_val.as_object().unwrap();
|
||||
if let Some((_key, value)) = local_val.into_iter().next() {
|
||||
return serde_json::Value::String(get_value(value));
|
||||
return serde_json::Value::String(super::get_string_value(value));
|
||||
};
|
||||
().into()
|
||||
}
|
||||
|
|
|
@ -40,14 +40,13 @@ use crate::{
|
|||
utils::{
|
||||
flatten,
|
||||
functions::get_vrl_compiler_config,
|
||||
json::{Map, Value},
|
||||
json::{self, Map, Value},
|
||||
},
|
||||
},
|
||||
service::{db, format_partition_key, stream::stream_settings},
|
||||
};
|
||||
|
||||
pub mod grpc;
|
||||
pub mod otlp_json;
|
||||
|
||||
pub type TriggerAlertData = Option<Vec<(Alert, Vec<Map<String, Value>>)>>;
|
||||
|
||||
|
@ -261,14 +260,13 @@ pub fn register_stream_transforms(
|
|||
(local_trans, stream_vrl_map)
|
||||
}
|
||||
|
||||
pub fn apply_stream_transform<'a>(
|
||||
local_trans: &Vec<StreamTransform>,
|
||||
value: &'a Value,
|
||||
stream_vrl_map: &'a AHashMap<String, VRLResultResolver>,
|
||||
pub fn apply_stream_transform(
|
||||
local_trans: &[StreamTransform],
|
||||
mut value: Value,
|
||||
stream_vrl_map: &AHashMap<String, VRLResultResolver>,
|
||||
stream_name: &str,
|
||||
runtime: &mut Runtime,
|
||||
) -> Result<Value, anyhow::Error> {
|
||||
let mut value = value.clone();
|
||||
for trans in local_trans {
|
||||
let func_key = format!("{stream_name}/{}", trans.transform.name);
|
||||
if stream_vrl_map.contains_key(&func_key) && !value.is_null() {
|
||||
|
@ -276,7 +274,7 @@ pub fn apply_stream_transform<'a>(
|
|||
value = apply_vrl_fn(runtime, vrl_runtime, &value);
|
||||
}
|
||||
}
|
||||
flatten::flatten(&value)
|
||||
flatten::flatten(value)
|
||||
}
|
||||
|
||||
pub async fn chk_schema_by_record(
|
||||
|
@ -324,7 +322,6 @@ pub async fn write_file(
|
|||
buf: AHashMap<String, SchemaRecords>,
|
||||
thread_id: usize,
|
||||
stream: &StreamParams,
|
||||
_partition_time_level: Option<PartitionTimeLevel>,
|
||||
) -> RequestStats {
|
||||
let mut req_stats = RequestStats::default();
|
||||
for (hour_key, entry) in buf {
|
||||
|
@ -332,8 +329,6 @@ pub async fn write_file(
|
|||
continue;
|
||||
}
|
||||
let entry_records = entry.records.len();
|
||||
|
||||
// -- call new ingester
|
||||
let writer =
|
||||
ingester::get_writer(thread_id, &stream.org_id, &stream.stream_type.to_string()).await;
|
||||
writer
|
||||
|
@ -349,7 +344,6 @@ pub async fn write_file(
|
|||
)
|
||||
.await
|
||||
.unwrap();
|
||||
// -- end call new ingester
|
||||
|
||||
req_stats.size += entry.records_size as f64 / SIZE_IN_MB;
|
||||
req_stats.records += entry_records as i64;
|
||||
|
@ -357,22 +351,6 @@ pub async fn write_file(
|
|||
req_stats
|
||||
}
|
||||
|
||||
pub fn get_value(value: &Value) -> String {
|
||||
if value.is_boolean() {
|
||||
value.as_bool().unwrap().to_string()
|
||||
} else if value.is_f64() {
|
||||
value.as_f64().unwrap().to_string()
|
||||
} else if value.is_i64() {
|
||||
value.as_i64().unwrap().to_string()
|
||||
} else if value.is_u64() {
|
||||
value.as_u64().unwrap().to_string()
|
||||
} else if value.is_string() {
|
||||
value.as_str().unwrap().to_string()
|
||||
} else {
|
||||
value.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_ingestion_allowed(org_id: &str, stream_name: Option<&str>) -> Option<anyhow::Error> {
|
||||
if !cluster::is_ingester(&cluster::LOCAL_NODE_ROLE) {
|
||||
return Some(anyhow::anyhow!("not an ingester"));
|
||||
|
@ -391,6 +369,119 @@ pub fn is_ingestion_allowed(org_id: &str, stream_name: Option<&str>) -> Option<a
|
|||
None
|
||||
}
|
||||
|
||||
pub fn get_float_value(val: &Value) -> f64 {
|
||||
match val {
|
||||
Value::String(v) => v.parse::<f64>().unwrap_or(0.0),
|
||||
Value::Number(v) => v.as_f64().unwrap_or(0.0),
|
||||
_ => 0.0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_int_value(val: &Value) -> i64 {
|
||||
match val {
|
||||
Value::String(v) => v.parse::<i64>().unwrap_or(0),
|
||||
Value::Number(v) => v.as_i64().unwrap_or(0),
|
||||
_ => 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_string_value(value: &Value) -> String {
|
||||
if value.is_boolean() {
|
||||
value.as_bool().unwrap_or_default().to_string()
|
||||
} else if value.is_i64() {
|
||||
value.as_i64().unwrap_or_default().to_string()
|
||||
} else if value.is_u64() {
|
||||
value.as_u64().unwrap_or_default().to_string()
|
||||
} else if value.is_f64() {
|
||||
value.as_f64().unwrap_or_default().to_string()
|
||||
} else if value.is_string() {
|
||||
value.as_str().unwrap_or_default().to_string()
|
||||
} else {
|
||||
value.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_val_for_attr(attr_val: &Value) -> Value {
|
||||
let local_val = attr_val.as_object().unwrap();
|
||||
if let Some((key, value)) = local_val.into_iter().next() {
|
||||
match key.as_str() {
|
||||
"stringValue" | "string_value" => {
|
||||
return json::json!(get_string_value(value));
|
||||
}
|
||||
"boolValue" | "bool_value" => {
|
||||
return json::json!(value.as_bool().unwrap_or(false).to_string());
|
||||
}
|
||||
"intValue" | "int_value" => {
|
||||
return json::json!(get_int_value(value).to_string());
|
||||
}
|
||||
"doubleValue" | "double_value" => {
|
||||
return json::json!(get_float_value(value).to_string());
|
||||
}
|
||||
|
||||
"bytesValue" | "bytes_value" => {
|
||||
return json::json!(value.as_str().unwrap_or("").to_string());
|
||||
}
|
||||
|
||||
"arrayValue" | "array_value" => {
|
||||
let mut vals = vec![];
|
||||
for item in value
|
||||
.get("values")
|
||||
.unwrap()
|
||||
.as_array()
|
||||
.unwrap_or(&vec![])
|
||||
.iter()
|
||||
{
|
||||
vals.push(get_val_for_attr(item));
|
||||
}
|
||||
return json::json!(vals);
|
||||
}
|
||||
|
||||
"kvlistValue" | "kvlist_value" => {
|
||||
let mut vals = json::Map::new();
|
||||
for item in value
|
||||
.get("values")
|
||||
.unwrap()
|
||||
.as_array()
|
||||
.unwrap_or(&vec![])
|
||||
.iter()
|
||||
{
|
||||
let mut key = item.get("key").unwrap().as_str().unwrap_or("").to_string();
|
||||
flatten::format_key(&mut key);
|
||||
let value = item.get("value").unwrap().clone();
|
||||
vals.insert(key, get_val_for_attr(&value));
|
||||
}
|
||||
return json::json!(vals);
|
||||
}
|
||||
|
||||
_ => {
|
||||
return json::json!(get_string_value(value));
|
||||
}
|
||||
}
|
||||
};
|
||||
attr_val.clone()
|
||||
}
|
||||
|
||||
pub fn get_val_with_type_retained(val: &Value) -> Value {
|
||||
match val {
|
||||
Value::String(val) => {
|
||||
json::json!(val)
|
||||
}
|
||||
Value::Bool(val) => {
|
||||
json::json!(val)
|
||||
}
|
||||
Value::Number(val) => {
|
||||
json::json!(val)
|
||||
}
|
||||
Value::Array(val) => {
|
||||
json::json!(val)
|
||||
}
|
||||
Value::Object(val) => {
|
||||
json::json!(val)
|
||||
}
|
||||
Value::Null => Value::Null,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashMap;
|
||||
|
|
|
@ -1,104 +0,0 @@
|
|||
use crate::common::utils::{flatten::format_key, json};
|
||||
|
||||
pub fn get_float_value(val: &json::Value) -> f64 {
|
||||
match val {
|
||||
json::Value::String(v) => v.parse::<f64>().unwrap_or(0.0),
|
||||
json::Value::Number(v) => v.as_f64().unwrap_or(0.0),
|
||||
_ => 0.0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_int_value(val: &json::Value) -> i64 {
|
||||
match val {
|
||||
json::Value::String(v) => v.parse::<i64>().unwrap_or(0),
|
||||
json::Value::Number(v) => v.as_i64().unwrap_or(0),
|
||||
_ => 0,
|
||||
}
|
||||
}
|
||||
pub fn get_string_value(val: &json::Value) -> String {
|
||||
match val {
|
||||
json::Value::String(v) => v.to_string(),
|
||||
json::Value::Number(v) => v.as_i64().unwrap_or(0).to_string(),
|
||||
_ => "".to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_val_for_attr(attr_val: &json::Value) -> json::Value {
|
||||
let local_val = attr_val.as_object().unwrap();
|
||||
if let Some((key, value)) = local_val.into_iter().next() {
|
||||
match key.as_str() {
|
||||
"stringValue" | "string_value" => {
|
||||
return json::json!(get_string_value(value));
|
||||
}
|
||||
"boolValue" | "bool_value" => {
|
||||
return json::json!(value.as_bool().unwrap_or(false).to_string());
|
||||
}
|
||||
"intValue" | "int_value" => {
|
||||
return json::json!(get_int_value(value).to_string());
|
||||
}
|
||||
"doubleValue" | "double_value" => {
|
||||
return json::json!(get_float_value(value).to_string());
|
||||
}
|
||||
|
||||
"bytesValue" | "bytes_value" => {
|
||||
return json::json!(value.as_str().unwrap_or("").to_string());
|
||||
}
|
||||
|
||||
"arrayValue" | "array_value" => {
|
||||
let mut vals = vec![];
|
||||
for item in value
|
||||
.get("values")
|
||||
.unwrap()
|
||||
.as_array()
|
||||
.unwrap_or(&vec![])
|
||||
.iter()
|
||||
{
|
||||
vals.push(get_val_for_attr(item));
|
||||
}
|
||||
return json::json!(vals);
|
||||
}
|
||||
|
||||
"kvlistValue" | "kvlist_value" => {
|
||||
let mut vals = json::Map::new();
|
||||
for item in value
|
||||
.get("values")
|
||||
.unwrap()
|
||||
.as_array()
|
||||
.unwrap_or(&vec![])
|
||||
.iter()
|
||||
{
|
||||
let key = item.get("key").unwrap().as_str().unwrap_or("").to_string();
|
||||
let value = item.get("value").unwrap().clone();
|
||||
vals.insert(format_key(&key), get_val_for_attr(&value));
|
||||
}
|
||||
return json::json!(vals);
|
||||
}
|
||||
|
||||
_ => {
|
||||
return json::json!(get_string_value(value));
|
||||
}
|
||||
}
|
||||
};
|
||||
attr_val.clone()
|
||||
}
|
||||
|
||||
pub fn get_val_with_type_retained(val: &json::Value) -> json::Value {
|
||||
match val {
|
||||
json::Value::String(val) => {
|
||||
json::json!(val)
|
||||
}
|
||||
json::Value::Bool(val) => {
|
||||
json::json!(val)
|
||||
}
|
||||
json::Value::Number(val) => {
|
||||
json::json!(val)
|
||||
}
|
||||
json::Value::Array(val) => {
|
||||
json::json!(val)
|
||||
}
|
||||
json::Value::Object(val) => {
|
||||
json::json!(val)
|
||||
}
|
||||
json::Value::Null => json::Value::Null,
|
||||
}
|
||||
}
|
|
@ -40,7 +40,7 @@ use crate::{
|
|||
service::{
|
||||
db, distinct_values,
|
||||
ingestion::{evaluate_trigger, write_file, TriggerAlertData},
|
||||
schema::stream_schema_exists,
|
||||
schema::{get_upto_discard_error, stream_schema_exists},
|
||||
usage::report_request_usage_stats,
|
||||
},
|
||||
};
|
||||
|
@ -63,6 +63,11 @@ pub async fn ingest(
|
|||
return Err(anyhow::anyhow!("Quota exceeded for this organization"));
|
||||
}
|
||||
|
||||
// check memtable
|
||||
if let Err(e) = ingester::check_memtable_size() {
|
||||
return Err(anyhow::Error::msg(e.to_string()));
|
||||
}
|
||||
|
||||
// let mut errors = false;
|
||||
let mut bulk_res = BulkResponse {
|
||||
took: 0,
|
||||
|
@ -70,7 +75,7 @@ pub async fn ingest(
|
|||
items: vec![],
|
||||
};
|
||||
|
||||
let mut min_ts =
|
||||
let min_ts =
|
||||
(Utc::now() - Duration::hours(CONFIG.limit.ingest_allowed_upto)).timestamp_micros();
|
||||
|
||||
let mut runtime = crate::service::ingestion::init_functions_runtime();
|
||||
|
@ -164,13 +169,13 @@ pub async fn ingest(
|
|||
let key = format!("{org_id}/{}/{stream_name}", StreamType::Logs);
|
||||
|
||||
// JSON Flattening
|
||||
let mut value = flatten::flatten(&value)?;
|
||||
let mut value = flatten::flatten(value)?;
|
||||
|
||||
if let Some(transforms) = stream_transform_map.get(&key) {
|
||||
let mut ret_value = value.clone();
|
||||
ret_value = crate::service::ingestion::apply_stream_transform(
|
||||
transforms,
|
||||
&ret_value,
|
||||
ret_value,
|
||||
&stream_vrl_map,
|
||||
&stream_name,
|
||||
&mut runtime,
|
||||
|
@ -222,10 +227,9 @@ pub async fn ingest(
|
|||
None => Utc::now().timestamp_micros(),
|
||||
};
|
||||
// check ingestion time
|
||||
let earliest_time = Utc::now() - Duration::hours(CONFIG.limit.ingest_allowed_upto);
|
||||
if timestamp < earliest_time.timestamp_micros() {
|
||||
if timestamp < min_ts {
|
||||
bulk_res.errors = true;
|
||||
let failure_reason = Some(super::get_upto_discard_error());
|
||||
let failure_reason = Some(get_upto_discard_error().to_string());
|
||||
add_record_status(
|
||||
stream_name.clone(),
|
||||
doc_id.clone(),
|
||||
|
@ -237,9 +241,6 @@ pub async fn ingest(
|
|||
);
|
||||
continue;
|
||||
}
|
||||
if timestamp < min_ts {
|
||||
min_ts = timestamp;
|
||||
}
|
||||
local_val.insert(
|
||||
CONFIG.common.column_timestamp.clone(),
|
||||
json::Value::Number(timestamp.into()),
|
||||
|
@ -257,7 +258,7 @@ pub async fn ingest(
|
|||
let mut status = RecordStatus::default();
|
||||
let need_trigger = !stream_trigger_map.contains_key(&stream_name);
|
||||
|
||||
let local_trigger = super::add_valid_record_arrow(
|
||||
let local_trigger = match super::add_valid_record(
|
||||
&StreamMeta {
|
||||
org_id: org_id.to_string(),
|
||||
stream_name: stream_name.clone(),
|
||||
|
@ -271,7 +272,23 @@ pub async fn ingest(
|
|||
local_val,
|
||||
need_trigger,
|
||||
)
|
||||
.await;
|
||||
.await
|
||||
{
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
bulk_res.errors = true;
|
||||
add_record_status(
|
||||
stream_name.clone(),
|
||||
doc_id.clone(),
|
||||
action.clone(),
|
||||
value,
|
||||
&mut bulk_res,
|
||||
Some(TS_PARSE_FAILED.to_string()),
|
||||
Some(e.to_string()),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
if local_trigger.is_some() {
|
||||
stream_trigger_map.insert(stream_name.clone(), local_trigger);
|
||||
}
|
||||
|
@ -330,7 +347,6 @@ pub async fn ingest(
|
|||
stream_data.data,
|
||||
thread_id,
|
||||
&StreamParams::new(org_id, &stream_name, StreamType::Logs),
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
req_stats.response_time += time;
|
||||
|
|
|
@ -17,7 +17,6 @@ use std::io::{BufRead, Read};
|
|||
|
||||
use actix_web::http;
|
||||
use ahash::AHashMap;
|
||||
use bytes::Bytes;
|
||||
use chrono::{Duration, Utc};
|
||||
use config::{meta::stream::StreamType, metrics, CONFIG, DISTINCT_FIELDS};
|
||||
use datafusion::arrow::datatypes::Schema;
|
||||
|
@ -43,6 +42,7 @@ use crate::{
|
|||
distinct_values, get_formatted_stream_name,
|
||||
ingestion::{evaluate_trigger, is_ingestion_allowed, write_file, TriggerAlertData},
|
||||
logs::StreamMeta,
|
||||
schema::get_upto_discard_error,
|
||||
usage::report_request_usage_stats,
|
||||
},
|
||||
};
|
||||
|
@ -54,7 +54,7 @@ pub async fn ingest(
|
|||
thread_id: usize,
|
||||
) -> Result<IngestionResponse, anyhow::Error> {
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
// check stream
|
||||
let mut stream_schema_map: AHashMap<String, Schema> = AHashMap::new();
|
||||
let mut stream_params = StreamParams::new(org_id, in_stream_name, StreamType::Logs);
|
||||
let stream_name = &get_formatted_stream_name(&mut stream_params, &mut stream_schema_map).await;
|
||||
|
@ -62,20 +62,20 @@ pub async fn ingest(
|
|||
return Err(value);
|
||||
}
|
||||
|
||||
let mut min_ts =
|
||||
// check memtable
|
||||
if let Err(e) = ingester::check_memtable_size() {
|
||||
return Ok(IngestionResponse {
|
||||
code: http::StatusCode::SERVICE_UNAVAILABLE.into(),
|
||||
status: vec![],
|
||||
error: Some(e.to_string()),
|
||||
});
|
||||
}
|
||||
|
||||
let min_ts =
|
||||
(Utc::now() - Duration::hours(CONFIG.limit.ingest_allowed_upto)).timestamp_micros();
|
||||
|
||||
let mut runtime = crate::service::ingestion::init_functions_runtime();
|
||||
|
||||
let mut stream_alerts_map: AHashMap<String, Vec<Alert>> = AHashMap::new();
|
||||
let mut stream_status = StreamStatus::new(stream_name);
|
||||
let mut distinct_values = Vec::with_capacity(16);
|
||||
let mut trigger: TriggerAlertData = None;
|
||||
|
||||
let multi_req: &Bytes;
|
||||
let reader: Vec<json::Value>;
|
||||
|
||||
// Start Register Transforms for stream
|
||||
let mut runtime = crate::service::ingestion::init_functions_runtime();
|
||||
let (local_trans, stream_vrl_map) = crate::service::ingestion::register_stream_transforms(
|
||||
org_id,
|
||||
StreamType::Logs,
|
||||
|
@ -84,6 +84,7 @@ pub async fn ingest(
|
|||
// End Register Transforms for stream
|
||||
|
||||
// Start get stream alerts
|
||||
let mut stream_alerts_map: AHashMap<String, Vec<Alert>> = AHashMap::new();
|
||||
crate::service::ingestion::get_stream_alerts(
|
||||
org_id,
|
||||
StreamType::Logs,
|
||||
|
@ -91,113 +92,113 @@ pub async fn ingest(
|
|||
&mut stream_alerts_map,
|
||||
)
|
||||
.await;
|
||||
// End get stream alert
|
||||
// End get stream alerts
|
||||
|
||||
let mut stream_status = StreamStatus::new(stream_name);
|
||||
let mut distinct_values = Vec::with_capacity(16);
|
||||
let mut trigger: TriggerAlertData = None;
|
||||
|
||||
let partition_det =
|
||||
crate::service::ingestion::get_stream_partition_keys(stream_name, &stream_schema_map).await;
|
||||
let partition_keys = partition_det.partition_keys;
|
||||
let partition_time_level = partition_det.partition_time_level;
|
||||
|
||||
let mut buf: AHashMap<String, SchemaRecords> = AHashMap::new();
|
||||
let mut write_buf: AHashMap<String, SchemaRecords> = AHashMap::new();
|
||||
|
||||
let ep: &str;
|
||||
|
||||
let data = match in_req {
|
||||
let json_req: Vec<json::Value>; // to hold json request because of borrow checker
|
||||
let (ep, data) = match in_req {
|
||||
IngestionRequest::JSON(req) => {
|
||||
reader = json::from_slice(req).unwrap_or({
|
||||
json_req = json::from_slice(req).unwrap_or({
|
||||
let val: json::Value = json::from_slice(req)?;
|
||||
vec![val]
|
||||
});
|
||||
ep = "/api/org/ingest/logs/_json";
|
||||
IngestionData::JSON(&reader)
|
||||
}
|
||||
IngestionRequest::GCP(req) => {
|
||||
ep = "/api/org/ingest/logs/_gcs";
|
||||
IngestionData::GCP(req)
|
||||
}
|
||||
IngestionRequest::Multi(req) => {
|
||||
multi_req = req;
|
||||
ep = "/api/org/ingest/logs/_multi";
|
||||
IngestionData::Multi(multi_req)
|
||||
}
|
||||
IngestionRequest::KinesisFH(req) => {
|
||||
ep = "/api/org/ingest/logs/_kinesis";
|
||||
IngestionData::KinesisFH(req)
|
||||
("/api/org/ingest/logs/_json", IngestionData::JSON(&json_req))
|
||||
}
|
||||
IngestionRequest::GCP(req) => ("/api/org/ingest/logs/_gcs", IngestionData::GCP(req)),
|
||||
IngestionRequest::Multi(req) => ("/api/org/ingest/logs/_multi", IngestionData::Multi(req)),
|
||||
IngestionRequest::KinesisFH(req) => (
|
||||
"/api/org/ingest/logs/_kinesis",
|
||||
IngestionData::KinesisFH(req),
|
||||
),
|
||||
};
|
||||
|
||||
for rec in data.iter() {
|
||||
match rec {
|
||||
Ok(item) => {
|
||||
match apply_functions(
|
||||
&item,
|
||||
&local_trans,
|
||||
&stream_vrl_map,
|
||||
stream_name,
|
||||
&mut runtime,
|
||||
) {
|
||||
Ok(mut res) => {
|
||||
let local_val = res.as_object_mut().unwrap();
|
||||
|
||||
match handle_ts(local_val, min_ts) {
|
||||
Ok(t) => min_ts = t,
|
||||
Err(e) => {
|
||||
stream_status.status.failed += 1;
|
||||
stream_status.status.error = e.to_string();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
let local_trigger = super::add_valid_record_arrow(
|
||||
&StreamMeta {
|
||||
org_id: org_id.to_string(),
|
||||
stream_name: stream_name.to_string(),
|
||||
partition_keys: &partition_keys,
|
||||
partition_time_level: &partition_time_level,
|
||||
stream_alerts_map: &stream_alerts_map,
|
||||
},
|
||||
&mut stream_schema_map,
|
||||
&mut stream_status.status,
|
||||
&mut buf,
|
||||
local_val,
|
||||
trigger.is_none(),
|
||||
)
|
||||
.await;
|
||||
if local_trigger.is_some() {
|
||||
trigger = local_trigger;
|
||||
}
|
||||
|
||||
// get distinct_value item
|
||||
for field in DISTINCT_FIELDS.iter() {
|
||||
if let Some(val) = local_val.get(field) {
|
||||
if !val.is_null() {
|
||||
distinct_values.push(distinct_values::DvItem {
|
||||
stream_type: StreamType::Logs,
|
||||
stream_name: stream_name.to_string(),
|
||||
field_name: field.to_string(),
|
||||
field_value: val.as_str().unwrap().to_string(),
|
||||
filter_name: "".to_string(),
|
||||
filter_value: "".to_string(),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
stream_status.status.failed += 1;
|
||||
stream_status.status.error = e.to_string();
|
||||
continue;
|
||||
}
|
||||
};
|
||||
}
|
||||
for ret in data.iter() {
|
||||
let item = match ret {
|
||||
Ok(item) => item,
|
||||
Err(e) => {
|
||||
log::error!("Error: {:?}", e);
|
||||
return Err(anyhow::Error::msg("Failed processing"));
|
||||
log::error!("IngestionError: {:?}", e);
|
||||
return Err(anyhow::anyhow!("Failed processing: {:?}", e));
|
||||
}
|
||||
};
|
||||
|
||||
let mut res = match apply_functions(
|
||||
item,
|
||||
&local_trans,
|
||||
&stream_vrl_map,
|
||||
stream_name,
|
||||
&mut runtime,
|
||||
) {
|
||||
Ok(res) => res,
|
||||
Err(e) => {
|
||||
stream_status.status.failed += 1;
|
||||
stream_status.status.error = e.to_string();
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let local_val = res.as_object_mut().unwrap();
|
||||
if let Err(e) = handle_timestamp(local_val, min_ts) {
|
||||
stream_status.status.failed += 1;
|
||||
stream_status.status.error = e.to_string();
|
||||
continue;
|
||||
}
|
||||
|
||||
let local_trigger = match super::add_valid_record(
|
||||
&StreamMeta {
|
||||
org_id: org_id.to_string(),
|
||||
stream_name: stream_name.to_string(),
|
||||
partition_keys: &partition_keys,
|
||||
partition_time_level: &partition_time_level,
|
||||
stream_alerts_map: &stream_alerts_map,
|
||||
},
|
||||
&mut stream_schema_map,
|
||||
&mut stream_status.status,
|
||||
&mut write_buf,
|
||||
local_val,
|
||||
trigger.is_none(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
stream_status.status.failed += 1;
|
||||
stream_status.status.error = e.to_string();
|
||||
continue;
|
||||
}
|
||||
};
|
||||
if local_trigger.is_some() {
|
||||
trigger = local_trigger;
|
||||
}
|
||||
|
||||
// get distinct_value item
|
||||
for field in DISTINCT_FIELDS.iter() {
|
||||
if let Some(val) = local_val.get(field) {
|
||||
if !val.is_null() {
|
||||
distinct_values.push(distinct_values::DvItem {
|
||||
stream_type: StreamType::Logs,
|
||||
stream_name: stream_name.to_string(),
|
||||
field_name: field.to_string(),
|
||||
field_value: val.as_str().unwrap().to_string(),
|
||||
filter_name: "".to_string(),
|
||||
filter_value: "".to_string(),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// write to file
|
||||
let mut req_stats = write_file(buf, thread_id, &stream_params, None).await;
|
||||
let mut req_stats = write_file(write_buf, thread_id, &stream_params).await;
|
||||
|
||||
// send distinct_values
|
||||
if !distinct_values.is_empty() {
|
||||
|
@ -206,7 +207,7 @@ pub async fn ingest(
|
|||
}
|
||||
}
|
||||
|
||||
// only one trigger per request, as it updates etcd
|
||||
// only one trigger per request
|
||||
evaluate_trigger(trigger).await;
|
||||
|
||||
// update ingestion metrics
|
||||
|
@ -229,7 +230,6 @@ pub async fn ingest(
|
|||
StreamType::Logs.to_string().as_str(),
|
||||
])
|
||||
.inc();
|
||||
|
||||
req_stats.response_time = start.elapsed().as_secs_f64();
|
||||
|
||||
// report data usage
|
||||
|
@ -243,6 +243,7 @@ pub async fn ingest(
|
|||
)
|
||||
.await;
|
||||
|
||||
// drop variables
|
||||
drop(runtime);
|
||||
drop(stream_schema_map);
|
||||
drop(stream_vrl_map);
|
||||
|
@ -256,8 +257,8 @@ pub async fn ingest(
|
|||
}
|
||||
|
||||
pub fn apply_functions<'a>(
|
||||
item: &'a json::Value,
|
||||
local_trans: &Vec<StreamTransform>,
|
||||
item: json::Value,
|
||||
local_trans: &[StreamTransform],
|
||||
stream_vrl_map: &'a AHashMap<String, VRLResultResolver>,
|
||||
stream_name: &'a str,
|
||||
runtime: &mut Runtime,
|
||||
|
@ -267,7 +268,7 @@ pub fn apply_functions<'a>(
|
|||
if !local_trans.is_empty() {
|
||||
value = crate::service::ingestion::apply_stream_transform(
|
||||
local_trans,
|
||||
&value,
|
||||
value,
|
||||
stream_vrl_map,
|
||||
stream_name,
|
||||
runtime,
|
||||
|
@ -281,10 +282,10 @@ pub fn apply_functions<'a>(
|
|||
}
|
||||
}
|
||||
|
||||
pub fn handle_ts(
|
||||
pub fn handle_timestamp(
|
||||
local_val: &mut json::Map<String, json::Value>,
|
||||
mut min_ts: i64,
|
||||
) -> Result<i64, anyhow::Error> {
|
||||
min_ts: i64,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
// handle timestamp
|
||||
let timestamp = match local_val.get(&CONFIG.common.column_timestamp) {
|
||||
Some(v) => match parse_timestamp_micro_from_value(v) {
|
||||
|
@ -294,18 +295,14 @@ pub fn handle_ts(
|
|||
None => Utc::now().timestamp_micros(),
|
||||
};
|
||||
// check ingestion time
|
||||
let earliest_time = Utc::now() - Duration::hours(CONFIG.limit.ingest_allowed_upto);
|
||||
if timestamp < earliest_time.timestamp_micros() {
|
||||
return Err(anyhow::Error::msg(super::get_upto_discard_error()));
|
||||
}
|
||||
if timestamp < min_ts {
|
||||
min_ts = timestamp;
|
||||
return Err(get_upto_discard_error());
|
||||
}
|
||||
local_val.insert(
|
||||
CONFIG.common.column_timestamp.clone(),
|
||||
json::Value::Number(timestamp.into()),
|
||||
);
|
||||
Ok(min_ts)
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl<'a> Iterator for IngestionDataIter<'a> {
|
||||
|
|
|
@ -17,10 +17,10 @@ use std::{collections::HashMap, sync::Arc};
|
|||
|
||||
use ahash::AHashMap;
|
||||
use arrow_schema::{DataType, Field};
|
||||
use config::{meta::stream::StreamType, utils::hasher::get_fields_key_xxh3, CONFIG};
|
||||
use config::{meta::stream::StreamType, utils::schema_ext::SchemaExt, CONFIG};
|
||||
use datafusion::arrow::datatypes::Schema;
|
||||
|
||||
use super::ingestion::TriggerAlertData;
|
||||
use super::ingestion::{get_string_value, TriggerAlertData};
|
||||
use crate::{
|
||||
common::{
|
||||
meta::{
|
||||
|
@ -28,15 +28,10 @@ use crate::{
|
|||
ingestion::RecordStatus,
|
||||
stream::{PartitionTimeLevel, SchemaRecords},
|
||||
},
|
||||
utils::{
|
||||
self,
|
||||
json::{Map, Value},
|
||||
},
|
||||
utils::json::{self, Map, Value},
|
||||
},
|
||||
service::{
|
||||
ingestion::{get_value, get_wal_time_key},
|
||||
schema::check_for_schema,
|
||||
stream::unwrap_partition_time_level,
|
||||
ingestion::get_wal_time_key, schema::check_for_schema, stream::unwrap_partition_time_level,
|
||||
},
|
||||
};
|
||||
|
||||
|
@ -49,13 +44,6 @@ pub mod syslog;
|
|||
|
||||
static BULK_OPERATORS: [&str; 3] = ["create", "index", "update"];
|
||||
|
||||
pub(crate) fn get_upto_discard_error() -> String {
|
||||
format!(
|
||||
"Too old data, only last {} hours data can be ingested. Data discarded. You can adjust ingestion max time by setting the environment variable ZO_INGEST_ALLOWED_UPTO=<max_hours>",
|
||||
CONFIG.limit.ingest_allowed_upto
|
||||
)
|
||||
}
|
||||
|
||||
fn parse_bulk_index(v: &Value) -> Option<(String, String, String)> {
|
||||
let local_val = v.as_object().unwrap();
|
||||
for action in BULK_OPERATORS {
|
||||
|
@ -75,339 +63,155 @@ fn parse_bulk_index(v: &Value) -> Option<(String, String, String)> {
|
|||
None
|
||||
}
|
||||
|
||||
pub fn cast_to_type(mut value: Value, delta: Vec<Field>) -> (Option<String>, Option<String>) {
|
||||
let local_map = value.as_object_mut().unwrap();
|
||||
// let mut error_msg = String::new();
|
||||
pub fn cast_to_type(
|
||||
value: &mut Map<String, Value>,
|
||||
delta: Vec<Field>,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
let mut parse_error = String::new();
|
||||
for field in delta {
|
||||
let field_map = local_map.get(field.name());
|
||||
if let Some(val) = field_map {
|
||||
if val.is_null() {
|
||||
local_map.insert(field.name().clone(), val.clone());
|
||||
continue;
|
||||
}
|
||||
let local_val = get_value(val);
|
||||
match field.data_type() {
|
||||
DataType::Boolean => {
|
||||
match local_val.parse::<bool>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::Int8 => {
|
||||
match local_val.parse::<i8>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::Int16 => {
|
||||
match local_val.parse::<i16>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::Int32 => {
|
||||
match local_val.parse::<i32>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::Int64 => {
|
||||
match local_val.parse::<i64>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::UInt8 => {
|
||||
match local_val.parse::<u8>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::UInt16 => {
|
||||
match local_val.parse::<u16>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::UInt32 => {
|
||||
match local_val.parse::<u32>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::UInt64 => {
|
||||
match local_val.parse::<u64>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::Float16 => {
|
||||
match local_val.parse::<f32>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::Float32 => {
|
||||
match local_val.parse::<f32>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::Float64 => {
|
||||
match local_val.parse::<f64>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::Utf8 => {
|
||||
match local_val.parse::<String>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
_ => println!("{local_val:?}"),
|
||||
};
|
||||
let field_name = field.name().clone();
|
||||
let Some(val) = value.get(&field_name) else {
|
||||
continue;
|
||||
};
|
||||
if val.is_null() {
|
||||
value.insert(field_name, Value::Null);
|
||||
continue;
|
||||
}
|
||||
match field.data_type() {
|
||||
DataType::Utf8 => {
|
||||
if val.is_string() {
|
||||
continue;
|
||||
}
|
||||
value.insert(field_name, Value::String(get_string_value(val)));
|
||||
}
|
||||
DataType::Int64 | DataType::Int32 | DataType::Int16 | DataType::Int8 => {
|
||||
if val.is_i64() {
|
||||
continue;
|
||||
}
|
||||
let val = get_string_value(val);
|
||||
match val.parse::<i64>() {
|
||||
Ok(val) => {
|
||||
value.insert(field_name, Value::Number(val.into()));
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::UInt64 | DataType::UInt32 | DataType::UInt16 | DataType::UInt8 => {
|
||||
if val.is_u64() {
|
||||
continue;
|
||||
}
|
||||
let val = get_string_value(val);
|
||||
match val.parse::<u64>() {
|
||||
Ok(val) => {
|
||||
value.insert(field_name, Value::Number(val.into()));
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::Float64 | DataType::Float32 | DataType::Float16 => {
|
||||
if val.is_f64() {
|
||||
continue;
|
||||
}
|
||||
let val = get_string_value(val);
|
||||
match val.parse::<f64>() {
|
||||
Ok(val) => {
|
||||
value.insert(
|
||||
field_name,
|
||||
Value::Number(serde_json::Number::from_f64(val).unwrap()),
|
||||
);
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::Boolean => {
|
||||
if val.is_boolean() {
|
||||
continue;
|
||||
}
|
||||
let val = get_string_value(val);
|
||||
match val.parse::<bool>() {
|
||||
Ok(val) => {
|
||||
value.insert(field_name, Value::Bool(val));
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
_ => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
if parse_error.is_empty() {
|
||||
(Some(utils::json::to_string(&local_map).unwrap()), None)
|
||||
if !parse_error.is_empty() {
|
||||
Err(anyhow::Error::msg(parse_error))
|
||||
} else {
|
||||
(None, Some(parse_error))
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn cast_to_type_arrow(mut value: Value, delta: Vec<Field>) -> (Option<String>, Option<String>) {
|
||||
let local_map = value.as_object_mut().unwrap();
|
||||
// let mut error_msg = String::new();
|
||||
let mut parse_error = String::new();
|
||||
for field in delta {
|
||||
let field_map = local_map.get(field.name());
|
||||
if let Some(val) = field_map {
|
||||
if val.is_null() {
|
||||
local_map.insert(field.name().clone(), val.clone());
|
||||
continue;
|
||||
}
|
||||
let local_val = get_value(val);
|
||||
match field.data_type() {
|
||||
DataType::Boolean => {
|
||||
match local_val.parse::<bool>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::Int8 => {
|
||||
match local_val.parse::<i8>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::Int16 => {
|
||||
match local_val.parse::<i16>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::Int32 => {
|
||||
match local_val.parse::<i32>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::Int64 => {
|
||||
match local_val.parse::<i64>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::UInt8 => {
|
||||
match local_val.parse::<u8>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::UInt16 => {
|
||||
match local_val.parse::<u16>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::UInt32 => {
|
||||
match local_val.parse::<u32>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::UInt64 => {
|
||||
match local_val.parse::<u64>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::Float16 => {
|
||||
match local_val.parse::<f32>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::Float32 => {
|
||||
match local_val.parse::<f32>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::Float64 => {
|
||||
match local_val.parse::<f64>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
DataType::Utf8 => {
|
||||
match local_val.parse::<String>() {
|
||||
Ok(val) => {
|
||||
local_map.insert(field.name().clone(), val.into());
|
||||
}
|
||||
Err(_) => set_parsing_error(&mut parse_error, &field),
|
||||
};
|
||||
}
|
||||
_ => println!("{local_val:?}"),
|
||||
};
|
||||
}
|
||||
}
|
||||
if parse_error.is_empty() {
|
||||
// Convert the Map to a Vec of (String, Value) pairs
|
||||
let mut entries: Vec<_> = local_map.clone().into_iter().collect();
|
||||
entries.sort_by(|a, b| a.0.cmp(&b.0));
|
||||
|
||||
// Convert it back to a Map
|
||||
let sorted_map: Map<String, Value> = entries.into_iter().collect();
|
||||
(Some(utils::json::to_string(&sorted_map).unwrap()), None)
|
||||
} else {
|
||||
(None, Some(parse_error))
|
||||
}
|
||||
}
|
||||
|
||||
async fn add_valid_record_arrow(
|
||||
async fn add_valid_record(
|
||||
stream_meta: &StreamMeta<'_>,
|
||||
stream_schema_map: &mut AHashMap<String, Schema>,
|
||||
status: &mut RecordStatus,
|
||||
buf: &mut AHashMap<String, SchemaRecords>,
|
||||
local_val: &mut Map<String, Value>,
|
||||
write_buf: &mut AHashMap<String, SchemaRecords>,
|
||||
record_val: &mut Map<String, Value>,
|
||||
need_trigger: bool,
|
||||
) -> TriggerAlertData {
|
||||
) -> Result<TriggerAlertData, anyhow::Error> {
|
||||
let mut trigger: Vec<(Alert, Vec<Map<String, Value>>)> = Vec::new();
|
||||
let timestamp: i64 = local_val
|
||||
let timestamp: i64 = record_val
|
||||
.get(&CONFIG.common.column_timestamp)
|
||||
.unwrap()
|
||||
.as_i64()
|
||||
.unwrap();
|
||||
|
||||
let mut value_str = utils::json::to_string(&local_val).unwrap();
|
||||
// check schema
|
||||
let schema_evolution = check_for_schema(
|
||||
&stream_meta.org_id,
|
||||
&stream_meta.stream_name,
|
||||
StreamType::Logs,
|
||||
&value_str,
|
||||
stream_schema_map,
|
||||
&Value::Object(record_val.clone()),
|
||||
timestamp,
|
||||
true,
|
||||
)
|
||||
.await;
|
||||
.await?;
|
||||
|
||||
// get hour key
|
||||
let schema_key = get_fields_key_xxh3(&schema_evolution.schema_fields);
|
||||
let rec_schema = stream_schema_map.get(&stream_meta.stream_name).unwrap();
|
||||
let schema_key = rec_schema.hash_key();
|
||||
let hour_key = get_wal_time_key(
|
||||
timestamp,
|
||||
stream_meta.partition_keys,
|
||||
unwrap_partition_time_level(*stream_meta.partition_time_level, StreamType::Logs),
|
||||
local_val,
|
||||
record_val,
|
||||
Some(&schema_key),
|
||||
);
|
||||
|
||||
let rec_schema = stream_schema_map.get(&stream_meta.stream_name).unwrap();
|
||||
|
||||
if schema_evolution.schema_compatible {
|
||||
let valid_record = if schema_evolution.types_delta.is_some() {
|
||||
let delta = schema_evolution.types_delta.unwrap();
|
||||
let loc_value: Value = utils::json::from_slice(value_str.as_bytes()).unwrap();
|
||||
let (ret_val, error) = if !CONFIG.common.widening_schema_evolution {
|
||||
cast_to_type_arrow(loc_value, delta)
|
||||
let ret_val = if !CONFIG.common.widening_schema_evolution {
|
||||
cast_to_type(record_val, delta)
|
||||
} else if schema_evolution.is_schema_changed {
|
||||
let local_delta = delta
|
||||
.into_iter()
|
||||
.filter(|x| x.metadata().contains_key("zo_cast"))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if local_delta.is_empty() {
|
||||
(Some(value_str.clone()), None)
|
||||
if !local_delta.is_empty() {
|
||||
cast_to_type(record_val, local_delta)
|
||||
} else {
|
||||
cast_to_type_arrow(loc_value, local_delta)
|
||||
Ok(())
|
||||
}
|
||||
} else {
|
||||
cast_to_type_arrow(loc_value, delta)
|
||||
cast_to_type(record_val, delta)
|
||||
};
|
||||
if ret_val.is_some() {
|
||||
value_str = ret_val.unwrap();
|
||||
true
|
||||
} else {
|
||||
status.failed += 1;
|
||||
status.error = error.unwrap();
|
||||
false
|
||||
match ret_val {
|
||||
Ok(_) => true,
|
||||
Err(e) => {
|
||||
status.failed += 1;
|
||||
status.error = e.to_string();
|
||||
false
|
||||
}
|
||||
}
|
||||
} else {
|
||||
true
|
||||
};
|
||||
|
||||
if valid_record {
|
||||
if need_trigger && !stream_meta.stream_alerts_map.is_empty() {
|
||||
// Start check for alert trigger
|
||||
|
@ -419,17 +223,16 @@ async fn add_valid_record_arrow(
|
|||
);
|
||||
if let Some(alerts) = stream_meta.stream_alerts_map.get(&key) {
|
||||
for alert in alerts {
|
||||
if let Ok(Some(v)) = alert.evaluate(Some(local_val)).await {
|
||||
if let Ok(Some(v)) = alert.evaluate(Some(record_val)).await {
|
||||
trigger.push((alert.clone(), v));
|
||||
}
|
||||
}
|
||||
}
|
||||
// End check for alert trigger
|
||||
}
|
||||
let loc_value: Value = utils::json::from_slice(value_str.as_bytes()).unwrap();
|
||||
let hour_buf = buf.entry(hour_key).or_insert_with(|| {
|
||||
let schema_key = get_fields_key_xxh3(&schema_evolution.schema_fields);
|
||||
let hour_buf = write_buf.entry(hour_key).or_insert_with(|| {
|
||||
let schema = Arc::new(rec_schema.clone().with_metadata(HashMap::new()));
|
||||
let schema_key = schema.hash_key();
|
||||
SchemaRecords {
|
||||
schema_key,
|
||||
schema,
|
||||
|
@ -437,17 +240,19 @@ async fn add_valid_record_arrow(
|
|||
records_size: 0,
|
||||
}
|
||||
});
|
||||
hour_buf.records.push(Arc::new(loc_value));
|
||||
hour_buf.records_size += value_str.len();
|
||||
let record_val = Value::Object(record_val.clone());
|
||||
let record_size = json::to_vec(&record_val).unwrap_or_default().len();
|
||||
hour_buf.records.push(Arc::new(record_val));
|
||||
hour_buf.records_size += record_size;
|
||||
status.successful += 1;
|
||||
};
|
||||
} else {
|
||||
status.failed += 1;
|
||||
}
|
||||
if trigger.is_empty() {
|
||||
None
|
||||
Ok(None)
|
||||
} else {
|
||||
Some(trigger)
|
||||
Ok(Some(trigger))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -483,8 +288,7 @@ mod tests {
|
|||
let mut local_val = Map::new();
|
||||
local_val.insert("test".to_string(), Value::from("test13212"));
|
||||
let delta = vec![Field::new("test", DataType::Utf8, true)];
|
||||
let (ret_val, error) = cast_to_type(Value::from(local_val), delta);
|
||||
assert!(ret_val.is_some());
|
||||
assert!(error.is_none());
|
||||
let ret_val = cast_to_type(&mut local_val, delta);
|
||||
assert!(ret_val.is_ok());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -35,6 +35,7 @@ use crate::{
|
|||
distinct_values, get_formatted_stream_name,
|
||||
ingestion::{evaluate_trigger, is_ingestion_allowed, write_file, TriggerAlertData},
|
||||
logs::StreamMeta,
|
||||
schema::get_upto_discard_error,
|
||||
usage::report_request_usage_stats,
|
||||
},
|
||||
};
|
||||
|
@ -45,8 +46,7 @@ use crate::{
|
|||
/// - org_id: org id to ingest data in
|
||||
/// - in_stream_name: stream to write data in
|
||||
/// - body: incoming payload
|
||||
/// - extend_json: a hashmap of string -> string values which should be extended
|
||||
/// in each json row
|
||||
/// - extend_json: a hashmap of string -> string values which should be extended in each json row
|
||||
/// - thread_id: a unique thread-id associated with this process
|
||||
pub async fn ingest_with_keys(
|
||||
org_id: &str,
|
||||
|
@ -77,7 +77,7 @@ async fn ingest_inner(
|
|||
}
|
||||
let mut runtime = crate::service::ingestion::init_functions_runtime();
|
||||
|
||||
let mut min_ts =
|
||||
let min_ts =
|
||||
(Utc::now() - Duration::hours(CONFIG.limit.ingest_allowed_upto)).timestamp_micros();
|
||||
|
||||
let mut stream_alerts_map: AHashMap<String, Vec<Alert>> = AHashMap::new();
|
||||
|
@ -122,13 +122,13 @@ async fn ingest_inner(
|
|||
}
|
||||
|
||||
// JSON Flattening
|
||||
value = flatten::flatten(&value)?;
|
||||
value = flatten::flatten(value)?;
|
||||
// Start row based transform
|
||||
|
||||
if !local_trans.is_empty() {
|
||||
value = crate::service::ingestion::apply_stream_transform(
|
||||
&local_trans,
|
||||
&value,
|
||||
value,
|
||||
&stream_vrl_map,
|
||||
stream_name,
|
||||
&mut runtime,
|
||||
|
@ -157,14 +157,10 @@ async fn ingest_inner(
|
|||
None => Utc::now().timestamp_micros(),
|
||||
};
|
||||
// check ingestion time
|
||||
let earliest_time = Utc::now() - Duration::hours(CONFIG.limit.ingest_allowed_upto);
|
||||
if timestamp < earliest_time.timestamp_micros() {
|
||||
stream_status.status.failed += 1; // to old data, just discard
|
||||
stream_status.status.error = super::get_upto_discard_error();
|
||||
continue;
|
||||
}
|
||||
if timestamp < min_ts {
|
||||
min_ts = timestamp;
|
||||
stream_status.status.failed += 1; // to old data, just discard
|
||||
stream_status.status.error = get_upto_discard_error().to_string();
|
||||
continue;
|
||||
}
|
||||
local_val.insert(
|
||||
CONFIG.common.column_timestamp.clone(),
|
||||
|
@ -172,7 +168,7 @@ async fn ingest_inner(
|
|||
);
|
||||
|
||||
// write data
|
||||
let local_trigger = super::add_valid_record_arrow(
|
||||
let local_trigger = match super::add_valid_record(
|
||||
&StreamMeta {
|
||||
org_id: org_id.to_string(),
|
||||
stream_name: stream_name.to_string(),
|
||||
|
@ -186,7 +182,15 @@ async fn ingest_inner(
|
|||
local_val,
|
||||
trigger.is_none(),
|
||||
)
|
||||
.await;
|
||||
.await
|
||||
{
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
stream_status.status.failed += 1;
|
||||
stream_status.status.error = e.to_string();
|
||||
continue;
|
||||
}
|
||||
};
|
||||
if local_trigger.is_some() {
|
||||
trigger = local_trigger;
|
||||
}
|
||||
|
@ -209,7 +213,7 @@ async fn ingest_inner(
|
|||
}
|
||||
|
||||
// write to file
|
||||
let mut req_stats = write_file(buf, thread_id, &stream_params, partition_time_level).await;
|
||||
let mut req_stats = write_file(buf, thread_id, &stream_params).await;
|
||||
|
||||
// only one trigger per request, as it updates etcd
|
||||
evaluate_trigger(trigger).await;
|
||||
|
|
|
@ -45,7 +45,7 @@ use crate::{
|
|||
grpc::{get_val, get_val_with_type_retained},
|
||||
write_file, TriggerAlertData,
|
||||
},
|
||||
schema::stream_schema_exists,
|
||||
schema::{get_upto_discard_error, stream_schema_exists},
|
||||
usage::report_request_usage_stats,
|
||||
},
|
||||
};
|
||||
|
@ -77,7 +77,7 @@ pub async fn usage_ingest(
|
|||
return Err(anyhow::anyhow!("stream [{stream_name}] is being deleted"));
|
||||
}
|
||||
|
||||
let mut min_ts =
|
||||
let min_ts =
|
||||
(Utc::now() - Duration::hours(CONFIG.limit.ingest_allowed_upto)).timestamp_micros();
|
||||
|
||||
let mut stream_alerts_map: AHashMap<String, Vec<Alert>> = AHashMap::new();
|
||||
|
@ -102,7 +102,7 @@ pub async fn usage_ingest(
|
|||
|
||||
let mut buf: AHashMap<String, SchemaRecords> = AHashMap::new();
|
||||
let reader: Vec<json::Value> = json::from_slice(&body)?;
|
||||
for item in reader.iter() {
|
||||
for item in reader.into_iter() {
|
||||
// JSON Flattening
|
||||
let mut value = flatten::flatten(item)?;
|
||||
|
||||
|
@ -122,21 +122,17 @@ pub async fn usage_ingest(
|
|||
None => Utc::now().timestamp_micros(),
|
||||
};
|
||||
// check ingestion time
|
||||
let earlest_time = Utc::now() - Duration::hours(CONFIG.limit.ingest_allowed_upto);
|
||||
if timestamp < earlest_time.timestamp_micros() {
|
||||
stream_status.status.failed += 1; // to old data, just discard
|
||||
stream_status.status.error = super::get_upto_discard_error();
|
||||
continue;
|
||||
}
|
||||
if timestamp < min_ts {
|
||||
min_ts = timestamp;
|
||||
stream_status.status.failed += 1; // to old data, just discard
|
||||
stream_status.status.error = get_upto_discard_error().to_string();
|
||||
continue;
|
||||
}
|
||||
local_val.insert(
|
||||
CONFIG.common.column_timestamp.clone(),
|
||||
json::Value::Number(timestamp.into()),
|
||||
);
|
||||
|
||||
let local_trigger = super::add_valid_record_arrow(
|
||||
let local_trigger = match super::add_valid_record(
|
||||
&StreamMeta {
|
||||
org_id: org_id.to_string(),
|
||||
stream_name: stream_name.to_string(),
|
||||
|
@ -150,7 +146,15 @@ pub async fn usage_ingest(
|
|||
local_val,
|
||||
trigger.is_none(),
|
||||
)
|
||||
.await;
|
||||
.await
|
||||
{
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
stream_status.status.failed += 1;
|
||||
stream_status.status.error = e.to_string();
|
||||
continue;
|
||||
}
|
||||
};
|
||||
if local_trigger.is_some() {
|
||||
trigger = local_trigger;
|
||||
}
|
||||
|
@ -177,7 +181,6 @@ pub async fn usage_ingest(
|
|||
buf,
|
||||
thread_id,
|
||||
&StreamParams::new(org_id, stream_name, StreamType::Logs),
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
|
||||
|
@ -239,6 +242,17 @@ pub async fn handle_grpc_request(
|
|||
"Quota exceeded for this organization".to_string(),
|
||||
)));
|
||||
}
|
||||
|
||||
// check memtable
|
||||
if let Err(e) = ingester::check_memtable_size() {
|
||||
return Ok(
|
||||
HttpResponse::ServiceUnavailable().json(MetaHttpResponse::error(
|
||||
http::StatusCode::SERVICE_UNAVAILABLE.into(),
|
||||
e.to_string(),
|
||||
)),
|
||||
);
|
||||
}
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
let mut stream_schema_map: AHashMap<String, Schema> = AHashMap::new();
|
||||
let stream_name = match in_stream_name {
|
||||
|
@ -332,7 +346,7 @@ pub async fn handle_grpc_request(
|
|||
|
||||
if ts < earlest_time.timestamp_micros().try_into().unwrap() {
|
||||
stream_status.status.failed += 1; // to old data, just discard
|
||||
stream_status.status.error = super::get_upto_discard_error();
|
||||
stream_status.status.error = get_upto_discard_error().to_string();
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -373,12 +387,12 @@ pub async fn handle_grpc_request(
|
|||
};
|
||||
|
||||
// flattening
|
||||
rec = flatten::flatten(&rec)?;
|
||||
rec = flatten::flatten(rec)?;
|
||||
|
||||
if !local_trans.is_empty() {
|
||||
rec = crate::service::ingestion::apply_stream_transform(
|
||||
&local_trans,
|
||||
&rec,
|
||||
rec,
|
||||
&stream_vrl_map,
|
||||
stream_name,
|
||||
&mut runtime,
|
||||
|
@ -387,7 +401,7 @@ pub async fn handle_grpc_request(
|
|||
// get json object
|
||||
let local_val = rec.as_object_mut().unwrap();
|
||||
|
||||
let local_trigger = super::add_valid_record_arrow(
|
||||
let local_trigger = match super::add_valid_record(
|
||||
&StreamMeta {
|
||||
org_id: org_id.to_string(),
|
||||
stream_name: stream_name.to_string(),
|
||||
|
@ -401,7 +415,15 @@ pub async fn handle_grpc_request(
|
|||
local_val,
|
||||
trigger.is_none(),
|
||||
)
|
||||
.await;
|
||||
.await
|
||||
{
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
stream_status.status.failed += 1;
|
||||
stream_status.status.error = e.to_string();
|
||||
continue;
|
||||
}
|
||||
};
|
||||
if local_trigger.is_some() {
|
||||
trigger = local_trigger;
|
||||
}
|
||||
|
@ -430,7 +452,6 @@ pub async fn handle_grpc_request(
|
|||
data_buf,
|
||||
thread_id,
|
||||
&StreamParams::new(org_id, stream_name, StreamType::Logs),
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
|
||||
|
|
|
@ -42,11 +42,9 @@ use crate::{
|
|||
service::{
|
||||
db, distinct_values, get_formatted_stream_name,
|
||||
ingestion::{
|
||||
evaluate_trigger,
|
||||
otlp_json::{get_int_value, get_val_for_attr},
|
||||
write_file, TriggerAlertData,
|
||||
evaluate_trigger, get_int_value, get_val_for_attr, write_file, TriggerAlertData,
|
||||
},
|
||||
schema::stream_schema_exists,
|
||||
schema::{get_upto_discard_error, stream_schema_exists},
|
||||
usage::report_request_usage_stats,
|
||||
},
|
||||
};
|
||||
|
@ -101,6 +99,16 @@ pub async fn logs_json_handler(
|
|||
)));
|
||||
}
|
||||
|
||||
// check memtable
|
||||
if let Err(e) = ingester::check_memtable_size() {
|
||||
return Ok(
|
||||
HttpResponse::ServiceUnavailable().json(MetaHttpResponse::error(
|
||||
http::StatusCode::SERVICE_UNAVAILABLE.into(),
|
||||
e.to_string(),
|
||||
)),
|
||||
);
|
||||
}
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
let mut stream_schema_map: AHashMap<String, Schema> = AHashMap::new();
|
||||
let stream_name = match in_stream_name {
|
||||
|
@ -127,7 +135,7 @@ pub async fn logs_json_handler(
|
|||
let mut stream_status = StreamStatus::new(stream_name);
|
||||
let mut trigger: TriggerAlertData = None;
|
||||
|
||||
let mut min_ts =
|
||||
let min_ts =
|
||||
(Utc::now() - Duration::hours(CONFIG.limit.ingest_allowed_upto)).timestamp_micros();
|
||||
|
||||
let partition_det =
|
||||
|
@ -262,11 +270,9 @@ pub async fn logs_json_handler(
|
|||
let attributes = log.get("attributes").unwrap().as_array().unwrap();
|
||||
for res_attr in attributes {
|
||||
let local_attr = res_attr.as_object().unwrap();
|
||||
|
||||
local_val.insert(
|
||||
flatten::format_key(local_attr.get("key").unwrap().as_str().unwrap()),
|
||||
get_val_for_attr(local_attr.get("value").unwrap()),
|
||||
);
|
||||
let mut key = local_attr.get("key").unwrap().as_str().unwrap().to_string();
|
||||
flatten::format_key(&mut key);
|
||||
local_val.insert(key, get_val_for_attr(local_attr.get("value").unwrap()));
|
||||
}
|
||||
}
|
||||
// remove attributes after adding
|
||||
|
@ -306,14 +312,10 @@ pub async fn logs_json_handler(
|
|||
}
|
||||
|
||||
// check ingestion time
|
||||
let earliest_time = Utc::now() - Duration::hours(CONFIG.limit.ingest_allowed_upto);
|
||||
if timestamp < earliest_time.timestamp_micros() {
|
||||
stream_status.status.failed += 1; // to old data, just discard
|
||||
stream_status.status.error = super::get_upto_discard_error();
|
||||
continue;
|
||||
}
|
||||
if timestamp < min_ts {
|
||||
min_ts = timestamp;
|
||||
stream_status.status.failed += 1; // to old data, just discard
|
||||
stream_status.status.error = get_upto_discard_error().to_string();
|
||||
continue;
|
||||
}
|
||||
|
||||
local_val.insert(
|
||||
|
@ -323,25 +325,25 @@ pub async fn logs_json_handler(
|
|||
|
||||
local_val.append(&mut service_att_map.clone());
|
||||
|
||||
value = json::to_value(local_val).unwrap();
|
||||
value = json::to_value(local_val)?;
|
||||
|
||||
// JSON Flattening
|
||||
value = flatten::flatten(&value).unwrap();
|
||||
value = flatten::flatten(value).unwrap();
|
||||
|
||||
if !local_trans.is_empty() {
|
||||
value = crate::service::ingestion::apply_stream_transform(
|
||||
&local_trans,
|
||||
&value,
|
||||
value,
|
||||
&stream_vrl_map,
|
||||
stream_name,
|
||||
&mut runtime,
|
||||
)
|
||||
.unwrap_or(value);
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
local_val = value.as_object_mut().unwrap();
|
||||
|
||||
let local_trigger = super::add_valid_record_arrow(
|
||||
let local_trigger = match super::add_valid_record(
|
||||
&StreamMeta {
|
||||
org_id: org_id.to_string(),
|
||||
stream_name: stream_name.to_string(),
|
||||
|
@ -355,8 +357,15 @@ pub async fn logs_json_handler(
|
|||
local_val,
|
||||
trigger.is_none(),
|
||||
)
|
||||
.await;
|
||||
|
||||
.await
|
||||
{
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
stream_status.status.failed += 1;
|
||||
stream_status.status.error = e.to_string();
|
||||
continue;
|
||||
}
|
||||
};
|
||||
if local_trigger.is_some() {
|
||||
trigger = local_trigger;
|
||||
}
|
||||
|
@ -385,7 +394,6 @@ pub async fn logs_json_handler(
|
|||
buf,
|
||||
thread_id,
|
||||
&StreamParams::new(org_id, stream_name, StreamType::Logs),
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
|
||||
|
|
|
@ -38,6 +38,7 @@ use crate::{
|
|||
service::{
|
||||
db, distinct_values, get_formatted_stream_name,
|
||||
ingestion::{evaluate_trigger, write_file, TriggerAlertData},
|
||||
schema::get_upto_discard_error,
|
||||
},
|
||||
};
|
||||
|
||||
|
@ -118,12 +119,12 @@ pub async fn ingest(msg: &str, addr: SocketAddr) -> Result<HttpResponse, anyhow:
|
|||
|
||||
let parsed_msg = syslog_loose::parse_message(msg);
|
||||
let mut value = message_to_value(parsed_msg);
|
||||
value = flatten::flatten(&value).unwrap();
|
||||
value = flatten::flatten(value).unwrap();
|
||||
|
||||
if !local_trans.is_empty() {
|
||||
value = crate::service::ingestion::apply_stream_transform(
|
||||
&local_trans,
|
||||
&value,
|
||||
value,
|
||||
&stream_vrl_map,
|
||||
stream_name,
|
||||
&mut runtime,
|
||||
|
@ -149,7 +150,7 @@ pub async fn ingest(msg: &str, addr: SocketAddr) -> Result<HttpResponse, anyhow:
|
|||
let earlest_time = Utc::now() - Duration::hours(CONFIG.limit.ingest_allowed_upto);
|
||||
if timestamp < earlest_time.timestamp_micros() {
|
||||
stream_status.status.failed += 1; // to old data, just discard
|
||||
stream_status.status.error = super::get_upto_discard_error();
|
||||
stream_status.status.error = get_upto_discard_error().to_string();
|
||||
}
|
||||
|
||||
local_val.insert(
|
||||
|
@ -157,7 +158,7 @@ pub async fn ingest(msg: &str, addr: SocketAddr) -> Result<HttpResponse, anyhow:
|
|||
json::Value::Number(timestamp.into()),
|
||||
);
|
||||
|
||||
let local_trigger = super::add_valid_record_arrow(
|
||||
let local_trigger = match super::add_valid_record(
|
||||
&StreamMeta {
|
||||
org_id: org_id.to_string(),
|
||||
stream_name: stream_name.to_string(),
|
||||
|
@ -171,7 +172,15 @@ pub async fn ingest(msg: &str, addr: SocketAddr) -> Result<HttpResponse, anyhow:
|
|||
local_val,
|
||||
trigger.is_none(),
|
||||
)
|
||||
.await;
|
||||
.await
|
||||
{
|
||||
Ok(v) => v,
|
||||
Err(e) => {
|
||||
stream_status.status.failed += 1;
|
||||
stream_status.status.error = e.to_string();
|
||||
None
|
||||
}
|
||||
};
|
||||
if local_trigger.is_some() {
|
||||
trigger = local_trigger;
|
||||
}
|
||||
|
@ -192,7 +201,7 @@ pub async fn ingest(msg: &str, addr: SocketAddr) -> Result<HttpResponse, anyhow:
|
|||
}
|
||||
}
|
||||
|
||||
write_file(buf, thread_id, &stream_params, None).await;
|
||||
write_file(buf, thread_id, &stream_params).await;
|
||||
|
||||
// only one trigger per request, as it updates etcd
|
||||
evaluate_trigger(trigger).await;
|
||||
|
|
|
@ -58,6 +58,15 @@ pub async fn ingest(org_id: &str, body: web::Bytes, thread_id: usize) -> Result<
|
|||
return Err(anyhow::anyhow!("Quota exceeded for this organization"));
|
||||
}
|
||||
|
||||
// check memtable
|
||||
if let Err(e) = ingester::check_memtable_size() {
|
||||
return Ok(IngestionResponse {
|
||||
code: http::StatusCode::SERVICE_UNAVAILABLE.into(),
|
||||
status: vec![],
|
||||
error: Some(e.to_string()),
|
||||
});
|
||||
}
|
||||
|
||||
let mut runtime = crate::service::ingestion::init_functions_runtime();
|
||||
let mut stream_schema_map: AHashMap<String, Schema> = AHashMap::new();
|
||||
let mut stream_status_map: AHashMap<String, StreamStatus> = AHashMap::new();
|
||||
|
@ -65,7 +74,7 @@ pub async fn ingest(org_id: &str, body: web::Bytes, thread_id: usize) -> Result<
|
|||
let mut stream_partitioning_map: AHashMap<String, PartitioningDetails> = AHashMap::new();
|
||||
|
||||
let reader: Vec<json::Value> = json::from_slice(&body)?;
|
||||
for record in reader.iter() {
|
||||
for record in reader.into_iter() {
|
||||
// JSON Flattening
|
||||
let mut record = flatten::flatten(record)?;
|
||||
// check data type
|
||||
|
@ -117,8 +126,8 @@ pub async fn ingest(org_id: &str, body: web::Bytes, thread_id: usize) -> Result<
|
|||
}
|
||||
|
||||
// apply functions
|
||||
let mut record = json::Value::Object(record.to_owned());
|
||||
apply_func(&mut runtime, org_id, &stream_name, &mut record)?;
|
||||
let record = json::Value::Object(record.to_owned());
|
||||
let mut record = apply_func(&mut runtime, org_id, &stream_name, record)?;
|
||||
|
||||
let record = record.as_object_mut().unwrap();
|
||||
|
||||
|
@ -263,17 +272,10 @@ pub async fn ingest(org_id: &str, body: web::Bytes, thread_id: usize) -> Result<
|
|||
continue;
|
||||
}
|
||||
|
||||
let time_level = if let Some(details) = stream_partitioning_map.get(&stream_name) {
|
||||
details.partition_time_level
|
||||
} else {
|
||||
Some(CONFIG.limit.metrics_file_retention.as_str().into())
|
||||
};
|
||||
|
||||
let mut req_stats = write_file(
|
||||
stream_data,
|
||||
thread_id,
|
||||
&StreamParams::new(org_id, &stream_name, StreamType::Metrics),
|
||||
time_level,
|
||||
)
|
||||
.await;
|
||||
req_stats.response_time = time;
|
||||
|
@ -319,21 +321,19 @@ fn apply_func(
|
|||
runtime: &mut Runtime,
|
||||
org_id: &str,
|
||||
metric_name: &str,
|
||||
value: &mut json::Value,
|
||||
) -> Result<()> {
|
||||
value: json::Value,
|
||||
) -> Result<json::Value> {
|
||||
let (local_tans, stream_vrl_map) = crate::service::ingestion::register_stream_transforms(
|
||||
org_id,
|
||||
StreamType::Metrics,
|
||||
metric_name,
|
||||
);
|
||||
|
||||
*value = crate::service::ingestion::apply_stream_transform(
|
||||
crate::service::ingestion::apply_stream_transform(
|
||||
&local_tans,
|
||||
value,
|
||||
&stream_vrl_map,
|
||||
metric_name,
|
||||
runtime,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
)
|
||||
}
|
||||
|
|
|
@ -75,6 +75,17 @@ pub async fn handle_grpc_request(
|
|||
"Quota exceeded for this organisation".to_string(),
|
||||
)));
|
||||
}
|
||||
|
||||
// check memtable
|
||||
if let Err(e) = ingester::check_memtable_size() {
|
||||
return Ok(
|
||||
HttpResponse::ServiceUnavailable().json(MetaHttpResponse::error(
|
||||
http::StatusCode::SERVICE_UNAVAILABLE.into(),
|
||||
e.to_string(),
|
||||
)),
|
||||
);
|
||||
}
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
let mut runtime = crate::service::ingestion::init_functions_runtime();
|
||||
let mut metric_data_map: AHashMap<String, AHashMap<String, SchemaRecords>> = AHashMap::new();
|
||||
|
@ -186,7 +197,7 @@ pub async fn handle_grpc_request(
|
|||
|
||||
for mut rec in records {
|
||||
// flattening
|
||||
rec = flatten::flatten(&rec)?;
|
||||
rec = flatten::flatten(rec)?;
|
||||
|
||||
let local_metric_name =
|
||||
&format_stream_name(rec.get(NAME_LABEL).unwrap().as_str().unwrap());
|
||||
|
@ -253,7 +264,7 @@ pub async fn handle_grpc_request(
|
|||
if !local_trans.is_empty() {
|
||||
rec = crate::service::ingestion::apply_stream_transform(
|
||||
&local_trans,
|
||||
&rec,
|
||||
rec,
|
||||
&stream_vrl_map,
|
||||
local_metric_name,
|
||||
&mut runtime,
|
||||
|
@ -353,18 +364,11 @@ pub async fn handle_grpc_request(
|
|||
continue;
|
||||
}
|
||||
|
||||
let time_level = if let Some(details) = stream_partitioning_map.get(&stream_name) {
|
||||
details.partition_time_level
|
||||
} else {
|
||||
Some(CONFIG.limit.metrics_file_retention.as_str().into())
|
||||
};
|
||||
|
||||
// write to file
|
||||
let mut req_stats = write_file(
|
||||
stream_data,
|
||||
thread_id,
|
||||
&StreamParams::new(org_id, &stream_name, StreamType::Metrics),
|
||||
time_level,
|
||||
)
|
||||
.await;
|
||||
|
||||
|
|
|
@ -45,9 +45,8 @@ use crate::{
|
|||
service::{
|
||||
db, format_stream_name,
|
||||
ingestion::{
|
||||
chk_schema_by_record, evaluate_trigger,
|
||||
otlp_json::{get_float_value, get_int_value, get_string_value, get_val_for_attr},
|
||||
write_file, TriggerAlertData,
|
||||
chk_schema_by_record, evaluate_trigger, get_float_value, get_int_value,
|
||||
get_string_value, get_val_for_attr, write_file, TriggerAlertData,
|
||||
},
|
||||
metrics::{format_label_name, get_exclude_labels, otlp_grpc::handle_grpc_request},
|
||||
schema::{set_schema_metadata, stream_schema_exists},
|
||||
|
@ -99,6 +98,16 @@ pub async fn metrics_json_handler(
|
|||
)));
|
||||
}
|
||||
|
||||
// check memtable
|
||||
if let Err(e) = ingester::check_memtable_size() {
|
||||
return Ok(
|
||||
HttpResponse::ServiceUnavailable().json(MetaHttpResponse::error(
|
||||
http::StatusCode::SERVICE_UNAVAILABLE.into(),
|
||||
e.to_string(),
|
||||
)),
|
||||
);
|
||||
}
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
let mut runtime = crate::service::ingestion::init_functions_runtime();
|
||||
let mut metric_data_map: AHashMap<String, AHashMap<String, SchemaRecords>> = AHashMap::new();
|
||||
|
@ -281,7 +290,7 @@ pub async fn metrics_json_handler(
|
|||
|
||||
for mut rec in records {
|
||||
// flattening
|
||||
rec = flatten::flatten(&rec).expect("failed to flatten");
|
||||
rec = flatten::flatten(rec).expect("failed to flatten");
|
||||
// get json object
|
||||
|
||||
let local_metric_name =
|
||||
|
@ -349,12 +358,12 @@ pub async fn metrics_json_handler(
|
|||
if !local_trans.is_empty() {
|
||||
rec = crate::service::ingestion::apply_stream_transform(
|
||||
&local_trans,
|
||||
&rec,
|
||||
rec,
|
||||
&stream_vrl_map,
|
||||
local_metric_name,
|
||||
&mut runtime,
|
||||
)
|
||||
.unwrap_or(rec);
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
let val_map: &mut serde_json::Map<String, serde_json::Value> =
|
||||
|
@ -455,18 +464,11 @@ pub async fn metrics_json_handler(
|
|||
continue;
|
||||
}
|
||||
|
||||
let time_level = if let Some(details) = stream_partitioning_map.get(&stream_name) {
|
||||
details.partition_time_level
|
||||
} else {
|
||||
Some(CONFIG.limit.metrics_file_retention.as_str().into())
|
||||
};
|
||||
|
||||
// write to file
|
||||
let mut req_stats = write_file(
|
||||
stream_data,
|
||||
thread_id,
|
||||
&StreamParams::new(org_id, &stream_name, StreamType::Metrics),
|
||||
time_level,
|
||||
)
|
||||
.await;
|
||||
|
||||
|
|
|
@ -17,7 +17,7 @@ use std::{collections::HashMap, sync::Arc};
|
|||
|
||||
use actix_web::web;
|
||||
use ahash::AHashMap;
|
||||
use chrono::{Duration, TimeZone, Utc};
|
||||
use chrono::{TimeZone, Utc};
|
||||
use config::{meta::stream::StreamType, metrics, utils::schema_ext::SchemaExt, FxIndexMap, CONFIG};
|
||||
use datafusion::arrow::datatypes::Schema;
|
||||
use promql_parser::{label::MatchOp, parser};
|
||||
|
@ -70,8 +70,13 @@ pub async fn remote_write(
|
|||
return Err(anyhow::anyhow!("Quota exceeded for this organization"));
|
||||
}
|
||||
|
||||
let mut min_ts =
|
||||
(Utc::now() - Duration::hours(CONFIG.limit.ingest_allowed_upto)).timestamp_micros();
|
||||
// check memtable
|
||||
if let Err(e) = ingester::check_memtable_size() {
|
||||
return Err(anyhow::Error::msg(e.to_string()));
|
||||
}
|
||||
|
||||
// let min_ts = (Utc::now() -
|
||||
// Duration::hours(CONFIG.limit.ingest_allowed_upto)).timestamp_micros();
|
||||
let dedup_enabled = CONFIG.common.metrics_dedup_enabled;
|
||||
let election_interval = CONFIG.limit.metrics_leader_election_interval * 1000000;
|
||||
let mut last_received: i64 = 0;
|
||||
|
@ -169,9 +174,6 @@ pub async fn remote_write(
|
|||
};
|
||||
|
||||
let timestamp = parse_i64_to_timestamp_micros(sample.timestamp);
|
||||
if timestamp < min_ts {
|
||||
min_ts = timestamp;
|
||||
}
|
||||
|
||||
if first_line && dedup_enabled && !cluster_name.is_empty() {
|
||||
let lock = METRIC_CLUSTER_LEADER.read().await;
|
||||
|
@ -261,7 +263,7 @@ pub async fn remote_write(
|
|||
|
||||
value = crate::service::ingestion::apply_stream_transform(
|
||||
&local_trans,
|
||||
&value,
|
||||
value,
|
||||
&stream_vrl_map,
|
||||
&metric_name,
|
||||
&mut runtime,
|
||||
|
@ -357,18 +359,11 @@ pub async fn remote_write(
|
|||
continue;
|
||||
}
|
||||
|
||||
let time_level = if let Some(details) = stream_partitioning_map.get(&stream_name) {
|
||||
details.partition_time_level
|
||||
} else {
|
||||
Some(CONFIG.limit.metrics_file_retention.as_str().into())
|
||||
};
|
||||
|
||||
// write to file
|
||||
let mut req_stats = write_file(
|
||||
stream_data,
|
||||
thread_id,
|
||||
&StreamParams::new(org_id, &stream_name, StreamType::Metrics),
|
||||
time_level,
|
||||
)
|
||||
.await;
|
||||
|
||||
|
|
|
@ -23,7 +23,10 @@ use std::{
|
|||
use ahash::AHashMap;
|
||||
use config::{
|
||||
meta::stream::StreamType,
|
||||
utils::{schema::infer_json_schema, schema_ext::SchemaExt},
|
||||
utils::{
|
||||
schema::{infer_json_schema, infer_json_schema_from_values},
|
||||
schema_ext::SchemaExt,
|
||||
},
|
||||
CONFIG,
|
||||
};
|
||||
use datafusion::arrow::{
|
||||
|
@ -41,6 +44,20 @@ use crate::{
|
|||
service::{db, search::server_internal_error},
|
||||
};
|
||||
|
||||
pub(crate) fn get_upto_discard_error() -> anyhow::Error {
|
||||
anyhow::anyhow!(
|
||||
"Too old data, only last {} hours data can be ingested. Data discarded. You can adjust ingestion max time by setting the environment variable ZO_INGEST_ALLOWED_UPTO=<max_hours>",
|
||||
CONFIG.limit.ingest_allowed_upto
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn get_rquest_columns_limit_error() -> anyhow::Error {
|
||||
anyhow::anyhow!(
|
||||
"Too many cloumns, only {} columns accept. Data discarded. You can adjust ingestion columns limit by setting the environment variable ZO_COLS_PER_RECORD_LIMIT=<max_cloumns>",
|
||||
CONFIG.limit.req_cols_per_record_limit
|
||||
)
|
||||
}
|
||||
|
||||
#[tracing::instrument(name = "service:schema:schema_evolution", skip(inferred_schema))]
|
||||
pub async fn schema_evolution(
|
||||
org_id: &str,
|
||||
|
@ -262,11 +279,10 @@ pub async fn check_for_schema(
|
|||
org_id: &str,
|
||||
stream_name: &str,
|
||||
stream_type: StreamType,
|
||||
val_str: &str,
|
||||
stream_schema_map: &mut AHashMap<String, Schema>,
|
||||
record_val: &json::Value,
|
||||
record_ts: i64,
|
||||
is_arrow: bool,
|
||||
) -> SchemaEvolution {
|
||||
) -> Result<SchemaEvolution, anyhow::Error> {
|
||||
let mut schema = if stream_schema_map.contains_key(stream_name) {
|
||||
stream_schema_map.get(stream_name).unwrap().clone()
|
||||
} else {
|
||||
|
@ -278,38 +294,27 @@ pub async fn check_for_schema(
|
|||
};
|
||||
|
||||
if !schema.fields().is_empty() && CONFIG.common.skip_schema_validation {
|
||||
return SchemaEvolution {
|
||||
return Ok(SchemaEvolution {
|
||||
schema_compatible: true,
|
||||
types_delta: None,
|
||||
schema_fields: schema.to_cloned_fields(),
|
||||
is_schema_changed: false,
|
||||
record_schema: schema,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
let mut schema_reader = BufReader::new(val_str.as_bytes());
|
||||
let inferred_schema = infer_json_schema(&mut schema_reader, None, stream_type).unwrap();
|
||||
let value_iter = [record_val].into_iter();
|
||||
let inferred_schema = infer_json_schema_from_values(value_iter, stream_type).unwrap();
|
||||
|
||||
if schema.fields.eq(&inferred_schema.fields) {
|
||||
// return (true, None, schema.fields().to_vec());
|
||||
return SchemaEvolution {
|
||||
return Ok(SchemaEvolution {
|
||||
schema_compatible: true,
|
||||
types_delta: None,
|
||||
schema_fields: schema.to_cloned_fields(),
|
||||
is_schema_changed: false,
|
||||
record_schema: schema,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
if inferred_schema.fields.len() > CONFIG.limit.req_cols_per_record_limit {
|
||||
// return (false, None, inferred_schema.fields().to_vec());
|
||||
return SchemaEvolution {
|
||||
schema_compatible: false,
|
||||
types_delta: None,
|
||||
schema_fields: inferred_schema.to_cloned_fields(),
|
||||
is_schema_changed: false,
|
||||
record_schema: schema,
|
||||
};
|
||||
return Err(get_rquest_columns_limit_error());
|
||||
}
|
||||
|
||||
if schema.fields().is_empty() {
|
||||
|
@ -324,12 +329,12 @@ pub async fn check_for_schema(
|
|||
)
|
||||
.await
|
||||
{
|
||||
return value;
|
||||
return Ok(value);
|
||||
}
|
||||
};
|
||||
|
||||
let (field_datatype_delta, is_schema_changed, final_fields, record_schema) =
|
||||
get_schema_changes(&schema, &inferred_schema, is_arrow);
|
||||
let (is_schema_changed, field_datatype_delta, _) =
|
||||
get_schema_changes(&schema, &inferred_schema);
|
||||
|
||||
if is_schema_changed {
|
||||
if let Some(value) = handle_existing_schema(
|
||||
|
@ -339,28 +344,23 @@ pub async fn check_for_schema(
|
|||
&inferred_schema,
|
||||
record_ts,
|
||||
stream_schema_map,
|
||||
is_arrow,
|
||||
)
|
||||
.await
|
||||
{
|
||||
value
|
||||
Ok(value)
|
||||
} else {
|
||||
SchemaEvolution {
|
||||
Ok(SchemaEvolution {
|
||||
schema_compatible: true,
|
||||
types_delta: Some(field_datatype_delta),
|
||||
schema_fields: schema.to_cloned_fields(),
|
||||
is_schema_changed: false,
|
||||
record_schema,
|
||||
}
|
||||
})
|
||||
}
|
||||
} else {
|
||||
SchemaEvolution {
|
||||
Ok(SchemaEvolution {
|
||||
schema_compatible: true,
|
||||
types_delta: Some(field_datatype_delta),
|
||||
schema_fields: final_fields,
|
||||
is_schema_changed,
|
||||
record_schema,
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -371,7 +371,6 @@ async fn handle_existing_schema(
|
|||
inferred_schema: &Schema,
|
||||
record_ts: i64,
|
||||
stream_schema_map: &mut AHashMap<String, Schema>,
|
||||
is_arrow: bool,
|
||||
) -> Option<SchemaEvolution> {
|
||||
if !CONFIG.common.local_mode {
|
||||
let mut lock = etcd::Locker::new(&format!("schema/{org_id}/{stream_type}/{stream_name}"));
|
||||
|
@ -379,8 +378,8 @@ async fn handle_existing_schema(
|
|||
let schema = db::schema::get_from_db(org_id, stream_name, stream_type)
|
||||
.await
|
||||
.unwrap();
|
||||
let (field_datatype_delta, is_schema_changed, final_fields, _) =
|
||||
get_schema_changes(&schema, inferred_schema, is_arrow);
|
||||
let (is_schema_changed, field_datatype_delta, final_fields) =
|
||||
get_schema_changes(&schema, inferred_schema);
|
||||
let is_field_delta = !field_datatype_delta.is_empty();
|
||||
let mut metadata = schema.metadata().clone();
|
||||
if !metadata.contains_key("created_at") {
|
||||
|
@ -390,7 +389,7 @@ async fn handle_existing_schema(
|
|||
);
|
||||
}
|
||||
metadata.extend(inferred_schema.metadata().to_owned());
|
||||
let final_schema = Schema::new(final_fields.clone()).with_metadata(metadata);
|
||||
let final_schema = Schema::new(final_fields).with_metadata(metadata);
|
||||
if is_schema_changed {
|
||||
log::info!(
|
||||
"Acquired lock for cluster stream {} to update schema",
|
||||
|
@ -415,9 +414,7 @@ async fn handle_existing_schema(
|
|||
Some(SchemaEvolution {
|
||||
schema_compatible: true,
|
||||
types_delta: Some(field_datatype_delta),
|
||||
schema_fields: final_fields,
|
||||
is_schema_changed,
|
||||
record_schema: final_schema,
|
||||
})
|
||||
} else {
|
||||
let key = format!(
|
||||
|
@ -436,8 +433,8 @@ async fn handle_existing_schema(
|
|||
let schema = db::schema::get_from_db(org_id, stream_name, stream_type)
|
||||
.await
|
||||
.unwrap();
|
||||
let (field_datatype_delta, is_schema_changed, final_fields, _) =
|
||||
get_schema_changes(&schema, inferred_schema, is_arrow);
|
||||
let (is_schema_changed, field_datatype_delta, final_fields) =
|
||||
get_schema_changes(&schema, inferred_schema);
|
||||
let is_field_delta = !field_datatype_delta.is_empty();
|
||||
let mut metadata = schema.metadata().clone();
|
||||
if !metadata.contains_key("created_at") {
|
||||
|
@ -447,7 +444,7 @@ async fn handle_existing_schema(
|
|||
);
|
||||
}
|
||||
metadata.extend(inferred_schema.metadata().to_owned());
|
||||
let final_schema = Schema::new(final_fields.clone()).with_metadata(metadata);
|
||||
let final_schema = Schema::new(final_fields).with_metadata(metadata);
|
||||
if is_schema_changed {
|
||||
log::info!(
|
||||
"Acquired lock for local stream {} to update schema",
|
||||
|
@ -473,26 +470,21 @@ async fn handle_existing_schema(
|
|||
Some(SchemaEvolution {
|
||||
schema_compatible: true,
|
||||
types_delta: Some(field_datatype_delta),
|
||||
schema_fields: final_fields,
|
||||
is_schema_changed,
|
||||
record_schema: final_schema,
|
||||
})
|
||||
} else {
|
||||
// Some other request has already acquired the lock.
|
||||
let schema = db::schema::get_from_db(org_id, stream_name, stream_type)
|
||||
.await
|
||||
.unwrap();
|
||||
let (field_datatype_delta, _is_schema_changed, final_fields, _) =
|
||||
get_schema_changes(&schema, inferred_schema, is_arrow);
|
||||
let (_, field_datatype_delta, _) = get_schema_changes(&schema, inferred_schema);
|
||||
stream_schema_map.insert(stream_name.to_string(), schema.clone());
|
||||
log::info!("Schema exists for stream {} ", stream_name);
|
||||
drop(lock_acquired); // release lock
|
||||
Some(SchemaEvolution {
|
||||
schema_compatible: true,
|
||||
types_delta: Some(field_datatype_delta),
|
||||
schema_fields: final_fields,
|
||||
is_schema_changed: false,
|
||||
record_schema: schema,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -555,9 +547,7 @@ async fn handle_new_schema(
|
|||
return Some(SchemaEvolution {
|
||||
schema_compatible: true,
|
||||
types_delta: None,
|
||||
schema_fields: final_schema.to_cloned_fields(),
|
||||
is_schema_changed: true,
|
||||
record_schema: final_schema,
|
||||
});
|
||||
} else {
|
||||
stream_schema_map.insert(stream_name.to_string(), chk_schema.clone());
|
||||
|
@ -610,9 +600,7 @@ async fn handle_new_schema(
|
|||
return Some(SchemaEvolution {
|
||||
schema_compatible: true,
|
||||
types_delta: None,
|
||||
schema_fields: final_schema.to_cloned_fields(),
|
||||
is_schema_changed: true,
|
||||
record_schema: final_schema,
|
||||
});
|
||||
} else {
|
||||
// No schema change
|
||||
|
@ -639,21 +627,16 @@ async fn handle_new_schema(
|
|||
None
|
||||
}
|
||||
|
||||
fn get_schema_changes(
|
||||
schema: &Schema,
|
||||
inferred_schema: &Schema,
|
||||
_is_arrow: bool,
|
||||
) -> (Vec<Field>, bool, Vec<Field>, Schema) {
|
||||
fn get_schema_changes(schema: &Schema, inferred_schema: &Schema) -> (bool, Vec<Field>, Vec<Field>) {
|
||||
let mut is_schema_changed = false;
|
||||
let mut field_datatype_delta: Vec<_> = vec![];
|
||||
let mut new_field_delta: Vec<_> = vec![];
|
||||
|
||||
let mut merged_fields = schema
|
||||
.fields()
|
||||
.iter()
|
||||
.map(|f| f.as_ref().to_owned())
|
||||
.collect::<Vec<_>>();
|
||||
let mut merged_fields_chk: AHashMap<String, usize> = AHashMap::new();
|
||||
let mut merged_fields_chk = hashbrown::HashMap::with_capacity(merged_fields.len());
|
||||
for (i, f) in merged_fields.iter().enumerate() {
|
||||
merged_fields_chk.insert(f.name().to_string(), i);
|
||||
}
|
||||
|
@ -663,41 +646,31 @@ fn get_schema_changes(
|
|||
let item_data_type = item.data_type();
|
||||
|
||||
match merged_fields_chk.get(item_name) {
|
||||
None => {
|
||||
is_schema_changed = true;
|
||||
merged_fields.push((**item).clone());
|
||||
merged_fields_chk.insert(item_name.to_string(), merged_fields.len() - 1);
|
||||
}
|
||||
Some(idx) => {
|
||||
let existing_field = &merged_fields[*idx];
|
||||
if existing_field.data_type() != item_data_type {
|
||||
if !CONFIG.common.widening_schema_evolution {
|
||||
field_datatype_delta.push(existing_field.clone());
|
||||
} else if is_widening_conversion(existing_field.data_type(), item_data_type) {
|
||||
is_schema_changed = true;
|
||||
field_datatype_delta.push((**item).clone());
|
||||
merged_fields[*idx] = (**item).clone();
|
||||
} else {
|
||||
let allowed =
|
||||
is_widening_conversion(existing_field.data_type(), item_data_type);
|
||||
if allowed {
|
||||
is_schema_changed = true;
|
||||
field_datatype_delta.push((**item).clone());
|
||||
merged_fields[*idx] = (**item).clone();
|
||||
} else {
|
||||
let mut meta = existing_field.metadata().clone();
|
||||
meta.insert("zo_cast".to_owned(), true.to_string());
|
||||
field_datatype_delta.push(existing_field.clone().with_metadata(meta));
|
||||
}
|
||||
let mut meta = existing_field.metadata().clone();
|
||||
meta.insert("zo_cast".to_owned(), true.to_string());
|
||||
field_datatype_delta.push(existing_field.clone().with_metadata(meta));
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {
|
||||
is_schema_changed = true;
|
||||
new_field_delta.push(item);
|
||||
merged_fields.push((**item).clone());
|
||||
merged_fields_chk.insert(item_name.to_string(), merged_fields.len() - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(
|
||||
field_datatype_delta,
|
||||
is_schema_changed,
|
||||
merged_fields,
|
||||
Schema::empty(),
|
||||
)
|
||||
(is_schema_changed, field_datatype_delta, merged_fields)
|
||||
}
|
||||
|
||||
pub async fn stream_schema_exists(
|
||||
|
@ -859,7 +832,8 @@ mod tests {
|
|||
async fn test_check_for_schema() {
|
||||
let stream_name = "Sample";
|
||||
let org_name = "nexus";
|
||||
let record = r#"{"Year": 1896, "City": "Athens", "_timestamp": 1234234234234}"#;
|
||||
let record =
|
||||
json::json!(r#"{"Year": 1896, "City": "Athens", "_timestamp": 1234234234234}"#);
|
||||
|
||||
let schema = Schema::new(vec![
|
||||
Field::new("Year", DataType::Int64, false),
|
||||
|
@ -872,12 +846,12 @@ mod tests {
|
|||
org_name,
|
||||
stream_name,
|
||||
StreamType::Logs,
|
||||
record,
|
||||
&mut map,
|
||||
&record,
|
||||
1234234234234,
|
||||
false,
|
||||
)
|
||||
.await;
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(result.schema_compatible);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -56,8 +56,8 @@ pub fn date_format_expr_impl() -> ScalarFunctionImplementation {
|
|||
)));
|
||||
}
|
||||
|
||||
// 1. cast both arguments to Union. These casts MUST be aligned with the
|
||||
// signature or this function panics!
|
||||
// 1. cast both arguments to Union. These casts MUST be aligned with the signature or this
|
||||
// function panics!
|
||||
let timestamp = &args[0]
|
||||
.as_any()
|
||||
.downcast_ref::<Int64Array>()
|
||||
|
|
|
@ -18,7 +18,7 @@ use std::{str::FromStr, sync::Arc};
|
|||
use ahash::AHashMap as HashMap;
|
||||
use config::{
|
||||
meta::stream::{FileKey, FileMeta, StreamType},
|
||||
utils::{parquet::new_parquet_writer, schema::infer_json_schema_from_iterator},
|
||||
utils::{parquet::new_parquet_writer, schema::infer_json_schema_from_values},
|
||||
CONFIG, PARQUET_BATCH_SIZE,
|
||||
};
|
||||
use datafusion::{
|
||||
|
@ -126,6 +126,7 @@ pub async fn sql(
|
|||
)
|
||||
.await?,
|
||||
);
|
||||
let mut spend_time = start.elapsed().as_secs_f64();
|
||||
|
||||
// get alias from context query for agg sql
|
||||
let meta_sql = sql::Sql::new(&sql.query_context);
|
||||
|
@ -182,10 +183,13 @@ pub async fn sql(
|
|||
}
|
||||
let batches = df.collect().await?;
|
||||
result.insert(format!("agg_{name}"), batches);
|
||||
|
||||
let q_time = start.elapsed().as_secs_f64();
|
||||
log::info!(
|
||||
"[session_id {session_id}] Query agg:{name} took {:.3} seconds.",
|
||||
start.elapsed().as_secs_f64()
|
||||
q_time - spend_time
|
||||
);
|
||||
spend_time = q_time;
|
||||
}
|
||||
|
||||
// drop table
|
||||
|
@ -1023,6 +1027,9 @@ pub fn create_session_config(search_type: &SearchType) -> Result<SessionConfig>
|
|||
if CONFIG.common.bloom_filter_enabled {
|
||||
config = config.set_bool("datafusion.execution.parquet.bloom_filter_enabled", true);
|
||||
}
|
||||
if CONFIG.common.bloom_filter_force_disabled {
|
||||
config = config.set_bool("datafusion.execution.parquet.bloom_filter_enabled", false);
|
||||
}
|
||||
Ok(config)
|
||||
}
|
||||
|
||||
|
@ -1184,12 +1191,12 @@ fn apply_query_fn(
|
|||
},
|
||||
&json::Value::Object(hit.clone()),
|
||||
);
|
||||
(!ret_val.is_null()).then_some(flatten::flatten(&ret_val).unwrap_or(ret_val))
|
||||
(!ret_val.is_null()).then_some(flatten::flatten(ret_val).unwrap())
|
||||
})
|
||||
.collect();
|
||||
|
||||
let value_iter = rows_val.iter().map(Ok);
|
||||
let inferred_schema = infer_json_schema_from_iterator(value_iter, stream_type).unwrap();
|
||||
let value_iter = rows_val.iter();
|
||||
let inferred_schema = infer_json_schema_from_values(value_iter, stream_type).unwrap();
|
||||
let mut decoder =
|
||||
arrow::json::ReaderBuilder::new(Arc::new(inferred_schema)).build_decoder()?;
|
||||
|
||||
|
|
|
@ -65,8 +65,8 @@ pub fn match_expr_impl(case_insensitive: bool) -> ScalarFunctionImplementation {
|
|||
)));
|
||||
}
|
||||
|
||||
// 1. cast both arguments to string. These casts MUST be aligned with the
|
||||
// signature or this function panics!
|
||||
// 1. cast both arguments to string. These casts MUST be aligned with the signature or this
|
||||
// function panics!
|
||||
let haystack = &args[0]
|
||||
.as_any()
|
||||
.downcast_ref::<StringArray>()
|
||||
|
|
|
@ -55,8 +55,8 @@ pub fn time_range_expr_impl() -> ScalarFunctionImplementation {
|
|||
)));
|
||||
}
|
||||
|
||||
// 1. cast both arguments to Union. These casts MUST be aligned with the
|
||||
// signature or this function panics!
|
||||
// 1. cast both arguments to Union. These casts MUST be aligned with the signature or this
|
||||
// function panics!
|
||||
let base = &args[0]
|
||||
.as_any()
|
||||
.downcast_ref::<Int64Array>()
|
||||
|
|
|
@ -30,7 +30,7 @@ use vrl::compiler::{runtime::Runtime, CompilationResult, Program, TargetValueRef
|
|||
|
||||
use crate::{
|
||||
common::{infra::config::QUERY_FUNCTIONS, utils::json},
|
||||
service::ingestion::{compile_vrl_function, get_value},
|
||||
service::ingestion::{compile_vrl_function, get_string_value},
|
||||
};
|
||||
|
||||
fn create_user_df(
|
||||
|
@ -147,13 +147,14 @@ fn get_udf_vrl(
|
|||
for col in res.fields {
|
||||
let field_builder = col_val_map.entry(col.to_string()).or_default();
|
||||
if res_map.contains_key(&col) {
|
||||
field_builder.insert(i, get_value(res_map.get(&col).unwrap()));
|
||||
field_builder
|
||||
.insert(i, get_string_value(res_map.get(&col).unwrap()));
|
||||
} else {
|
||||
field_builder.insert(i, "".to_string());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
res_data_vec.insert(i, get_value(&result));
|
||||
res_data_vec.insert(i, get_string_value(&result));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -280,23 +280,31 @@ fn get_key_from_error(err: &str, pos: usize) -> Option<String> {
|
|||
None
|
||||
}
|
||||
|
||||
fn check_memory_circuit_breaker(scan_stats: &ScanStats) -> Result<(), Error> {
|
||||
fn check_memory_circuit_breaker(session_id: &str, scan_stats: &ScanStats) -> Result<(), Error> {
|
||||
let scan_size = if scan_stats.compressed_size > 0 {
|
||||
scan_stats.compressed_size
|
||||
} else {
|
||||
scan_stats.original_size
|
||||
};
|
||||
if let Some(cur_memory) = memory_stats::memory_stats() {
|
||||
if cur_memory.physical_mem as i64 + scan_size
|
||||
> (CONFIG.limit.mem_total * CONFIG.common.memory_circuit_breaker_ratio / 100) as i64
|
||||
// left memory < datafusion * breaker_ratio and scan_size >= left memory
|
||||
let left_mem = CONFIG.limit.mem_total - cur_memory.physical_mem;
|
||||
if (left_mem
|
||||
< (CONFIG.memory_cache.datafusion_max_size
|
||||
* CONFIG.common.memory_circuit_breaker_ratio
|
||||
/ 100))
|
||||
&& (scan_size >= left_mem as i64)
|
||||
{
|
||||
let err = format!(
|
||||
"fire memory_circuit_breaker, try to alloc {} bytes, now current memory usage is {} bytes, larger than limit of [{} bytes] ",
|
||||
"fire memory_circuit_breaker, try to alloc {} bytes, now current memory usage is {} bytes, left memory {} bytes, left memory more than limit of [{} bytes] or scan_size more than left memory , please submit a new query with a short time range",
|
||||
scan_size,
|
||||
cur_memory.physical_mem,
|
||||
CONFIG.limit.mem_total * CONFIG.common.memory_circuit_breaker_ratio / 100
|
||||
left_mem,
|
||||
CONFIG.memory_cache.datafusion_max_size
|
||||
* CONFIG.common.memory_circuit_breaker_ratio
|
||||
/ 100
|
||||
);
|
||||
log::warn!("{}", err);
|
||||
log::warn!("[{session_id}] {}", err);
|
||||
return Err(Error::Message(err.to_string()));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -153,7 +153,7 @@ pub async fn search(
|
|||
);
|
||||
|
||||
if CONFIG.common.memory_circuit_breaker_enable {
|
||||
super::check_memory_circuit_breaker(&scan_stats)?;
|
||||
super::check_memory_circuit_breaker(session_id, &scan_stats)?;
|
||||
}
|
||||
|
||||
// load files to local cache
|
||||
|
|
|
@ -122,10 +122,6 @@ pub async fn search_parquet(
|
|||
return Ok((HashMap::new(), scan_stats));
|
||||
}
|
||||
|
||||
if CONFIG.common.memory_circuit_breaker_enable {
|
||||
super::check_memory_circuit_breaker(&scan_stats)?;
|
||||
}
|
||||
|
||||
// fetch all schema versions, get latest schema
|
||||
let schema_latest = Arc::new(
|
||||
schema_latest
|
||||
|
@ -351,7 +347,7 @@ pub async fn search_memtable(
|
|||
);
|
||||
|
||||
if CONFIG.common.memory_circuit_breaker_enable {
|
||||
super::check_memory_circuit_breaker(&scan_stats)?;
|
||||
super::check_memory_circuit_breaker(session_id, &scan_stats)?;
|
||||
}
|
||||
|
||||
// fetch all schema versions, get latest schema
|
||||
|
|
|
@ -424,8 +424,7 @@ async fn search_in_cluster(mut req: cluster_rpc::SearchRequest) -> Result<search
|
|||
},
|
||||
&json::Value::Object(hit.clone()),
|
||||
);
|
||||
(!ret_val.is_null())
|
||||
.then_some(flatten::flatten(&ret_val).unwrap_or(ret_val))
|
||||
(!ret_val.is_null()).then_some(flatten::flatten(ret_val).unwrap())
|
||||
})
|
||||
.collect(),
|
||||
None => json_rows
|
||||
|
@ -444,7 +443,8 @@ async fn search_in_cluster(mut req: cluster_rpc::SearchRequest) -> Result<search
|
|||
|
||||
if sql.uses_zo_fn {
|
||||
for source in sources {
|
||||
result.add_hit(&flatten::flatten(&source).unwrap());
|
||||
result
|
||||
.add_hit(&flatten::flatten(source).map_err(|e| Error::Message(e.to_string()))?);
|
||||
}
|
||||
} else {
|
||||
for source in sources {
|
||||
|
|
|
@ -20,7 +20,7 @@ use ahash::AHashMap;
|
|||
use bytes::BytesMut;
|
||||
use chrono::{Duration, Utc};
|
||||
use config::{
|
||||
meta::stream::StreamType, metrics, utils::hasher::get_fields_key_xxh3, CONFIG, DISTINCT_FIELDS,
|
||||
meta::stream::StreamType, metrics, utils::schema_ext::SchemaExt, CONFIG, DISTINCT_FIELDS,
|
||||
};
|
||||
use datafusion::arrow::datatypes::Schema;
|
||||
use opentelemetry::trace::{SpanId, TraceId};
|
||||
|
@ -42,7 +42,7 @@ use crate::{
|
|||
traces::{Event, Span, SpanRefType},
|
||||
usage::UsageType,
|
||||
},
|
||||
utils::{self, flatten, json},
|
||||
utils::{flatten, json},
|
||||
},
|
||||
service::{
|
||||
db, distinct_values, format_partition_key, format_stream_name,
|
||||
|
@ -83,6 +83,17 @@ pub async fn handle_trace_request(
|
|||
"Quota exceeded for this organization".to_string(),
|
||||
)));
|
||||
}
|
||||
|
||||
// check memtable
|
||||
if let Err(e) = ingester::check_memtable_size() {
|
||||
return Ok(
|
||||
HttpResponse::ServiceUnavailable().json(MetaHttpResponse::error(
|
||||
http::StatusCode::SERVICE_UNAVAILABLE.into(),
|
||||
e.to_string(),
|
||||
)),
|
||||
);
|
||||
}
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
let traces_stream_name = match in_stream_name {
|
||||
|
@ -241,30 +252,34 @@ pub async fn handle_trace_request(
|
|||
let value: json::Value = json::to_value(local_val).unwrap();
|
||||
|
||||
// JSON Flattening
|
||||
let mut value = flatten::flatten(&value).unwrap();
|
||||
let mut value = flatten::flatten(value).map_err(|e| {
|
||||
std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string())
|
||||
})?;
|
||||
|
||||
if !local_trans.is_empty() {
|
||||
value = crate::service::ingestion::apply_stream_transform(
|
||||
&local_trans,
|
||||
&value,
|
||||
value,
|
||||
&stream_vrl_map,
|
||||
traces_stream_name,
|
||||
&mut runtime,
|
||||
)
|
||||
.unwrap_or(value);
|
||||
.map_err(|e| {
|
||||
std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string())
|
||||
})?;
|
||||
}
|
||||
// End row based transform */
|
||||
// get json object
|
||||
let val_map = value.as_object_mut().unwrap();
|
||||
let record_val = value.as_object_mut().unwrap();
|
||||
|
||||
val_map.insert(
|
||||
record_val.insert(
|
||||
CONFIG.common.column_timestamp.clone(),
|
||||
json::Value::Number(timestamp.into()),
|
||||
);
|
||||
|
||||
// get distinct_value item
|
||||
for field in DISTINCT_FIELDS.iter() {
|
||||
if let Some(val) = val_map.get(field) {
|
||||
if let Some(val) = record_val.get(field) {
|
||||
if !val.is_null() {
|
||||
let (filter_name, filter_value) = if field == "operation_name" {
|
||||
("service_name".to_string(), service_name.clone())
|
||||
|
@ -283,30 +298,17 @@ pub async fn handle_trace_request(
|
|||
}
|
||||
}
|
||||
|
||||
let value_str = crate::common::utils::json::to_string(&val_map).unwrap();
|
||||
|
||||
// check schema
|
||||
let schema_evolution = check_for_schema(
|
||||
let _ = check_for_schema(
|
||||
org_id,
|
||||
traces_stream_name,
|
||||
StreamType::Traces,
|
||||
&value_str,
|
||||
&mut traces_schema_map,
|
||||
&json::Value::Object(record_val.clone()),
|
||||
timestamp.try_into().unwrap(),
|
||||
true,
|
||||
)
|
||||
.await;
|
||||
|
||||
let schema_key = get_fields_key_xxh3(&schema_evolution.schema_fields);
|
||||
// get hour key
|
||||
let mut hour_key = super::ingestion::get_wal_time_key(
|
||||
timestamp.try_into().unwrap(),
|
||||
&partition_keys,
|
||||
partition_time_level,
|
||||
val_map,
|
||||
Some(&schema_key),
|
||||
);
|
||||
|
||||
if trigger.is_none() && !stream_alerts_map.is_empty() {
|
||||
// Start check for alert trigger
|
||||
let key = format!("{}/{}/{}", &org_id, StreamType::Traces, traces_stream_name);
|
||||
|
@ -314,7 +316,7 @@ pub async fn handle_trace_request(
|
|||
let mut trigger_alerts: Vec<(Alert, Vec<json::Map<String, json::Value>>)> =
|
||||
Vec::new();
|
||||
for alert in alerts {
|
||||
if let Ok(Some(v)) = alert.evaluate(Some(val_map)).await {
|
||||
if let Ok(Some(v)) = alert.evaluate(Some(record_val)).await {
|
||||
trigger_alerts.push((alert.clone(), v));
|
||||
}
|
||||
}
|
||||
|
@ -323,28 +325,37 @@ pub async fn handle_trace_request(
|
|||
// End check for alert trigger
|
||||
}
|
||||
|
||||
// get hour key
|
||||
let rec_schema = traces_schema_map
|
||||
.get(traces_stream_name)
|
||||
.unwrap()
|
||||
.clone()
|
||||
.with_metadata(HashMap::new());
|
||||
let schema_key = rec_schema.hash_key();
|
||||
let mut hour_key = super::ingestion::get_wal_time_key(
|
||||
timestamp.try_into().unwrap(),
|
||||
&partition_keys,
|
||||
partition_time_level,
|
||||
record_val,
|
||||
Some(&schema_key),
|
||||
);
|
||||
|
||||
if partition_keys.is_empty() {
|
||||
let partition_key = format!("service_name={}", service_name);
|
||||
hour_key.push_str(&format!("/{}", format_partition_key(&partition_key)));
|
||||
}
|
||||
|
||||
let hour_buf = data_buf.entry(hour_key).or_insert_with(|| {
|
||||
let schema = traces_schema_map
|
||||
.get(traces_stream_name)
|
||||
.unwrap()
|
||||
.clone()
|
||||
.with_metadata(HashMap::new());
|
||||
SchemaRecords {
|
||||
schema_key,
|
||||
schema: Arc::new(schema),
|
||||
records: vec![],
|
||||
records_size: 0,
|
||||
}
|
||||
let hour_buf = data_buf.entry(hour_key).or_insert_with(|| SchemaRecords {
|
||||
schema_key,
|
||||
schema: Arc::new(rec_schema),
|
||||
records: vec![],
|
||||
records_size: 0,
|
||||
});
|
||||
let loc_value: utils::json::Value =
|
||||
utils::json::from_slice(value_str.as_bytes()).unwrap();
|
||||
hour_buf.records.push(Arc::new(loc_value));
|
||||
hour_buf.records_size += value_str.len();
|
||||
let record_val = record_val.to_owned();
|
||||
let record_val = json::Value::Object(record_val);
|
||||
let record_size = json::to_vec(&record_val).unwrap_or_default().len();
|
||||
hour_buf.records.push(Arc::new(record_val));
|
||||
hour_buf.records_size += record_size;
|
||||
|
||||
if timestamp < min_ts.try_into().unwrap() {
|
||||
partial_success.rejected_spans += 1;
|
||||
|
@ -359,7 +370,6 @@ pub async fn handle_trace_request(
|
|||
data_buf,
|
||||
thread_id,
|
||||
&StreamParams::new(org_id, traces_stream_name, StreamType::Traces),
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
let time = start.elapsed().as_secs_f64();
|
||||
|
|
|
@ -19,7 +19,7 @@ use actix_web::{http, web, HttpResponse};
|
|||
use ahash::AHashMap;
|
||||
use chrono::{Duration, Utc};
|
||||
use config::{
|
||||
meta::stream::StreamType, metrics, utils::hasher::get_fields_key_xxh3, CONFIG, DISTINCT_FIELDS,
|
||||
meta::stream::StreamType, metrics, utils::schema_ext::SchemaExt, CONFIG, DISTINCT_FIELDS,
|
||||
};
|
||||
use datafusion::arrow::datatypes::Schema;
|
||||
use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest;
|
||||
|
@ -37,7 +37,6 @@ use crate::{
|
|||
},
|
||||
usage::UsageType,
|
||||
},
|
||||
utils,
|
||||
utils::{flatten, json},
|
||||
},
|
||||
service::{
|
||||
|
@ -87,6 +86,16 @@ pub async fn traces_json(
|
|||
)));
|
||||
}
|
||||
|
||||
// check memtable
|
||||
if let Err(e) = ingester::check_memtable_size() {
|
||||
return Ok(
|
||||
HttpResponse::ServiceUnavailable().json(MetaHttpResponse::error(
|
||||
http::StatusCode::SERVICE_UNAVAILABLE.into(),
|
||||
e.to_string(),
|
||||
)),
|
||||
);
|
||||
}
|
||||
|
||||
let start = std::time::Instant::now();
|
||||
let traces_stream_name = match in_stream_name {
|
||||
Some(name) => format_stream_name(name),
|
||||
|
@ -304,30 +313,34 @@ pub async fn traces_json(
|
|||
let mut value: json::Value = json::to_value(local_val).unwrap();
|
||||
|
||||
// JSON Flattening
|
||||
value = flatten::flatten(&value).unwrap();
|
||||
value = flatten::flatten(value).map_err(|e| {
|
||||
std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string())
|
||||
})?;
|
||||
|
||||
if !local_trans.is_empty() {
|
||||
value = crate::service::ingestion::apply_stream_transform(
|
||||
&local_trans,
|
||||
&value,
|
||||
value,
|
||||
&stream_vrl_map,
|
||||
traces_stream_name,
|
||||
&mut runtime,
|
||||
)
|
||||
.unwrap_or(value);
|
||||
.map_err(|e| {
|
||||
std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string())
|
||||
})?;
|
||||
}
|
||||
// End row based transform */
|
||||
// get json object
|
||||
let val_map = value.as_object_mut().unwrap();
|
||||
let record_val = value.as_object_mut().unwrap();
|
||||
|
||||
val_map.insert(
|
||||
record_val.insert(
|
||||
CONFIG.common.column_timestamp.clone(),
|
||||
json::Value::Number(timestamp.into()),
|
||||
);
|
||||
|
||||
// get distinct_value item
|
||||
for field in DISTINCT_FIELDS.iter() {
|
||||
if let Some(val) = val_map.get(field) {
|
||||
if let Some(val) = record_val.get(field) {
|
||||
if !val.is_null() {
|
||||
let (filter_name, filter_value) = if field == "operation_name" {
|
||||
("service_name".to_string(), service_name.clone())
|
||||
|
@ -346,30 +359,17 @@ pub async fn traces_json(
|
|||
}
|
||||
}
|
||||
|
||||
let value_str = crate::common::utils::json::to_string(&val_map).unwrap();
|
||||
|
||||
// check schema
|
||||
let schema_evolution = check_for_schema(
|
||||
let _ = check_for_schema(
|
||||
org_id,
|
||||
traces_stream_name,
|
||||
StreamType::Traces,
|
||||
&value_str,
|
||||
&mut traces_schema_map,
|
||||
&json::Value::Object(record_val.clone()),
|
||||
timestamp.try_into().unwrap(),
|
||||
true,
|
||||
)
|
||||
.await;
|
||||
|
||||
// get hour key
|
||||
let schema_key = get_fields_key_xxh3(&schema_evolution.schema_fields);
|
||||
let mut hour_key = crate::service::ingestion::get_wal_time_key(
|
||||
timestamp.try_into().unwrap(),
|
||||
&partition_keys,
|
||||
partition_time_level,
|
||||
val_map,
|
||||
Some(&schema_key),
|
||||
);
|
||||
|
||||
if trigger.is_none() && !stream_alerts_map.is_empty() {
|
||||
// Start check for alert trigger
|
||||
let key =
|
||||
|
@ -380,7 +380,7 @@ pub async fn traces_json(
|
|||
Vec<json::Map<String, json::Value>>,
|
||||
)> = Vec::new();
|
||||
for alert in alerts {
|
||||
if let Ok(Some(v)) = alert.evaluate(Some(val_map)).await {
|
||||
if let Ok(Some(v)) = alert.evaluate(Some(record_val)).await {
|
||||
trigger_alerts.push((alert.clone(), v));
|
||||
}
|
||||
}
|
||||
|
@ -389,28 +389,37 @@ pub async fn traces_json(
|
|||
// End check for alert trigger
|
||||
}
|
||||
|
||||
// get hour key
|
||||
let rec_schema = traces_schema_map
|
||||
.get(traces_stream_name)
|
||||
.unwrap()
|
||||
.clone()
|
||||
.with_metadata(HashMap::new());
|
||||
let schema_key = rec_schema.hash_key();
|
||||
let mut hour_key = crate::service::ingestion::get_wal_time_key(
|
||||
timestamp.try_into().unwrap(),
|
||||
&partition_keys,
|
||||
partition_time_level,
|
||||
record_val,
|
||||
Some(&schema_key),
|
||||
);
|
||||
|
||||
if partition_keys.is_empty() {
|
||||
let partition_key = format!("service_name={}", service_name);
|
||||
hour_key.push_str(&format!("/{}", format_partition_key(&partition_key)));
|
||||
}
|
||||
|
||||
let hour_buf = data_buf.entry(hour_key).or_insert_with(|| {
|
||||
let schema = traces_schema_map
|
||||
.get(traces_stream_name)
|
||||
.unwrap()
|
||||
.clone()
|
||||
.with_metadata(HashMap::new());
|
||||
SchemaRecords {
|
||||
schema_key,
|
||||
schema: Arc::new(schema),
|
||||
records: vec![],
|
||||
records_size: 0,
|
||||
}
|
||||
let hour_buf = data_buf.entry(hour_key).or_insert_with(|| SchemaRecords {
|
||||
schema_key,
|
||||
schema: Arc::new(rec_schema),
|
||||
records: vec![],
|
||||
records_size: 0,
|
||||
});
|
||||
let loc_value: utils::json::Value =
|
||||
utils::json::from_slice(value_str.as_bytes()).unwrap();
|
||||
hour_buf.records.push(Arc::new(loc_value));
|
||||
hour_buf.records_size += value_str.len();
|
||||
let record_val = record_val.to_owned();
|
||||
let record_val = json::Value::Object(record_val);
|
||||
let record_size = json::to_vec(&record_val).unwrap_or_default().len();
|
||||
hour_buf.records.push(Arc::new(record_val));
|
||||
hour_buf.records_size += record_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -421,7 +430,6 @@ pub async fn traces_json(
|
|||
data_buf,
|
||||
thread_id,
|
||||
&StreamParams::new(org_id, traces_stream_name, StreamType::Traces),
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
let time = start.elapsed().as_secs_f64();
|
||||
|
|
|
@ -22,7 +22,6 @@ use std::{
|
|||
use byteorder::{BigEndian, ReadBytesExt};
|
||||
use crc32fast::Hasher;
|
||||
use snafu::{ensure, ResultExt};
|
||||
use snap::read::FrameDecoder;
|
||||
|
||||
use crate::errors::*;
|
||||
|
||||
|
@ -84,7 +83,7 @@ where
|
|||
|
||||
let compressed_read = self.f.by_ref().take(expected_len);
|
||||
let hashing_read = CrcReader::new(compressed_read);
|
||||
let mut decompressing_read = FrameDecoder::new(hashing_read);
|
||||
let mut decompressing_read = snap::read::FrameDecoder::new(hashing_read);
|
||||
|
||||
let mut data = Vec::with_capacity(100);
|
||||
decompressing_read
|
||||
|
|
Loading…
Reference in New Issue