Fix dataset count slowness (#818)

Fixes #811
This commit is contained in:
Dilshod Tadjibaev 2023-09-21 07:55:13 -05:00 committed by GitHub
parent aacf191161
commit 393d86e99d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 10 additions and 2 deletions

View File

@ -6,7 +6,7 @@ mod iterator;
#[cfg(any(feature = "sqlite", feature = "sqlite-bundled"))]
mod sqlite;
#[cfg(feature = "fake")]
#[cfg(any(test, feature = "fake"))]
pub use self::fake::*;
pub use base::*;
pub use in_memory::*;

View File

@ -247,7 +247,15 @@ fn fetch_columns_and_len(
let columns = columns_from_statement(&statement);
// Count the number of rows and save it as len
let mut statement = connection.prepare(format!("select count(*) from {split}").as_str())?;
//
// NOTE: Using coalesce(max(row_id), 0) instead of count(*) because count(*) is super slow for large tables.
// The coalesce(max(row_id), 0) returns 0 if the table is empty, otherwise it returns the max row_id,
// which corresponds to the number of rows in the table.
// The main assumption, which always holds true, is that the row_id is always increasing and there are no gaps.
// This is true for all the datasets that we are using, otherwise row_id will not correspond to the index.
let mut statement =
connection.prepare(format!("select coalesce(max(row_id), 0) from {split}").as_str())?;
let len = statement.query_row([], |row| {
let len: usize = row.get(0)?;
Ok(len)