update heed to latest version

This commit is contained in:
Tamo 2024-05-16 16:10:55 +02:00
parent 7e251b43d4
commit c9ac7f2e7e
13 changed files with 43 additions and 42 deletions

26
Cargo.lock generated
View File

@ -378,9 +378,9 @@ dependencies = [
[[package]] [[package]]
name = "arroy" name = "arroy"
version = "0.2.0" version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "efddeb1e7c32a551cc07ef4c3e181e3cd5478fdaf4f0bd799983171c1f6efe57" checksum = "73897699bf04bac935c0b120990d2a511e91e563e0f9769f9c8bb983d98dfbc9"
dependencies = [ dependencies = [
"bytemuck", "bytemuck",
"byteorder", "byteorder",
@ -1536,9 +1536,9 @@ checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10"
[[package]] [[package]]
name = "doxygen-rs" name = "doxygen-rs"
version = "0.2.2" version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bff670ea0c9bbb8414e7efa6e23ebde2b8f520a7eef78273a3918cf1903e7505" checksum = "415b6ec780d34dcf624666747194393603d0373b7141eef01d12ee58881507d9"
dependencies = [ dependencies = [
"phf", "phf",
] ]
@ -2262,12 +2262,11 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
[[package]] [[package]]
name = "heed" name = "heed"
version = "0.20.0-alpha.9" version = "0.20.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9648a50991c86df7d00c56c268c27754fcf4c80be2ba57fc4a00dc928c6fe934" checksum = "6f7acb9683d7c7068aa46d47557bfa4e35a277964b350d9504a87b03610163fd"
dependencies = [ dependencies = [
"bitflags 2.5.0", "bitflags 2.5.0",
"bytemuck",
"byteorder", "byteorder",
"heed-traits", "heed-traits",
"heed-types", "heed-types",
@ -2281,15 +2280,15 @@ dependencies = [
[[package]] [[package]]
name = "heed-traits" name = "heed-traits"
version = "0.20.0-alpha.9" version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5ab0b7d9cde969ad36dde692e487dc89d97f7168bf6a7bd3b894ad4bf7278298" checksum = "eb3130048d404c57ce5a1ac61a903696e8fcde7e8c2991e9fcfc1f27c3ef74ff"
[[package]] [[package]]
name = "heed-types" name = "heed-types"
version = "0.20.0-alpha.9" version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0cb3567a7363f28b597bf6e9897b9466397951dd0e52df2c8196dd8a71af44a" checksum = "3cb0d6ba3700c9a57e83c013693e3eddb68a6d9b6781cacafc62a0d992e8ddb3"
dependencies = [ dependencies = [
"bincode", "bincode",
"byteorder", "byteorder",
@ -3189,14 +3188,13 @@ checksum = "f9d642685b028806386b2b6e75685faadd3eb65a85fff7df711ce18446a422da"
[[package]] [[package]]
name = "lmdb-master-sys" name = "lmdb-master-sys"
version = "0.1.0" version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "629c123f5321b48fa4f8f4d3b868165b748d9ba79c7103fb58e3a94f736bcedd" checksum = "dc9048db3a58c0732d7236abc4909058f9d2708cfb6d7d047eb895fddec6419a"
dependencies = [ dependencies = [
"cc", "cc",
"doxygen-rs", "doxygen-rs",
"libc", "libc",
"pkg-config",
] ]
[[package]] [[package]]

View File

@ -785,10 +785,12 @@ impl IndexScheduler {
let dst = temp_snapshot_dir.path().join("auth"); let dst = temp_snapshot_dir.path().join("auth");
fs::create_dir_all(&dst)?; fs::create_dir_all(&dst)?;
// TODO We can't use the open_auth_store_env function here but we should // TODO We can't use the open_auth_store_env function here but we should
let auth = milli::heed::EnvOpenOptions::new() let auth = unsafe {
.map_size(1024 * 1024 * 1024) // 1 GiB milli::heed::EnvOpenOptions::new()
.max_dbs(2) .map_size(1024 * 1024 * 1024) // 1 GiB
.open(&self.auth_path)?; .max_dbs(2)
.open(&self.auth_path)
}?;
auth.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?; auth.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?;
// 5. Copy and tarball the flat snapshot // 5. Copy and tarball the flat snapshot

View File

@ -453,10 +453,12 @@ impl IndexScheduler {
) )
}; };
let env = heed::EnvOpenOptions::new() let env = unsafe {
.max_dbs(11) heed::EnvOpenOptions::new()
.map_size(budget.task_db_size) .max_dbs(11)
.open(options.tasks_path)?; .map_size(budget.task_db_size)
.open(options.tasks_path)
}?;
let features = features::FeatureData::new(&env, options.instance_features)?; let features = features::FeatureData::new(&env, options.instance_features)?;
@ -585,9 +587,9 @@ impl IndexScheduler {
} }
fn is_good_heed(tasks_path: &Path, map_size: usize) -> bool { fn is_good_heed(tasks_path: &Path, map_size: usize) -> bool {
if let Ok(env) = if let Ok(env) = unsafe {
heed::EnvOpenOptions::new().map_size(clamp_to_page_size(map_size)).open(tasks_path) heed::EnvOpenOptions::new().map_size(clamp_to_page_size(map_size)).open(tasks_path)
{ } {
env.prepare_for_closing().wait(); env.prepare_for_closing().wait();
true true
} else { } else {

View File

@ -49,7 +49,7 @@ pub fn open_auth_store_env(path: &Path) -> milli::heed::Result<milli::heed::Env>
let mut options = EnvOpenOptions::new(); let mut options = EnvOpenOptions::new();
options.map_size(AUTH_STORE_SIZE); // 1GB options.map_size(AUTH_STORE_SIZE); // 1GB
options.max_dbs(2); options.max_dbs(2);
options.open(path) unsafe { options.open(path) }
} }
impl HeedAuthStore { impl HeedAuthStore {

View File

@ -423,7 +423,6 @@ impl ErrorCode for HeedError {
HeedError::Mdb(_) HeedError::Mdb(_)
| HeedError::Encoding(_) | HeedError::Encoding(_)
| HeedError::Decoding(_) | HeedError::Decoding(_)
| HeedError::InvalidDatabaseTyping
| HeedError::DatabaseClosing | HeedError::DatabaseClosing
| HeedError::BadOpenOptions { .. } => Code::Internal, | HeedError::BadOpenOptions { .. } => Code::Internal,
} }

View File

@ -80,9 +80,7 @@ fn main() -> anyhow::Result<()> {
/// Clears the task queue located at `db_path`. /// Clears the task queue located at `db_path`.
fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> { fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> {
let path = db_path.join("tasks"); let path = db_path.join("tasks");
let env = EnvOpenOptions::new() let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&path) }
.max_dbs(100)
.open(&path)
.with_context(|| format!("While trying to open {:?}", path.display()))?; .with_context(|| format!("While trying to open {:?}", path.display()))?;
eprintln!("Deleting tasks from the database..."); eprintln!("Deleting tasks from the database...");
@ -193,9 +191,7 @@ fn export_a_dump(
FileStore::new(db_path.join("update_files")).context("While opening the FileStore")?; FileStore::new(db_path.join("update_files")).context("While opening the FileStore")?;
let index_scheduler_path = db_path.join("tasks"); let index_scheduler_path = db_path.join("tasks");
let env = EnvOpenOptions::new() let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) }
.max_dbs(100)
.open(&index_scheduler_path)
.with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?;
eprintln!("Dumping the keys..."); eprintln!("Dumping the keys...");

View File

@ -30,7 +30,7 @@ grenad = { version = "0.4.6", default-features = false, features = [
"rayon", "rayon",
"tempfile", "tempfile",
] } ] }
heed = { version = "0.20.0-alpha.9", default-features = false, features = [ heed = { version = "0.20.1", default-features = false, features = [
"serde-json", "serde-json",
"serde-bincode", "serde-bincode",
"read-txn-no-tls", "read-txn-no-tls",
@ -82,7 +82,7 @@ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls",
] } ] }
tiktoken-rs = "0.5.8" tiktoken-rs = "0.5.8"
liquid = "0.26.4" liquid = "0.26.4"
arroy = "0.2.0" arroy = "0.3.1"
rand = "0.8.5" rand = "0.8.5"
tracing = "0.1.40" tracing = "0.1.40"
ureq = { version = "2.9.7", features = ["json"] } ureq = { version = "2.9.7", features = ["json"] }

3
milli/fuzz/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
target
corpus
artifacts

View File

@ -48,8 +48,6 @@ pub enum InternalError {
GrenadInvalidFormatVersion, GrenadInvalidFormatVersion,
#[error("Invalid merge while processing {process}")] #[error("Invalid merge while processing {process}")]
IndexingMergingKeys { process: &'static str }, IndexingMergingKeys { process: &'static str },
#[error("{}", HeedError::InvalidDatabaseTyping)]
InvalidDatabaseTyping,
#[error(transparent)] #[error(transparent)]
RayonThreadPool(#[from] ThreadPoolBuildError), RayonThreadPool(#[from] ThreadPoolBuildError),
#[error(transparent)] #[error(transparent)]
@ -429,7 +427,6 @@ impl From<HeedError> for Error {
// TODO use the encoding // TODO use the encoding
HeedError::Encoding(_) => InternalError(Serialization(Encoding { db_name: None })), HeedError::Encoding(_) => InternalError(Serialization(Encoding { db_name: None })),
HeedError::Decoding(_) => InternalError(Serialization(Decoding { db_name: None })), HeedError::Decoding(_) => InternalError(Serialization(Decoding { db_name: None })),
HeedError::InvalidDatabaseTyping => InternalError(InvalidDatabaseTyping),
HeedError::DatabaseClosing => InternalError(DatabaseClosing), HeedError::DatabaseClosing => InternalError(DatabaseClosing),
HeedError::BadOpenOptions { .. } => UserError(InvalidLmdbOpenOptions), HeedError::BadOpenOptions { .. } => UserError(InvalidLmdbOpenOptions),
} }

View File

@ -184,7 +184,7 @@ impl Index {
options.max_dbs(25); options.max_dbs(25);
let env = options.open(path)?; let env = unsafe { options.open(path) }?;
let mut wtxn = env.write_txn()?; let mut wtxn = env.write_txn()?;
let main = env.database_options().name(MAIN).create(&mut wtxn)?; let main = env.database_options().name(MAIN).create(&mut wtxn)?;
let word_docids = env.create_database(&mut wtxn, Some(WORD_DOCIDS))?; let word_docids = env.create_database(&mut wtxn, Some(WORD_DOCIDS))?;
@ -294,6 +294,11 @@ impl Index {
self.env.read_txn() self.env.read_txn()
} }
/// Create a static read transaction to be able to read the index without keeping a reference to it.
pub fn static_read_txn(&self) -> heed::Result<RoTxn<'static>> {
self.env.clone().static_read_txn()
}
/// Returns the canonicalized path where the heed `Env` of this `Index` lives. /// Returns the canonicalized path where the heed `Env` of this `Index` lives.
pub fn path(&self) -> &Path { pub fn path(&self) -> &Path {
self.env.path() self.env.path()

View File

@ -379,7 +379,7 @@ pub(crate) mod test_helpers {
let mut options = heed::EnvOpenOptions::new(); let mut options = heed::EnvOpenOptions::new();
let options = options.map_size(4096 * 4 * 1000 * 100); let options = options.map_size(4096 * 4 * 1000 * 100);
let tempdir = tempfile::TempDir::new().unwrap(); let tempdir = tempfile::TempDir::new().unwrap();
let env = options.open(tempdir.path()).unwrap(); let env = unsafe { options.open(tempdir.path()) }.unwrap();
let mut wtxn = env.write_txn().unwrap(); let mut wtxn = env.write_txn().unwrap();
let content = env.create_database(&mut wtxn, None).unwrap(); let content = env.create_database(&mut wtxn, None).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();

View File

@ -556,7 +556,7 @@ where
let writer_index = (embedder_index as u16) << 8; let writer_index = (embedder_index as u16) << 8;
for k in 0..=u8::MAX { for k in 0..=u8::MAX {
let writer = let writer =
arroy::Writer::new(vector_arroy, writer_index | (k as u16), dimension)?; arroy::Writer::new(vector_arroy, writer_index | (k as u16), dimension);
if writer.is_empty(wtxn)? { if writer.is_empty(wtxn)? {
break; break;
} }

View File

@ -661,7 +661,7 @@ pub(crate) fn write_typed_chunk_into_index(
)?; )?;
let writer_index = (embedder_index as u16) << 8; let writer_index = (embedder_index as u16) << 8;
// FIXME: allow customizing distance // FIXME: allow customizing distance
let writers: std::result::Result<Vec<_>, _> = (0..=u8::MAX) let writers: Vec<_> = (0..=u8::MAX)
.map(|k| { .map(|k| {
arroy::Writer::new( arroy::Writer::new(
index.vector_arroy, index.vector_arroy,
@ -670,7 +670,6 @@ pub(crate) fn write_typed_chunk_into_index(
) )
}) })
.collect(); .collect();
let writers = writers?;
// remove vectors for docids we want them removed // remove vectors for docids we want them removed
let merger = remove_vectors_builder.build(); let merger = remove_vectors_builder.build();