Remove the vectors from the documents database

This commit is contained in:
Tamo 2024-05-22 15:27:09 +02:00
parent 7a84697570
commit 84e498299b
14 changed files with 407 additions and 51 deletions

2
Cargo.lock generated
View File

@ -2455,6 +2455,7 @@ name = "index-scheduler"
version = "1.9.0"
dependencies = [
"anyhow",
"arroy",
"big_s",
"bincode",
"crossbeam",
@ -2465,6 +2466,7 @@ dependencies = [
"file-store",
"flate2",
"insta",
"maplit",
"meili-snap",
"meilisearch-auth",
"meilisearch-types",

View File

@ -40,7 +40,9 @@ ureq = "2.9.7"
uuid = { version = "1.6.1", features = ["serde", "v4"] }
[dev-dependencies]
arroy = "0.3.1"
big_s = "1.0.2"
crossbeam = "0.8.4"
insta = { version = "1.34.0", features = ["json", "redactions"] }
maplit = "1.0.2"
meili-snap = { path = "../meili-snap" }

View File

@ -1459,11 +1459,11 @@ impl IndexScheduler {
// TODO: consider using a type alias or a struct embedder/template
pub fn embedders(
&self,
embedding_configs: Vec<(String, milli::vector::EmbeddingConfig)>,
embedding_configs: Vec<(String, milli::vector::EmbeddingConfig, RoaringBitmap)>,
) -> Result<EmbeddingConfigs> {
let res: Result<_> = embedding_configs
.into_iter()
.map(|(name, milli::vector::EmbeddingConfig { embedder_options, prompt })| {
.map(|(name, milli::vector::EmbeddingConfig { embedder_options, prompt }, _)| {
let prompt =
Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?);
// optimistically return existing embedder
@ -1748,6 +1748,9 @@ mod tests {
use meilisearch_types::milli::update::IndexDocumentsMethod::{
ReplaceDocuments, UpdateDocuments,
};
use meilisearch_types::milli::update::Setting;
use meilisearch_types::milli::vector::settings::EmbeddingSettings;
use meilisearch_types::settings::{Checked, Unchecked};
use meilisearch_types::tasks::IndexSwap;
use meilisearch_types::VERSION_FILE_NAME;
use tempfile::{NamedTempFile, TempDir};
@ -3052,7 +3055,9 @@ mod tests {
let rtxn = index.read_txn().unwrap();
let configs = index.embedding_configs(&rtxn).unwrap();
let (_, embedding_config) = configs.first().unwrap();
let (name, embedding_config, user_provided) = configs.first().unwrap();
insta::assert_snapshot!(name, @"default");
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
insta::assert_json_snapshot!(embedding_config.embedder_options);
}
@ -5017,13 +5022,15 @@ mod tests {
let configs = index.embedding_configs(&rtxn).unwrap();
// for consistency with the below
#[allow(clippy::get_first)]
let (name, fakerest_config) = configs.get(0).unwrap();
insta::assert_json_snapshot!(name, @r###""A_fakerest""###);
let (name, fakerest_config, user_provided) = configs.get(0).unwrap();
insta::assert_snapshot!(name, @"A_fakerest");
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
insta::assert_json_snapshot!(fakerest_config.embedder_options);
let fakerest_name = name.clone();
let (name, simple_hf_config) = configs.get(1).unwrap();
insta::assert_json_snapshot!(name, @r###""B_small_hf""###);
let (name, simple_hf_config, user_provided) = configs.get(1).unwrap();
insta::assert_snapshot!(name, @"B_small_hf");
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
insta::assert_json_snapshot!(simple_hf_config.embedder_options);
let simple_hf_name = name.clone();
@ -5091,6 +5098,18 @@ mod tests {
let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap();
// Ensure the document have been inserted into the relevant bitamp
let configs = index.embedding_configs(&rtxn).unwrap();
// for consistency with the below
#[allow(clippy::get_first)]
let (name, _config, user_defined) = configs.get(0).unwrap();
insta::assert_snapshot!(name, @"A_fakerest");
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>");
let (name, _config, user_defined) = configs.get(1).unwrap();
insta::assert_snapshot!(name, @"B_small_hf");
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>");
let embeddings = index.embeddings(&rtxn, 0).unwrap();
assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true");
@ -5153,6 +5172,18 @@ mod tests {
let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap();
// Ensure the document have been inserted into the relevant bitamp
let configs = index.embedding_configs(&rtxn).unwrap();
// for consistency with the below
#[allow(clippy::get_first)]
let (name, _config, user_defined) = configs.get(0).unwrap();
insta::assert_snapshot!(name, @"A_fakerest");
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>");
let (name, _config, user_defined) = configs.get(1).unwrap();
insta::assert_snapshot!(name, @"B_small_hf");
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>");
let embeddings = index.embeddings(&rtxn, 0).unwrap();
// automatically changed to patou
@ -5176,4 +5207,246 @@ mod tests {
}
}
}
#[test]
fn import_vectors_first_and_embedder_later() {
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
let content = serde_json::json!(
[
{
"id": 0,
"doggo": "kefir",
},
{
"id": 1,
"doggo": "intel",
"_vectors": {
"my_doggo_embedder": vec![1; 384],
"unknown embedder": vec![1, 2, 3],
}
},
{
"id": 2,
"doggo": "max",
"_vectors": {
"my_doggo_embedder": {
"userProvided": true,
"embeddings": vec![2; 384],
},
"unknown embedder": vec![4, 5],
},
},
{
"id": 3,
"doggo": "marcel",
"_vectors": {
"my_doggo_embedder": {
"userProvided": false,
"embeddings": vec![3; 384],
},
},
},
{
"id": 4,
"doggo": "sora",
"_vectors": {
"my_doggo_embedder": {
"userProvided": false,
},
},
},
]
);
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0 as u128).unwrap();
let documents_count =
read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file)
.unwrap();
snapshot!(documents_count, @"5");
file.persist().unwrap();
index_scheduler
.register(
KindWithContent::DocumentAdditionOrUpdate {
index_uid: S("doggos"),
primary_key: None,
method: ReplaceDocuments,
content_file: uuid,
documents_count,
allow_index_creation: true,
},
None,
false,
)
.unwrap();
index_scheduler.assert_internally_consistent();
handle.advance_one_successful_batch();
index_scheduler.assert_internally_consistent();
let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap();
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
let documents = index
.all_documents(&rtxn)
.unwrap()
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
.collect::<Vec<_>>();
snapshot!(serde_json::to_string(&documents).unwrap(), name: "documents after initial push");
let mut setting = meilisearch_types::settings::Settings::<Unchecked>::default();
setting.embedders = Setting::Set(maplit::btreemap! {
S("my_doggo_embedder") => Setting::Set(EmbeddingSettings {
source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace),
model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")),
revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")),
document_template: Setting::Set(S("{{doc.doggo}}")),
.. EmbeddingSettings::default()
})
});
index_scheduler
.register(
KindWithContent::SettingsUpdate {
index_uid: S("doggos"),
new_settings: Box::new(setting),
is_deletion: false,
allow_index_creation: false,
},
None,
false,
)
.unwrap();
index_scheduler.assert_internally_consistent();
handle.advance_one_successful_batch();
index_scheduler.assert_internally_consistent();
let index = index_scheduler.index("doggos").unwrap();
let rtxn = index.read_txn().unwrap();
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
let documents = index
.all_documents(&rtxn)
.unwrap()
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
.collect::<Vec<_>>();
// the all the vectors linked to the new specified embedder have been removed
// Only the unknown embedders stays in the document DB
snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]"###);
let conf = index.embedding_configs(&rtxn).unwrap();
// even though we specified the vector for the ID 3, it shouldn't be marked
// as user provided since we explicitely marked it as NOT user provided.
snapshot!(format!("{conf:#?}"), @r###"
[
(
"my_doggo_embedder",
EmbeddingConfig {
embedder_options: HuggingFace(
EmbedderOptions {
model: "sentence-transformers/all-MiniLM-L6-v2",
revision: Some(
"e4ce9877abf3edfe10b0d82785e83bdcb973e22e",
),
distribution: None,
},
),
prompt: PromptData {
template: "{{doc.doggo}}",
},
},
RoaringBitmap<[1, 2]>,
),
]
"###);
let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap();
let embeddings = index.embeddings(&rtxn, docid).unwrap();
let embedding = &embeddings["my_doggo_embedder"];
assert!(!embedding.is_empty(), "{embedding:?}");
// the document with the id 3 should keep its original embedding
let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap();
let mut embeddings = Vec::new();
'vectors: for i in 0..=u8::MAX {
let reader = arroy::Reader::open(&rtxn, 0 | (i as u16), index.vector_arroy)
.map(Some)
.or_else(|e| match e {
arroy::Error::MissingMetadata => Ok(None),
e => Err(e),
})
.transpose();
let Some(reader) = reader else {
break 'vectors;
};
let embedding = reader.unwrap().item_vector(&rtxn, docid).unwrap();
if let Some(embedding) = embedding {
embeddings.push(embedding)
} else {
break 'vectors;
}
}
snapshot!(embeddings.len(), @"1");
assert!(embeddings[0].iter().all(|i| *i == 3.0), "{:?}", embeddings[0]);
// If we update marcel it should regenerate its embedding automatically
let content = serde_json::json!(
[
{
"id": 3,
"doggo": "marvel",
},
{
"id": 4,
"doggo": "sorry",
},
]
);
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(1 as u128).unwrap();
let documents_count =
read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file)
.unwrap();
snapshot!(documents_count, @"2");
file.persist().unwrap();
index_scheduler
.register(
KindWithContent::DocumentAdditionOrUpdate {
index_uid: S("doggos"),
primary_key: None,
method: UpdateDocuments,
content_file: uuid,
documents_count,
allow_index_creation: true,
},
None,
false,
)
.unwrap();
index_scheduler.assert_internally_consistent();
handle.advance_one_successful_batch();
index_scheduler.assert_internally_consistent();
// the document with the id 3 should have its original embedding updated
let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap();
let embeddings = index.embeddings(&rtxn, docid).unwrap();
let embedding = &embeddings["my_doggo_embedder"];
assert!(!embedding.is_empty());
/// TODO: it shouldnt be equal to 3.0
assert!(embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]);
// the document with the id 4 should generate an embedding
// let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap();
// let embeddings = index.embeddings(&rtxn, docid).unwrap();
// dbg!(&embeddings);
// let embedding = &embeddings["my_doggo_embedder"];
// assert!(!embedding.is_empty());
// assert!(embedding[0]);
}
}

View File

@ -0,0 +1,4 @@
---
source: index-scheduler/src/lib.rs
---
[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]

View File

@ -672,7 +672,7 @@ pub fn settings(
let embedders: BTreeMap<_, _> = index
.embedding_configs(rtxn)?
.into_iter()
.map(|(name, config)| (name, Setting::Set(config.into())))
.map(|(name, config, _)| (name, Setting::Set(config.into())))
.collect();
let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) };

View File

@ -44,7 +44,7 @@ once_cell = "1.19.0"
ordered-float = "4.2.0"
rand_pcg = { version = "0.3.1", features = ["serde1"] }
rayon = "1.8.0"
roaring = "0.10.2"
roaring = { version = "0.10.2", features = ["serde"] }
rstar = { version = "0.11.0", features = ["serde"] }
serde = { version = "1.0.195", features = ["derive"] }
serde_json = { version = "1.0.111", features = ["preserve_order"] }

View File

@ -1572,16 +1572,18 @@ impl Index {
Ok(script_language)
}
/// Put the embedding configs:
/// 1. The name of the embedder
/// 2. The configuration option for this embedder
/// 3. The list of documents with a user provided embedding
pub(crate) fn put_embedding_configs(
&self,
wtxn: &mut RwTxn<'_>,
configs: Vec<(String, EmbeddingConfig)>,
configs: Vec<(String, EmbeddingConfig, RoaringBitmap)>,
) -> heed::Result<()> {
self.main.remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig)>>>().put(
wtxn,
main_key::EMBEDDING_CONFIGS,
&configs,
)
self.main
.remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig, RoaringBitmap)>>>()
.put(wtxn, main_key::EMBEDDING_CONFIGS, &configs)
}
pub(crate) fn delete_embedding_configs(&self, wtxn: &mut RwTxn<'_>) -> heed::Result<bool> {
@ -1591,10 +1593,10 @@ impl Index {
pub fn embedding_configs(
&self,
rtxn: &RoTxn<'_>,
) -> Result<Vec<(String, crate::vector::EmbeddingConfig)>> {
) -> Result<Vec<(String, EmbeddingConfig, RoaringBitmap)>> {
Ok(self
.main
.remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig)>>>()
.remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig, RoaringBitmap)>>>()
.get(rtxn, main_key::EMBEDDING_CONFIGS)?
.unwrap_or_default())
}

View File

@ -10,16 +10,16 @@ use bytemuck::cast_slice;
use grenad::Writer;
use itertools::EitherOrBoth;
use ordered_float::OrderedFloat;
use roaring::RoaringBitmap;
use serde_json::Value;
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
use crate::prompt::Prompt;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::helpers::try_split_at;
use crate::update::settings::InnerIndexSettingsDiff;
use crate::vector::parsed_vectors::{ParsedVectorsDiff, RESERVED_VECTORS_FIELD_NAME};
use crate::vector::Embedder;
use crate::{DocumentId, Result, ThreadPoolNoAbort};
use crate::{try_split_array_at, DocumentId, Result, ThreadPoolNoAbort};
/// The length of the elements that are always in the buffer when inserting new values.
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
@ -35,6 +35,8 @@ pub struct ExtractedVectorPoints {
// embedder
pub embedder_name: String,
pub embedder: Arc<Embedder>,
pub user_defined: RoaringBitmap,
pub remove_from_user_defined: RoaringBitmap,
}
enum VectorStateDelta {
@ -80,6 +82,11 @@ struct EmbedderVectorExtractor {
prompts_writer: Writer<BufWriter<File>>,
// (docid) -> ()
remove_vectors_writer: Writer<BufWriter<File>>,
// The docids of the documents that contains a user defined embedding
user_defined: RoaringBitmap,
// The docids of the documents that contains an auto-generated embedding
remove_from_user_defined: RoaringBitmap,
}
/// Extracts the embedding vector contained in each document under the `_vectors` field.
@ -134,6 +141,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
manual_vectors_writer,
prompts_writer,
remove_vectors_writer,
user_defined: RoaringBitmap::new(),
remove_from_user_defined: RoaringBitmap::new(),
});
}
@ -141,13 +150,15 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
let mut cursor = obkv_documents.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
// this must always be serialized as (docid, external_docid);
const SIZE_OF_DOCUMENTID: usize = std::mem::size_of::<DocumentId>();
let (docid_bytes, external_id_bytes) =
try_split_at(key, std::mem::size_of::<DocumentId>()).unwrap();
try_split_array_at::<u8, SIZE_OF_DOCUMENTID>(key).unwrap();
debug_assert!(from_utf8(external_id_bytes).is_ok());
let docid = DocumentId::from_be_bytes(docid_bytes);
let obkv = obkv::KvReader::new(value);
key_buffer.clear();
key_buffer.extend_from_slice(docid_bytes);
key_buffer.extend_from_slice(docid_bytes.as_slice());
// since we only need the primary key when we throw an error we create this getter to
// lazily get it when needed
@ -163,10 +174,22 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
manual_vectors_writer,
prompts_writer,
remove_vectors_writer,
user_defined,
remove_from_user_defined,
} in extractors.iter_mut()
{
let delta = match parsed_vectors.remove(embedder_name) {
(Some(old), Some(new)) => {
match (old.is_user_provided(), new.is_user_provided()) {
(true, true) | (false, false) => (),
(true, false) => {
remove_from_user_defined.insert(docid);
}
(false, true) => {
user_defined.insert(docid);
}
}
// no autogeneration
let del_vectors = old.into_array_of_vectors();
let add_vectors = new.into_array_of_vectors();
@ -187,6 +210,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
if document_is_kept {
remove_from_user_defined.insert(docid);
// becomes autogenerated
VectorStateDelta::NowGenerated(prompt.render(
obkv,
@ -198,6 +222,11 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
}
}
(None, Some(new)) => {
if new.is_user_provided() {
user_defined.insert(docid);
} else {
remove_from_user_defined.insert(docid);
}
// was possibly autogenerated, remove all vectors for that document
let add_vectors = new.into_array_of_vectors();
if add_vectors.len() > usize::from(u8::MAX) {
@ -239,6 +268,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
VectorStateDelta::NoChange
}
} else {
remove_from_user_defined.remove(docid);
VectorStateDelta::NowRemoved
}
}
@ -265,18 +295,18 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
manual_vectors_writer,
prompts_writer,
remove_vectors_writer,
user_defined,
remove_from_user_defined,
} in extractors
{
results.push(ExtractedVectorPoints {
// docid, _index -> KvWriterDelAdd -> Vector
manual_vectors: writer_into_reader(manual_vectors_writer)?,
// docid -> ()
remove_vectors: writer_into_reader(remove_vectors_writer)?,
// docid -> prompt
prompts: writer_into_reader(prompts_writer)?,
embedder,
embedder_name,
user_defined,
remove_from_user_defined,
})
}

View File

@ -238,6 +238,8 @@ fn send_original_documents_data(
prompts,
embedder_name,
embedder,
user_defined,
remove_from_user_defined: auto_generated,
} in extracted_vectors
{
let embeddings = match extract_embeddings(
@ -262,6 +264,8 @@ fn send_original_documents_data(
expected_dimension: embedder.dimensions(),
manual_vectors,
embedder_name,
user_defined,
remove_from_user_defined: auto_generated,
}));
}
}

View File

@ -501,6 +501,8 @@ where
embeddings,
manual_vectors,
embedder_name,
user_defined,
remove_from_user_defined,
} => {
dimension.insert(embedder_name.clone(), expected_dimension);
TypedChunk::VectorPoints {
@ -509,6 +511,8 @@ where
expected_dimension,
manual_vectors,
embedder_name,
user_defined,
remove_from_user_defined,
}
}
otherwise => otherwise,
@ -2616,10 +2620,11 @@ mod tests {
let rtxn = index.read_txn().unwrap();
let mut embedding_configs = index.embedding_configs(&rtxn).unwrap();
let (embedder_name, embedder) = embedding_configs.pop().unwrap();
let (embedder_name, embedder, user_defined) = embedding_configs.pop().unwrap();
insta::assert_snapshot!(embedder_name, @"manual");
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0, 1, 2]>");
let embedder =
std::sync::Arc::new(crate::vector::Embedder::new(embedder.embedder_options).unwrap());
assert_eq!("manual", embedder_name);
let res = index
.search(&rtxn)
.semantic(embedder_name, embedder, Some([0.0, 1.0, 2.0].to_vec()))

View File

@ -90,6 +90,8 @@ pub(crate) enum TypedChunk {
expected_dimension: usize,
manual_vectors: grenad::Reader<BufReader<File>>,
embedder_name: String,
user_defined: RoaringBitmap,
remove_from_user_defined: RoaringBitmap,
},
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
}
@ -155,7 +157,7 @@ pub(crate) fn write_typed_chunk_into_index(
let mut iter = merger.into_stream_merger_iter()?;
let embedders: BTreeSet<_> =
index.embedding_configs(wtxn)?.into_iter().map(|(k, _v)| k).collect();
index.embedding_configs(wtxn)?.into_iter().map(|(name, _, _)| name).collect();
let mut vectors_buffer = Vec::new();
while let Some((key, reader)) = iter.next()? {
let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
@ -181,7 +183,7 @@ pub(crate) fn write_typed_chunk_into_index(
// if the `_vectors` field cannot be parsed as map of vectors, just write it as-is
break 'vectors Some(addition);
};
vectors.retain_user_provided_vectors(&embedders);
vectors.retain_not_embedded_vectors(&embedders);
let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors;
if vectors.is_empty() {
// skip writing empty `_vectors` map
@ -619,6 +621,8 @@ pub(crate) fn write_typed_chunk_into_index(
let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn);
let mut user_defined = RoaringBitmap::new();
let mut remove_from_user_defined = RoaringBitmap::new();
let mut params = None;
for typed_chunk in typed_chunks {
let TypedChunk::VectorPoints {
@ -627,6 +631,8 @@ pub(crate) fn write_typed_chunk_into_index(
embeddings,
expected_dimension,
embedder_name,
user_defined: ud,
remove_from_user_defined: rud,
} = typed_chunk
else {
unreachable!();
@ -639,11 +645,21 @@ pub(crate) fn write_typed_chunk_into_index(
if let Some(embeddings) = embeddings {
embeddings_builder.push(embeddings.into_cursor()?);
}
user_defined |= ud;
remove_from_user_defined |= rud;
}
// typed chunks has always at least 1 chunk.
let Some((expected_dimension, embedder_name)) = params else { unreachable!() };
let mut embedding_configs = index.embedding_configs(&wtxn)?;
let (_name, _conf, ud) =
embedding_configs.iter_mut().find(|config| config.0 == embedder_name).unwrap();
*ud -= remove_from_user_defined;
*ud |= user_defined;
index.put_embedding_configs(wtxn, embedding_configs)?;
let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
)?;

View File

@ -6,6 +6,7 @@ use std::sync::Arc;
use charabia::{Normalize, Tokenizer, TokenizerBuilder};
use deserr::{DeserializeError, Deserr};
use itertools::{EitherOrBoth, Itertools};
use roaring::RoaringBitmap;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use time::OffsetDateTime;
@ -926,8 +927,13 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
Setting::Set(configs) => {
let mut changed = false;
let old_configs = self.index.embedding_configs(self.wtxn)?;
let old_configs: BTreeMap<String, Setting<EmbeddingSettings>> =
old_configs.into_iter().map(|(k, v)| (k, Setting::Set(v.into()))).collect();
let old_configs: BTreeMap<String, (Setting<EmbeddingSettings>, RoaringBitmap)> =
old_configs
.into_iter()
.map(|(name, setting, user_defined)| {
(name, (Setting::Set(setting.into()), user_defined))
})
.collect();
let mut new_configs = BTreeMap::new();
for joined in old_configs
@ -936,15 +942,19 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
{
match joined {
// updated config
EitherOrBoth::Both((name, mut old), (_, new)) => {
EitherOrBoth::Both((name, (mut old, user_defined)), (_, new)) => {
changed |= EmbeddingSettings::apply_and_need_reindex(&mut old, new);
if changed {
tracing::debug!(embedder = name, "need reindex");
tracing::debug!(
embedder = name,
documents = user_defined.len(),
"need reindex"
);
} else {
tracing::debug!(embedder = name, "skip reindex");
}
let new = validate_embedding_settings(old, &name)?;
new_configs.insert(name, new);
new_configs.insert(name, (new, user_defined));
}
// unchanged config
EitherOrBoth::Left((name, setting)) => {
@ -961,21 +971,23 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
);
let setting = validate_embedding_settings(setting, &name)?;
changed = true;
new_configs.insert(name, setting);
new_configs.insert(name, (setting, RoaringBitmap::new()));
}
}
}
let new_configs: Vec<(String, EmbeddingConfig)> = new_configs
let new_configs: Vec<(String, EmbeddingConfig, RoaringBitmap)> = new_configs
.into_iter()
.filter_map(|(name, setting)| match setting {
Setting::Set(value) => Some((name, value.into())),
.filter_map(|(name, (setting, user_defined))| match setting {
Setting::Set(settings) => Some((name, settings.into(), user_defined)),
Setting::Reset => None,
Setting::NotSet => Some((name, EmbeddingSettings::default().into())),
Setting::NotSet => {
Some((name, EmbeddingSettings::default().into(), user_defined))
}
})
.collect();
self.index.embedder_category_id.clear(self.wtxn)?;
for (index, (embedder_name, _)) in new_configs.iter().enumerate() {
for (index, (embedder_name, _, _)) in new_configs.iter().enumerate() {
self.index.embedder_category_id.put_with_flags(
self.wtxn,
heed::PutFlags::APPEND,
@ -1359,10 +1371,12 @@ impl InnerIndexSettings {
}
}
fn embedders(embedding_configs: Vec<(String, EmbeddingConfig)>) -> Result<EmbeddingConfigs> {
fn embedders(
embedding_configs: Vec<(String, EmbeddingConfig, RoaringBitmap)>,
) -> Result<EmbeddingConfigs> {
let res: Result<_> = embedding_configs
.into_iter()
.map(|(name, EmbeddingConfig { embedder_options, prompt })| {
.map(|(name, EmbeddingConfig { embedder_options, prompt }, _)| {
let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?);
let embedder = Arc::new(

View File

@ -17,6 +17,13 @@ pub enum Vectors {
}
impl Vectors {
pub fn is_user_provided(&self) -> bool {
match self {
Vectors::ImplicitlyUserProvided(_) => true,
Vectors::Explicit(ExplicitVectors { user_provided, .. }) => *user_provided,
}
}
pub fn into_array_of_vectors(self) -> Vec<Embedding> {
match self {
Vectors::ImplicitlyUserProvided(embeddings)
@ -89,15 +96,8 @@ impl ParsedVectors {
Ok(ParsedVectors(value))
}
pub fn retain_user_provided_vectors(&mut self, embedders: &BTreeSet<String>) {
self.0.retain(|k, v| match v {
Vectors::ImplicitlyUserProvided(_) => true,
Vectors::Explicit(ExplicitVectors { embeddings: _, user_provided }) => {
*user_provided
// if the embedder is not in the config, then never touch it
|| !embedders.contains(k)
}
});
pub fn retain_not_embedded_vectors(&mut self, embedders: &BTreeSet<String>) {
self.0.retain(|k, _v| !embedders.contains(k))
}
}