|
|
|
@ -1,14 +1,19 @@
|
|
|
|
|
use std::cell::RefCell;
|
|
|
|
|
use std::marker::PhantomData;
|
|
|
|
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
|
|
|
|
use std::num::NonZeroU16;
|
|
|
|
|
use std::{mem, slice};
|
|
|
|
|
|
|
|
|
|
use bbqueue::framed::{FrameGrantR, FrameProducer};
|
|
|
|
|
use bytemuck::{NoUninit, CheckedBitPattern};
|
|
|
|
|
use crossbeam::sync::{Parker, Unparker};
|
|
|
|
|
use crossbeam_channel::{IntoIter, Receiver, SendError, Sender};
|
|
|
|
|
use crossbeam_channel::{IntoIter, Receiver, SendError};
|
|
|
|
|
use heed::types::Bytes;
|
|
|
|
|
use heed::BytesDecode;
|
|
|
|
|
use memmap2::Mmap;
|
|
|
|
|
use roaring::RoaringBitmap;
|
|
|
|
|
|
|
|
|
|
use super::extract::FacetKind;
|
|
|
|
|
use super::ref_cell_ext::RefCellExt;
|
|
|
|
|
use super::thread_local::{FullySend, ThreadLocal};
|
|
|
|
|
use super::StdResult;
|
|
|
|
|
use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec};
|
|
|
|
@ -16,7 +21,7 @@ use crate::index::main_key::{GEO_FACETED_DOCUMENTS_IDS_KEY, GEO_RTREE_KEY};
|
|
|
|
|
use crate::index::{db_name, IndexEmbeddingConfig};
|
|
|
|
|
use crate::update::new::KvReaderFieldId;
|
|
|
|
|
use crate::vector::Embedding;
|
|
|
|
|
use crate::{DocumentId, Index};
|
|
|
|
|
use crate::{CboRoaringBitmapCodec, DocumentId, Index};
|
|
|
|
|
|
|
|
|
|
/// Creates a tuple of producer/receivers to be used by
|
|
|
|
|
/// the extractors and the writer loop.
|
|
|
|
@ -26,125 +31,97 @@ use crate::{DocumentId, Index};
|
|
|
|
|
/// Panics if the number of provided bbqueue is not exactly equal
|
|
|
|
|
/// to the number of available threads in the rayon threadpool.
|
|
|
|
|
pub fn extractor_writer_bbqueue(
|
|
|
|
|
bbqueue: &[bbqueue::BBBuffer],
|
|
|
|
|
bbbuffers: &[bbqueue::BBBuffer],
|
|
|
|
|
) -> (ExtractorBbqueueSender, WriterBbqueueReceiver) {
|
|
|
|
|
assert_eq!(
|
|
|
|
|
bbqueue.len(),
|
|
|
|
|
bbbuffers.len(),
|
|
|
|
|
rayon::current_num_threads(),
|
|
|
|
|
"You must provide as many BBBuffer as the available number of threads to extract"
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
let capacity = bbbuffers.first().unwrap().capacity();
|
|
|
|
|
let parker = Parker::new();
|
|
|
|
|
let extractors = ThreadLocal::with_capacity(bbqueue.len());
|
|
|
|
|
let extractors = ThreadLocal::with_capacity(bbbuffers.len());
|
|
|
|
|
let producers = rayon::broadcast(|bi| {
|
|
|
|
|
let bbqueue = &bbqueue[bi.index()];
|
|
|
|
|
let bbqueue = &bbbuffers[bi.index()];
|
|
|
|
|
let (producer, consumer) = bbqueue.try_split_framed().unwrap();
|
|
|
|
|
extractors.get_or(|| FullySend(producer));
|
|
|
|
|
extractors.get_or(|| FullySend(RefCell::new(producer)));
|
|
|
|
|
consumer
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
(
|
|
|
|
|
ExtractorBbqueueSender { inner: extractors, unparker: parker.unparker().clone() },
|
|
|
|
|
ExtractorBbqueueSender {
|
|
|
|
|
inner: extractors,
|
|
|
|
|
capacity: capacity.checked_sub(9).unwrap(),
|
|
|
|
|
unparker: parker.unparker().clone(),
|
|
|
|
|
},
|
|
|
|
|
WriterBbqueueReceiver { inner: producers, parker },
|
|
|
|
|
)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub struct ExtractorBbqueueSender<'a> {
|
|
|
|
|
inner: ThreadLocal<FullySend<bbqueue::framed::FrameProducer<'a>>>,
|
|
|
|
|
/// Used to wake up the receiver thread,
|
|
|
|
|
/// Used everytime we write something in the producer.
|
|
|
|
|
unparker: Unparker,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub struct WriterBbqueueReceiver<'a> {
|
|
|
|
|
inner: Vec<bbqueue::framed::FrameConsumer<'a>>,
|
|
|
|
|
/// Used to park when no more work is required
|
|
|
|
|
parker: Parker,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// The capacity of the channel is currently in number of messages.
|
|
|
|
|
pub fn extractor_writer_channel(cap: usize) -> (ExtractorSender, WriterReceiver) {
|
|
|
|
|
let (sender, receiver) = crossbeam_channel::bounded(cap);
|
|
|
|
|
(
|
|
|
|
|
ExtractorSender {
|
|
|
|
|
sender,
|
|
|
|
|
send_count: Default::default(),
|
|
|
|
|
writer_contentious_count: Default::default(),
|
|
|
|
|
extractor_contentious_count: Default::default(),
|
|
|
|
|
},
|
|
|
|
|
WriterReceiver(receiver),
|
|
|
|
|
)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub enum KeyValueEntry {
|
|
|
|
|
Small { key_length: usize, data: Box<[u8]> },
|
|
|
|
|
Large { key_entry: KeyEntry, data: Mmap },
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl KeyValueEntry {
|
|
|
|
|
pub fn from_small_key_value(key: &[u8], value: &[u8]) -> Self {
|
|
|
|
|
let mut data = Vec::with_capacity(key.len() + value.len());
|
|
|
|
|
data.extend_from_slice(key);
|
|
|
|
|
data.extend_from_slice(value);
|
|
|
|
|
KeyValueEntry::Small { key_length: key.len(), data: data.into_boxed_slice() }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn from_large_key_value(key: &[u8], value: Mmap) -> Self {
|
|
|
|
|
KeyValueEntry::Large { key_entry: KeyEntry::from_key(key), data: value }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn key(&self) -> &[u8] {
|
|
|
|
|
match self {
|
|
|
|
|
KeyValueEntry::Small { key_length, data } => &data[..*key_length],
|
|
|
|
|
KeyValueEntry::Large { key_entry, data: _ } => key_entry.entry(),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn value(&self) -> &[u8] {
|
|
|
|
|
match self {
|
|
|
|
|
KeyValueEntry::Small { key_length, data } => &data[*key_length..],
|
|
|
|
|
KeyValueEntry::Large { key_entry: _, data } => &data[..],
|
|
|
|
|
impl<'a> WriterBbqueueReceiver<'a> {
|
|
|
|
|
pub fn read(&mut self) -> Option<FrameWithHeader<'a>> {
|
|
|
|
|
loop {
|
|
|
|
|
for consumer in &mut self.inner {
|
|
|
|
|
// mark the frame as auto release
|
|
|
|
|
if let Some() = consumer.read()
|
|
|
|
|
}
|
|
|
|
|
break None;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub struct KeyEntry {
|
|
|
|
|
data: Box<[u8]>,
|
|
|
|
|
struct FrameWithHeader<'a> {
|
|
|
|
|
header: EntryHeader,
|
|
|
|
|
frame: FrameGrantR<'a>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl KeyEntry {
|
|
|
|
|
pub fn from_key(key: &[u8]) -> Self {
|
|
|
|
|
KeyEntry { data: key.to_vec().into_boxed_slice() }
|
|
|
|
|
#[derive(Debug, Clone, Copy, CheckedBitPattern)]
|
|
|
|
|
#[repr(u8)]
|
|
|
|
|
enum EntryHeader {
|
|
|
|
|
/// Wether a put of the key/value pair or a delete of the given key.
|
|
|
|
|
DbOperation {
|
|
|
|
|
/// The database on which to perform the operation.
|
|
|
|
|
database: Database,
|
|
|
|
|
/// The key length in the buffer.
|
|
|
|
|
///
|
|
|
|
|
/// If None it means that the buffer is dedicated
|
|
|
|
|
/// to the key and it is therefore a deletion operation.
|
|
|
|
|
key_length: Option<NonZeroU16>,
|
|
|
|
|
},
|
|
|
|
|
ArroyDeleteVector {
|
|
|
|
|
docid: DocumentId,
|
|
|
|
|
},
|
|
|
|
|
/// The embedding is the remaining space and represents a non-aligned [f32].
|
|
|
|
|
ArroySetVector {
|
|
|
|
|
docid: DocumentId,
|
|
|
|
|
embedder_id: u8,
|
|
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl EntryHeader {
|
|
|
|
|
fn delete_key_size(key_length: u16) -> usize {
|
|
|
|
|
mem::size_of::<Self>() + key_length as usize
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn entry(&self) -> &[u8] {
|
|
|
|
|
self.data.as_ref()
|
|
|
|
|
fn put_key_value_size(key_length: u16, value_length: usize) -> usize {
|
|
|
|
|
mem::size_of::<Self>() + key_length as usize + value_length
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn bytes_of(&self) -> &[u8] {
|
|
|
|
|
/// TODO do the variant matching ourselves
|
|
|
|
|
todo!()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub enum EntryOperation {
|
|
|
|
|
Delete(KeyEntry),
|
|
|
|
|
Write(KeyValueEntry),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub enum WriterOperation {
|
|
|
|
|
DbOperation(DbOperation),
|
|
|
|
|
ArroyOperation(ArroyOperation),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub enum ArroyOperation {
|
|
|
|
|
DeleteVectors { docid: DocumentId },
|
|
|
|
|
SetVectors { docid: DocumentId, embedder_id: u8, embeddings: Vec<Embedding> },
|
|
|
|
|
SetVector { docid: DocumentId, embedder_id: u8, embedding: Embedding },
|
|
|
|
|
Finish { configs: Vec<IndexEmbeddingConfig> },
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub struct DbOperation {
|
|
|
|
|
database: Database,
|
|
|
|
|
entry: EntryOperation,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[derive(Debug)]
|
|
|
|
|
#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)]
|
|
|
|
|
#[repr(u32)]
|
|
|
|
|
pub enum Database {
|
|
|
|
|
Main,
|
|
|
|
|
Documents,
|
|
|
|
@ -220,82 +197,46 @@ impl From<FacetKind> for Database {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl DbOperation {
|
|
|
|
|
pub fn database(&self, index: &Index) -> heed::Database<Bytes, Bytes> {
|
|
|
|
|
self.database.database(index)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn database_name(&self) -> &'static str {
|
|
|
|
|
self.database.database_name()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn entry(self) -> EntryOperation {
|
|
|
|
|
self.entry
|
|
|
|
|
}
|
|
|
|
|
pub struct ExtractorBbqueueSender<'a> {
|
|
|
|
|
inner: ThreadLocal<FullySend<RefCell<FrameProducer<'a>>>>,
|
|
|
|
|
/// The capacity of this frame producer, will never be able to store more than that.
|
|
|
|
|
///
|
|
|
|
|
/// Note that the FrameProducer requires up to 9 bytes to encode the length,
|
|
|
|
|
/// the capacity has been shrinked accordingly.
|
|
|
|
|
///
|
|
|
|
|
/// <https://docs.rs/bbqueue/latest/bbqueue/framed/index.html#frame-header>
|
|
|
|
|
capacity: usize,
|
|
|
|
|
/// Used to wake up the receiver thread,
|
|
|
|
|
/// Used everytime we write something in the producer.
|
|
|
|
|
unparker: Unparker,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub struct WriterReceiver(Receiver<WriterOperation>);
|
|
|
|
|
|
|
|
|
|
impl IntoIterator for WriterReceiver {
|
|
|
|
|
type Item = WriterOperation;
|
|
|
|
|
type IntoIter = IntoIter<Self::Item>;
|
|
|
|
|
|
|
|
|
|
fn into_iter(self) -> Self::IntoIter {
|
|
|
|
|
self.0.into_iter()
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub struct ExtractorSender {
|
|
|
|
|
sender: Sender<WriterOperation>,
|
|
|
|
|
/// The number of message we sent in total in the channel.
|
|
|
|
|
send_count: AtomicUsize,
|
|
|
|
|
/// The number of times we sent something in a channel that was full.
|
|
|
|
|
writer_contentious_count: AtomicUsize,
|
|
|
|
|
/// The number of times we sent something in a channel that was empty.
|
|
|
|
|
extractor_contentious_count: AtomicUsize,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl Drop for ExtractorSender {
|
|
|
|
|
fn drop(&mut self) {
|
|
|
|
|
let send_count = *self.send_count.get_mut();
|
|
|
|
|
let writer_contentious_count = *self.writer_contentious_count.get_mut();
|
|
|
|
|
let extractor_contentious_count = *self.extractor_contentious_count.get_mut();
|
|
|
|
|
tracing::debug!(
|
|
|
|
|
"Extractor channel stats: {send_count} sends, \
|
|
|
|
|
{writer_contentious_count} writer contentions ({}%), \
|
|
|
|
|
{extractor_contentious_count} extractor contentions ({}%)",
|
|
|
|
|
(writer_contentious_count as f32 / send_count as f32) * 100.0,
|
|
|
|
|
(extractor_contentious_count as f32 / send_count as f32) * 100.0
|
|
|
|
|
)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl ExtractorSender {
|
|
|
|
|
pub fn docids<D: DatabaseType>(&self) -> WordDocidsSender<'_, D> {
|
|
|
|
|
impl<'b> ExtractorBbqueueSender<'b> {
|
|
|
|
|
pub fn docids<'a, D: DatabaseType>(&'a self) -> WordDocidsSender<'a, 'b, D> {
|
|
|
|
|
WordDocidsSender { sender: self, _marker: PhantomData }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn facet_docids(&self) -> FacetDocidsSender<'_> {
|
|
|
|
|
pub fn facet_docids<'a>(&'a self) -> FacetDocidsSender<'a, 'b> {
|
|
|
|
|
FacetDocidsSender { sender: self }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn field_id_docid_facet_sender(&self) -> FieldIdDocidFacetSender<'_> {
|
|
|
|
|
FieldIdDocidFacetSender(self)
|
|
|
|
|
pub fn field_id_docid_facet_sender<'a>(&'a self) -> FieldIdDocidFacetSender<'a, 'b> {
|
|
|
|
|
FieldIdDocidFacetSender(&self)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn documents(&self) -> DocumentsSender<'_> {
|
|
|
|
|
DocumentsSender(self)
|
|
|
|
|
pub fn documents<'a>(&'a self) -> DocumentsSender<'a, 'b> {
|
|
|
|
|
DocumentsSender(&self)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn embeddings(&self) -> EmbeddingSender<'_> {
|
|
|
|
|
EmbeddingSender(&self.sender)
|
|
|
|
|
pub fn embeddings<'a>(&'a self) -> EmbeddingSender<'a, 'b> {
|
|
|
|
|
EmbeddingSender(&self)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn geo(&self) -> GeoSender<'_> {
|
|
|
|
|
GeoSender(&self.sender)
|
|
|
|
|
pub fn geo<'a>(&'a self) -> GeoSender<'a, 'b> {
|
|
|
|
|
GeoSender(&self)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn send_delete_vector(&self, docid: DocumentId) -> StdResult<(), SendError<()>> {
|
|
|
|
|
fn send_delete_vector(&self, docid: DocumentId) -> crate::Result<()> {
|
|
|
|
|
match self
|
|
|
|
|
.sender
|
|
|
|
|
.send(WriterOperation::ArroyOperation(ArroyOperation::DeleteVectors { docid }))
|
|
|
|
@ -305,18 +246,69 @@ impl ExtractorSender {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn send_db_operation(&self, op: DbOperation) -> StdResult<(), SendError<()>> {
|
|
|
|
|
if self.sender.is_full() {
|
|
|
|
|
self.writer_contentious_count.fetch_add(1, Ordering::SeqCst);
|
|
|
|
|
}
|
|
|
|
|
if self.sender.is_empty() {
|
|
|
|
|
self.extractor_contentious_count.fetch_add(1, Ordering::SeqCst);
|
|
|
|
|
fn write_key_value(&self, database: Database, key: &[u8], value: &[u8]) -> crate::Result<()> {
|
|
|
|
|
let capacity = self.capacity;
|
|
|
|
|
let refcell = self.inner.get().unwrap();
|
|
|
|
|
let mut producer = refcell.0.borrow_mut_or_yield();
|
|
|
|
|
|
|
|
|
|
let key_length = key.len().try_into().unwrap();
|
|
|
|
|
let value_length = value.len();
|
|
|
|
|
let total_length = EntryHeader::put_key_value_size(key_length, value_length);
|
|
|
|
|
if total_length > capacity {
|
|
|
|
|
unreachable!("entry larger that the bbqueue capacity");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
self.send_count.fetch_add(1, Ordering::SeqCst);
|
|
|
|
|
match self.sender.send(WriterOperation::DbOperation(op)) {
|
|
|
|
|
Ok(()) => Ok(()),
|
|
|
|
|
Err(SendError(_)) => Err(SendError(())),
|
|
|
|
|
let payload_header =
|
|
|
|
|
EntryHeader::DbOperation { database, key_length: NonZeroU16::new(key_length) };
|
|
|
|
|
|
|
|
|
|
loop {
|
|
|
|
|
let mut grant = match producer.grant(total_length) {
|
|
|
|
|
Ok(grant) => grant,
|
|
|
|
|
Err(bbqueue::Error::InsufficientSize) => continue,
|
|
|
|
|
Err(e) => unreachable!("{e:?}"),
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let (header, remaining) = grant.split_at_mut(mem::size_of::<EntryHeader>());
|
|
|
|
|
header.copy_from_slice(payload_header.bytes_of());
|
|
|
|
|
let (key_out, value_out) = remaining.split_at_mut(key.len());
|
|
|
|
|
key_out.copy_from_slice(key);
|
|
|
|
|
value_out.copy_from_slice(value);
|
|
|
|
|
|
|
|
|
|
// We could commit only the used memory.
|
|
|
|
|
grant.commit(total_length);
|
|
|
|
|
|
|
|
|
|
break Ok(());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn delete_entry(&self, database: Database, key: &[u8]) -> crate::Result<()> {
|
|
|
|
|
let capacity = self.capacity;
|
|
|
|
|
let refcell = self.inner.get().unwrap();
|
|
|
|
|
let mut producer = refcell.0.borrow_mut_or_yield();
|
|
|
|
|
|
|
|
|
|
let key_length = key.len().try_into().unwrap();
|
|
|
|
|
let total_length = EntryHeader::delete_key_size(key_length);
|
|
|
|
|
if total_length > capacity {
|
|
|
|
|
unreachable!("entry larger that the bbqueue capacity");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let payload_header = EntryHeader::DbOperation { database, key_length: None };
|
|
|
|
|
|
|
|
|
|
loop {
|
|
|
|
|
let mut grant = match producer.grant(total_length) {
|
|
|
|
|
Ok(grant) => grant,
|
|
|
|
|
Err(bbqueue::Error::InsufficientSize) => continue,
|
|
|
|
|
Err(e) => unreachable!("{e:?}"),
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let (header, remaining) = grant.split_at_mut(mem::size_of::<EntryHeader>());
|
|
|
|
|
header.copy_from_slice(payload_header.bytes_of());
|
|
|
|
|
remaining.copy_from_slice(key);
|
|
|
|
|
|
|
|
|
|
// We could commit only the used memory.
|
|
|
|
|
grant.commit(total_length);
|
|
|
|
|
|
|
|
|
|
break Ok(());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -356,159 +348,237 @@ impl DatabaseType for WordPositionDocids {
|
|
|
|
|
const DATABASE: Database = Database::WordPositionDocids;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub trait DocidsSender {
|
|
|
|
|
fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>>;
|
|
|
|
|
fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>>;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub struct WordDocidsSender<'a, D> {
|
|
|
|
|
sender: &'a ExtractorSender,
|
|
|
|
|
pub struct WordDocidsSender<'a, 'b, D> {
|
|
|
|
|
sender: &'a ExtractorBbqueueSender<'b>,
|
|
|
|
|
_marker: PhantomData<D>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl<D: DatabaseType> DocidsSender for WordDocidsSender<'_, D> {
|
|
|
|
|
fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> {
|
|
|
|
|
let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value));
|
|
|
|
|
match self.sender.send_db_operation(DbOperation { database: D::DATABASE, entry }) {
|
|
|
|
|
Ok(()) => Ok(()),
|
|
|
|
|
Err(SendError(_)) => Err(SendError(())),
|
|
|
|
|
impl<D: DatabaseType> WordDocidsSender<'_, '_, D> {
|
|
|
|
|
pub fn write(&self, key: &[u8], bitmap: &RoaringBitmap) -> crate::Result<()> {
|
|
|
|
|
let capacity = self.sender.capacity;
|
|
|
|
|
let refcell = self.sender.inner.get().unwrap();
|
|
|
|
|
let mut producer = refcell.0.borrow_mut_or_yield();
|
|
|
|
|
|
|
|
|
|
let key_length = key.len().try_into().unwrap();
|
|
|
|
|
let value_length = CboRoaringBitmapCodec::serialized_size(bitmap);
|
|
|
|
|
|
|
|
|
|
let total_length = EntryHeader::put_key_value_size(key_length, value_length);
|
|
|
|
|
if total_length > capacity {
|
|
|
|
|
unreachable!("entry larger that the bbqueue capacity");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
|
|
|
|
|
let entry = EntryOperation::Delete(KeyEntry::from_key(key));
|
|
|
|
|
match self.sender.send_db_operation(DbOperation { database: D::DATABASE, entry }) {
|
|
|
|
|
Ok(()) => Ok(()),
|
|
|
|
|
Err(SendError(_)) => Err(SendError(())),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub struct FacetDocidsSender<'a> {
|
|
|
|
|
sender: &'a ExtractorSender,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl DocidsSender for FacetDocidsSender<'_> {
|
|
|
|
|
fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> {
|
|
|
|
|
let (facet_kind, key) = FacetKind::extract_from_key(key);
|
|
|
|
|
let database = Database::from(facet_kind);
|
|
|
|
|
let entry = match facet_kind {
|
|
|
|
|
// skip level group size
|
|
|
|
|
FacetKind::String | FacetKind::Number => {
|
|
|
|
|
// add facet group size
|
|
|
|
|
let value = [&[1], value].concat();
|
|
|
|
|
EntryOperation::Write(KeyValueEntry::from_small_key_value(key, &value))
|
|
|
|
|
}
|
|
|
|
|
_ => EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)),
|
|
|
|
|
let payload_header = EntryHeader::DbOperation {
|
|
|
|
|
database: D::DATABASE,
|
|
|
|
|
key_length: NonZeroU16::new(key_length),
|
|
|
|
|
};
|
|
|
|
|
match self.sender.send_db_operation(DbOperation { database, entry }) {
|
|
|
|
|
Ok(()) => Ok(()),
|
|
|
|
|
Err(SendError(_)) => Err(SendError(())),
|
|
|
|
|
|
|
|
|
|
loop {
|
|
|
|
|
let mut grant = match producer.grant(total_length) {
|
|
|
|
|
Ok(grant) => grant,
|
|
|
|
|
Err(bbqueue::Error::InsufficientSize) => continue,
|
|
|
|
|
Err(e) => unreachable!("{e:?}"),
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let (header, remaining) = grant.split_at_mut(mem::size_of::<EntryHeader>());
|
|
|
|
|
header.copy_from_slice(payload_header.bytes_of());
|
|
|
|
|
let (key_out, value_out) = remaining.split_at_mut(key.len());
|
|
|
|
|
key_out.copy_from_slice(key);
|
|
|
|
|
CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_out)?;
|
|
|
|
|
|
|
|
|
|
// We could commit only the used memory.
|
|
|
|
|
grant.commit(total_length);
|
|
|
|
|
|
|
|
|
|
break Ok(());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
|
|
|
|
|
pub fn delete(&self, key: &[u8]) -> crate::Result<()> {
|
|
|
|
|
let capacity = self.sender.capacity;
|
|
|
|
|
let refcell = self.sender.inner.get().unwrap();
|
|
|
|
|
let mut producer = refcell.0.borrow_mut_or_yield();
|
|
|
|
|
|
|
|
|
|
let key_length = key.len().try_into().unwrap();
|
|
|
|
|
let total_length = EntryHeader::delete_key_size(key_length);
|
|
|
|
|
if total_length > capacity {
|
|
|
|
|
unreachable!("entry larger that the bbqueue capacity");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let payload_header = EntryHeader::DbOperation { database: D::DATABASE, key_length: None };
|
|
|
|
|
|
|
|
|
|
loop {
|
|
|
|
|
let mut grant = match producer.grant(total_length) {
|
|
|
|
|
Ok(grant) => grant,
|
|
|
|
|
Err(bbqueue::Error::InsufficientSize) => continue,
|
|
|
|
|
Err(e) => unreachable!("{e:?}"),
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let (header, remaining) = grant.split_at_mut(mem::size_of::<EntryHeader>());
|
|
|
|
|
header.copy_from_slice(payload_header.bytes_of());
|
|
|
|
|
remaining.copy_from_slice(key);
|
|
|
|
|
|
|
|
|
|
// We could commit only the used memory.
|
|
|
|
|
grant.commit(total_length);
|
|
|
|
|
|
|
|
|
|
break Ok(());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub struct FacetDocidsSender<'a, 'b> {
|
|
|
|
|
sender: &'a ExtractorBbqueueSender<'b>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl FacetDocidsSender<'_, '_> {
|
|
|
|
|
pub fn write(&self, key: &[u8], bitmap: &RoaringBitmap) -> crate::Result<()> {
|
|
|
|
|
let capacity = self.sender.capacity;
|
|
|
|
|
let refcell = self.sender.inner.get().unwrap();
|
|
|
|
|
let mut producer = refcell.0.borrow_mut_or_yield();
|
|
|
|
|
|
|
|
|
|
let (facet_kind, key) = FacetKind::extract_from_key(key);
|
|
|
|
|
let database = Database::from(facet_kind);
|
|
|
|
|
let entry = EntryOperation::Delete(KeyEntry::from_key(key));
|
|
|
|
|
match self.sender.send_db_operation(DbOperation { database, entry }) {
|
|
|
|
|
Ok(()) => Ok(()),
|
|
|
|
|
Err(SendError(_)) => Err(SendError(())),
|
|
|
|
|
let key_length = key.len().try_into().unwrap();
|
|
|
|
|
|
|
|
|
|
let value_length = CboRoaringBitmapCodec::serialized_size(bitmap);
|
|
|
|
|
let value_length = match facet_kind {
|
|
|
|
|
// We must take the facet group size into account
|
|
|
|
|
// when we serialize strings and numbers.
|
|
|
|
|
FacetKind::Number | FacetKind::String => value_length + 1,
|
|
|
|
|
FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_length,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let total_length = EntryHeader::put_key_value_size(key_length, value_length);
|
|
|
|
|
if total_length > capacity {
|
|
|
|
|
unreachable!("entry larger that the bbqueue capacity");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let payload_header = EntryHeader::DbOperation {
|
|
|
|
|
database: Database::from(facet_kind),
|
|
|
|
|
key_length: NonZeroU16::new(key_length),
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
loop {
|
|
|
|
|
let mut grant = match producer.grant(total_length) {
|
|
|
|
|
Ok(grant) => grant,
|
|
|
|
|
Err(bbqueue::Error::InsufficientSize) => continue,
|
|
|
|
|
Err(e) => unreachable!("{e:?}"),
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let (header, remaining) = grant.split_at_mut(mem::size_of::<EntryHeader>());
|
|
|
|
|
header.copy_from_slice(payload_header.bytes_of());
|
|
|
|
|
let (key_out, value_out) = remaining.split_at_mut(key.len());
|
|
|
|
|
key_out.copy_from_slice(key);
|
|
|
|
|
|
|
|
|
|
let value_out = match facet_kind {
|
|
|
|
|
// We must take the facet group size into account
|
|
|
|
|
// when we serialize strings and numbers.
|
|
|
|
|
FacetKind::String | FacetKind::Number => {
|
|
|
|
|
let (first, remaining) = value_out.split_first_mut().unwrap();
|
|
|
|
|
*first = 1;
|
|
|
|
|
remaining
|
|
|
|
|
}
|
|
|
|
|
FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_out,
|
|
|
|
|
};
|
|
|
|
|
CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_out)?;
|
|
|
|
|
|
|
|
|
|
// We could commit only the used memory.
|
|
|
|
|
grant.commit(total_length);
|
|
|
|
|
|
|
|
|
|
break Ok(());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn delete(&self, key: &[u8]) -> crate::Result<()> {
|
|
|
|
|
let capacity = self.sender.capacity;
|
|
|
|
|
let refcell = self.sender.inner.get().unwrap();
|
|
|
|
|
let mut producer = refcell.0.borrow_mut_or_yield();
|
|
|
|
|
|
|
|
|
|
let (facet_kind, key) = FacetKind::extract_from_key(key);
|
|
|
|
|
let key_length = key.len().try_into().unwrap();
|
|
|
|
|
|
|
|
|
|
let total_length = EntryHeader::delete_key_size(key_length);
|
|
|
|
|
if total_length > capacity {
|
|
|
|
|
unreachable!("entry larger that the bbqueue capacity");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let payload_header =
|
|
|
|
|
EntryHeader::DbOperation { database: Database::from(facet_kind), key_length: None };
|
|
|
|
|
|
|
|
|
|
loop {
|
|
|
|
|
let mut grant = match producer.grant(total_length) {
|
|
|
|
|
Ok(grant) => grant,
|
|
|
|
|
Err(bbqueue::Error::InsufficientSize) => continue,
|
|
|
|
|
Err(e) => unreachable!("{e:?}"),
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let (header, remaining) = grant.split_at_mut(mem::size_of::<EntryHeader>());
|
|
|
|
|
header.copy_from_slice(payload_header.bytes_of());
|
|
|
|
|
remaining.copy_from_slice(key);
|
|
|
|
|
|
|
|
|
|
// We could commit only the used memory.
|
|
|
|
|
grant.commit(total_length);
|
|
|
|
|
|
|
|
|
|
break Ok(());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub struct FieldIdDocidFacetSender<'a>(&'a ExtractorSender);
|
|
|
|
|
pub struct FieldIdDocidFacetSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>);
|
|
|
|
|
|
|
|
|
|
impl FieldIdDocidFacetSender<'_> {
|
|
|
|
|
pub fn write_facet_string(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> {
|
|
|
|
|
impl FieldIdDocidFacetSender<'_, '_> {
|
|
|
|
|
pub fn write_facet_string(&self, key: &[u8], value: &[u8]) -> crate::Result<()> {
|
|
|
|
|
debug_assert!(FieldDocIdFacetStringCodec::bytes_decode(key).is_ok());
|
|
|
|
|
let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value));
|
|
|
|
|
self.0
|
|
|
|
|
.send_db_operation(DbOperation { database: Database::FieldIdDocidFacetStrings, entry })
|
|
|
|
|
self.0.write_key_value(Database::FieldIdDocidFacetStrings, key, value)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn write_facet_f64(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
|
|
|
|
|
pub fn write_facet_f64(&self, key: &[u8]) -> crate::Result<()> {
|
|
|
|
|
debug_assert!(FieldDocIdFacetF64Codec::bytes_decode(key).is_ok());
|
|
|
|
|
let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, &[]));
|
|
|
|
|
self.0.send_db_operation(DbOperation { database: Database::FieldIdDocidFacetF64s, entry })
|
|
|
|
|
self.0.write_key_value(Database::FieldIdDocidFacetF64s, key, &[])
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn delete_facet_string(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
|
|
|
|
|
pub fn delete_facet_string(&self, key: &[u8]) -> crate::Result<()> {
|
|
|
|
|
debug_assert!(FieldDocIdFacetStringCodec::bytes_decode(key).is_ok());
|
|
|
|
|
let entry = EntryOperation::Delete(KeyEntry::from_key(key));
|
|
|
|
|
self.0
|
|
|
|
|
.send_db_operation(DbOperation { database: Database::FieldIdDocidFacetStrings, entry })
|
|
|
|
|
self.0.delete_entry(Database::FieldIdDocidFacetStrings, key)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn delete_facet_f64(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
|
|
|
|
|
pub fn delete_facet_f64(&self, key: &[u8]) -> crate::Result<()> {
|
|
|
|
|
debug_assert!(FieldDocIdFacetF64Codec::bytes_decode(key).is_ok());
|
|
|
|
|
let entry = EntryOperation::Delete(KeyEntry::from_key(key));
|
|
|
|
|
self.0.send_db_operation(DbOperation { database: Database::FieldIdDocidFacetF64s, entry })
|
|
|
|
|
self.0.delete_entry(Database::FieldIdDocidFacetF64s, key)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub struct DocumentsSender<'a>(&'a ExtractorSender);
|
|
|
|
|
pub struct DocumentsSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>);
|
|
|
|
|
|
|
|
|
|
impl DocumentsSender<'_> {
|
|
|
|
|
impl DocumentsSender<'_, '_> {
|
|
|
|
|
/// TODO do that efficiently
|
|
|
|
|
pub fn uncompressed(
|
|
|
|
|
&self,
|
|
|
|
|
docid: DocumentId,
|
|
|
|
|
external_id: String,
|
|
|
|
|
document: &KvReaderFieldId,
|
|
|
|
|
) -> StdResult<(), SendError<()>> {
|
|
|
|
|
let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(
|
|
|
|
|
&docid.to_be_bytes(),
|
|
|
|
|
document.as_bytes(),
|
|
|
|
|
));
|
|
|
|
|
match self.0.send_db_operation(DbOperation { database: Database::Documents, entry }) {
|
|
|
|
|
Ok(()) => Ok(()),
|
|
|
|
|
Err(SendError(_)) => Err(SendError(())),
|
|
|
|
|
}?;
|
|
|
|
|
|
|
|
|
|
let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(
|
|
|
|
|
) -> crate::Result<()> {
|
|
|
|
|
self.0.write_key_value(Database::Documents, &docid.to_be_bytes(), document.as_bytes())?;
|
|
|
|
|
self.0.write_key_value(
|
|
|
|
|
Database::ExternalDocumentsIds,
|
|
|
|
|
external_id.as_bytes(),
|
|
|
|
|
&docid.to_be_bytes(),
|
|
|
|
|
));
|
|
|
|
|
match self
|
|
|
|
|
.0
|
|
|
|
|
.send_db_operation(DbOperation { database: Database::ExternalDocumentsIds, entry })
|
|
|
|
|
{
|
|
|
|
|
Ok(()) => Ok(()),
|
|
|
|
|
Err(SendError(_)) => Err(SendError(())),
|
|
|
|
|
}
|
|
|
|
|
)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn delete(&self, docid: DocumentId, external_id: String) -> StdResult<(), SendError<()>> {
|
|
|
|
|
let entry = EntryOperation::Delete(KeyEntry::from_key(&docid.to_be_bytes()));
|
|
|
|
|
match self.0.send_db_operation(DbOperation { database: Database::Documents, entry }) {
|
|
|
|
|
Ok(()) => Ok(()),
|
|
|
|
|
Err(SendError(_)) => Err(SendError(())),
|
|
|
|
|
}?;
|
|
|
|
|
|
|
|
|
|
pub fn delete(&self, docid: DocumentId, external_id: String) -> crate::Result<()> {
|
|
|
|
|
self.0.delete_entry(Database::Documents, &docid.to_be_bytes())?;
|
|
|
|
|
self.0.send_delete_vector(docid)?;
|
|
|
|
|
|
|
|
|
|
let entry = EntryOperation::Delete(KeyEntry::from_key(external_id.as_bytes()));
|
|
|
|
|
match self
|
|
|
|
|
.0
|
|
|
|
|
.send_db_operation(DbOperation { database: Database::ExternalDocumentsIds, entry })
|
|
|
|
|
{
|
|
|
|
|
Ok(()) => Ok(()),
|
|
|
|
|
Err(SendError(_)) => Err(SendError(())),
|
|
|
|
|
}
|
|
|
|
|
self.0.delete_entry(Database::ExternalDocumentsIds, external_id.as_bytes())
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub struct EmbeddingSender<'a>(&'a Sender<WriterOperation>);
|
|
|
|
|
pub struct EmbeddingSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>);
|
|
|
|
|
|
|
|
|
|
impl EmbeddingSender<'_> {
|
|
|
|
|
impl EmbeddingSender<'_, '_> {
|
|
|
|
|
pub fn set_vectors(
|
|
|
|
|
&self,
|
|
|
|
|
docid: DocumentId,
|
|
|
|
|
embedder_id: u8,
|
|
|
|
|
embeddings: Vec<Embedding>,
|
|
|
|
|
) -> StdResult<(), SendError<()>> {
|
|
|
|
|
) -> crate::Result<()> {
|
|
|
|
|
self.0
|
|
|
|
|
.send(WriterOperation::ArroyOperation(ArroyOperation::SetVectors {
|
|
|
|
|
docid,
|
|
|
|
@ -541,33 +611,36 @@ impl EmbeddingSender<'_> {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub struct GeoSender<'a>(&'a Sender<WriterOperation>);
|
|
|
|
|
pub struct GeoSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>);
|
|
|
|
|
|
|
|
|
|
impl GeoSender<'_> {
|
|
|
|
|
impl GeoSender<'_, '_> {
|
|
|
|
|
pub fn set_rtree(&self, value: Mmap) -> StdResult<(), SendError<()>> {
|
|
|
|
|
self.0
|
|
|
|
|
.send(WriterOperation::DbOperation(DbOperation {
|
|
|
|
|
database: Database::Main,
|
|
|
|
|
entry: EntryOperation::Write(KeyValueEntry::from_large_key_value(
|
|
|
|
|
GEO_RTREE_KEY.as_bytes(),
|
|
|
|
|
value,
|
|
|
|
|
)),
|
|
|
|
|
}))
|
|
|
|
|
.map_err(|_| SendError(()))
|
|
|
|
|
todo!("set rtree from file")
|
|
|
|
|
// self.0
|
|
|
|
|
// .send(WriterOperation::DbOperation(DbOperation {
|
|
|
|
|
// database: Database::Main,
|
|
|
|
|
// entry: EntryOperation::Write(KeyValueEntry::from_large_key_value(
|
|
|
|
|
// GEO_RTREE_KEY.as_bytes(),
|
|
|
|
|
// value,
|
|
|
|
|
// )),
|
|
|
|
|
// }))
|
|
|
|
|
// .map_err(|_| SendError(()))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn set_geo_faceted(&self, bitmap: &RoaringBitmap) -> StdResult<(), SendError<()>> {
|
|
|
|
|
let mut buffer = Vec::new();
|
|
|
|
|
bitmap.serialize_into(&mut buffer).unwrap();
|
|
|
|
|
todo!("serialize directly into bbqueue (as a real roaringbitmap not a cbo)")
|
|
|
|
|
|
|
|
|
|
self.0
|
|
|
|
|
.send(WriterOperation::DbOperation(DbOperation {
|
|
|
|
|
database: Database::Main,
|
|
|
|
|
entry: EntryOperation::Write(KeyValueEntry::from_small_key_value(
|
|
|
|
|
GEO_FACETED_DOCUMENTS_IDS_KEY.as_bytes(),
|
|
|
|
|
&buffer,
|
|
|
|
|
)),
|
|
|
|
|
}))
|
|
|
|
|
.map_err(|_| SendError(()))
|
|
|
|
|
// let mut buffer = Vec::new();
|
|
|
|
|
// bitmap.serialize_into(&mut buffer).unwrap();
|
|
|
|
|
|
|
|
|
|
// self.0
|
|
|
|
|
// .send(WriterOperation::DbOperation(DbOperation {
|
|
|
|
|
// database: Database::Main,
|
|
|
|
|
// entry: EntryOperation::Write(KeyValueEntry::from_small_key_value(
|
|
|
|
|
// GEO_FACETED_DOCUMENTS_IDS_KEY.as_bytes(),
|
|
|
|
|
// &buffer,
|
|
|
|
|
// )),
|
|
|
|
|
// }))
|
|
|
|
|
// .map_err(|_| SendError(()))
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|