test: Add some more tests
This commit is contained in:
parent
e68e6056c3
commit
34b43d4002
|
@ -77,7 +77,7 @@ fn search(metadata: &Metadata, database: &DB, query: &str) {
|
|||
|
||||
// "Sony" "PlayStation 4 500GB"
|
||||
let config = Config {
|
||||
metadata: metadata,
|
||||
index: unimplemented!(),
|
||||
automatons: automatons,
|
||||
criteria: criterion::default(),
|
||||
distinct: (distinct_by_title_first_four_chars, 1),
|
||||
|
|
|
@ -89,7 +89,7 @@ where M: AsRef<Metadata>,
|
|||
}
|
||||
|
||||
let config = Config {
|
||||
metadata: metadata.as_ref(),
|
||||
index: unimplemented!(),
|
||||
automatons: automatons,
|
||||
criteria: criterion::default(),
|
||||
distinct: ((), 1),
|
||||
|
|
|
@ -0,0 +1,509 @@
|
|||
use crate::vec_read_only::VecReadOnly;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::{mem, cmp};
|
||||
use std::rc::Rc;
|
||||
|
||||
use fst::{Automaton, Streamer};
|
||||
use fst::automaton::AlwaysMatch;
|
||||
use sdset::{Set, SetBuf, SetOperation};
|
||||
use sdset::duo::OpBuilder as SdOpBuilder;
|
||||
use group_by::GroupBy;
|
||||
|
||||
use crate::blob::{Blob, Sign};
|
||||
use crate::blob::ops::{OpBuilder, Union, IndexedDocIndexes};
|
||||
use crate::DocIndex;
|
||||
|
||||
fn group_is_negative(blobs: &&[Blob]) -> bool {
|
||||
blobs[0].sign() == Sign::Negative
|
||||
}
|
||||
|
||||
fn blob_same_sign(a: &Blob, b: &Blob) -> bool {
|
||||
a.sign() == b.sign()
|
||||
}
|
||||
|
||||
fn sign_from_group_index(group: usize) -> Sign {
|
||||
if group % 2 == 0 {
|
||||
Sign::Positive
|
||||
} else {
|
||||
Sign::Negative
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Merge<'b> {
|
||||
heap: GroupHeap<'b>,
|
||||
outs: Vec<IndexedDocIndexes>,
|
||||
cur_slot: Option<Slot>,
|
||||
}
|
||||
|
||||
impl<'b> Merge<'b> {
|
||||
pub fn always_match(blobs: &'b [Blob]) -> Self {
|
||||
Self::with_automatons(vec![AlwaysMatch], blobs)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'b> Merge<'b> {
|
||||
pub fn with_automatons<A>(automatons: Vec<A>, blobs: &'b [Blob]) -> Self
|
||||
where A: 'b + Automaton + Clone
|
||||
{
|
||||
let mut groups = Vec::new();
|
||||
// We can skip blobs that are negative: they didn't remove anything at the start
|
||||
for blobs in GroupBy::new(blobs, blob_same_sign).skip_while(group_is_negative) {
|
||||
let mut builder = OpBuilder::with_automatons(automatons.clone());
|
||||
for blob in blobs {
|
||||
builder.push(blob);
|
||||
}
|
||||
groups.push(builder.union());
|
||||
}
|
||||
|
||||
let mut heap = GroupHeap::new(groups);
|
||||
heap.refill();
|
||||
|
||||
Merge {
|
||||
heap: heap,
|
||||
outs: Vec::new(),
|
||||
cur_slot: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'b, 'a> Streamer<'a> for Merge<'b> {
|
||||
type Item = (&'a [u8], &'a [IndexedDocIndexes]);
|
||||
|
||||
fn next(&'a mut self) -> Option<Self::Item> {
|
||||
self.outs.clear();
|
||||
loop {
|
||||
if let Some(slot) = self.cur_slot.take() {
|
||||
self.heap.refill();
|
||||
}
|
||||
let slot = match self.heap.pop() {
|
||||
None => return None,
|
||||
Some(slot) => {
|
||||
self.cur_slot = Some(slot);
|
||||
self.cur_slot.as_ref().unwrap()
|
||||
}
|
||||
};
|
||||
|
||||
let mut doc_indexes = Vec::new();
|
||||
let mut doc_indexes_slots = Vec::with_capacity(self.heap.num_groups());
|
||||
|
||||
let len = match sign_from_group_index(slot.grp_index) {
|
||||
Sign::Positive => {
|
||||
doc_indexes.extend_from_slice(&slot.output);
|
||||
slot.output.len()
|
||||
},
|
||||
Sign::Negative => 0,
|
||||
};
|
||||
|
||||
let mut slotidi = SlotIndexedDocIndexes {
|
||||
index: slot.aut_index,
|
||||
start: 0,
|
||||
len: len,
|
||||
};
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
while let Some(slot2) = self.heap.pop_if_equal(slot.input()) {
|
||||
if slotidi.index == slot2.aut_index {
|
||||
buffer.clear();
|
||||
buffer.extend(doc_indexes.drain(slotidi.start..));
|
||||
|
||||
let a = Set::new_unchecked(&buffer);
|
||||
let b = Set::new_unchecked(&slot2.output);
|
||||
match sign_from_group_index(slot2.grp_index) {
|
||||
Sign::Positive => { SdOpBuilder::new(a, b).union().extend_vec(&mut doc_indexes) },
|
||||
Sign::Negative => SdOpBuilder::new(a, b).difference().extend_vec(&mut doc_indexes),
|
||||
}
|
||||
slotidi.len = doc_indexes.len() - slotidi.start;
|
||||
|
||||
} else {
|
||||
if slotidi.len != 0 {
|
||||
doc_indexes_slots.push(slotidi);
|
||||
}
|
||||
slotidi = SlotIndexedDocIndexes {
|
||||
index: slot2.aut_index,
|
||||
start: doc_indexes.len(),
|
||||
len: slot2.output.len(),
|
||||
};
|
||||
buffer.extend_from_slice(&slot2.output);
|
||||
}
|
||||
}
|
||||
|
||||
if slotidi.len != 0 {
|
||||
doc_indexes_slots.push(slotidi);
|
||||
}
|
||||
|
||||
let read_only = VecReadOnly::new(doc_indexes);
|
||||
self.outs.reserve(doc_indexes_slots.len());
|
||||
for slot in doc_indexes_slots {
|
||||
let indexes = IndexedDocIndexes {
|
||||
index: slot.index,
|
||||
doc_indexes: read_only.range(slot.start, slot.len),
|
||||
};
|
||||
self.outs.push(indexes);
|
||||
}
|
||||
|
||||
if !self.outs.is_empty() {
|
||||
let slot = self.cur_slot.as_ref().unwrap(); // FIXME
|
||||
return Some((slot.input(), &self.outs))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct SlotIndexedDocIndexes {
|
||||
index: usize,
|
||||
start: usize,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
struct Slot {
|
||||
grp_index: usize,
|
||||
aut_index: usize,
|
||||
input: Rc<Vec<u8>>,
|
||||
output: VecReadOnly<DocIndex>,
|
||||
}
|
||||
|
||||
impl Slot {
|
||||
fn input(&self) -> &[u8] {
|
||||
&self.input
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for Slot {
|
||||
fn partial_cmp(&self, other: &Slot) -> Option<cmp::Ordering> {
|
||||
(&self.input, self.aut_index, self.grp_index, &self.output)
|
||||
.partial_cmp(&(&other.input, other.aut_index, other.grp_index, &other.output))
|
||||
.map(|ord| ord.reverse())
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for Slot {
|
||||
fn cmp(&self, other: &Slot) -> cmp::Ordering {
|
||||
self.partial_cmp(other).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
struct GroupHeap<'b> {
|
||||
groups: Vec<Union<'b>>,
|
||||
heap: BinaryHeap<Slot>,
|
||||
}
|
||||
|
||||
impl<'b> GroupHeap<'b> {
|
||||
fn new(groups: Vec<Union<'b>>) -> GroupHeap<'b> {
|
||||
GroupHeap {
|
||||
groups: groups,
|
||||
heap: BinaryHeap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn num_groups(&self) -> usize {
|
||||
self.groups.len()
|
||||
}
|
||||
|
||||
fn pop(&mut self) -> Option<Slot> {
|
||||
self.heap.pop()
|
||||
}
|
||||
|
||||
fn peek_is_duplicate(&self, key: &[u8]) -> bool {
|
||||
self.heap.peek().map(|s| *s.input == key).unwrap_or(false)
|
||||
}
|
||||
|
||||
fn pop_if_equal(&mut self, key: &[u8]) -> Option<Slot> {
|
||||
if self.peek_is_duplicate(key) { self.pop() } else { None }
|
||||
}
|
||||
|
||||
fn refill(&mut self) {
|
||||
for (i, group) in self.groups.iter_mut().enumerate() {
|
||||
if let Some((input, doc_indexes)) = group.next() {
|
||||
let input = Rc::new(input.to_vec());
|
||||
for doc_index in doc_indexes {
|
||||
let slot = Slot {
|
||||
input: input.clone(),
|
||||
grp_index: i,
|
||||
aut_index: doc_index.index,
|
||||
output: doc_index.doc_indexes.clone(),
|
||||
};
|
||||
self.heap.push(slot);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::blob::{PositiveBlobBuilder, NegativeBlobBuilder};
|
||||
use crate::DocIndex;
|
||||
|
||||
fn get_all<'m, I, S>(stream: I) -> Vec<(String, VecReadOnly<DocIndex>)>
|
||||
where
|
||||
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], &'a [IndexedDocIndexes])>,
|
||||
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], &'a [IndexedDocIndexes])>,
|
||||
{
|
||||
let mut result = Vec::new();
|
||||
|
||||
let mut stream = stream.into_stream();
|
||||
while let Some((string, indexes)) = stream.next() {
|
||||
let string = String::from_utf8(string.to_owned()).unwrap();
|
||||
result.push((string, indexes[0].doc_indexes.clone()))
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_positive_blob() {
|
||||
let doc1 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 0 };
|
||||
let doc2 = DocIndex{ document_id: 12, attribute: 0, attribute_index: 2 };
|
||||
let doc3 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 1 };
|
||||
let doc4 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 2 };
|
||||
|
||||
let a = {
|
||||
let mut builder = PositiveBlobBuilder::new(Vec::new(), Vec::new());
|
||||
|
||||
builder.insert("hell", doc1);
|
||||
builder.insert("hell", doc2);
|
||||
builder.insert("hello", doc3);
|
||||
builder.insert("wor", doc4);
|
||||
|
||||
Blob::Positive(builder.build().unwrap())
|
||||
};
|
||||
|
||||
let blobs = &[a];
|
||||
let merge = Merge::always_match(blobs);
|
||||
|
||||
let value = get_all(merge);
|
||||
assert_eq!(value.len(), 3);
|
||||
|
||||
assert_eq!(value[0].0, "hell");
|
||||
assert_eq!(&*value[0].1, &[doc1, doc2][..]);
|
||||
|
||||
assert_eq!(value[1].0, "hello");
|
||||
assert_eq!(&*value[1].1, &[doc3][..]);
|
||||
|
||||
assert_eq!(value[2].0, "wor");
|
||||
assert_eq!(&*value[2].1, &[doc4][..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_negative_blob() {
|
||||
let doc1 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 0 };
|
||||
let doc2 = DocIndex{ document_id: 12, attribute: 0, attribute_index: 2 };
|
||||
let doc3 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 1 };
|
||||
let doc4 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 2 };
|
||||
|
||||
let a = {
|
||||
let mut builder = NegativeBlobBuilder::new(Vec::new(), Vec::new());
|
||||
|
||||
builder.insert("hell", doc1);
|
||||
builder.insert("hell", doc2);
|
||||
builder.insert("hello", doc3);
|
||||
builder.insert("wor", doc4);
|
||||
|
||||
Blob::Negative(builder.build().unwrap())
|
||||
};
|
||||
|
||||
let blobs = &[a];
|
||||
let merge = Merge::always_match(blobs);
|
||||
|
||||
let value = get_all(merge);
|
||||
assert_eq!(value.len(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn two_positive_blobs() {
|
||||
let doc1 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 0 };
|
||||
let doc2 = DocIndex{ document_id: 12, attribute: 0, attribute_index: 2 };
|
||||
let doc3 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 1 };
|
||||
let doc4 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 2 };
|
||||
|
||||
let a = {
|
||||
let mut builder = PositiveBlobBuilder::new(Vec::new(), Vec::new());
|
||||
|
||||
builder.insert("hell", doc1);
|
||||
builder.insert("wor", doc4);
|
||||
|
||||
Blob::Positive(builder.build().unwrap())
|
||||
};
|
||||
|
||||
let b = {
|
||||
let mut builder = PositiveBlobBuilder::new(Vec::new(), Vec::new());
|
||||
|
||||
builder.insert("hell", doc2);
|
||||
builder.insert("hello", doc3);
|
||||
|
||||
Blob::Positive(builder.build().unwrap())
|
||||
};
|
||||
|
||||
let blobs = &[a, b];
|
||||
let merge = Merge::always_match(blobs);
|
||||
|
||||
let value = get_all(merge);
|
||||
assert_eq!(value.len(), 3);
|
||||
|
||||
assert_eq!(value[0].0, "hell");
|
||||
assert_eq!(&*value[0].1, &[doc1, doc2][..]);
|
||||
|
||||
assert_eq!(value[1].0, "hello");
|
||||
assert_eq!(&*value[1].1, &[doc3][..]);
|
||||
|
||||
assert_eq!(value[2].0, "wor");
|
||||
assert_eq!(&*value[2].1, &[doc4][..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_positive_one_negative_blobs() {
|
||||
let doc1 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 0 };
|
||||
let doc2 = DocIndex{ document_id: 12, attribute: 0, attribute_index: 2 };
|
||||
let doc3 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 1 };
|
||||
let doc4 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 2 };
|
||||
|
||||
let a = {
|
||||
let mut builder = PositiveBlobBuilder::new(Vec::new(), Vec::new());
|
||||
|
||||
builder.insert("hell", doc1);
|
||||
builder.insert("hell", doc2);
|
||||
builder.insert("hello", doc3);
|
||||
builder.insert("wor", doc4);
|
||||
|
||||
Blob::Positive(builder.build().unwrap())
|
||||
};
|
||||
|
||||
let b = {
|
||||
let mut builder = NegativeBlobBuilder::new(Vec::new(), Vec::new());
|
||||
|
||||
builder.insert("hell", doc2);
|
||||
builder.insert("hello", doc3);
|
||||
|
||||
Blob::Negative(builder.build().unwrap())
|
||||
};
|
||||
|
||||
let blobs = &[a, b];
|
||||
let merge = Merge::always_match(blobs);
|
||||
|
||||
let value = get_all(merge);
|
||||
assert_eq!(value.len(), 2);
|
||||
|
||||
assert_eq!(value[0].0, "hell");
|
||||
assert_eq!(&*value[0].1, &[doc1][..]);
|
||||
|
||||
assert_eq!(value[1].0, "wor");
|
||||
assert_eq!(&*value[1].1, &[doc4][..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn alternate_positive_negative_blobs() {
|
||||
let doc1 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 0 };
|
||||
let doc2 = DocIndex{ document_id: 12, attribute: 0, attribute_index: 2 };
|
||||
let doc3 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 1 };
|
||||
let doc4 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 2 };
|
||||
|
||||
let a = {
|
||||
let mut builder = PositiveBlobBuilder::new(Vec::new(), Vec::new());
|
||||
|
||||
builder.insert("hell", doc1);
|
||||
builder.insert("hell", doc2);
|
||||
builder.insert("hello", doc3);
|
||||
|
||||
Blob::Positive(builder.build().unwrap())
|
||||
};
|
||||
|
||||
let b = {
|
||||
let mut builder = NegativeBlobBuilder::new(Vec::new(), Vec::new());
|
||||
|
||||
builder.insert("hell", doc1);
|
||||
builder.insert("wor", doc4);
|
||||
|
||||
Blob::Negative(builder.build().unwrap())
|
||||
};
|
||||
|
||||
let c = {
|
||||
let mut builder = PositiveBlobBuilder::new(Vec::new(), Vec::new());
|
||||
|
||||
builder.insert("hell", doc1);
|
||||
builder.insert("wor", doc4);
|
||||
|
||||
Blob::Positive(builder.build().unwrap())
|
||||
};
|
||||
|
||||
let d = {
|
||||
let mut builder = NegativeBlobBuilder::new(Vec::new(), Vec::new());
|
||||
|
||||
builder.insert("hell", doc1);
|
||||
|
||||
Blob::Negative(builder.build().unwrap())
|
||||
};
|
||||
|
||||
let blobs = &[a, b, c, d];
|
||||
let merge = Merge::always_match(blobs);
|
||||
|
||||
let value = get_all(merge);
|
||||
assert_eq!(value.len(), 3);
|
||||
|
||||
assert_eq!(value[0].0, "hell");
|
||||
assert_eq!(&*value[0].1, &[doc2][..]);
|
||||
|
||||
assert_eq!(value[1].0, "hello");
|
||||
assert_eq!(&*value[1].1, &[doc3][..]);
|
||||
|
||||
assert_eq!(value[2].0, "wor");
|
||||
assert_eq!(&*value[2].1, &[doc4][..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn alternate_multiple_positive_negative_blobs() {
|
||||
let doc1 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 0 };
|
||||
let doc2 = DocIndex{ document_id: 12, attribute: 0, attribute_index: 2 };
|
||||
let doc3 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 1 };
|
||||
let doc4 = DocIndex{ document_id: 0, attribute: 0, attribute_index: 2 };
|
||||
|
||||
let a = {
|
||||
let mut builder = PositiveBlobBuilder::new(Vec::new(), Vec::new());
|
||||
|
||||
builder.insert("hell", doc1);
|
||||
builder.insert("hell", doc2);
|
||||
builder.insert("hello", doc3);
|
||||
|
||||
Blob::Positive(builder.build().unwrap())
|
||||
};
|
||||
|
||||
let b = {
|
||||
let mut builder = PositiveBlobBuilder::new(Vec::new(), Vec::new());
|
||||
|
||||
builder.insert("hell", doc1);
|
||||
builder.insert("wor", doc4);
|
||||
|
||||
Blob::Positive(builder.build().unwrap())
|
||||
};
|
||||
|
||||
let c = {
|
||||
let mut builder = NegativeBlobBuilder::new(Vec::new(), Vec::new());
|
||||
|
||||
builder.insert("hell", doc1);
|
||||
builder.insert("wor", doc4);
|
||||
|
||||
Blob::Negative(builder.build().unwrap())
|
||||
};
|
||||
|
||||
let d = {
|
||||
let mut builder = NegativeBlobBuilder::new(Vec::new(), Vec::new());
|
||||
|
||||
builder.insert("hell", doc1);
|
||||
|
||||
Blob::Negative(builder.build().unwrap())
|
||||
};
|
||||
|
||||
let blobs = &[a, b, c, d];
|
||||
let merge = Merge::always_match(blobs);
|
||||
|
||||
let value = get_all(merge);
|
||||
assert_eq!(value.len(), 2);
|
||||
|
||||
assert_eq!(value[0].0, "hell");
|
||||
assert_eq!(&*value[0].1, &[doc2][..]);
|
||||
|
||||
assert_eq!(value[1].0, "hello");
|
||||
assert_eq!(&*value[1].1, &[doc3][..]);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
mod merge;
|
||||
mod ops;
|
||||
mod ops_indexed_value;
|
||||
mod positive_blob;
|
||||
mod negative_blob;
|
||||
|
||||
pub use self::merge::Merge;
|
||||
pub use self::positive_blob::{PositiveBlob, PositiveBlobBuilder};
|
||||
pub use self::negative_blob::{NegativeBlob, NegativeBlobBuilder};
|
||||
|
||||
use fst::Map;
|
||||
|
||||
use crate::doc_indexes::DocIndexes;
|
||||
|
||||
pub enum Blob {
|
||||
Positive(PositiveBlob),
|
||||
Negative(NegativeBlob),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum Sign {
|
||||
Positive,
|
||||
Negative,
|
||||
}
|
||||
|
||||
impl Sign {
|
||||
pub fn alternate(self) -> Sign {
|
||||
match self {
|
||||
Sign::Positive => Sign::Negative,
|
||||
Sign::Negative => Sign::Positive,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Blob {
|
||||
pub fn sign(&self) -> Sign {
|
||||
match self {
|
||||
Blob::Positive(_) => Sign::Positive,
|
||||
Blob::Negative(_) => Sign::Negative,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_map(&self) -> &Map {
|
||||
match self {
|
||||
Blob::Positive(blob) => blob.as_map(),
|
||||
Blob::Negative(blob) => blob.as_map(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_indexes(&self) -> &DocIndexes {
|
||||
match self {
|
||||
Blob::Positive(blob) => blob.as_indexes(),
|
||||
Blob::Negative(blob) => blob.as_indexes(),
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,87 @@
|
|||
use std::error::Error;
|
||||
use std::path::Path;
|
||||
use std::io::Write;
|
||||
|
||||
use fst::{Map, MapBuilder};
|
||||
|
||||
use crate::DocIndex;
|
||||
use crate::doc_indexes::{DocIndexes, DocIndexesBuilder};
|
||||
|
||||
pub struct NegativeBlob {
|
||||
map: Map,
|
||||
indexes: DocIndexes,
|
||||
}
|
||||
|
||||
impl NegativeBlob {
|
||||
pub unsafe fn from_paths<P, Q>(map: P, indexes: Q) -> Result<Self, Box<Error>>
|
||||
where P: AsRef<Path>,
|
||||
Q: AsRef<Path>,
|
||||
{
|
||||
let map = Map::from_path(map)?;
|
||||
let indexes = DocIndexes::from_path(indexes)?;
|
||||
Ok(NegativeBlob { map, indexes })
|
||||
}
|
||||
|
||||
pub fn from_bytes(map: Vec<u8>, indexes: Vec<u8>) -> Result<Self, Box<Error>> {
|
||||
let map = Map::from_bytes(map)?;
|
||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
||||
Ok(NegativeBlob { map, indexes })
|
||||
}
|
||||
|
||||
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<&[DocIndex]> {
|
||||
self.map.get(key).and_then(|index| self.indexes.get(index))
|
||||
}
|
||||
|
||||
pub fn as_map(&self) -> &Map {
|
||||
&self.map
|
||||
}
|
||||
|
||||
pub fn as_indexes(&self) -> &DocIndexes {
|
||||
&self.indexes
|
||||
}
|
||||
|
||||
pub fn explode(self) -> (Map, DocIndexes) {
|
||||
(self.map, self.indexes)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct NegativeBlobBuilder<W, X> {
|
||||
map: W,
|
||||
indexes: DocIndexesBuilder<X>,
|
||||
}
|
||||
|
||||
impl<W: Write, X: Write> NegativeBlobBuilder<W, X> {
|
||||
pub fn new(map: W, indexes: X) -> Self {
|
||||
Self { map, indexes: DocIndexesBuilder::new(indexes) }
|
||||
}
|
||||
|
||||
pub fn insert<S: Into<String>>(&mut self, key: S, index: DocIndex) {
|
||||
self.indexes.insert(key.into(), index)
|
||||
}
|
||||
|
||||
pub fn finish(self) -> Result<(), Box<Error>> {
|
||||
self.into_inner().map(|_| ())
|
||||
}
|
||||
|
||||
pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
|
||||
// FIXME insert a magic number that indicates if the endianess
|
||||
// of the input is the same as the machine that is reading it.
|
||||
|
||||
let map = {
|
||||
let mut keys_builder = MapBuilder::new(self.map)?;
|
||||
let keys = self.indexes.keys().map(|(s, v)| (s, *v));
|
||||
keys_builder.extend_iter(keys)?;
|
||||
keys_builder.into_inner()?
|
||||
};
|
||||
|
||||
let indexes = self.indexes.into_inner()?;
|
||||
|
||||
Ok((map, indexes))
|
||||
}
|
||||
}
|
||||
|
||||
impl NegativeBlobBuilder<Vec<u8>, Vec<u8>> {
|
||||
pub fn build(self) -> Result<NegativeBlob, Box<Error>> {
|
||||
self.into_inner().and_then(|(m, i)| NegativeBlob::from_bytes(m, i))
|
||||
}
|
||||
}
|
|
@ -0,0 +1,323 @@
|
|||
use std::collections::BTreeMap;
|
||||
|
||||
use fst::{map, Streamer, Automaton};
|
||||
use fst::automaton::AlwaysMatch;
|
||||
use sdset::multi::OpBuilder as SdOpBuilder;
|
||||
use sdset::{SetOperation, Set};
|
||||
|
||||
use crate::blob::ops_indexed_value::{
|
||||
OpIndexedValueBuilder, UnionIndexedValue,
|
||||
};
|
||||
use crate::blob::Blob;
|
||||
use crate::doc_indexes::DocIndexes;
|
||||
use crate::vec_read_only::VecReadOnly;
|
||||
use crate::DocIndex;
|
||||
|
||||
pub struct OpBuilder<'m, A: Automaton> {
|
||||
// the operation on the maps is always an union.
|
||||
maps: OpIndexedValueBuilder<'m>,
|
||||
automatons: Vec<A>,
|
||||
indexes: Vec<&'m DocIndexes>,
|
||||
}
|
||||
|
||||
impl<'m> OpBuilder<'m, AlwaysMatch> {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
maps: OpIndexedValueBuilder::new(),
|
||||
automatons: vec![AlwaysMatch],
|
||||
indexes: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Do a set operation on multiple maps with the same automatons.
|
||||
impl<'m, A: 'm + Automaton> OpBuilder<'m, A> {
|
||||
pub fn with_automatons(automatons: Vec<A>) -> Self {
|
||||
Self {
|
||||
maps: OpIndexedValueBuilder::new(),
|
||||
automatons: automatons,
|
||||
indexes: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add(mut self, blob: &'m Blob) -> Self where A: Clone {
|
||||
self.push(blob);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn push(&mut self, blob: &'m Blob) where A: Clone {
|
||||
let mut op = map::OpBuilder::new();
|
||||
for automaton in self.automatons.iter().cloned() {
|
||||
let stream = blob.as_map().search(automaton);
|
||||
op.push(stream);
|
||||
}
|
||||
|
||||
let stream = op.union();
|
||||
let indexes = blob.as_indexes();
|
||||
|
||||
self.maps.push(stream);
|
||||
self.indexes.push(indexes);
|
||||
}
|
||||
|
||||
pub fn union(self) -> Union<'m> {
|
||||
Union::new(self.maps, self.indexes, self.automatons.len())
|
||||
}
|
||||
|
||||
pub fn intersection(self) -> Intersection<'m> {
|
||||
Intersection::new(self.maps, self.indexes, self.automatons.len())
|
||||
}
|
||||
|
||||
pub fn difference(self) -> Difference<'m> {
|
||||
Difference::new(self.maps, self.indexes, self.automatons.len())
|
||||
}
|
||||
|
||||
pub fn symmetric_difference(self) -> SymmetricDifference<'m> {
|
||||
SymmetricDifference::new(self.maps, self.indexes, self.automatons.len())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
pub struct IndexedDocIndexes {
|
||||
pub index: usize,
|
||||
pub doc_indexes: VecReadOnly<DocIndex>,
|
||||
}
|
||||
|
||||
struct SlotIndexedDocIndexes {
|
||||
index: usize,
|
||||
start: usize,
|
||||
len: usize,
|
||||
}
|
||||
|
||||
macro_rules! logical_operation {
|
||||
(struct $name:ident, $operation:ident) => {
|
||||
|
||||
pub struct $name<'m> {
|
||||
maps: UnionIndexedValue<'m>,
|
||||
indexes: Vec<&'m DocIndexes>,
|
||||
number_automatons: usize,
|
||||
outs: Vec<IndexedDocIndexes>,
|
||||
}
|
||||
|
||||
impl<'m> $name<'m> {
|
||||
fn new(maps: OpIndexedValueBuilder<'m>, indexes: Vec<&'m DocIndexes>, number_automatons: usize) -> Self {
|
||||
$name {
|
||||
maps: maps.union(),
|
||||
indexes: indexes,
|
||||
number_automatons: number_automatons,
|
||||
outs: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'m, 'a> fst::Streamer<'a> for $name<'m> {
|
||||
type Item = (&'a [u8], &'a [IndexedDocIndexes]);
|
||||
|
||||
fn next(&'a mut self) -> Option<Self::Item> {
|
||||
match self.maps.next() {
|
||||
Some((input, ivalues)) => {
|
||||
self.outs.clear();
|
||||
|
||||
let mut builders = vec![BTreeMap::new(); self.number_automatons];
|
||||
for iv in ivalues {
|
||||
let builder = &mut builders[iv.aut_index];
|
||||
builder.insert(iv.rdr_index, iv.value);
|
||||
}
|
||||
|
||||
let mut doc_indexes = Vec::new();
|
||||
let mut doc_indexes_slots = Vec::with_capacity(builders.len());
|
||||
for (aut_index, values) in builders.into_iter().enumerate() {
|
||||
let mut builder = SdOpBuilder::with_capacity(values.len());
|
||||
for (rdr_index, value) in values {
|
||||
let indexes = self.indexes[rdr_index].get(value).expect("could not find indexes");
|
||||
let indexes = Set::new_unchecked(indexes);
|
||||
builder.push(indexes);
|
||||
}
|
||||
|
||||
let start = doc_indexes.len();
|
||||
builder.$operation().extend_vec(&mut doc_indexes);
|
||||
let len = doc_indexes.len() - start;
|
||||
if len != 0 {
|
||||
let slot = SlotIndexedDocIndexes {
|
||||
index: aut_index,
|
||||
start: start,
|
||||
len: len,
|
||||
};
|
||||
doc_indexes_slots.push(slot);
|
||||
}
|
||||
}
|
||||
|
||||
let read_only = VecReadOnly::new(doc_indexes);
|
||||
self.outs.reserve(doc_indexes_slots.len());
|
||||
for slot in doc_indexes_slots {
|
||||
let indexes = IndexedDocIndexes {
|
||||
index: slot.index,
|
||||
doc_indexes: read_only.range(slot.start, slot.len),
|
||||
};
|
||||
self.outs.push(indexes);
|
||||
}
|
||||
|
||||
if self.outs.is_empty() { return None }
|
||||
Some((input, &self.outs))
|
||||
},
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
}}
|
||||
|
||||
logical_operation!(struct Union, union);
|
||||
logical_operation!(struct Intersection, intersection);
|
||||
logical_operation!(struct Difference, difference);
|
||||
logical_operation!(struct SymmetricDifference, symmetric_difference);
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::blob::PositiveBlobBuilder;
|
||||
|
||||
fn get_exact_key<'m, I, S>(stream: I, key: &[u8]) -> Option<VecReadOnly<DocIndex>>
|
||||
where
|
||||
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], &'a [IndexedDocIndexes])>,
|
||||
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], &'a [IndexedDocIndexes])>,
|
||||
{
|
||||
let mut stream = stream.into_stream();
|
||||
while let Some((string, indexes)) = stream.next() {
|
||||
if string == key {
|
||||
return Some(indexes[0].doc_indexes.clone())
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn union_two_blobs() {
|
||||
let doc1 = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 };
|
||||
let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 };
|
||||
|
||||
let meta1 = {
|
||||
let mapw = Vec::new();
|
||||
let indexesw = Vec::new();
|
||||
let mut builder = PositiveBlobBuilder::new(mapw, indexesw);
|
||||
|
||||
builder.insert("chameau", doc1);
|
||||
|
||||
Blob::Positive(builder.build().unwrap())
|
||||
};
|
||||
|
||||
let meta2 = {
|
||||
let mapw = Vec::new();
|
||||
let indexesw = Vec::new();
|
||||
let mut builder = PositiveBlobBuilder::new(mapw, indexesw);
|
||||
|
||||
builder.insert("chameau", doc2);
|
||||
|
||||
Blob::Positive(builder.build().unwrap())
|
||||
};
|
||||
|
||||
let metas = OpBuilder::new().add(&meta1).add(&meta2).union();
|
||||
let value = get_exact_key(metas, b"chameau");
|
||||
|
||||
assert_eq!(&*value.unwrap(), &[doc1, doc2][..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn intersection_two_blobs() {
|
||||
let doc1 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 };
|
||||
let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 };
|
||||
|
||||
let meta1 = {
|
||||
let mapw = Vec::new();
|
||||
let indexesw = Vec::new();
|
||||
let mut builder = PositiveBlobBuilder::new(mapw, indexesw);
|
||||
|
||||
builder.insert("chameau", doc1);
|
||||
|
||||
Blob::Positive(builder.build().unwrap())
|
||||
};
|
||||
|
||||
let meta2 = {
|
||||
let mapw = Vec::new();
|
||||
let indexesw = Vec::new();
|
||||
let mut builder = PositiveBlobBuilder::new(mapw, indexesw);
|
||||
|
||||
builder.insert("chameau", doc2);
|
||||
|
||||
Blob::Positive(builder.build().unwrap())
|
||||
};
|
||||
|
||||
let metas = OpBuilder::new().add(&meta1).add(&meta2).intersection();
|
||||
let value = get_exact_key(metas, b"chameau");
|
||||
|
||||
assert_eq!(&*value.unwrap(), &[doc1][..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn difference_two_blobs() {
|
||||
let doc1 = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 };
|
||||
let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 };
|
||||
let doc3 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 };
|
||||
|
||||
let meta1 = {
|
||||
let mapw = Vec::new();
|
||||
let indexesw = Vec::new();
|
||||
let mut builder = PositiveBlobBuilder::new(mapw, indexesw);
|
||||
|
||||
builder.insert("chameau", doc1);
|
||||
builder.insert("chameau", doc2);
|
||||
|
||||
Blob::Positive(builder.build().unwrap())
|
||||
};
|
||||
|
||||
let meta2 = {
|
||||
let mapw = Vec::new();
|
||||
let indexesw = Vec::new();
|
||||
let mut builder = PositiveBlobBuilder::new(mapw, indexesw);
|
||||
|
||||
builder.insert("chameau", doc3);
|
||||
|
||||
Blob::Positive(builder.build().unwrap())
|
||||
};
|
||||
|
||||
let metas = OpBuilder::new().add(&meta1).add(&meta2).difference();
|
||||
let value = get_exact_key(metas, b"chameau");
|
||||
|
||||
assert_eq!(&*value.unwrap(), &[doc1][..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn symmetric_difference_two_blobs() {
|
||||
let doc1 = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 };
|
||||
let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 };
|
||||
let doc3 = DocIndex { document_id: 32, attribute: 0, attribute_index: 1 };
|
||||
let doc4 = DocIndex { document_id: 34, attribute: 12, attribute_index: 1 };
|
||||
|
||||
let meta1 = {
|
||||
let mapw = Vec::new();
|
||||
let indexesw = Vec::new();
|
||||
let mut builder = PositiveBlobBuilder::new(mapw, indexesw);
|
||||
|
||||
builder.insert("chameau", doc1);
|
||||
builder.insert("chameau", doc2);
|
||||
builder.insert("chameau", doc3);
|
||||
|
||||
Blob::Positive(builder.build().unwrap())
|
||||
};
|
||||
|
||||
let meta2 = {
|
||||
let mapw = Vec::new();
|
||||
let indexesw = Vec::new();
|
||||
let mut builder = PositiveBlobBuilder::new(mapw, indexesw);
|
||||
|
||||
builder.insert("chameau", doc2);
|
||||
builder.insert("chameau", doc3);
|
||||
builder.insert("chameau", doc4);
|
||||
|
||||
Blob::Positive(builder.build().unwrap())
|
||||
};
|
||||
|
||||
let metas = OpBuilder::new().add(&meta1).add(&meta2).symmetric_difference();
|
||||
let value = get_exact_key(metas, b"chameau");
|
||||
|
||||
assert_eq!(&*value.unwrap(), &[doc1, doc4][..]);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,203 @@
|
|||
use std::collections::BinaryHeap;
|
||||
use std::rc::Rc;
|
||||
use std::cmp;
|
||||
use fst::raw::{self, Output};
|
||||
use fst::{self, IntoStreamer, Streamer};
|
||||
|
||||
type BoxedStream<'f> = Box<for<'a> Streamer<'a, Item=(&'a [u8], &'a [raw::IndexedValue])> + 'f>;
|
||||
|
||||
pub struct OpIndexedValueBuilder<'f> {
|
||||
streams: Vec<BoxedStream<'f>>,
|
||||
}
|
||||
|
||||
impl<'f> OpIndexedValueBuilder<'f> {
|
||||
pub fn new() -> Self {
|
||||
Self { streams: Vec::new() }
|
||||
}
|
||||
|
||||
pub fn push<I, S>(&mut self, stream: I)
|
||||
where
|
||||
I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], &'a [raw::IndexedValue])>,
|
||||
S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], &'a [raw::IndexedValue])>,
|
||||
{
|
||||
self.streams.push(Box::new(stream.into_stream()));
|
||||
}
|
||||
|
||||
pub fn union(self) -> UnionIndexedValue<'f> {
|
||||
UnionIndexedValue {
|
||||
heap: StreamIndexedValueHeap::new(self.streams),
|
||||
outs: Vec::new(),
|
||||
cur_slot: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct UnionIndexedValue<'f> {
|
||||
heap: StreamIndexedValueHeap<'f>,
|
||||
outs: Vec<IndexedValue>,
|
||||
cur_slot: Option<SlotIndexedValue>,
|
||||
}
|
||||
|
||||
impl<'f> UnionIndexedValue<'f> {
|
||||
pub fn len(&self) -> usize {
|
||||
self.heap.num_slots()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'm> fst::Streamer<'a> for UnionIndexedValue<'m> {
|
||||
type Item = (&'a [u8], &'a [IndexedValue]);
|
||||
|
||||
fn next(&'a mut self) -> Option<Self::Item> {
|
||||
if let Some(slot) = self.cur_slot.take() {
|
||||
self.heap.refill(slot);
|
||||
}
|
||||
let slot = match self.heap.pop() {
|
||||
None => return None,
|
||||
Some(slot) => {
|
||||
self.cur_slot = Some(slot);
|
||||
self.cur_slot.as_mut().unwrap()
|
||||
}
|
||||
};
|
||||
self.outs.clear();
|
||||
self.outs.push(slot.indexed_value());
|
||||
while let Some(slot2) = self.heap.pop_if_equal(slot.input()) {
|
||||
self.outs.push(slot2.indexed_value());
|
||||
self.heap.refill(slot2);
|
||||
}
|
||||
Some((slot.input(), &self.outs))
|
||||
}
|
||||
}
|
||||
|
||||
struct StreamIndexedValueHeap<'f> {
|
||||
rdrs: Vec<BoxedStream<'f>>,
|
||||
heap: BinaryHeap<SlotIndexedValue>,
|
||||
}
|
||||
|
||||
impl<'f> StreamIndexedValueHeap<'f> {
|
||||
fn new(streams: Vec<BoxedStream<'f>>) -> StreamIndexedValueHeap<'f> {
|
||||
let mut u = StreamIndexedValueHeap {
|
||||
rdrs: streams,
|
||||
heap: BinaryHeap::new(),
|
||||
};
|
||||
for i in 0..u.rdrs.len() {
|
||||
u.refill(SlotIndexedValue::new(i));
|
||||
}
|
||||
u
|
||||
}
|
||||
|
||||
fn pop(&mut self) -> Option<SlotIndexedValue> {
|
||||
self.heap.pop()
|
||||
}
|
||||
|
||||
fn peek_is_duplicate(&self, key: &[u8]) -> bool {
|
||||
self.heap.peek().map(|s| s.input() == key).unwrap_or(false)
|
||||
}
|
||||
|
||||
fn pop_if_equal(&mut self, key: &[u8]) -> Option<SlotIndexedValue> {
|
||||
if self.peek_is_duplicate(key) {
|
||||
self.pop()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn pop_if_le(&mut self, key: &[u8]) -> Option<SlotIndexedValue> {
|
||||
if self.heap.peek().map(|s| s.input() <= key).unwrap_or(false) {
|
||||
self.pop()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn num_slots(&self) -> usize {
|
||||
self.rdrs.len()
|
||||
}
|
||||
|
||||
fn refill(&mut self, mut slot: SlotIndexedValue) {
|
||||
if let Some((input, ivalues)) = self.rdrs[slot.rdr_index].next() {
|
||||
slot.set_input(input);
|
||||
for values in ivalues {
|
||||
slot.set_aut_index(values.index);
|
||||
slot.set_output(values.value);
|
||||
self.heap.push(slot.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct SlotIndexedValue {
|
||||
rdr_index: usize,
|
||||
aut_index: usize,
|
||||
input: Rc<Vec<u8>>,
|
||||
output: Output,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct IndexedValue {
|
||||
pub rdr_index: usize,
|
||||
pub aut_index: usize,
|
||||
pub value: u64,
|
||||
}
|
||||
|
||||
impl PartialEq for SlotIndexedValue {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
(&self.input, self.rdr_index, self.aut_index, self.output)
|
||||
.eq(&(&other.input, other.rdr_index, other.aut_index, other.output))
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for SlotIndexedValue { }
|
||||
|
||||
impl PartialOrd for SlotIndexedValue {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> {
|
||||
(&self.input, self.rdr_index, self.aut_index, self.output)
|
||||
.partial_cmp(&(&other.input, other.rdr_index, other.aut_index, other.output))
|
||||
.map(|ord| ord.reverse())
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for SlotIndexedValue {
|
||||
fn cmp(&self, other: &Self) -> cmp::Ordering {
|
||||
self.partial_cmp(other).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl SlotIndexedValue {
|
||||
fn new(rdr_index: usize) -> SlotIndexedValue {
|
||||
SlotIndexedValue {
|
||||
rdr_index: rdr_index,
|
||||
aut_index: 0,
|
||||
input: Rc::new(Vec::with_capacity(64)),
|
||||
output: Output::zero(),
|
||||
}
|
||||
}
|
||||
|
||||
fn indexed_value(&self) -> IndexedValue {
|
||||
IndexedValue {
|
||||
rdr_index: self.rdr_index,
|
||||
aut_index: self.aut_index,
|
||||
value: self.output.value(),
|
||||
}
|
||||
}
|
||||
|
||||
fn input(&self) -> &[u8] {
|
||||
&self.input
|
||||
}
|
||||
|
||||
fn set_aut_index(&mut self, aut_index: usize) {
|
||||
self.aut_index = aut_index;
|
||||
}
|
||||
|
||||
fn set_input(&mut self, input: &[u8]) {
|
||||
if *self.input != input {
|
||||
let inner = Rc::make_mut(&mut self.input);
|
||||
inner.clear();
|
||||
inner.extend(input);
|
||||
}
|
||||
}
|
||||
|
||||
fn set_output(&mut self, output: u64) {
|
||||
self.output = Output::new(output);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,87 @@
|
|||
use std::error::Error;
|
||||
use std::path::Path;
|
||||
use std::io::Write;
|
||||
|
||||
use fst::{Map, MapBuilder};
|
||||
|
||||
use crate::DocIndex;
|
||||
use crate::doc_indexes::{DocIndexes, DocIndexesBuilder};
|
||||
|
||||
pub struct PositiveBlob {
|
||||
map: Map,
|
||||
indexes: DocIndexes,
|
||||
}
|
||||
|
||||
impl PositiveBlob {
|
||||
pub unsafe fn from_paths<P, Q>(map: P, indexes: Q) -> Result<Self, Box<Error>>
|
||||
where P: AsRef<Path>,
|
||||
Q: AsRef<Path>,
|
||||
{
|
||||
let map = Map::from_path(map)?;
|
||||
let indexes = DocIndexes::from_path(indexes)?;
|
||||
Ok(PositiveBlob { map, indexes })
|
||||
}
|
||||
|
||||
pub fn from_bytes(map: Vec<u8>, indexes: Vec<u8>) -> Result<Self, Box<Error>> {
|
||||
let map = Map::from_bytes(map)?;
|
||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
||||
Ok(PositiveBlob { map, indexes })
|
||||
}
|
||||
|
||||
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<&[DocIndex]> {
|
||||
self.map.get(key).and_then(|index| self.indexes.get(index))
|
||||
}
|
||||
|
||||
pub fn as_map(&self) -> &Map {
|
||||
&self.map
|
||||
}
|
||||
|
||||
pub fn as_indexes(&self) -> &DocIndexes {
|
||||
&self.indexes
|
||||
}
|
||||
|
||||
pub fn explode(self) -> (Map, DocIndexes) {
|
||||
(self.map, self.indexes)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PositiveBlobBuilder<W, X> {
|
||||
map: W,
|
||||
indexes: DocIndexesBuilder<X>,
|
||||
}
|
||||
|
||||
impl<W: Write, X: Write> PositiveBlobBuilder<W, X> {
|
||||
pub fn new(map: W, indexes: X) -> Self {
|
||||
Self { map, indexes: DocIndexesBuilder::new(indexes) }
|
||||
}
|
||||
|
||||
pub fn insert<S: Into<String>>(&mut self, key: S, index: DocIndex) {
|
||||
self.indexes.insert(key.into(), index)
|
||||
}
|
||||
|
||||
pub fn finish(self) -> Result<(), Box<Error>> {
|
||||
self.into_inner().map(|_| ())
|
||||
}
|
||||
|
||||
pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
|
||||
// FIXME insert a magic number that indicates if the endianess
|
||||
// of the input is the same as the machine that is reading it.
|
||||
|
||||
let map = {
|
||||
let mut keys_builder = MapBuilder::new(self.map)?;
|
||||
let keys = self.indexes.keys().map(|(s, v)| (s, *v));
|
||||
keys_builder.extend_iter(keys)?;
|
||||
keys_builder.into_inner()?
|
||||
};
|
||||
|
||||
let indexes = self.indexes.into_inner()?;
|
||||
|
||||
Ok((map, indexes))
|
||||
}
|
||||
}
|
||||
|
||||
impl PositiveBlobBuilder<Vec<u8>, Vec<u8>> {
|
||||
pub fn build(self) -> Result<PositiveBlob, Box<Error>> {
|
||||
self.into_inner().and_then(|(m, i)| PositiveBlob::from_bytes(m, i))
|
||||
}
|
||||
}
|
|
@ -0,0 +1,200 @@
|
|||
use std::collections::btree_map::{BTreeMap, Iter, Entry};
|
||||
use std::slice::from_raw_parts;
|
||||
use std::io::{self, Write};
|
||||
use std::path::Path;
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
use std::mem;
|
||||
use fst::raw::MmapReadOnly;
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use crate::DocIndex;
|
||||
|
||||
#[repr(C)]
|
||||
struct Range {
|
||||
start: u64,
|
||||
end: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
enum DocIndexesData {
|
||||
Shared {
|
||||
vec: Arc<Vec<u8>>,
|
||||
offset: usize,
|
||||
len: usize,
|
||||
},
|
||||
Mmap(MmapReadOnly),
|
||||
}
|
||||
|
||||
impl Deref for DocIndexesData {
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
match self {
|
||||
DocIndexesData::Shared { vec, offset, len } => {
|
||||
&vec[*offset..offset + len]
|
||||
},
|
||||
DocIndexesData::Mmap(m) => m.as_slice(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct DocIndexes {
|
||||
ranges: DocIndexesData,
|
||||
indexes: DocIndexesData,
|
||||
}
|
||||
|
||||
impl DocIndexes {
|
||||
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
|
||||
let mmap = MmapReadOnly::open_path(path)?;
|
||||
|
||||
let range_len = mmap.as_slice().read_u64::<LittleEndian>()?;
|
||||
let range_len = range_len as usize * mem::size_of::<Range>();
|
||||
|
||||
let offset = mem::size_of::<u64>() as usize;
|
||||
let ranges = DocIndexesData::Mmap(mmap.range(offset, range_len));
|
||||
|
||||
let len = mmap.len() - range_len - offset;
|
||||
let offset = offset + range_len;
|
||||
let indexes = DocIndexesData::Mmap(mmap.range(offset, len));
|
||||
|
||||
Ok(DocIndexes { ranges, indexes })
|
||||
}
|
||||
|
||||
pub fn from_bytes(vec: Vec<u8>) -> io::Result<Self> {
|
||||
let vec = Arc::new(vec);
|
||||
|
||||
let range_len = vec.as_slice().read_u64::<LittleEndian>()?;
|
||||
let range_len = range_len as usize * mem::size_of::<Range>();
|
||||
|
||||
let offset = mem::size_of::<u64>() as usize;
|
||||
let ranges = DocIndexesData::Shared {
|
||||
vec: vec.clone(),
|
||||
offset,
|
||||
len: range_len
|
||||
};
|
||||
|
||||
let len = vec.len() - range_len - offset;
|
||||
let offset = offset + range_len;
|
||||
let indexes = DocIndexesData::Shared { vec, offset, len };
|
||||
|
||||
Ok(DocIndexes { ranges, indexes })
|
||||
}
|
||||
|
||||
pub fn get(&self, index: u64) -> Option<&[DocIndex]> {
|
||||
self.ranges().get(index as usize).map(|Range { start, end }| {
|
||||
let start = *start as usize;
|
||||
let end = *end as usize;
|
||||
&self.indexes()[start..end]
|
||||
})
|
||||
}
|
||||
|
||||
fn ranges(&self) -> &[Range] {
|
||||
let slice = &self.ranges;
|
||||
let ptr = slice.as_ptr() as *const Range;
|
||||
let len = slice.len() / mem::size_of::<Range>();
|
||||
unsafe { from_raw_parts(ptr, len) }
|
||||
}
|
||||
|
||||
fn indexes(&self) -> &[DocIndex] {
|
||||
let slice = &self.indexes;
|
||||
let ptr = slice.as_ptr() as *const DocIndex;
|
||||
let len = slice.len() / mem::size_of::<DocIndex>();
|
||||
unsafe { from_raw_parts(ptr, len) }
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocIndexesBuilder<W> {
|
||||
keys: BTreeMap<String, u64>,
|
||||
indexes: Vec<Vec<DocIndex>>,
|
||||
number_docs: usize,
|
||||
wtr: W,
|
||||
}
|
||||
|
||||
impl<W: Write> DocIndexesBuilder<W> {
|
||||
pub fn new(wtr: W) -> Self {
|
||||
Self {
|
||||
keys: BTreeMap::new(),
|
||||
indexes: Vec::new(),
|
||||
number_docs: 0,
|
||||
wtr: wtr,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn number_doc_indexes(&self) -> usize {
|
||||
self.number_docs
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, key: String, value: DocIndex) {
|
||||
match self.keys.entry(key) {
|
||||
Entry::Vacant(e) => {
|
||||
let index = self.indexes.len() as u64;
|
||||
self.indexes.push(vec![value]);
|
||||
e.insert(index);
|
||||
},
|
||||
Entry::Occupied(e) => {
|
||||
let index = *e.get();
|
||||
let vec = &mut self.indexes[index as usize];
|
||||
vec.push(value);
|
||||
},
|
||||
}
|
||||
self.number_docs += 1;
|
||||
}
|
||||
|
||||
pub fn keys(&self) -> Iter<String, u64> {
|
||||
self.keys.iter()
|
||||
}
|
||||
|
||||
pub fn finish(self) -> io::Result<()> {
|
||||
self.into_inner().map(|_| ())
|
||||
}
|
||||
|
||||
pub fn into_inner(mut self) -> io::Result<W> {
|
||||
|
||||
for vec in &mut self.indexes {
|
||||
vec.sort_unstable();
|
||||
}
|
||||
|
||||
let (ranges, values) = into_sliced_ranges(self.indexes, self.number_docs);
|
||||
let len = ranges.len() as u64;
|
||||
|
||||
// TODO check if this is correct
|
||||
self.wtr.write_u64::<LittleEndian>(len)?;
|
||||
unsafe {
|
||||
// write Ranges first
|
||||
let slice = into_u8_slice(ranges.as_slice());
|
||||
self.wtr.write_all(slice)?;
|
||||
|
||||
// write Values after
|
||||
let slice = into_u8_slice(values.as_slice());
|
||||
self.wtr.write_all(slice)?;
|
||||
}
|
||||
|
||||
self.wtr.flush()?;
|
||||
Ok(self.wtr)
|
||||
}
|
||||
}
|
||||
|
||||
fn into_sliced_ranges<T>(vecs: Vec<Vec<T>>, number_docs: usize) -> (Vec<Range>, Vec<T>) {
|
||||
let cap = vecs.len();
|
||||
let mut ranges = Vec::with_capacity(cap);
|
||||
let mut values = Vec::with_capacity(number_docs);
|
||||
|
||||
for v in &vecs {
|
||||
let len = v.len() as u64;
|
||||
let start = ranges.last().map(|&Range { end, .. }| end).unwrap_or(0);
|
||||
|
||||
let range = Range { start, end: start + len };
|
||||
ranges.push(range);
|
||||
}
|
||||
|
||||
values.extend(vecs.into_iter().flatten());
|
||||
|
||||
(ranges, values)
|
||||
}
|
||||
|
||||
unsafe fn into_u8_slice<T>(slice: &[T]) -> &[u8] {
|
||||
let ptr = slice.as_ptr() as *const u8;
|
||||
let len = slice.len() * mem::size_of::<T>();
|
||||
from_raw_parts(ptr, len)
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
use std::path::{Path, PathBuf};
|
||||
use std::error::Error;
|
||||
|
||||
use crate::rank::Document;
|
||||
use crate::blob::Blob;
|
||||
|
||||
pub struct Index {
|
||||
path: PathBuf,
|
||||
blobs: Vec<Blob>,
|
||||
}
|
||||
|
||||
impl Index {
|
||||
pub fn open(path: &Path) -> Result<Self, Box<Error>> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
pub fn create(path: &Path) -> Result<Self, Box<Error>> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
pub fn blobs(&self) -> &[Blob] {
|
||||
&self.blobs
|
||||
}
|
||||
}
|
|
@ -2,6 +2,11 @@
|
|||
|
||||
#[macro_use] extern crate lazy_static;
|
||||
|
||||
pub mod index;
|
||||
pub mod pentium;
|
||||
pub mod blob;
|
||||
pub mod doc_indexes;
|
||||
|
||||
pub mod rank;
|
||||
pub mod metadata;
|
||||
pub mod vec_read_only;
|
||||
|
|
|
@ -64,13 +64,13 @@ mod tests {
|
|||
#[test]
|
||||
fn empty() {
|
||||
let positive_metas = construct_metadata(vec![
|
||||
("chameau".into(), DocIndex{ document: 12, attribute: 1, attribute_index: 22 }),
|
||||
("chameau".into(), DocIndex{ document: 31, attribute: 0, attribute_index: 1 }),
|
||||
("chameau".into(), DocIndex{ document_id: 12, attribute: 1, attribute_index: 22 }),
|
||||
("chameau".into(), DocIndex{ document_id: 31, attribute: 0, attribute_index: 1 }),
|
||||
]);
|
||||
|
||||
let negative_metas = construct_metadata(vec![
|
||||
("chameau".into(), DocIndex{ document: 12, attribute: 1, attribute_index: 22 }),
|
||||
("chameau".into(), DocIndex{ document: 31, attribute: 0, attribute_index: 1 }),
|
||||
("chameau".into(), DocIndex{ document_id: 12, attribute: 1, attribute_index: 22 }),
|
||||
("chameau".into(), DocIndex{ document_id: 31, attribute: 0, attribute_index: 1 }),
|
||||
]);
|
||||
|
||||
let positives = &[positive_metas];
|
||||
|
@ -82,8 +82,8 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn one_positive() {
|
||||
let di1 = DocIndex{ document: 12, attribute: 1, attribute_index: 22 };
|
||||
let di2 = DocIndex{ document: 31, attribute: 0, attribute_index: 1 };
|
||||
let di1 = DocIndex{ document_id: 12, attribute: 1, attribute_index: 22 };
|
||||
let di2 = DocIndex{ document_id: 31, attribute: 0, attribute_index: 1 };
|
||||
|
||||
let positive_metas = construct_metadata(vec![
|
||||
("chameau".into(), di1),
|
||||
|
@ -105,8 +105,8 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn more_negative_than_positive() {
|
||||
let di1 = DocIndex{ document: 12, attribute: 1, attribute_index: 22 };
|
||||
let di2 = DocIndex{ document: 31, attribute: 0, attribute_index: 1 };
|
||||
let di1 = DocIndex{ document_id: 12, attribute: 1, attribute_index: 22 };
|
||||
let di2 = DocIndex{ document_id: 31, attribute: 0, attribute_index: 1 };
|
||||
|
||||
let positive_metas = construct_metadata(vec![
|
||||
("chameau".into(), di1),
|
||||
|
|
|
@ -107,7 +107,7 @@ mod tests {
|
|||
|
||||
let mut builder = MetadataBuilder::new(mapw, indexesw);
|
||||
|
||||
let doc = DocIndex { document: 12, attribute: 1, attribute_index: 22 };
|
||||
let doc = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 };
|
||||
builder.insert("chameau".into(), doc);
|
||||
|
||||
let (map, indexes) = builder.into_inner().unwrap();
|
||||
|
@ -123,8 +123,8 @@ mod tests {
|
|||
|
||||
let mut builder = MetadataBuilder::new(mapw, indexesw);
|
||||
|
||||
let doc1 = DocIndex { document: 12, attribute: 1, attribute_index: 22 };
|
||||
let doc2 = DocIndex { document: 31, attribute: 0, attribute_index: 1 };
|
||||
let doc1 = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 };
|
||||
let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 };
|
||||
builder.insert("chameau".into(), doc1);
|
||||
builder.insert("chameau".into(), doc2);
|
||||
|
||||
|
|
|
@ -189,8 +189,8 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn union_two_metadata() {
|
||||
let doc1 = DocIndex { document: 12, attribute: 1, attribute_index: 22 };
|
||||
let doc2 = DocIndex { document: 31, attribute: 0, attribute_index: 1 };
|
||||
let doc1 = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 };
|
||||
let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 };
|
||||
|
||||
let meta1 = {
|
||||
let mapw = Vec::new();
|
||||
|
@ -222,8 +222,8 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn intersection_two_metadata() {
|
||||
let doc1 = DocIndex { document: 31, attribute: 0, attribute_index: 1 };
|
||||
let doc2 = DocIndex { document: 31, attribute: 0, attribute_index: 1 };
|
||||
let doc1 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 };
|
||||
let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 };
|
||||
|
||||
let meta1 = {
|
||||
let mapw = Vec::new();
|
||||
|
@ -255,9 +255,9 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn difference_two_metadata() {
|
||||
let doc1 = DocIndex { document: 12, attribute: 1, attribute_index: 22 };
|
||||
let doc2 = DocIndex { document: 31, attribute: 0, attribute_index: 1 };
|
||||
let doc3 = DocIndex { document: 31, attribute: 0, attribute_index: 1 };
|
||||
let doc1 = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 };
|
||||
let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 };
|
||||
let doc3 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 };
|
||||
|
||||
let meta1 = {
|
||||
let mapw = Vec::new();
|
||||
|
@ -290,10 +290,10 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn symmetric_difference_two_metadata() {
|
||||
let doc1 = DocIndex { document: 12, attribute: 1, attribute_index: 22 };
|
||||
let doc2 = DocIndex { document: 31, attribute: 0, attribute_index: 1 };
|
||||
let doc3 = DocIndex { document: 32, attribute: 0, attribute_index: 1 };
|
||||
let doc4 = DocIndex { document: 34, attribute: 12, attribute_index: 1 };
|
||||
let doc1 = DocIndex { document_id: 12, attribute: 1, attribute_index: 22 };
|
||||
let doc2 = DocIndex { document_id: 31, attribute: 0, attribute_index: 1 };
|
||||
let doc3 = DocIndex { document_id: 32, attribute: 0, attribute_index: 1 };
|
||||
let doc4 = DocIndex { document_id: 34, attribute: 12, attribute_index: 1 };
|
||||
|
||||
let meta1 = {
|
||||
let mapw = Vec::new();
|
||||
|
|
|
@ -0,0 +1,28 @@
|
|||
use std::error::Error;
|
||||
|
||||
use crate::automaton;
|
||||
use crate::rank::Document;
|
||||
use crate::index::Index;
|
||||
|
||||
pub struct Pentium {
|
||||
index: Index,
|
||||
}
|
||||
|
||||
impl Pentium {
|
||||
pub fn from_index(index: Index) -> Result<Self, Box<Error>> {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
pub fn search(&self, query: &str) -> Vec<Document> {
|
||||
|
||||
let mut automatons = Vec::new();
|
||||
for word in query.split_whitespace().map(str::to_lowercase) {
|
||||
let dfa = automaton::build_prefix_dfa(&word);
|
||||
automatons.push(dfa);
|
||||
}
|
||||
|
||||
let stream = unimplemented!();
|
||||
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
|
@ -9,8 +9,8 @@ use fst::Streamer;
|
|||
use group_by::GroupByMut;
|
||||
|
||||
use crate::automaton::{DfaExt, AutomatonExt};
|
||||
use crate::metadata::Metadata;
|
||||
use crate::metadata::ops::OpBuilder;
|
||||
use crate::index::Index;
|
||||
use crate::blob::{Blob, Merge};
|
||||
use crate::rank::criterion::Criterion;
|
||||
use crate::rank::Document;
|
||||
use crate::{Match, DocumentId};
|
||||
|
@ -22,28 +22,26 @@ fn clamp_range<T: Copy + Ord>(range: Range<T>, big: Range<T>) -> Range<T> {
|
|||
}
|
||||
}
|
||||
|
||||
pub struct Config<'m, C, F> {
|
||||
pub metadata: &'m Metadata,
|
||||
pub struct Config<C, F> {
|
||||
pub index: Index,
|
||||
pub automatons: Vec<DfaExt>,
|
||||
pub criteria: Vec<C>,
|
||||
pub distinct: (F, usize),
|
||||
}
|
||||
|
||||
pub struct RankedStream<'m, C, F> {
|
||||
stream: crate::metadata::ops::Union<'m>,
|
||||
stream: crate::blob::Merge<'m>,
|
||||
automatons: Vec<Rc<DfaExt>>,
|
||||
criteria: Vec<C>,
|
||||
distinct: (F, usize),
|
||||
}
|
||||
|
||||
impl<'m, C, F> RankedStream<'m, C, F> {
|
||||
pub fn new(config: Config<'m, C, F>) -> Self {
|
||||
pub fn new(config: Config<C, F>) -> Self {
|
||||
let automatons: Vec<_> = config.automatons.into_iter().map(Rc::new).collect();
|
||||
let mut builder = OpBuilder::with_automatons(automatons.clone());
|
||||
builder.push(config.metadata);
|
||||
|
||||
RankedStream {
|
||||
stream: builder.union(),
|
||||
stream: Merge::with_automatons(automatons.clone(), unimplemented!()),
|
||||
automatons: automatons,
|
||||
criteria: config.criteria,
|
||||
distinct: config.distinct,
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
use std::fmt;
|
||||
|
||||
#[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
#[derive(Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||
pub struct VecReadOnly<T> {
|
||||
inner: Arc<Vec<T>>,
|
||||
offset: usize,
|
||||
|
@ -42,3 +43,9 @@ impl<T> Deref for VecReadOnly<T> {
|
|||
self.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: fmt::Debug> fmt::Debug for VecReadOnly<T> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
self.inner.fmt(f)
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue