feat: Index and store/serialize attributes while creating the update
This commit is contained in:
parent
442834c28f
commit
731ed11153
|
@ -9,24 +9,9 @@ use crate::database::deserializer::{Deserializer, DeserializerError};
|
|||
use crate::database::{DATA_INDEX, DATA_SCHEMA};
|
||||
use crate::blob::positive::PositiveBlob;
|
||||
use crate::index::schema::Schema;
|
||||
use crate::database::{DocumentKey, DocumentKeyAttr};
|
||||
use crate::database::{retrieve_data_schema, DocumentKey, DocumentKeyAttr};
|
||||
use crate::DocumentId;
|
||||
|
||||
// FIXME Do not panic!
|
||||
fn retrieve_data_schema(snapshot: &Snapshot<&DB>) -> Result<Schema, Box<Error>> {
|
||||
match snapshot.get(DATA_SCHEMA)? {
|
||||
Some(vector) => Ok(Schema::read_from(&*vector)?),
|
||||
None => panic!("BUG: no schema found in the database"),
|
||||
}
|
||||
}
|
||||
|
||||
fn retrieve_data_index(snapshot: &Snapshot<&DB>) -> Result<PositiveBlob, Box<Error>> {
|
||||
match snapshot.get(DATA_INDEX)? {
|
||||
Some(vector) => Ok(bincode::deserialize(&*vector)?),
|
||||
None => Ok(PositiveBlob::default()),
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DatabaseView<'a> {
|
||||
snapshot: Snapshot<&'a DB>,
|
||||
schema: Schema,
|
||||
|
|
|
@ -1,15 +1,17 @@
|
|||
use std::error::Error;
|
||||
use std::path::Path;
|
||||
use std::ops::Deref;
|
||||
use std::fmt;
|
||||
|
||||
use rocksdb::rocksdb_options::{DBOptions, IngestExternalFileOptions, ColumnFamilyOptions};
|
||||
use rocksdb::{DB, DBVector, MergeOperands, SeekKey};
|
||||
use rocksdb::rocksdb::Writable;
|
||||
use rocksdb::rocksdb::{Writable, Snapshot};
|
||||
|
||||
pub use crate::database::database_view::DatabaseView;
|
||||
pub use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
|
||||
pub use crate::database::database_view::DatabaseView;
|
||||
use crate::index::update::Update;
|
||||
use crate::index::schema::Schema;
|
||||
use crate::blob::positive::PositiveBlob;
|
||||
use crate::blob::{self, Blob};
|
||||
|
||||
mod document_key;
|
||||
|
@ -19,6 +21,24 @@ mod deserializer;
|
|||
const DATA_INDEX: &[u8] = b"data-index";
|
||||
const DATA_SCHEMA: &[u8] = b"data-schema";
|
||||
|
||||
pub fn retrieve_data_schema<D>(snapshot: &Snapshot<D>) -> Result<Schema, Box<Error>>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
match snapshot.get(DATA_SCHEMA)? {
|
||||
Some(vector) => Ok(Schema::read_from(&*vector)?),
|
||||
None => Err(String::from("BUG: no schema found in the database").into()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn retrieve_data_index<D>(snapshot: &Snapshot<D>) -> Result<PositiveBlob, Box<Error>>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
match snapshot.get(DATA_INDEX)? {
|
||||
Some(vector) => Ok(bincode::deserialize(&*vector)?),
|
||||
None => Ok(PositiveBlob::default()),
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Database(DB);
|
||||
|
||||
impl Database {
|
||||
|
@ -162,14 +182,14 @@ mod tests {
|
|||
struct SimpleDoc {
|
||||
title: String,
|
||||
description: String,
|
||||
timestamp: u64,
|
||||
}
|
||||
|
||||
let title;
|
||||
let description;
|
||||
let schema = {
|
||||
let mut builder = SchemaBuilder::new();
|
||||
title = builder.new_attribute("title", STORED | INDEXED);
|
||||
description = builder.new_attribute("description", STORED | INDEXED);
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
builder.new_attribute("description", STORED | INDEXED);
|
||||
builder.new_attribute("timestamp", STORED);
|
||||
builder.build()
|
||||
};
|
||||
|
||||
|
@ -181,21 +201,17 @@ mod tests {
|
|||
let doc0 = SimpleDoc {
|
||||
title: String::from("I am a title"),
|
||||
description: String::from("I am a description"),
|
||||
timestamp: 1234567,
|
||||
};
|
||||
let doc1 = SimpleDoc {
|
||||
title: String::from("I am the second title"),
|
||||
description: String::from("I am the second description"),
|
||||
timestamp: 7654321,
|
||||
};
|
||||
|
||||
let mut update = {
|
||||
let mut builder = PositiveUpdateBuilder::new(update_path, schema, tokenizer_builder);
|
||||
|
||||
// builder.update_field(0, title, doc0.title.clone());
|
||||
// builder.update_field(0, description, doc0.description.clone());
|
||||
|
||||
// builder.update_field(1, title, doc1.title.clone());
|
||||
// builder.update_field(1, description, doc1.description.clone());
|
||||
|
||||
builder.update(0, &doc0).unwrap();
|
||||
builder.update(1, &doc1).unwrap();
|
||||
|
||||
|
@ -206,19 +222,9 @@ mod tests {
|
|||
database.ingest_update_file(update)?;
|
||||
let view = database.view()?;
|
||||
|
||||
println!("{:?}", view);
|
||||
|
||||
#[derive(Deserialize, Debug, Clone, PartialEq, Eq)]
|
||||
struct DeSimpleDoc {
|
||||
title: char,
|
||||
}
|
||||
|
||||
let de_doc0: SimpleDoc = view.retrieve_document(0)?;
|
||||
let de_doc1: SimpleDoc = view.retrieve_document(1)?;
|
||||
|
||||
println!("{:?}", de_doc0);
|
||||
println!("{:?}", de_doc1);
|
||||
|
||||
assert_eq!(doc0, de_doc0);
|
||||
assert_eq!(doc1, de_doc1);
|
||||
|
||||
|
|
|
@ -25,6 +25,7 @@ pub struct PositiveUpdateBuilder<B> {
|
|||
path: PathBuf,
|
||||
schema: Schema,
|
||||
tokenizer_builder: B,
|
||||
builder: UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||
new_states: BTreeMap<(DocumentId, SchemaAttr), NewState>,
|
||||
}
|
||||
|
||||
|
@ -34,14 +35,19 @@ impl<B> PositiveUpdateBuilder<B> {
|
|||
path: path.into(),
|
||||
schema: schema,
|
||||
tokenizer_builder: tokenizer_builder,
|
||||
builder: UnorderedPositiveBlobBuilder::memory(),
|
||||
new_states: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn update<T: Serialize>(&mut self, id: DocumentId, document: &T) -> Result<(), Box<Error>> {
|
||||
pub fn update<T: Serialize>(&mut self, id: DocumentId, document: &T) -> Result<(), Box<Error>>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
let serializer = Serializer {
|
||||
schema: &self.schema,
|
||||
document_id: id,
|
||||
tokenizer_builder: &self.tokenizer_builder,
|
||||
builder: &mut self.builder,
|
||||
new_states: &mut self.new_states
|
||||
};
|
||||
|
||||
|
@ -90,9 +96,11 @@ impl fmt::Display for SerializerError {
|
|||
|
||||
impl Error for SerializerError {}
|
||||
|
||||
struct Serializer<'a> {
|
||||
struct Serializer<'a, B> {
|
||||
schema: &'a Schema,
|
||||
tokenizer_builder: &'a B,
|
||||
document_id: DocumentId,
|
||||
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||
new_states: &'a mut BTreeMap<(DocumentId, SchemaAttr), NewState>,
|
||||
}
|
||||
|
||||
|
@ -106,7 +114,9 @@ macro_rules! forward_to_unserializable_type {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'a> ser::Serializer for Serializer<'a> {
|
||||
impl<'a, B> ser::Serializer for Serializer<'a, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
@ -114,7 +124,7 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
|||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStruct = StructSerializer<'a>;
|
||||
type SerializeStruct = StructSerializer<'a, B>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
|
@ -238,7 +248,9 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
|||
{
|
||||
Ok(StructSerializer {
|
||||
schema: self.schema,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
document_id: self.document_id,
|
||||
builder: self.builder,
|
||||
new_states: self.new_states,
|
||||
})
|
||||
}
|
||||
|
@ -255,33 +267,17 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
fn serialize_field(
|
||||
schema: &Schema,
|
||||
document_id: DocumentId,
|
||||
new_states: &mut BTreeMap<(DocumentId, SchemaAttr), NewState>,
|
||||
name: &str,
|
||||
value: Vec<u8>,
|
||||
) -> Result<(), SerializerError>
|
||||
{
|
||||
match schema.attribute(name) {
|
||||
Some(attr) => {
|
||||
let props = schema.props(attr);
|
||||
if props.is_stored() {
|
||||
new_states.insert((document_id, attr), NewState::Updated { value });
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
None => Err(SerializerError::SchemaDontMatch { attribute: name.to_owned() }),
|
||||
}
|
||||
}
|
||||
|
||||
struct StructSerializer<'a> {
|
||||
struct StructSerializer<'a, B> {
|
||||
schema: &'a Schema,
|
||||
tokenizer_builder: &'a B,
|
||||
document_id: DocumentId,
|
||||
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||
new_states: &'a mut BTreeMap<(DocumentId, SchemaAttr), NewState>,
|
||||
}
|
||||
|
||||
impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
||||
impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
|
||||
|
@ -292,11 +288,26 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
|||
) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
let value = match bincode::serialize(value) {
|
||||
Ok(value) => value,
|
||||
Err(e) => return Err(SerializerError::UnserializableType { name: "???" }),
|
||||
};
|
||||
serialize_field(self.schema, self.document_id, self.new_states, key, value)
|
||||
match self.schema.attribute(key) {
|
||||
Some(attr) => {
|
||||
let props = self.schema.props(attr);
|
||||
if props.is_stored() {
|
||||
let value = bincode::serialize(value).unwrap();
|
||||
self.new_states.insert((self.document_id, attr), NewState::Updated { value });
|
||||
}
|
||||
if props.is_indexed() {
|
||||
let serializer = IndexerSerializer {
|
||||
builder: self.builder,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
document_id: self.document_id,
|
||||
attribute: attr,
|
||||
};
|
||||
value.serialize(serializer)?;
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
None => Err(SerializerError::SchemaDontMatch { attribute: key.to_owned() }),
|
||||
}
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
|
@ -304,52 +315,181 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
impl<B> PositiveUpdateBuilder<B>
|
||||
struct IndexerSerializer<'a, B> {
|
||||
tokenizer_builder: &'a B,
|
||||
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||
document_id: DocumentId,
|
||||
attribute: SchemaAttr,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::Serializer for IndexerSerializer<'a, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
|
||||
i8 => serialize_i8,
|
||||
i16 => serialize_i16,
|
||||
i32 => serialize_i32,
|
||||
i64 => serialize_i64,
|
||||
|
||||
u8 => serialize_u8,
|
||||
u16 => serialize_u16,
|
||||
u32 => serialize_u32,
|
||||
u64 => serialize_u64,
|
||||
|
||||
f32 => serialize_f32,
|
||||
f64 => serialize_f64,
|
||||
}
|
||||
|
||||
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
|
||||
for (index, word) in self.tokenizer_builder.build(v) {
|
||||
let doc_index = DocIndex {
|
||||
document_id: self.document_id,
|
||||
attribute: self.attribute.as_u32() as u8,
|
||||
attribute_index: index as u32,
|
||||
};
|
||||
|
||||
// insert the exact representation
|
||||
let word_lower = word.to_lowercase();
|
||||
|
||||
// and the unidecoded lowercased version
|
||||
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
||||
if word_lower != word_unidecoded {
|
||||
self.builder.insert(word_unidecoded, doc_index);
|
||||
}
|
||||
|
||||
self.builder.insert(word_lower, doc_index);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "seq" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "map" })
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct" })
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct variant" })
|
||||
}
|
||||
}
|
||||
|
||||
impl<B> PositiveUpdateBuilder<B> {
|
||||
pub fn build(self) -> Result<Update, Box<Error>> {
|
||||
let env_options = rocksdb_options::EnvOptions::new();
|
||||
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
|
||||
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
|
||||
file_writer.open(&self.path.to_string_lossy())?;
|
||||
|
||||
let mut builder = UnorderedPositiveBlobBuilder::memory();
|
||||
for ((document_id, attr), state) in &self.new_states {
|
||||
let props = self.schema.props(*attr);
|
||||
let value = match state {
|
||||
NewState::Updated { value } if props.is_indexed() => value,
|
||||
_ => continue,
|
||||
};
|
||||
|
||||
let value: String = match bincode::deserialize(&value) {
|
||||
Ok(value) => value,
|
||||
Err(e) => {
|
||||
eprintln!("{}", e);
|
||||
continue
|
||||
},
|
||||
};
|
||||
|
||||
for (index, word) in self.tokenizer_builder.build(&value) {
|
||||
let doc_index = DocIndex {
|
||||
document_id: *document_id,
|
||||
attribute: attr.as_u32() as u8,
|
||||
attribute_index: index as u32,
|
||||
};
|
||||
|
||||
// insert the exact representation
|
||||
let word_lower = word.to_lowercase();
|
||||
|
||||
// and the unidecoded lowercased version
|
||||
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
||||
if word_lower != word_unidecoded {
|
||||
builder.insert(word_unidecoded, doc_index);
|
||||
}
|
||||
|
||||
builder.insert(word_lower, doc_index);
|
||||
}
|
||||
}
|
||||
|
||||
let (blob_fst_map, blob_doc_idx) = builder.into_inner()?;
|
||||
let (blob_fst_map, blob_doc_idx) = self.builder.into_inner()?;
|
||||
let positive_blob = PositiveBlob::from_bytes(blob_fst_map, blob_doc_idx)?;
|
||||
let blob = Blob::Positive(positive_blob);
|
||||
|
||||
|
|
|
@ -2,7 +2,6 @@ pub mod automaton;
|
|||
pub mod blob;
|
||||
pub mod database;
|
||||
pub mod data;
|
||||
pub mod retrieve;
|
||||
pub mod index;
|
||||
pub mod rank;
|
||||
pub mod tokenizer;
|
||||
|
|
|
@ -11,9 +11,9 @@ use fst::Streamer;
|
|||
use crate::automaton::{self, DfaExt, AutomatonExt};
|
||||
use crate::rank::criterion::{self, Criterion};
|
||||
use crate::rank::distinct_map::DistinctMap;
|
||||
use crate::database::retrieve_data_index;
|
||||
use crate::blob::PositiveBlob;
|
||||
use crate::{Match, DocumentId};
|
||||
use crate::retrieve::Retrieve;
|
||||
use crate::rank::Document;
|
||||
|
||||
fn clamp_range<T: Copy + Ord>(range: Range<T>, big: Range<T>) -> Range<T> {
|
||||
|
@ -48,7 +48,7 @@ impl<T, C> QueryBuilder<T, C>
|
|||
where T: Deref<Target=DB>,
|
||||
{
|
||||
pub fn with_criteria(snapshot: Snapshot<T>, criteria: Vec<C>) -> Result<Self, Box<Error>> {
|
||||
let blob = snapshot.data_index()?;
|
||||
let blob = retrieve_data_index(&snapshot)?;
|
||||
Ok(QueryBuilder { snapshot, blob, criteria })
|
||||
}
|
||||
|
||||
|
|
|
@ -1,60 +0,0 @@
|
|||
use std::error::Error;
|
||||
use std::ops::Deref;
|
||||
|
||||
use ::rocksdb::rocksdb::{DB, Snapshot, DBVector};
|
||||
|
||||
use crate::index::schema::{Schema, SchemaAttr};
|
||||
use crate::blob::PositiveBlob;
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct DocDatabase<'a, R: ?Sized> {
|
||||
retrieve: &'a R,
|
||||
schema: Schema,
|
||||
}
|
||||
|
||||
impl<'a, R> DocDatabase<'a, R> {
|
||||
pub fn get_document<D>(&self, id: DocumentId) -> Result<Option<D>, Box<Error>> {
|
||||
// if ids.is_empty() { return Ok(Vec::new()) }
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
pub fn get_document_attribute(&self, id: DocumentId, attr: SchemaAttr) -> Result<DBVector, Box<Error>> {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Retrieve {
|
||||
fn schema(&self) -> Result<Option<Schema>, Box<Error>>;
|
||||
fn data_index(&self) -> Result<PositiveBlob, Box<Error>>;
|
||||
fn doc_database(&self) -> Result<DocDatabase<Self>, Box<Error>>;
|
||||
}
|
||||
|
||||
impl<T> Retrieve for Snapshot<T>
|
||||
where T: Deref<Target=DB>,
|
||||
{
|
||||
fn schema(&self) -> Result<Option<Schema>, Box<Error>> {
|
||||
match self.deref().get(b"data-schema")? {
|
||||
Some(value) => Ok(Some(Schema::read_from(&*value)?)),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
fn data_index(&self) -> Result<PositiveBlob, Box<Error>> {
|
||||
match self.deref().get(b"data-index")? {
|
||||
Some(value) => Ok(bincode::deserialize(&value)?),
|
||||
None => Ok(PositiveBlob::default()),
|
||||
}
|
||||
}
|
||||
|
||||
fn doc_database(&self) -> Result<DocDatabase<Self>, Box<Error>> {
|
||||
let schema = match self.schema()? {
|
||||
Some(schema) => schema,
|
||||
None => return Err(String::from("BUG: could not find schema").into()),
|
||||
};
|
||||
|
||||
Ok(DocDatabase {
|
||||
retrieve: self,
|
||||
schema: schema,
|
||||
})
|
||||
}
|
||||
}
|
|
@ -1,52 +1,6 @@
|
|||
use std::mem;
|
||||
use self::Separator::*;
|
||||
|
||||
struct MegaTokenizer<I> {
|
||||
strings: I,
|
||||
}
|
||||
|
||||
impl From<String> for MegaTokenizer<Option<String>> {
|
||||
fn from(string: String) -> Self {
|
||||
MegaTokenizer { strings: Some(string) }
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Vec<String>> for MegaTokenizer<Vec<String>> {
|
||||
fn from(strings: Vec<String>) -> Self {
|
||||
MegaTokenizer { strings }
|
||||
}
|
||||
}
|
||||
|
||||
impl<I> Iterator for MegaTokenizer<I> {
|
||||
type Item = (usize, String);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn xxx() {
|
||||
let s1 = "hello world!";
|
||||
let mut s1 = MegaTokenizer::from(s1.to_owned());
|
||||
|
||||
assert_eq!(s1.next(), Some((0, "hello".into())));
|
||||
assert_eq!(s1.next(), Some((1, "world".into())));
|
||||
|
||||
assert_eq!(s1.next(), None);
|
||||
|
||||
let v1 = vec!["Vin Diesel".to_owned(), "Quentin Tarantino".to_owned()];
|
||||
let mut v1 = MegaTokenizer::from(v1);
|
||||
|
||||
assert_eq!(v1.next(), Some((0, "Vin".into())));
|
||||
assert_eq!(v1.next(), Some((1, "Diesel".into())));
|
||||
|
||||
assert_eq!(v1.next(), Some((8, "Quentin".into())));
|
||||
assert_eq!(v1.next(), Some((9, "Tarantino".into())));
|
||||
|
||||
assert_eq!(v1.next(), None);
|
||||
}
|
||||
|
||||
pub trait TokenizerBuilder {
|
||||
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=(usize, &'a str)> + 'a>;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue