Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make tantivy work in the browser with a statically hosted database and on-demand fetching #1067

Draft
wants to merge 9 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ byteorder = "1"
crc32fast = "1"
once_cell = "1"
regex ={version = "1", default-features = false, features = ["std"]}
tantivy-fst = "0.3"
tantivy-fst = {version="0.3", path="../tantivy-fst"}
memmap = {version = "0.7", optional=true}
lz4 = {version="1", optional=true}
brotli = {version="3.3.0", optional=true}
Expand Down Expand Up @@ -48,6 +48,7 @@ chrono = "0.4"
smallvec = "1"
rayon = "1"
lru = "0.6"
backtrace = "0.3.59"

[target.'cfg(windows)'.dependencies]
winapi = "0.3"
Expand Down
2 changes: 1 addition & 1 deletion src/collector/facet_collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ impl Collector for FacetCollector {

let mut collapse_mapping = Vec::new();
let mut counts = Vec::new();
let mut collapse_facet_ords = Vec::new();
let mut collapse_facet_ords: Vec<u64> = Vec::new();

let mut collapse_facet_it = self.facets.iter().peekable();
collapse_facet_ords.push(0);
Expand Down
1 change: 1 addition & 0 deletions src/collector/top_score_collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,7 @@ impl Collector for TopDocs {
threshold
})?;
} else {
crate::info_log(format!("Scoring results and collecting TOP {}", self.0.limit));
weight.for_each_pruning(Score::MIN, reader, &mut |doc, score| {
let heap_item = ComparableDoc {
feature: score,
Expand Down
3 changes: 3 additions & 0 deletions src/common/bitpacker.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
use tantivy_fst::Ulen;
use std::io;

use crate::directory::OwnedBytes;
Expand Down Expand Up @@ -103,6 +104,8 @@ impl BitUnpacker {

#[cfg(test)]
mod test {
use tantivy_fst::Ulen;

use super::{BitPacker, BitUnpacker};
use crate::directory::OwnedBytes;

Expand Down
2 changes: 2 additions & 0 deletions src/common/bitset.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use std::fmt;
use std::u64;

use tantivy_fst::Ulen;

#[derive(Clone, Copy, Eq, PartialEq)]
pub(crate) struct TinySet(u64);

Expand Down
20 changes: 11 additions & 9 deletions src/common/composite_file.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use tantivy_fst::Ulen;

use crate::common::BinarySerializable;
use crate::common::CountingWriter;
use crate::common::VInt;
Expand All @@ -14,11 +16,11 @@ use super::HasLen;
#[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)]
pub struct FileAddr {
field: Field,
idx: usize,
idx: Ulen,
}

impl FileAddr {
fn new(field: Field, idx: usize) -> FileAddr {
fn new(field: Field, idx: Ulen) -> FileAddr {
FileAddr { field, idx }
}
}
Expand All @@ -32,7 +34,7 @@ impl BinarySerializable for FileAddr {

fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let field = Field::deserialize(reader)?;
let idx = VInt::deserialize(reader)?.0 as usize;
let idx = VInt::deserialize(reader)?.0 as Ulen;
Ok(FileAddr { field, idx })
}
}
Expand All @@ -59,7 +61,7 @@ impl<W: TerminatingWrite + Write> CompositeWrite<W> {
}

/// Start writing a new field.
pub fn for_field_with_idx(&mut self, field: Field, idx: usize) -> &mut CountingWriter<W> {
pub fn for_field_with_idx(&mut self, field: Field, idx: Ulen) -> &mut CountingWriter<W> {
let offset = self.write.written_bytes();
let file_addr = FileAddr::new(field, idx);
assert!(!self.offsets.contains_key(&file_addr));
Expand Down Expand Up @@ -105,7 +107,7 @@ impl<W: TerminatingWrite + Write> CompositeWrite<W> {
#[derive(Clone)]
pub struct CompositeFile {
data: FileSlice,
offsets_index: HashMap<FileAddr, (usize, usize)>,
offsets_index: HashMap<FileAddr, (Ulen, Ulen)>,
}

impl CompositeFile {
Expand All @@ -114,7 +116,7 @@ impl CompositeFile {
pub fn open(data: &FileSlice) -> io::Result<CompositeFile> {
let end = data.len();
let footer_len_data = data.slice_from(end - 4).read_bytes()?;
let footer_len = u32::deserialize(&mut footer_len_data.as_slice())? as usize;
let footer_len = u32::deserialize(&mut footer_len_data.as_slice())? as Ulen;
let footer_start = end - 4 - footer_len;
let footer_data = data
.slice(footer_start, footer_start + footer_len)
Expand All @@ -128,7 +130,7 @@ impl CompositeFile {

let mut offset = 0;
for _ in 0..num_fields {
offset += VInt::deserialize(&mut footer_buffer)?.0 as usize;
offset += VInt::deserialize(&mut footer_buffer)?.0 as Ulen;
let file_addr = FileAddr::deserialize(&mut footer_buffer)?;
offsets.push(offset);
file_addrs.push(file_addr);
Expand Down Expand Up @@ -164,7 +166,7 @@ impl CompositeFile {

/// Returns the `FileSlice` associated
/// to a given `Field` and stored in a `CompositeFile`.
pub fn open_read_with_idx(&self, field: Field, idx: usize) -> Option<FileSlice> {
pub fn open_read_with_idx(&self, field: Field, idx: Ulen) -> Option<FileSlice> {
self.offsets_index
.get(&FileAddr { field, idx })
.map(|&(from, to)| self.data.slice(from, to))
Expand All @@ -176,7 +178,7 @@ impl CompositeFile {
fields
.entry(field_addr.field)
.or_insert_with(|| FieldUsage::empty(field_addr.field))
.add_field_idx(field_addr.idx, end - start);
.add_field_idx(field_addr.idx as usize, end - start);
}
PerFieldSpaceUsage::new(fields)
}
Expand Down
4 changes: 2 additions & 2 deletions src/common/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ pub use self::vint::{
read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt,
};
pub use byteorder::LittleEndian as Endianness;

use tantivy_fst::Ulen;
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
///
/// We do not allow segments with more than
Expand Down Expand Up @@ -69,7 +69,7 @@ pub(crate) fn compute_num_bits(n: u64) -> u8 {
/// Has length trait
pub trait HasLen {
/// Return length
fn len(&self) -> usize;
fn len(&self) -> Ulen;

/// Returns true iff empty.
fn is_empty(&self) -> bool {
Expand Down
19 changes: 10 additions & 9 deletions src/common/serialize.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use crate::common::Endianness;
use crate::common::VInt;
use byteorder::{ReadBytesExt, WriteBytesExt};
use tantivy_fst::Ulen;
use std::fmt;
use std::io;
use std::io::Read;
Expand All @@ -17,7 +18,7 @@ pub trait BinarySerializable: fmt::Debug + Sized {
/// `FixedSize` marks a `BinarySerializable` as
/// always serializing to the same size.
pub trait FixedSize: BinarySerializable {
const SIZE_IN_BYTES: usize;
const SIZE_IN_BYTES: Ulen;
}

impl BinarySerializable for () {
Expand All @@ -30,7 +31,7 @@ impl BinarySerializable for () {
}

impl FixedSize for () {
const SIZE_IN_BYTES: usize = 0;
const SIZE_IN_BYTES: Ulen = 0;
}

impl<T: BinarySerializable> BinarySerializable for Vec<T> {
Expand Down Expand Up @@ -73,7 +74,7 @@ impl BinarySerializable for u32 {
}

impl FixedSize for u32 {
const SIZE_IN_BYTES: usize = 4;
const SIZE_IN_BYTES: Ulen = 4;
}

impl BinarySerializable for u64 {
Expand All @@ -86,7 +87,7 @@ impl BinarySerializable for u64 {
}

impl FixedSize for u64 {
const SIZE_IN_BYTES: usize = 8;
const SIZE_IN_BYTES: Ulen = 8;
}

impl BinarySerializable for f32 {
Expand All @@ -99,7 +100,7 @@ impl BinarySerializable for f32 {
}

impl FixedSize for f32 {
const SIZE_IN_BYTES: usize = 4;
const SIZE_IN_BYTES: Ulen = 4;
}

impl BinarySerializable for i64 {
Expand All @@ -112,7 +113,7 @@ impl BinarySerializable for i64 {
}

impl FixedSize for i64 {
const SIZE_IN_BYTES: usize = 8;
const SIZE_IN_BYTES: Ulen = 8;
}

impl BinarySerializable for f64 {
Expand All @@ -125,7 +126,7 @@ impl BinarySerializable for f64 {
}

impl FixedSize for f64 {
const SIZE_IN_BYTES: usize = 8;
const SIZE_IN_BYTES: Ulen = 8;
}

impl BinarySerializable for u8 {
Expand All @@ -138,7 +139,7 @@ impl BinarySerializable for u8 {
}

impl FixedSize for u8 {
const SIZE_IN_BYTES: usize = 1;
const SIZE_IN_BYTES: Ulen = 1;
}

impl BinarySerializable for String {
Expand Down Expand Up @@ -167,7 +168,7 @@ pub mod test {
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new();
O::default().serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
assert_eq!(buffer.len(), O::SIZE_IN_BYTES as usize);
}

fn serialize_test<T: BinarySerializable + Eq>(v: T) -> usize {
Expand Down
1 change: 1 addition & 0 deletions src/common/vint.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use super::BinarySerializable;
use byteorder::{ByteOrder, LittleEndian};
use tantivy_fst::Ulen;
use std::io;
use std::io::Read;
use std::io::Write;
Expand Down
7 changes: 5 additions & 2 deletions src/core/executor.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use crossbeam::channel;
use rayon::{ThreadPool, ThreadPoolBuilder};
use tantivy_fst::Ulen;

/// Search executor whether search request are single thread or multithread.
///
Expand Down Expand Up @@ -87,12 +88,14 @@ impl Executor {
#[cfg(test)]
mod tests {

use tantivy_fst::Ulen;

use super::Executor;

#[test]
#[should_panic(expected = "panic should propagate")]
fn test_panic_propagates_single_thread() {
let _result: Vec<usize> = Executor::single_thread()
let _result: Vec<Ulen> = Executor::single_thread()
.map(
|_| {
panic!("panic should propagate");
Expand All @@ -105,7 +108,7 @@ mod tests {
#[test]
#[should_panic] //< unfortunately the panic message is not propagated
fn test_panic_propagates_multi_thread() {
let _result: Vec<usize> = Executor::multi_thread(1, "search-test")
let _result: Vec<Ulen> = Executor::multi_thread(1, "search-test")
.unwrap()
.map(
|_| {
Expand Down
2 changes: 2 additions & 0 deletions src/core/index.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use tantivy_fst::Ulen;

use super::segment::Segment;
use crate::core::Executor;
use crate::core::IndexMeta;
Expand Down
25 changes: 17 additions & 8 deletions src/core/inverted_index_reader.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use std::io;

use tantivy_fst::{FakeArr, Ulen};

use crate::common::BinarySerializable;
use crate::directory::FileSlice;
use crate::positions::PositionReader;
Expand Down Expand Up @@ -90,10 +92,10 @@ impl InvertedIndexReader {
term_info: &TermInfo,
block_postings: &mut BlockSegmentPostings,
) -> io::Result<()> {
let start_offset = term_info.postings_start_offset as usize;
let stop_offset = term_info.postings_stop_offset as usize;
let start_offset = term_info.postings_start_offset as Ulen;
let stop_offset = term_info.postings_stop_offset as Ulen;
let postings_slice = self.postings_file_slice.slice(start_offset, stop_offset);
block_postings.reset(term_info.doc_freq, postings_slice.read_bytes()?);
block_postings.reset(term_info.doc_freq, postings_slice);
Ok(())
}

Expand All @@ -106,8 +108,10 @@ impl InvertedIndexReader {
term: &Term,
option: IndexRecordOption,
) -> io::Result<Option<BlockSegmentPostings>> {
self.get_term_info(term)?
.map(move |term_info| self.read_block_postings_from_terminfo(&term_info, option))
crate::info_log(format!("reading term info for term {:?}", term));

let info = self.get_term_info(term)?;
info.map(move |term_info| self.read_block_postings_from_terminfo(&term_info, option))
.transpose()
}

Expand All @@ -121,9 +125,11 @@ impl InvertedIndexReader {
requested_option: IndexRecordOption,
) -> io::Result<BlockSegmentPostings> {
let postings_data = self.postings_file_slice.slice(
term_info.postings_start_offset as usize,
term_info.postings_stop_offset as usize,
term_info.postings_start_offset as Ulen,
term_info.postings_stop_offset as Ulen,
);

postings_data.to_vec(); // better force load it all at once
BlockSegmentPostings::open(
term_info.doc_freq,
postings_data,
Expand Down Expand Up @@ -181,7 +187,10 @@ impl InvertedIndexReader {
option: IndexRecordOption,
) -> io::Result<Option<SegmentPostings>> {
self.get_term_info(term)?
.map(move |term_info| self.read_postings_from_terminfo(&term_info, option))
.map(move |term_info| {
crate::info_log(format!("Fetching document ids and frequencies matching term {:?}", term));
self.read_postings_from_terminfo(&term_info, option)
})
.transpose()
}

Expand Down
15 changes: 14 additions & 1 deletion src/core/searcher.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use tantivy_fst::Ulen;

use crate::collector::Collector;
use crate::core::Executor;

Expand Down Expand Up @@ -56,7 +58,18 @@ impl Searcher {
pub fn doc(&self, doc_address: DocAddress) -> crate::Result<Document> {
let DocAddress(segment_local_id, doc_id) = doc_address;
let store_reader = &self.store_readers[segment_local_id as usize];
store_reader.get(doc_id)
let doc = store_reader.get(doc_id)?;
crate::info_log(format!("read content of doc {:?}", doc.field_values()));
Ok(doc)
}

pub fn doc_multiple(&self, doc_addresses: Vec<DocAddress>) -> crate::Result<Vec<Document>> {
if doc_addresses.len() == 0 {
return Ok(vec![]);
}
assert!(doc_addresses.windows(2).all(|s| s[0].0 == s[1].0), "only supported on same segment for now");
let store_reader = &self.store_readers[doc_addresses[0].0 as usize];
store_reader.get_multiple(&doc_addresses.into_iter().map(|d| d.1).collect::<Vec<_>>())
}

/// Access the schema associated to the index of this searcher.
Expand Down
Loading