Skip to content

Commit

Permalink
Fix multi-word fuzzy queries #336
Browse files Browse the repository at this point in the history
  • Loading branch information
joepio committed Feb 21, 2023
1 parent 1c8aa51 commit 9acf443
Show file tree
Hide file tree
Showing 7 changed files with 65 additions and 86 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
trace-*.json
**/.temp
.DS_Store
.cargo
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ See [STATUS.md](server/STATUS.md) to learn more about which features will remain

## UNRELEASED

- **Requires `--rebuild-index`**
- Improve full-text search, use JSON fields #335
- Update tantivy.
- Parse multiple auth cookies #525
- Fix `--script` flag

Expand Down
2 changes: 1 addition & 1 deletion CONTRIBUTE.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ There are two ways you can use `tracing` to get insights into performance.
- Visit jaeger: `http://localhost:16686`

```sh
docker run -d --name jaeger \
docker run -d --platform linux/amd64 --name jaeger \
-e COLLECTOR_ZIPKIN_HTTP_PORT=9411 \
-p 5775:5775/udp \
-p 6831:6831/udp \
Expand Down
94 changes: 36 additions & 58 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 2 additions & 7 deletions lib/src/db/query_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,14 +132,9 @@ pub fn query_indexed(store: &Db, q: &Query) -> AtomicResult<QueryResult> {
subjects.push(subject.into())
}
Err(e) => match &e.error_type {
crate::AtomicErrorType::NotFoundError => {
println!("Not found: {}", subject)
}
crate::AtomicErrorType::UnauthorizedError => {
println!("Unauthorized: {}", subject)
}
crate::AtomicErrorType::NotFoundError => {}
crate::AtomicErrorType::UnauthorizedError => {}
_other => {
println!("Other error: {}", subject);
return Err(format!(
"Error when getting resource in collection: {}",
&e
Expand Down
2 changes: 1 addition & 1 deletion server/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ rustls-pemfile = "1"
sanitize-filename = "0.4"
serde_json = "1"
static-files = "0.2"
tantivy = "0.18"
tantivy = "0.19"
tracing = "0.1"
tracing-actix-web = "0.6"
tracing-chrome = "0.6"
Expand Down
40 changes: 21 additions & 19 deletions server/src/handlers/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ use serde::Deserialize;
use tantivy::{
collector::TopDocs,
query::{BooleanQuery, BoostQuery, QueryParser},
tokenizer::Tokenizer,
};

type Queries = Vec<(tantivy::query::Occur, Box<dyn tantivy::query::Query>)>;
Expand All @@ -29,11 +30,9 @@ pub struct SearchQuery {
pub include: Option<bool>,
/// Maximum amount of results
pub limit: Option<usize>,
/// Filter by Property URL
pub property: Option<String>,
/// Only include resources that have this resource as its ancestor
pub parent: Option<String>,
/// Filter based on props
/// Filter based on props, using tantivy QueryParser syntax
pub filter: Option<String>,
}

Expand Down Expand Up @@ -144,18 +143,24 @@ pub struct StringAtom {
}

fn build_fuzzy_query(fields: &Fields, q: &str) -> impl tantivy::query::Query {
let title_term = tantivy::Term::from_field_text(fields.title, q);
let description_term = tantivy::Term::from_field_text(fields.description, q);
let title_query = tantivy::query::FuzzyTermQuery::new_prefix(title_term, 1, true);
let description_query = tantivy::query::FuzzyTermQuery::new_prefix(description_term, 1, true);

let queries: Queries = vec![
(
let mut token_stream = tantivy::tokenizer::SimpleTokenizer.token_stream(q);
type Queries = Vec<(tantivy::query::Occur, Box<dyn tantivy::query::Query>)>;
let mut queries: Queries = Vec::new();
// Parse every word (Token) from the Query, and for each word, create a FuzzyQuery
token_stream.process(&mut |token| {
let text = &token.text;
let title_term = tantivy::Term::from_field_text(fields.title, text);
let description_term = tantivy::Term::from_field_text(fields.description, text);
let title_query = tantivy::query::FuzzyTermQuery::new_prefix(title_term, 1, true);
let description_query =
tantivy::query::FuzzyTermQuery::new_prefix(description_term, 1, true);

queries.push((
tantivy::query::Occur::Should,
Box::new(BoostQuery::new(Box::new(title_query), 2.0)),
),
(tantivy::query::Occur::Should, Box::new(description_query)),
];
));
queries.push((tantivy::query::Occur::Should, Box::new(description_query)));
});

BooleanQuery::from(queries)
}
Expand All @@ -166,14 +171,11 @@ fn build_query(
q: &str,
index: &tantivy::Index,
) -> AtomicResult<Box<dyn tantivy::query::Query>> {
// construct the query
let query_parser = QueryParser::for_index(index, vec![fields.propvals]);

let query_text = q.to_string();

let query = query_parser
.parse_query(&query_text)
.map_err(|e| format!("Error parsing query {}", e))?;
.parse_query(q)
.map_err(|e| format!("Error parsing query: {}", e))?;

Ok(query)
}
Expand Down Expand Up @@ -214,7 +216,7 @@ fn unpack_value(
fn docs_to_resources(
docs: Vec<(f32, tantivy::DocAddress)>,
fields: &Fields,
searcher: &tantivy::LeasedItem<tantivy::Searcher>,
searcher: &tantivy::Searcher,
) -> Result<Vec<String>, AtomicServerError> {
let mut subjects: HashSet<String> = HashSet::new();

Expand Down

0 comments on commit 9acf443

Please sign in to comment.