Skip to content

Commit

Permalink
#25 Add filter to search and index all props
Browse files Browse the repository at this point in the history
  • Loading branch information
Polleps committed Feb 7, 2023
1 parent 320ffe7 commit edd35d0
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 108 deletions.
12 changes: 8 additions & 4 deletions lib/src/db/query_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,6 @@ pub fn query_indexed(store: &Db, q: &Query) -> AtomicResult<QueryResult> {
if in_selection {
let (k, _v) = kv.map_err(|_e| "Unable to parse query_cached")?;
let (_q_filter, _val, subject) = parse_collection_members_key(&k)?;

// If no external resources should be included, skip this one if it's an external resource
if !q.include_external && !subject.starts_with(&self_url) {
continue;
Expand All @@ -133,14 +132,19 @@ pub fn query_indexed(store: &Db, q: &Query) -> AtomicResult<QueryResult> {
subjects.push(subject.into())
}
Err(e) => match &e.error_type {
crate::AtomicErrorType::NotFoundError => {}
crate::AtomicErrorType::UnauthorizedError => {}
crate::AtomicErrorType::NotFoundError => {
println!("Not found: {}", subject)
}
crate::AtomicErrorType::UnauthorizedError => {
println!("Unauthorized: {}", subject)
}
_other => {
println!("Other error: {}", subject);
return Err(format!(
"Error when getting resource in collection: {}",
&e
)
.into())
.into());
}
},
}
Expand Down
92 changes: 39 additions & 53 deletions server/src/handlers/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@ use crate::{
use actix_web::{web, HttpResponse};
use atomic_lib::{errors::AtomicResult, urls, Db, Resource, Storelike};
use serde::Deserialize;
use tantivy::{collector::TopDocs, query::QueryParser};
use tantivy::{
collector::TopDocs,
query::{BooleanQuery, BoostQuery, QueryParser},
};

type Queries = Vec<(tantivy::query::Occur, Box<dyn tantivy::query::Query>)>;

#[derive(Deserialize, Debug)]
pub struct SearchQuery {
Expand All @@ -28,6 +33,8 @@ pub struct SearchQuery {
pub property: Option<String>,
/// Only include resources that have this resource as its ancestor
pub parent: Option<String>,
/// Filter based on props
pub filter: Option<String>,
}

/// Parses a search query and responds with a list of resources
Expand Down Expand Up @@ -56,7 +63,7 @@ pub async fn search_query(
// https://github.com/atomicdata-dev/atomic-data-rust/issues/279.
let initial_results_limit = 100;

let mut query_list: Vec<(tantivy::query::Occur, Box<dyn tantivy::query::Query>)> = Vec::new();
let mut query_list: Queries = Vec::new();

if let Some(parent) = params.parent.clone() {
let query = build_parent_query(parent, &fields, store)?;
Expand All @@ -65,23 +72,21 @@ pub async fn search_query(
}

if let Some(q) = params.q.clone() {
let fuzzy = should_fuzzy(&params.property, &q);
let fuzzy_query = build_fuzzy_query(&fields, &q);

let query = if fuzzy {
build_fuzzy_query(&fields, &q)?
} else {
build_query(
&fields,
&q,
params.property.clone(),
&appstate.search_state.index,
)?
};

query_list.push((tantivy::query::Occur::Must, query));
query_list.push((tantivy::query::Occur::Must, Box::new(fuzzy_query)));
}

let query = tantivy::query::BooleanQuery::new(query_list);
if let Some(filter) = params.filter.clone() {
let exact_query = BoostQuery::new(
build_query(&fields, &filter, &appstate.search_state.index)?,
20.0,
);

query_list.push((tantivy::query::Occur::Must, Box::new(exact_query)));
}

let query = BooleanQuery::new(query_list);

// execute the query
let top_docs = searcher
Expand Down Expand Up @@ -125,6 +130,7 @@ pub async fn search_query(
}
}
results_resource.set_propval(urls::ENDPOINT_RESULTS.into(), resources.into(), store)?;

let mut builder = HttpResponse::Ok();
// TODO: support other serialization options
Ok(builder.body(results_resource.to_json_ad()?))
Expand All @@ -137,53 +143,33 @@ pub struct StringAtom {
pub value: String,
}

fn should_fuzzy(property: &Option<String>, q: &str) -> bool {
if property.is_some() {
// Fuzzy searching is not possible when filtering by property
return false;
}

// If any of these substrings appear, the user wants an exact / advanced search
let dont_fuzz_strings = vec!["*", "AND", "OR", "[", "\"", ":", "+", "-", " "];
for substr in dont_fuzz_strings {
if q.contains(substr) {
return false;
}
}

true
}

fn build_fuzzy_query(fields: &Fields, q: &str) -> AtomicResult<Box<dyn tantivy::query::Query>> {
let term = tantivy::Term::from_field_text(fields.value, q);
let query = tantivy::query::FuzzyTermQuery::new_prefix(term, 1, true);

Ok(Box::new(query))
fn build_fuzzy_query(fields: &Fields, q: &str) -> impl tantivy::query::Query {
let title_term = tantivy::Term::from_field_text(fields.title, q);
let description_term = tantivy::Term::from_field_text(fields.description, q);
let title_query = tantivy::query::FuzzyTermQuery::new_prefix(title_term, 1, true);
let description_query = tantivy::query::FuzzyTermQuery::new_prefix(description_term, 1, true);

let queries: Queries = vec![
(
tantivy::query::Occur::Should,
Box::new(BoostQuery::new(Box::new(title_query), 2.0)),
),
(tantivy::query::Occur::Should, Box::new(description_query)),
];

BooleanQuery::from(queries)
}

#[tracing::instrument(skip(index))]
fn build_query(
fields: &Fields,
q: &str,
property: Option<String>,
index: &tantivy::Index,
) -> AtomicResult<Box<dyn tantivy::query::Query>> {
// construct the query
let query_parser = QueryParser::for_index(
index,
vec![
fields.subject,
// I don't think we need to search in the property
// fields.property,
fields.value,
],
);
let query_parser = QueryParser::for_index(index, vec![fields.propvals]);

let query_text = if let Some(prop) = property {
format!("property:{:?} AND {}", prop, &q)
} else {
q.to_string()
};
let query_text = q.to_string();

let query = query_parser
.parse_query(&query_text)
Expand Down
112 changes: 61 additions & 51 deletions server/src/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ use crate::errors::AtomicServerResult;
#[derive(Debug)]
pub struct Fields {
pub subject: Field,
pub property: Field,
pub value: Field,
pub title: Field,
pub description: Field,
pub propvals: Field,
pub hierarchy: Field,
}

Expand Down Expand Up @@ -59,8 +60,9 @@ pub fn build_schema() -> AtomicServerResult<tantivy::schema::Schema> {
let mut schema_builder = Schema::builder();
// The STORED flag makes the index store the full values. Can be useful.
schema_builder.add_text_field("subject", TEXT | STORED);
schema_builder.add_text_field("property", TEXT | STORED);
schema_builder.add_text_field("value", TEXT | STORED);
schema_builder.add_text_field("title", TEXT | STORED);
schema_builder.add_text_field("description", TEXT | STORED);
schema_builder.add_json_field("propvals", STORED | TEXT);
schema_builder.add_facet_field("hierarchy", STORED);
let schema = schema_builder.build();
Ok(schema)
Expand Down Expand Up @@ -92,23 +94,28 @@ pub fn get_schema_fields(appstate: &SearchState) -> AtomicServerResult<Fields> {
.schema
.get_field("subject")
.ok_or("No 'subject' in the schema")?;
let property = appstate
let title = appstate
.schema
.get_field("property")
.ok_or("No 'property' in the schema")?;
let value = appstate
.get_field("title")
.ok_or("No 'title' in the schema")?;
let description = appstate
.schema
.get_field("value")
.ok_or("No 'value' in the schema")?;
.get_field("description")
.ok_or("No 'description' in the schema")?;
let propvals = appstate
.schema
.get_field("propvals")
.ok_or("No 'propvals' in the schema")?;
let hierarchy = appstate
.schema
.get_field("hierarchy")
.ok_or("No 'hierarchy' in the schema")?;

Ok(Fields {
subject,
property,
value,
title,
description,
propvals,
hierarchy,
})
}
Expand Down Expand Up @@ -143,23 +150,32 @@ pub fn add_resource(
let fields = get_schema_fields(appstate)?;
let subject = resource.get_subject();
let writer = appstate.writer.read()?;

let mut doc = Document::default();
doc.add_json_object(
fields.propvals,
serde_json::from_str(&resource.to_json(store)?).map_err(|e| {
format!(
"Failed to convert resource to json for search indexing. Subject: {}. Error: {}",
subject, e
)
})?,
);

doc.add_text(fields.subject, subject);
doc.add_text(fields.title, get_resource_title(resource));

if let Ok(atomic_lib::Value::Markdown(description)) =
resource.get(atomic_lib::urls::DESCRIPTION)
{
doc.add_text(fields.description, description);
};

let hierarchy = resource_to_facet(resource, store)?;
doc.add_facet(fields.hierarchy, hierarchy);

writer.add_document(doc)?;

for (prop, val) in resource.get_propvals() {
match val {
atomic_lib::Value::AtomicUrl(_) | atomic_lib::Value::ResourceArray(_) => continue,
_ => {
add_triple(
&writer,
subject.into(),
prop.into(),
val.to_string(),
Some(hierarchy.clone()),
&fields,
)?;
}
};
}
Ok(())
}

Expand All @@ -175,30 +191,6 @@ pub fn remove_resource(search_state: &SearchState, subject: &str) -> AtomicServe
Ok(())
}

/// Adds a single atom or triple to the search index, but does _not_ commit!
/// `appstate.search_index_writer.write()?.commit()?;`
#[tracing::instrument(skip(writer, fields))]
pub fn add_triple(
writer: &IndexWriter,
subject: String,
property: String,
value: String,
hierarchy: Option<Facet>,
fields: &Fields,
) -> AtomicServerResult<()> {
let mut doc = Document::default();
doc.add_text(fields.property, property);
doc.add_text(fields.value, value);
doc.add_text(fields.subject, subject);

if let Some(hierarchy) = hierarchy {
doc.add_facet(fields.hierarchy, hierarchy);
}

writer.add_document(doc)?;
Ok(())
}

// For a search server you will typically create one reader for the entire lifetime of your program, and acquire a new searcher for every single request.
pub fn get_reader(index: &tantivy::Index) -> AtomicServerResult<tantivy::IndexReader> {
Ok(index
Expand Down Expand Up @@ -242,6 +234,24 @@ pub fn resource_to_facet(resource: &Resource, store: &Db) -> AtomicServerResult<
Ok(result)
}

fn get_resource_title(resource: &Resource) -> String {
let title = if let Ok(name) = resource.get(atomic_lib::urls::NAME) {
name.clone()
} else if let Ok(shortname) = resource.get(atomic_lib::urls::SHORTNAME) {
shortname.clone()
} else if let Ok(filename) = resource.get(atomic_lib::urls::FILENAME) {
filename.clone()
} else {
atomic_lib::Value::String(resource.get_subject().to_string())
};

match title {
atomic_lib::Value::String(s) => s,
atomic_lib::Value::Slug(s) => s,
_ => resource.get_subject().to_string(),
}
}

#[cfg(test)]
mod tests {
use atomic_lib::{urls, Resource, Storelike};
Expand Down

0 comments on commit edd35d0

Please sign in to comment.