Skip to content

Commit

Permalink
improve persons searching support (#148)
Browse files Browse the repository at this point in the history
* ensure correct id for items on elasticsearch
* normalize data when updating ES
* ensure_required_metadata script improvements
* added advanced persons search
* improvements to api docs
  • Loading branch information
OriHoch authored Apr 20, 2017
2 parents fe81cbf + f62d234 commit 62e36cc
Show file tree
Hide file tree
Showing 33 changed files with 1,141 additions and 1,342 deletions.
2 changes: 1 addition & 1 deletion bhs_api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def create_app(testing=False, live=False):

# Create the elasticsearch connection
app.es = elasticsearch.Elasticsearch(conf.elasticsearch_host)
app.es_data_db_index_name = conf.elasticsearch_data_index if getattr(conf, "elasticsearch_data_index") else app.data_db.name
app.es_data_db_index_name = getattr(conf, "elasticsearch_data_index", app.data_db.name)

# Add the user's endpoints
from bhs_api.user import user_endpoints
Expand Down
67 changes: 29 additions & 38 deletions bhs_api/item.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,8 @@ def enrich_item(item, db=None, collection_name=None):


def get_item_by_id(id, collection_name, db=None):
if collection_name == "persons":
raise Exception("persons collection does not support getting item by id, you need to search person using multiple fields")
if not db:
db = current_app.data_db
id_field = get_collection_id_field(collection_name)
Expand Down Expand Up @@ -346,16 +348,15 @@ def get_image_url(image_id, bucket):
return 'https://storage.googleapis.com/{}/{}.jpg'.format(bucket, image_id)


def get_collection_id_field(collection_name, is_elasticsearch=False):
def get_collection_id_field(collection_name):
doc_id = 'UnitId'
if collection_name == 'photos':
doc_id = 'PictureId'
# TODO: remove references to the genTreeIndividuals collection - it is irrelevant and not in use
elif collection_name == 'genTreeIndividuals':
doc_id = 'ID'
elif collection_name == 'persons':
# elasticsearch cannot have "id" attribute
doc_id = "PID" if is_elasticsearch else "id"
raise Exception("persons collection does not support plain id field, but a combination of fields")
elif collection_name == 'synonyms':
doc_id = '_id'
elif collection_name == 'trees':
Expand Down Expand Up @@ -412,46 +413,41 @@ def create_slug(document, collection_name):
return ret

def get_doc_id(collection_name, doc):
mongo_id_field = get_collection_id_field(collection_name, is_elasticsearch=False)
elasticsearch_id_field = get_collection_id_field(collection_name, is_elasticsearch=True)
if mongo_id_field == elasticsearch_id_field:
return doc.get(mongo_id_field)
else:
mongo_id = doc.get(mongo_id_field)
elasticsearch_id = doc.get(elasticsearch_id_field)
if mongo_id == elasticsearch_id:
return mongo_id
elif elasticsearch_id is None:
return mongo_id
elif mongo_id is None:
return elasticsearch_id
else:
raise Exception("could not find doc_id for collection {} doc {}".format(collection_name, doc))
if collection_name == "persons":
raise Exception("persons collection items don't have a single doc_id, you must match on multiple fields")
id_field = get_collection_id_field(collection_name)
return doc[id_field]


def update_es(collection_name, doc, is_new, es_index_name=None, es=None, data_db=None, app=None):
def update_es(collection_name, doc, is_new, es_index_name=None, es=None, app=None):
app = current_app if not app else app
es_index_name = app.es_data_db_index_name if not es_index_name else es_index_name
es = app.es if not es else es
data_db = app.data_db if not data_db else data_db
# index only the docs that are publicly available
if doc_show_filter(collection_name, doc):
body = deepcopy(doc)
# the given doc might come either from mongo or from elasticsearch
# here we try to get the doc id from one of them and save it in elasticsearch
elasticsearch_id_field = get_collection_id_field(collection_name, is_elasticsearch=True)
doc_id = get_doc_id(collection_name, doc)
body[elasticsearch_id_field] = doc_id
# adjust attributes for elasticsearch
if collection_name == "persons":
body["person_id"] = body.get("id", body.get("ID"))
body["first_name_lc"] = body["name_lc"][0]
body["last_name_lc"] = body["name_lc"][1]
# maps all known SEX values to normalized gender value
body["gender"] = {"F": "F", "M": "M",
None: "U", "": "U", "U": "U", "?": "U", "P": "U"}[body.get("SEX", "").strip()]
# _id field is internal to mongo
if '_id' in body:
del body['_id']
# id field has special meaning in elasticsearch (it is copied to correct attribute above in the id_field handling
# id field has special meaning in elasticsearch
if 'id' in body:
del body['id']
if "thumbnail" in body and "data" in body["thumbnail"]:
# no need to have thumbnail data in elasticsearch
# TODO: ensure we only store and use thumbnail from filesystem
del body["thumbnail"]["data"]
# persons collection gets a fake header to support searching
if collection_name == "persons":
name = " ".join(body["name"]) if isinstance(body["name"], list) else body["name"]
body["Header"] = {"En": name, "He": name}
# elasticsearch uses the header for completion field
# this field does not support empty values, so we put a string with space here
# this is most likely wrong, but works for now
Expand All @@ -460,21 +456,16 @@ def update_es(collection_name, doc, is_new, es_index_name=None, es=None, data_db
for lang in ("He", "En"):
if body["Header"].get(lang) is None:
body["Header"][lang] = '_'
if collection_name == "persons":
doc_id = "{}_{}_{}".format(body["tree_num"], body["tree_version"], body["person_id"])
else:
doc_id = get_doc_id(collection_name, body)
if is_new:
uuids_to_str(body)
es.index(index=es_index_name, doc_type=collection_name, id=doc_id, body=body)
return True, "indexed successfully"
return True, "indexed successfully (inserted)"
else:
try:
es.update(index=es_index_name, doc_type=collection_name, id=doc_id, body={"doc": body})
return True, "indexed successfully"
except elasticsearch.exceptions.NotFoundError as e:
# So it's in the DB, passes the SHOW_FILTER and not found in ES
# weird, but that's what we have.
# let's index it.
item = data_db[collection_name].find_one({'_id': doc_id})
del item['_id']
es.index(index=es_index_name, doc_type=collection_name, id=doc_id, body=item)
return True, "indexed successfully, by resorting to ES index function for {}:{} with {}".format(collection_name, doc_id, e)
es.update(index=es_index_name, doc_type=collection_name, id=doc_id, body=body)
return True, "indexed successfully (updated)"
else:
return True, "item should not be shown - so not indexed"
26 changes: 26 additions & 0 deletions bhs_api/persons.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,32 @@
"deceased",
"tree_version"]

PERSONS_SEARCH_REQUIRES_ONE_OF = ["first", "last", "sex", "pob", "pom", "pod", "yob", "yom", "yod", "treenum"]

PERSONS_SEARCH_DEFAULT_PARAMETERS = {"first": None, "first_t": "exact",
"last": None, "last_t": "exact",
"sex": None,
"pob": None, "pob_t": "exact",
"pom": None, "pom_t": "exact",
"pod": None, "pod_t": "exact",
"yob": None, "yob_t": "exact", "yob_v": None,
"yom": None, "yom_t": "exact", "yom_v": None,
"yod": None, "yod_t": "exact", "yod_v": None,
"treenum": None,}

PERSONS_SEARCH_YEAR_PARAMS = (("yob", "birth_year"),
("yod", "death_year"),
("yom", "marriage_years"))

PERSONS_SEARCH_TEXT_PARAMS = (("first", "first_name_lc"),
("last", "last_name_lc"),
("pob", "BIRT_PLAC_lc"),
("pom", "MARR_PLAC_lc"),
("pod", "DEAT_PLAC_lc"),)

PERSONS_SEARCH_EXACT_PARAMS = (("sex", "gender"),
("treenum", "tree_num"))


def is_living_person(is_deceased, birth_year):
if is_deceased:
Expand Down
114 changes: 90 additions & 24 deletions bhs_api/v1_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import pymongo
import jinja2
import requests
import traceback

from bhs_api import SEARCH_CHUNK_SIZE
from bhs_api.utils import (get_conf, gen_missing_keys_error, binarize_image,
Expand All @@ -31,6 +32,8 @@
from bhs_api.user import get_user

from bhs_api import phonetic
from bhs_api.persons import (PERSONS_SEARCH_DEFAULT_PARAMETERS, PERSONS_SEARCH_REQUIRES_ONE_OF,
PERSONS_SEARCH_YEAR_PARAMS, PERSONS_SEARCH_TEXT_PARAMS, PERSONS_SEARCH_EXACT_PARAMS)

v1_endpoints = Blueprint('v1', __name__)

Expand All @@ -51,19 +54,79 @@ def custom_error(error):
'''


def es_search(q, size, collection=None, from_=0, sort=None, with_persons=False):
def es_search(q, size, collection=None, from_=0, sort=None, with_persons=False, **kwargs):
if collection:
# if user requested specific collections - we don't filter for persons (that's what user asked for!)
collections = collection.split(",")
else:
# we consider the with_persons to decide whether to include persons collection or not
collections = [collection for collection in SEARCHABLE_COLLECTIONS
if with_persons or collection != "persons"]
body = {"query": {"query_string": {
"fields": ['Header.En^2', 'Header.He^2', 'UnitText1.En', 'UnitText1.He'],
"query": q,
"default_operator": "and"
}}}
default_query = {
"query_string": {
"fields": ["Header.En^2", "Header.He^2", "UnitText1.En", "UnitText1.He"],
"query": q,
"default_operator": "and"
}
}

if collection == "persons":
must_queries = []
if q:
must_queries.append(default_query)
for year_param, year_attr in PERSONS_SEARCH_YEAR_PARAMS:
if kwargs[year_param]:
try:
year_value = int(kwargs[year_param])
except Exception as e:
raise Exception("invalid value for {} ({}): {}".format(year_param, year_attr, kwargs[year_param]))
year_type_param = "{}_t".format(year_param)
year_type = kwargs[year_type_param]
if year_type == "pmyears":
year_type_value_param = "{}_v".format(year_param)
try:
year_type_value = int(kwargs[year_type_value_param])
except Exception as e:
raise Exception("invalid value for {} ({}): {}".format(year_type_value_param, year_attr, kwargs[year_type_value_param]))
must_queries.append({"range": {year_attr: {"gte": year_value - year_type_value, "lte": year_value + year_type_value,}}})
elif year_type == "exact":
must_queries.append({"term": {year_attr: year_value}})
else:
raise Exception("invalid value for {} ({}): {}".format(year_type_param, year_attr, year_type))
for text_param, text_attr in PERSONS_SEARCH_TEXT_PARAMS:
if kwargs[text_param]:
text_value = kwargs[text_param]
text_type_param = "{}_t".format(text_param)
text_type = kwargs[text_type_param]
if text_type == "exact":
must_queries.append({"term": {text_attr: text_value}})
elif text_type == "like":
must_queries.append({"match": {text_attr: {"query": text_value,
"fuzziness": "AUTO"}}})
elif text_type == "starts":
must_queries.append({"prefix": {text_attr: text_value}})
else:
raise Exception("invalid value for {} ({}): {}".format(text_type, text_attr, text_type))
for exact_param, exact_attr in PERSONS_SEARCH_EXACT_PARAMS:
if kwargs[exact_param]:
exact_value = kwargs[exact_param]
if exact_param == "sex" and exact_value not in ("F", "M", "U"):
raise Exception ("invalid value for {} ({}): {}".format(exact_param, exact_attr, exact_value))
elif exact_param == "treenum":
try:
exact_value = int(exact_value)
except Exception as e:
raise Exception("invalid value for {} ({}): {}".format(exact_param, exact_attr, exact_value))
must_queries.append({"term": {exact_attr: exact_value}})
body = {
"query": {
"bool": {
"must": must_queries
}
}
}
else:
body = {"query": default_query}
if sort == "abc":
if phonetic.is_hebrew(q.strip()):
# hebrew alphabetical sort
Expand All @@ -77,15 +140,13 @@ def es_search(q, size, collection=None, from_=0, sort=None, with_persons=False):
elif sort == "year" and collection == "photoUnits":
body["sort"] = [{"UnitPeriod.PeriodStartDate.keyword": "asc"}, "_score"]
try:
try:
collection = collection.split(',')
except:
pass
current_app.logger.debug("es.search index={}, doc_type={} body={}".format(current_app.es_data_db_index_name, collections, json.dumps(body)))
results = current_app.es.search(index=current_app.es_data_db_index_name, body=body, doc_type=collections, size=size, from_=from_)
except elasticsearch.exceptions.ConnectionError as e:
current_app.logger.error('Error connecting to Elasticsearch: {}'.format(e.error))
return None
current_app.logger.error('Error connecting to Elasticsearch: {}'.format(e))
raise Exception("Error connecting to Elasticsearch: {}".format(e))
except Exception as e:
raise Exception("Elasticsearch error: {}".format(e))
return results

def _generate_credits(fn='credits.html'):
Expand Down Expand Up @@ -373,22 +434,26 @@ def save_user_content():
def general_search():
args = request.args
parameters = {'collection': None, 'size': SEARCH_CHUNK_SIZE, 'from_': 0, 'q': None, 'sort': None, "with_persons": False}
parameters.update(PERSONS_SEARCH_DEFAULT_PARAMETERS)
got_one_of_required_persons_params = False
for param in parameters.keys():
if param in args:
if param == "with_persons":
parameters[param] = args[param].lower() in ["1", "yes", "true"]
else:
parameters[param] = args[param]
if not parameters['q']:
abort(400, 'You must specify a search query')
else:
rv = es_search(**parameters)
if not rv:
abort(500, 'Sorry, the search cluster appears to be down')
else:
for item in rv['hits']['hits']:
enrich_item(item['_source'], collection_name=item['_type'])
if param in PERSONS_SEARCH_REQUIRES_ONE_OF and parameters[param]:
got_one_of_required_persons_params = True
if parameters["q"] or (parameters["collection"] == "persons" and got_one_of_required_persons_params):
try:
rv = es_search(**parameters)
except Exception as e:
return humanify({"error": e.message}, 500)
for item in rv['hits']['hits']:
enrich_item(item['_source'], collection_name=item['_type'])
return humanify(rv)
else:
return humanify({"error": "You must specify a search query"}, 400)

@v1_endpoints.route('/wsearch')
def wizard_search():
Expand Down Expand Up @@ -449,20 +514,21 @@ def get_suggestions(collection,string):
'''
rv = {}
try:
unlistify_item = lambda i: " ".join(i) if isinstance(i, (tuple, list)) else i
if collection == "*":
rv['starts_with'], rv['phonetic'] = get_completion_all_collections(string)
rv['contains'] = {}
# make all the words in the suggestion start with a capital letter
rv = {k: {kk: [i.title() for i in vv] for kk, vv in v.items()} for k, v in rv.items()}
rv = {k: {kk: [unlistify_item(i).title() for i in vv] for kk, vv in v.items()} for k, v in rv.items()}
return humanify(rv)
else:
rv['starts_with'], rv['phonetic'] = get_completion(collection, string)
rv['contains'] = []
# make all the words in the suggestion start with a capital letter
rv = {k: [i.title() for i in v] for k, v in rv.items()}
rv = {k: [unlistify_item(i).title() for i in v] for k, v in rv.items()}
return humanify(rv)
except Exception, e:
return humanify({"error": "unexpected exception getting completion data: {}".format(e)}, 500)
return humanify({"error": "unexpected exception getting completion data: {}".format(e), "traceback": traceback.format_exc()}, 500)



Expand Down
4 changes: 3 additions & 1 deletion conf/app_server.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,11 @@ image_bucket: bhs-flat-pics
thumbnail_bucket: bhs-thumbnails
ftree_bucket_url: https://storage.googleapis.com/bhs-familytrees
video_bucket_url: https://storage.googleapis.com/bhs-videos

# elastic search
elasticsearch_host: localhost
elasticsearch_data_index:
elasticsearch_data_index: bhdata

# redis
redis_host: localhost
redis_port: 6379
Expand Down
2 changes: 1 addition & 1 deletion conf/bhs_api_site
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ server {

location /v1/docs {
default_type text/html;
alias /home/bhs/api/docs/index.html;
alias /home/bhs/api/docs/_build/index.html;
}

location / {
Expand Down
5 changes: 5 additions & 0 deletions docs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# dbs-back documentation hub

## API documentation

The API documentation is served at http://devapi.dbs.bh.org.il/v1/docs
7 changes: 7 additions & 0 deletions docs/_build/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# documentation source files

This directory contains the api documentation files built from the _sources files

**Important Notice**

If you just want to read the documentation, go to https://github.com/Beit-Hatfutsot/dbs-back/tree/dev/docs
Loading

0 comments on commit 62e36cc

Please sign in to comment.