improve persons searching support (#148)

* ensure correct id for items on elasticsearch * normalize data when updating ES * ensure_required_metadata script improvements * added advanced persons search * improvements to api docs
Beit-Hatfutsot · Apr 20, 2017 · 62e36cc · 62e36cc
2 parents fe81cbf + f62d234
commit 62e36cc
Show file tree

Hide file tree

Showing 33 changed files with 1,141 additions and 1,342 deletions.
diff --git a/bhs_api/__init__.py b/bhs_api/__init__.py
@@ -75,7 +75,7 @@ def create_app(testing=False, live=False):
 
     # Create the elasticsearch connection
     app.es = elasticsearch.Elasticsearch(conf.elasticsearch_host)
-    app.es_data_db_index_name = conf.elasticsearch_data_index if getattr(conf, "elasticsearch_data_index") else app.data_db.name
+    app.es_data_db_index_name = getattr(conf, "elasticsearch_data_index", app.data_db.name)
 
     # Add the user's endpoints
     from bhs_api.user import user_endpoints

diff --git a/bhs_api/item.py b/bhs_api/item.py
@@ -226,6 +226,8 @@ def enrich_item(item, db=None, collection_name=None):
 
 
 def get_item_by_id(id, collection_name, db=None):
+    if collection_name == "persons":
+        raise Exception("persons collection does not support getting item by id, you need to search person using multiple fields")
     if not db:
         db = current_app.data_db
     id_field = get_collection_id_field(collection_name)
@@ -346,16 +348,15 @@ def get_image_url(image_id, bucket):
     return  'https://storage.googleapis.com/{}/{}.jpg'.format(bucket, image_id)
 
 
-def get_collection_id_field(collection_name, is_elasticsearch=False):
+def get_collection_id_field(collection_name):
     doc_id = 'UnitId'
     if collection_name == 'photos':
         doc_id = 'PictureId'
     # TODO: remove references to the genTreeIndividuals collection - it is irrelevant and not in use
     elif collection_name == 'genTreeIndividuals':
         doc_id = 'ID'
     elif collection_name == 'persons':
-        # elasticsearch cannot have "id" attribute
-        doc_id = "PID" if is_elasticsearch else "id"
+        raise Exception("persons collection does not support plain id field, but a combination of fields")
     elif collection_name == 'synonyms':
         doc_id = '_id'
     elif collection_name == 'trees':
@@ -412,46 +413,41 @@ def create_slug(document, collection_name):
     return ret
 
 def get_doc_id(collection_name, doc):
-    mongo_id_field = get_collection_id_field(collection_name, is_elasticsearch=False)
-    elasticsearch_id_field = get_collection_id_field(collection_name, is_elasticsearch=True)
-    if mongo_id_field == elasticsearch_id_field:
-        return doc.get(mongo_id_field)
-    else:
-        mongo_id = doc.get(mongo_id_field)
-        elasticsearch_id = doc.get(elasticsearch_id_field)
-        if mongo_id == elasticsearch_id:
-            return mongo_id
-        elif elasticsearch_id is None:
-            return mongo_id
-        elif mongo_id is None:
-            return elasticsearch_id
-        else:
-            raise Exception("could not find doc_id for collection {} doc {}".format(collection_name, doc))
+    if collection_name == "persons":
+        raise Exception("persons collection items don't have a single doc_id, you must match on multiple fields")
+    id_field = get_collection_id_field(collection_name)
+    return doc[id_field]
 
 
-def update_es(collection_name, doc, is_new, es_index_name=None, es=None, data_db=None, app=None):
+def update_es(collection_name, doc, is_new, es_index_name=None, es=None, app=None):
     app = current_app if not app else app
     es_index_name = app.es_data_db_index_name if not es_index_name else es_index_name
     es = app.es if not es else es
-    data_db = app.data_db if not data_db else data_db
     # index only the docs that are publicly available
     if doc_show_filter(collection_name, doc):
         body = deepcopy(doc)
-        # the given doc might come either from mongo or from elasticsearch
-        # here we try to get the doc id from one of them and save it in elasticsearch
-        elasticsearch_id_field = get_collection_id_field(collection_name, is_elasticsearch=True)
-        doc_id = get_doc_id(collection_name, doc)
-        body[elasticsearch_id_field] = doc_id
+        # adjust attributes for elasticsearch
+        if collection_name == "persons":
+            body["person_id"] = body.get("id", body.get("ID"))
+            body["first_name_lc"] = body["name_lc"][0]
+            body["last_name_lc"] = body["name_lc"][1]
+            # maps all known SEX values to normalized gender value
+            body["gender"] = {"F": "F", "M": "M",
+                              None: "U", "": "U", "U": "U", "?": "U", "P": "U"}[body.get("SEX", "").strip()]
         # _id field is internal to mongo
         if '_id' in body:
             del body['_id']
-        # id field has special meaning in elasticsearch (it is copied to correct attribute above in the id_field handling
+        # id field has special meaning in elasticsearch
         if 'id' in body:
             del body['id']
         if "thumbnail" in body and "data" in body["thumbnail"]:
             # no need to have thumbnail data in elasticsearch
             # TODO: ensure we only store and use thumbnail from filesystem
             del body["thumbnail"]["data"]
+        # persons collection gets a fake header to support searching
+        if collection_name == "persons":
+            name = " ".join(body["name"]) if isinstance(body["name"], list) else body["name"]
+            body["Header"] = {"En": name, "He": name}
         # elasticsearch uses the header for completion field
         # this field does not support empty values, so we put a string with space here
         # this is most likely wrong, but works for now
@@ -460,21 +456,16 @@ def update_es(collection_name, doc, is_new, es_index_name=None, es=None, data_db
             for lang in ("He", "En"):
                 if body["Header"].get(lang) is None:
                     body["Header"][lang] = '_'
+        if collection_name == "persons":
+            doc_id = "{}_{}_{}".format(body["tree_num"], body["tree_version"], body["person_id"])
+        else:
+            doc_id = get_doc_id(collection_name, body)
         if is_new:
             uuids_to_str(body)
             es.index(index=es_index_name, doc_type=collection_name, id=doc_id, body=body)
-            return True, "indexed successfully"
+            return True, "indexed successfully (inserted)"
         else:
-            try:
-                es.update(index=es_index_name, doc_type=collection_name, id=doc_id, body={"doc": body})
-                return True, "indexed successfully"
-            except elasticsearch.exceptions.NotFoundError as e:
-                # So it's in the DB, passes the SHOW_FILTER and not found in ES
-                # weird, but that's what we have.
-                # let's index it.
-                item = data_db[collection_name].find_one({'_id': doc_id})
-                del item['_id']
-                es.index(index=es_index_name, doc_type=collection_name, id=doc_id, body=item)
-                return True, "indexed successfully, by resorting to ES index function for {}:{} with {}".format(collection_name, doc_id, e)
+            es.update(index=es_index_name, doc_type=collection_name, id=doc_id, body=body)
+            return True, "indexed successfully (updated)"
     else:
         return True, "item should not be shown - so not indexed"
diff --git a/bhs_api/persons.py b/bhs_api/persons.py
@@ -19,6 +19,32 @@
                                   "deceased",
                                   "tree_version"]
 
+PERSONS_SEARCH_REQUIRES_ONE_OF = ["first", "last", "sex", "pob", "pom", "pod", "yob", "yom", "yod", "treenum"]
+
+PERSONS_SEARCH_DEFAULT_PARAMETERS = {"first": None, "first_t": "exact",
+                                     "last": None, "last_t": "exact",
+                                     "sex": None,
+                                     "pob": None, "pob_t": "exact",
+                                     "pom": None, "pom_t": "exact",
+                                     "pod": None, "pod_t": "exact",
+                                     "yob": None, "yob_t": "exact", "yob_v": None,
+                                     "yom": None, "yom_t": "exact", "yom_v": None,
+                                     "yod": None, "yod_t": "exact", "yod_v": None,
+                                     "treenum": None,}
+
+PERSONS_SEARCH_YEAR_PARAMS = (("yob", "birth_year"),
+                              ("yod", "death_year"),
+                              ("yom", "marriage_years"))
+
+PERSONS_SEARCH_TEXT_PARAMS = (("first", "first_name_lc"),
+                              ("last", "last_name_lc"),
+                              ("pob", "BIRT_PLAC_lc"),
+                              ("pom", "MARR_PLAC_lc"),
+                              ("pod", "DEAT_PLAC_lc"),)
+
+PERSONS_SEARCH_EXACT_PARAMS = (("sex", "gender"),
+                               ("treenum", "tree_num"))
+
 
 def is_living_person(is_deceased, birth_year):
     if is_deceased:

diff --git a/bhs_api/v1_endpoints.py b/bhs_api/v1_endpoints.py
@@ -20,6 +20,7 @@
 import pymongo
 import jinja2
 import requests
+import traceback
 
 from bhs_api import SEARCH_CHUNK_SIZE
 from bhs_api.utils import (get_conf, gen_missing_keys_error, binarize_image,
@@ -31,6 +32,8 @@
 from bhs_api.user import get_user
 
 from bhs_api import phonetic
+from bhs_api.persons import (PERSONS_SEARCH_DEFAULT_PARAMETERS, PERSONS_SEARCH_REQUIRES_ONE_OF,
+                             PERSONS_SEARCH_YEAR_PARAMS, PERSONS_SEARCH_TEXT_PARAMS, PERSONS_SEARCH_EXACT_PARAMS)
 
 v1_endpoints = Blueprint('v1', __name__)
 
@@ -51,19 +54,79 @@ def custom_error(error):
 '''
 
 
-def es_search(q, size, collection=None, from_=0, sort=None, with_persons=False):
+def es_search(q, size, collection=None, from_=0, sort=None, with_persons=False, **kwargs):
     if collection:
         # if user requested specific collections - we don't filter for persons (that's what user asked for!)
         collections = collection.split(",")
     else:
         # we consider the with_persons to decide whether to include persons collection or not
         collections = [collection for collection in SEARCHABLE_COLLECTIONS
                                   if with_persons or collection != "persons"]
-    body = {"query": {"query_string": {
-        "fields": ['Header.En^2', 'Header.He^2', 'UnitText1.En', 'UnitText1.He'],
-        "query": q,
-        "default_operator": "and"
-    }}}
+    default_query = {
+        "query_string": {
+            "fields": ["Header.En^2", "Header.He^2", "UnitText1.En", "UnitText1.He"],
+            "query": q,
+            "default_operator": "and"
+        }
+    }
+
+    if collection == "persons":
+        must_queries = []
+        if q:
+            must_queries.append(default_query)
+        for year_param, year_attr in PERSONS_SEARCH_YEAR_PARAMS:
+            if kwargs[year_param]:
+                try:
+                    year_value = int(kwargs[year_param])
+                except Exception as e:
+                    raise Exception("invalid value for {} ({}): {}".format(year_param, year_attr, kwargs[year_param]))
+                year_type_param = "{}_t".format(year_param)
+                year_type = kwargs[year_type_param]
+                if year_type == "pmyears":
+                    year_type_value_param = "{}_v".format(year_param)
+                    try:
+                        year_type_value = int(kwargs[year_type_value_param])
+                    except Exception as e:
+                        raise Exception("invalid value for {} ({}): {}".format(year_type_value_param, year_attr, kwargs[year_type_value_param]))
+                    must_queries.append({"range": {year_attr: {"gte": year_value - year_type_value, "lte": year_value + year_type_value,}}})
+                elif year_type == "exact":
+                    must_queries.append({"term": {year_attr: year_value}})
+                else:
+                    raise Exception("invalid value for {} ({}): {}".format(year_type_param, year_attr, year_type))
+        for text_param, text_attr in PERSONS_SEARCH_TEXT_PARAMS:
+            if kwargs[text_param]:
+                text_value = kwargs[text_param]
+                text_type_param = "{}_t".format(text_param)
+                text_type = kwargs[text_type_param]
+                if text_type == "exact":
+                    must_queries.append({"term": {text_attr: text_value}})
+                elif text_type == "like":
+                    must_queries.append({"match": {text_attr: {"query": text_value,
+                                                               "fuzziness": "AUTO"}}})
+                elif text_type == "starts":
+                    must_queries.append({"prefix": {text_attr: text_value}})
+                else:
+                    raise Exception("invalid value for {} ({}): {}".format(text_type, text_attr, text_type))
+        for exact_param, exact_attr in PERSONS_SEARCH_EXACT_PARAMS:
+            if kwargs[exact_param]:
+                exact_value = kwargs[exact_param]
+                if exact_param == "sex" and exact_value not in ("F", "M", "U"):
+                    raise Exception ("invalid value for {} ({}): {}".format(exact_param, exact_attr, exact_value))
+                elif exact_param == "treenum":
+                    try:
+                        exact_value = int(exact_value)
+                    except Exception as e:
+                        raise Exception("invalid value for {} ({}): {}".format(exact_param, exact_attr, exact_value))
+                must_queries.append({"term": {exact_attr: exact_value}})
+        body = {
+            "query": {
+                "bool": {
+                    "must": must_queries
+                }
+            }
+        }
+    else:
+        body = {"query": default_query}
     if sort == "abc":
         if phonetic.is_hebrew(q.strip()):
             # hebrew alphabetical sort
@@ -77,15 +140,13 @@ def es_search(q, size, collection=None, from_=0, sort=None, with_persons=False):
     elif sort == "year" and collection == "photoUnits":
         body["sort"] = [{"UnitPeriod.PeriodStartDate.keyword": "asc"}, "_score"]
     try:
-        try:
-            collection = collection.split(',')
-        except:
-            pass
         current_app.logger.debug("es.search index={}, doc_type={} body={}".format(current_app.es_data_db_index_name, collections, json.dumps(body)))
         results = current_app.es.search(index=current_app.es_data_db_index_name, body=body, doc_type=collections, size=size, from_=from_)
     except elasticsearch.exceptions.ConnectionError as e:
-        current_app.logger.error('Error connecting to Elasticsearch: {}'.format(e.error))
-        return None
+        current_app.logger.error('Error connecting to Elasticsearch: {}'.format(e))
+        raise Exception("Error connecting to Elasticsearch: {}".format(e))
+    except Exception as e:
+        raise Exception("Elasticsearch error: {}".format(e))
     return results
 
 def _generate_credits(fn='credits.html'):
@@ -373,22 +434,26 @@ def save_user_content():
 def general_search():
     args = request.args
     parameters = {'collection': None, 'size': SEARCH_CHUNK_SIZE, 'from_': 0, 'q': None, 'sort': None, "with_persons": False}
+    parameters.update(PERSONS_SEARCH_DEFAULT_PARAMETERS)
+    got_one_of_required_persons_params = False
     for param in parameters.keys():
         if param in args:
             if param == "with_persons":
                 parameters[param] = args[param].lower() in ["1", "yes", "true"]
             else:
                 parameters[param] = args[param]
-    if not parameters['q']:
-        abort(400, 'You must specify a search query')
-    else:
-        rv = es_search(**parameters)
-        if not rv:
-            abort(500, 'Sorry, the search cluster appears to be down')
-        else:
-            for item in rv['hits']['hits']:
-                enrich_item(item['_source'], collection_name=item['_type'])
+                if param in PERSONS_SEARCH_REQUIRES_ONE_OF and parameters[param]:
+                    got_one_of_required_persons_params = True
+    if parameters["q"] or (parameters["collection"] == "persons" and got_one_of_required_persons_params):
+        try:
+            rv = es_search(**parameters)
+        except Exception as e:
+            return humanify({"error": e.message}, 500)
+        for item in rv['hits']['hits']:
+            enrich_item(item['_source'], collection_name=item['_type'])
         return humanify(rv)
+    else:
+        return humanify({"error": "You must specify a search query"}, 400)
 
 @v1_endpoints.route('/wsearch')
 def wizard_search():
@@ -449,20 +514,21 @@ def get_suggestions(collection,string):
     '''
     rv = {}
     try:
+        unlistify_item = lambda i: " ".join(i) if isinstance(i, (tuple, list)) else i
         if collection == "*":
             rv['starts_with'], rv['phonetic'] = get_completion_all_collections(string)
             rv['contains'] = {}
             # make all the words in the suggestion start with a capital letter
-            rv = {k: {kk: [i.title() for i in vv] for kk, vv in v.items()} for k, v in rv.items()}
+            rv = {k: {kk: [unlistify_item(i).title() for i in vv] for kk, vv in v.items()} for k, v in rv.items()}
             return humanify(rv)
         else:
             rv['starts_with'], rv['phonetic'] = get_completion(collection, string)
             rv['contains'] = []
             # make all the words in the suggestion start with a capital letter
-            rv = {k: [i.title() for i in v] for k, v in rv.items()}
+            rv = {k: [unlistify_item(i).title() for i in v] for k, v in rv.items()}
             return humanify(rv)
     except Exception, e:
-        return humanify({"error": "unexpected exception getting completion data: {}".format(e)}, 500)
+        return humanify({"error": "unexpected exception getting completion data: {}".format(e), "traceback": traceback.format_exc()}, 500)
 
 
 

diff --git a/conf/app_server.yaml b/conf/app_server.yaml
@@ -28,9 +28,11 @@ image_bucket: bhs-flat-pics
 thumbnail_bucket: bhs-thumbnails
 ftree_bucket_url: https://storage.googleapis.com/bhs-familytrees
 video_bucket_url: https://storage.googleapis.com/bhs-videos
+
 # elastic search
 elasticsearch_host: localhost
-elasticsearch_data_index:
+elasticsearch_data_index: bhdata
+
 # redis
 redis_host: localhost
 redis_port: 6379

diff --git a/conf/bhs_api_site b/conf/bhs_api_site
@@ -8,7 +8,7 @@ server {
 
 	location /v1/docs {
         default_type text/html;
-        alias /home/bhs/api/docs/index.html;
+        alias /home/bhs/api/docs/_build/index.html;
     }
 
     location / {

diff --git a/docs/README.md b/docs/README.md
@@ -0,0 +1,5 @@
+# dbs-back documentation hub
+
+## API documentation
+
+The API documentation is served at http://devapi.dbs.bh.org.il/v1/docs
diff --git a/docs/_build/README.md b/docs/_build/README.md
@@ -0,0 +1,7 @@
+# documentation source files
+
+This directory contains the api documentation files built from the _sources files
+
+**Important Notice**
+
+If you just want to read the documentation, go to https://github.com/Beit-Hatfutsot/dbs-back/tree/dev/docs