From 6db77f3203c1f0f0352fc44f46898d0bf44601e1 Mon Sep 17 00:00:00 2001 From: Ben Silverman Date: Wed, 31 Jan 2024 17:12:35 -0500 Subject: [PATCH] Rewrite single volume search API backend (#956) --- apps/readux/search.py | 242 +++++++++++++++++++++---------- apps/readux/tests/test_search.py | 104 ++++++++----- 2 files changed, 237 insertions(+), 109 deletions(-) diff --git a/apps/readux/search.py b/apps/readux/search.py index b847e8dc8..b3d46dd3c 100644 --- a/apps/readux/search.py +++ b/apps/readux/search.py @@ -1,12 +1,14 @@ """Search Endpoint(s)""" -import json +from itertools import groupby +import re from django.http import JsonResponse from django.views import View -from django.db.models import Count, Q -from django.contrib.postgres.search import SearchVector, SearchQuery, SearchRank -from ..iiif.annotations.models import Annotation -from ..iiif.manifests.models import Manifest -from .models import UserAnnotation +from elasticsearch_dsl import Q +from more_itertools import flatten + +from apps.iiif.manifests.documents import ManifestDocument +from apps.readux.documents import UserAnnotationDocument +from apps.readux.templatetags.readux_extras import has_inner_hits, group_by_canvas class SearchManifestCanvas(View): @@ -14,82 +16,179 @@ class SearchManifestCanvas(View): Endpoint for text search of manifest :rtype json """ + + # regex to match exact search terms in doublequotes + re_exact_match = re.compile(r'\B(".+?")\B') + def get_queryresults(self): """ Build query results. :return: [description] :rtype: JSON """ - manifest = Manifest.objects.get(pid=self.request.GET['volume']) - annotations = Annotation.objects.filter( - canvas__manifest__label=manifest.label - ) - user_annotations = UserAnnotation.objects.filter( - owner_id=self.request.user.id - ).filter( - canvas__manifest__label=manifest.label + # get search params + volume_pid = self.request.GET.get("volume_id") or "" + canvas_pid = self.request.GET.get("canvas_id") or "" + search_query = self.request.GET.get("keyword") or "" + + # find exact match queries (words or phrases in double quotes) + exact_queries = self.re_exact_match.findall(search_query) + # remove exact queries from the original search query to search separately + search_query = re.sub(self.re_exact_match, "", search_query).strip() + + # set up elasticsearch queries for searching the volume full text + vol_queries = [] + vol_queries_exact = [] + + # techcnically we are using the search for all volumes here, + # but we are filtering to only one volume and then getting inner hits + volumes = ManifestDocument.search() + # filter to only volume matching pid + volumes = volumes.filter("term", pid=volume_pid) + + # build query for nested fields (i.e. canvas position and text) + nested_kwargs = { + "path": "canvas_set", + "score_mode": "sum", + } + inner_hits_dict = { + "size": 100, # max number of pages shown in full-text results + "highlight": {"fields": {"canvas_set.result": {}}}, + } + + # get nested inner hits on canvas (for partial match portion of query) + nested_query = Q( + "nested", + query=Q( + "multi_match", + query=search_query, + fields=["canvas_set.result"], + ), + inner_hits={**inner_hits_dict, "name": "canvases"}, + **nested_kwargs, ) - fuzzy_search = Q() - query = SearchQuery('') - vector = SearchVector('content') - search_type = self.request.GET['type'] - search_strings = self.request.GET['query'].split() + vol_queries.append(nested_query) + + for i, exq in enumerate(exact_queries): + # separate exact searches so we can put them in "must" boolean query + nested_exact = Q( + "nested", + query=Q( + "multi_match", + query=exq.replace('"', "").strip(), + fields=["canvas_set.result"], + type="phrase", + ), + # each inner_hits set needs to have a different name in elasticsearch + inner_hits={**inner_hits_dict, "name": f"canvases_{i}"}, + **nested_kwargs, + ) + vol_queries_exact.append({"bool": {"should": [nested_exact]}}) + + # combine exact and partial with bool: { should, must } + q = Q("bool", should=vol_queries, must=vol_queries_exact) + volumes = volumes.query(q) + + # execute the search + response = volumes.execute() + + # group inner hits by canvas and collect highlighted context + volume_matches = [] + total_matches_on_canvas = 0 + total_matches_in_volume = 0 + if int(response.hits.total.value): + volume = response.hits[0] + if has_inner_hits(volume): + for canvas in group_by_canvas(volume.meta.inner_hits, limit=100): + volume_matches.append( + { + "canvas_index": canvas["position"], + "canvas_match_count": len(canvas["highlights"]), + "canvas_pid": canvas["pid"], + "context": canvas["highlights"], + } + ) + total_matches_in_volume += len(canvas["highlights"]) + if canvas_pid and canvas["pid"] == canvas_pid: + total_matches_on_canvas = len(canvas["highlights"]) + + # JSON-serializable results results = { - 'search_terms': search_strings, - 'ocr_annotations': [], - 'user_annotations': [] + "matches_in_text": { + "total_matches_on_canvas": total_matches_on_canvas, + "total_matches_in_volume": total_matches_in_volume, + "volume_matches": volume_matches, + } } - if search_strings: - if search_type == 'partial': - for search_string in search_strings: - query = query | SearchQuery(search_string) - fuzzy_search |= Q(content__icontains=search_string) - annotations = annotations.filter(fuzzy_search) - user_annotations = user_annotations.filter(fuzzy_search) - else: - for search_string in search_strings: - query = query | SearchQuery(search_string) - - annotations = annotations.annotate( - search=vector - ).filter( - search=query + # ------------------------------------------------------------ + # Now, search for UserAnnotations + annotations = UserAnnotationDocument.search() + + # filter to only owner matching user, volume matching pid + annotations = annotations.filter( + "term", owner_username=self.request.user.username + ).filter("term", manifest_pid=volume_pid) + + # search for partial matches + anno_queries = [Q("multi_match", query=search_query, fields=["content"])] + + # search for exact matches + anno_queries_exact = [] + for i, exq in enumerate(exact_queries): + # separate exact searches so we can put them in "must" boolean query + eq = exq.replace('"', "").strip() + nested_exact = Q("multi_match", query=eq, fields=["content"], type="phrase") + anno_queries_exact.append({"bool": {"should": [nested_exact]}}) + + # combine exact and partial with bool: { should, must } + q = Q("bool", should=anno_queries, must=anno_queries_exact) + annotations = annotations.query(q) + annotations = annotations.highlight("content") + + # execute the search + anno_response = annotations.execute() + + # collect metadata and highlighted context from hits + annotation_match_count = int(anno_response.hits.total.value) + annotation_matches = [] + if annotation_match_count: + for anno in anno_response.hits: + annotation_matches.append( + { + "canvas_index": anno["canvas_index"], + "canvas_pid": anno["canvas_pid"], + "context": list(anno.meta.highlight.content), + } ) - user_annotations = user_annotations.annotate( - search=vector - ).filter( - search=query - ) + # group annotations by canvas pid + # TODO: is there a way to do this without iterating over the same data multiple times? + # maybe some kind of aggregation in elastic? + group_key = lambda a: a["canvas_pid"] + annotation_matches.sort(key=group_key) + annotation_matches_grouped = [] + annotation_matches_on_canvas = 0 + annotation_matches_in_volume = 0 + for k, v in groupby(annotation_matches, key=group_key): + canvas_matches = list(flatten([match["context"] for match in v])) + annotation_matches_grouped.append( + { + "canvas_index": anno["canvas_index"], + "canvas_match_count": len(canvas_matches), + "canvas_pid": k, + "context": canvas_matches, + } + ) + annotation_matches_in_volume += len(canvas_matches) + if canvas_pid and k == canvas_pid: + annotation_matches_on_canvas = len(canvas_matches) - annotation_results = annotations.values( - 'canvas__position', - 'canvas__manifest__pid', - 'canvas__pid' - ).annotate( - Count('canvas__position') - ).order_by( - 'canvas__position' - ).exclude( - resource_type='dctypes:Text' - ).distinct() - - for annotation in annotation_results: - results['ocr_annotations'].append(json.dumps(annotation)) - - user_annotation_results = user_annotations.values( - 'canvas__position', - 'canvas__manifest__pid', - 'canvas__pid' - ).annotate( - rank=SearchRank(vector, query) - ).order_by( - '-rank' - ).distinct() - - for ua_annotation in user_annotation_results: - results['user_annotations'].append(json.dumps(ua_annotation)) + results["matches_in_annotations"] = { + "total_matches_on_canvas": annotation_matches_on_canvas, + "total_matches_in_volume": annotation_matches_in_volume, + "volume_matches": annotation_matches_grouped, + } return results @@ -98,7 +197,4 @@ def get(self, request, *args, **kwargs): # pylint: disable = unused-argument Respond to GET requests for search queries. :rtype: JsonResponse """ - return JsonResponse( - status=200, - data=self.get_queryresults() - ) + return JsonResponse(status=200, data=self.get_queryresults()) diff --git a/apps/readux/tests/test_search.py b/apps/readux/tests/test_search.py index cfe8b1a63..4d8c8d9c5 100644 --- a/apps/readux/tests/test_search.py +++ b/apps/readux/tests/test_search.py @@ -5,6 +5,7 @@ from django.test import TestCase, Client from django.test import RequestFactory from django.urls import reverse +from django_elasticsearch_dsl.test import ESTestCase from apps.users.tests.factories import UserFactory from ...iiif.manifests.tests.factories import ManifestFactory from ...iiif.canvases.tests.factories import CanvasFactory @@ -13,11 +14,12 @@ from .factories import UserAnnotationFactory -class TestReaduxPageDetailSearch(TestCase): +class TestReaduxPageDetailSearch(ESTestCase, TestCase): """ Test page search. """ def setUp(self): + super().setUp() self.search_manifest_view = SearchManifestCanvas.as_view() self.request = RequestFactory() self.volume = ManifestFactory.create() @@ -33,9 +35,13 @@ def setUp(self): # # Delete the canvas created by the ManifestFactory to ensure a clean set. original_canvas.delete() for _ in [1, 2]: - self.add_annotations(self.volume.canvas_set.get(position=1)) + canvas = self.volume.canvas_set.get(position=1) + self.add_annotations(canvas) + canvas.save() for _ in [1, 2, 3]: - self.add_annotations(self.volume.canvas_set.get(position=2)) + canvas = self.volume.canvas_set.get(position=2) + self.add_annotations(canvas) + canvas.save() # pylint: enable = unused-variable @@ -46,12 +52,12 @@ def add_annotations(self, canvas): """Add OCR and User annotations to a canvas.""" AnnotationFactory.create( canvas=canvas, - content='stankonia', + content='stinking', owner=self.ocr_user ) UserAnnotationFactory.create( canvas=canvas, - content='Aquemini', + content='outcasts', owner=self.user ) @@ -65,80 +71,106 @@ def load_results(self, response): return json.loads(response.content.decode('UTF-8-sig')) def test_manifest_canvas_ocr_partial_search(self): - query_params = {'volume': self.volume.pid, 'type': 'partial', 'query': 'stank'} + query_params = {'volume_id': self.volume.pid, 'keyword': 'stink'} request = self.request.get( self.url, query_params ) request.user = UserFactory.create() response = self.search_manifest_view(request) search_results = self.load_results(response) - assert len(search_results['ocr_annotations']) == 2 - assert len(search_results['user_annotations']) == 0 - assert search_results['search_terms'] == 'stank'.split() - assert json.loads(search_results['ocr_annotations'][0])['canvas__position'] == 1 - assert json.loads(search_results['ocr_annotations'][1])['canvas__position'] == 2 - assert json.loads(search_results['ocr_annotations'][0])['canvas__position__count'] == 2 - assert json.loads(search_results['ocr_annotations'][1])['canvas__position__count'] == 3 + + # two hits in text, no hits in annotations + assert search_results['matches_in_text']['total_matches_in_volume'] == 2 + assert search_results['matches_in_annotations']['total_matches_in_volume'] == 0 + for match in search_results['matches_in_text']['volume_matches']: + # should be in canvas indices 1 and 2 + assert match["canvas_index"] in [1, 2] + if match["canvas_index"] == 1: + # 2 matches in first canvas + # NOTE: OCR annotations are indexed as a single block of text per canvas. in this + # case, the terms appeared so close together they are grouped into one result, but + # still highlighted individually, thus two s. + assert match["context"][0].count("") == 2 + elif match["canvas_index"] == 2: + # 3 matches in second canvas + assert match["context"][0].count("") == 3 def test_manifest_canvas_ocr_exact_search(self): - query_params = {'volume': self.volume.pid, 'type': 'exact', 'query': 'stankonia'} + query_params = {'volume_id': self.volume.pid, 'keyword': '"stinking"'} request = self.request.get( self.url, query_params ) request.user = UserFactory.create() response = self.search_manifest_view(request) search_results = self.load_results(response) - assert len(search_results['ocr_annotations']) == 2 - assert len(search_results['user_annotations']) == 0 - assert json.loads(search_results['ocr_annotations'][0])['canvas__position'] == 1 - assert json.loads(search_results['ocr_annotations'][1])['canvas__position'] == 2 - assert json.loads(search_results['ocr_annotations'][0])['canvas__position__count'] == 2 - assert json.loads(search_results['ocr_annotations'][1])['canvas__position__count'] == 3 + # two hits in text, no hits in annotations + assert search_results['matches_in_text']['total_matches_in_volume'] == 2 + assert search_results['matches_in_annotations']['total_matches_in_volume'] == 0 + for match in search_results['matches_in_text']['volume_matches']: + # should be in canvas indices 1 and 2 + assert match["canvas_index"] in [1, 2] + if match["canvas_index"] == 1: + # 2 matches in first canvas; so close together they are grouped into one result + assert match["context"][0].count("") == 2 + elif match["canvas_index"] == 2: + # 3 matches in second canvas; so close together they are grouped into one result + assert match["context"][0].count("") == 3 def test_manifest_canvas_ocr_exact_search_no_results(self): - query_params = {'volume': self.volume.pid, 'type': 'exact', 'query': 'Idlewild'} + query_params = {'volume_id': self.volume.pid, 'keyword': '"Idlewild"'} request = self.request.get( self.url, query_params ) request.user = UserFactory.create() response = self.search_manifest_view(request) search_results = self.load_results(response) - assert len(search_results['ocr_annotations']) == 0 - assert len(search_results['user_annotations']) == 0 + assert search_results['matches_in_text']['total_matches_in_volume'] == 0 + assert search_results['matches_in_annotations']['total_matches_in_volume'] == 0 def test_manifest_canvas_user_annotation_partial_search(self): - query_params = {'volume': self.volume.pid, 'type': 'partial', 'query': 'Aqu'} + query_params = {'volume_id': self.volume.pid, 'keyword': 'outcast'} request = self.request.get( self.url, query_params ) request.user = self.user response = self.search_manifest_view(request) search_results = self.load_results(response) - assert len(search_results['ocr_annotations']) == 0 - assert len(search_results['user_annotations']) == 2 - assert json.loads(search_results['user_annotations'][0])['canvas__position'] == 1 - assert json.loads(search_results['user_annotations'][1])['canvas__position'] == 2 + print(search_results) + assert search_results['matches_in_text']['total_matches_in_volume'] == 0 + # NOTE: since user annotations are indexed individually (unlike OCR annotations which + # are grouped by canvas), each one gets a separate hit, so the total matches is 5. + # however, in the search results, they will be grouped by canvas. thus, len(matches) is 2, + # one per canvas, while total matches remains 5. + assert search_results['matches_in_annotations']['total_matches_in_volume'] == 5 + assert len(search_results['matches_in_annotations']['volume_matches']) == 2 + for match in search_results['matches_in_annotations']['volume_matches']: + # should be in canvas indices 1 and 2 + assert match["canvas_index"] in [1, 2] def test_manifest_canvas_user_annotation_exact_search(self): - query_params = {'volume': self.volume.pid, 'type': 'exact', 'query': 'Aquemini'} + query_params = {'volume_id': self.volume.pid, 'keyword': '"outcasts"'} request = self.request.get( self.url, query_params ) request.user = self.user response = self.search_manifest_view(request) search_results = self.load_results(response) - assert len(search_results['ocr_annotations']) == 0 - assert len(search_results['user_annotations']) == 2 - assert json.loads(search_results['user_annotations'][0])['canvas__position'] == 1 - assert json.loads(search_results['user_annotations'][1])['canvas__position'] == 2 + print(search_results) + assert search_results['matches_in_text']['total_matches_in_volume'] == 0 + # NOTE: see above note about user annotations vs OCR annotations. + assert search_results['matches_in_annotations']['total_matches_in_volume'] == 5 + assert len(search_results['matches_in_annotations']['volume_matches']) == 2 + for match in search_results['matches_in_annotations']['volume_matches']: + # should be in canvas indices 1 and 2 + assert match["canvas_index"] in [1, 2] def test_manifest_canvas_user_annotation_exact_search_no_results(self): - query_params = {'volume': self.volume.pid, 'type': 'exact', 'query': 'Idlewild'} + query_params = {'volume_id': self.volume.pid, 'keyword': '"Idlewild"'} request = self.request.get( self.url, query_params ) request.user = self.user response = self.search_manifest_view(request) search_results = self.load_results(response) - assert len(search_results['ocr_annotations']) == 0 - assert len(search_results['user_annotations']) == 0 + assert search_results['matches_in_text']['total_matches_in_volume'] == 0 + assert search_results['matches_in_annotations']['total_matches_in_volume'] == 0