diff --git a/webhook/document_extract_test.py b/webhook/document_extract_test.py index 01da34e..3ffc570 100644 --- a/webhook/document_extract_test.py +++ b/webhook/document_extract_test.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pytest import backoff import os from google.cloud import vision @@ -27,6 +28,7 @@ # System / integration test +@pytest.mark.skip(reason="#55 : OCR takes too long and is flaky") @backoff.on_exception(backoff.expo, Exception, max_tries=3) def test_async_document_extract_system(capsys): out = document_extract.async_document_extract( diff --git a/webhook/main.py b/webhook/main.py index c9a9949..1f5738d 100644 --- a/webhook/main.py +++ b/webhook/main.py @@ -129,15 +129,23 @@ def cloud_event_entrypoint(event_id, bucket, name, time_created): def summarization_entrypoint( - name, - extracted_text, - time_created, - bucket=None, - event_id=None, -): + name: str, + extracted_text: str, + time_created: datetime.time, + bucket: str = None, + event_id: str = None, +) -> str: logging_client = logging.Client() logger = logging_client.logger(_FUNCTIONS_VERTEX_EVENT_LOGGER) + if len(extracted_text) == 0: + logger.log(f"""cloud_event_id({event_id}): BAD INPUT +No characters recognized from PDF and so the PDF cannot be +summarized. Be sure to upload a high-quality PDF that contains 'Abstract' and +'Conclusion' sections. + """, severity="ERROR") + return "" + complete_text_filename = f'summaries/{name.replace(".pdf", "")}_fulltext.txt' upload_to_gcs( _OUTPUT_BUCKET, @@ -149,7 +157,7 @@ def summarization_entrypoint( severity="INFO", ) - extracted_text_trunc = truncate_complete_text(extracted_text) + extracted_text_trunc = truncate_complete_text(extracted_text, _FUNCTIONS_VERTEX_EVENT_LOGGER) summary = predict_large_language_model( project_id=_PROJECT_ID, model_name=_MODEL_NAME, diff --git a/webhook/services_test.py b/webhook/services_test.py index f09d528..91f4203 100644 --- a/webhook/services_test.py +++ b/webhook/services_test.py @@ -60,7 +60,7 @@ def test_up16_services(): assert check_blob_exists(_OUTPUT_BUCKET, complete_text_filename) # TODO(erschmid): replace truncate with better solution - extracted_text_ = truncate_complete_text(extracted_text) + extracted_text_ = truncate_complete_text(extracted_text, "test_logger") summary = predict_large_language_model( project_id=_PROJECT_ID, model_name=_MODEL_NAME, diff --git a/webhook/utils.py b/webhook/utils.py index 6d1d3ac..1989fb6 100644 --- a/webhook/utils.py +++ b/webhook/utils.py @@ -12,14 +12,24 @@ # See the License for the specific language governing permissions and # limitations under the License. +from google.cloud import logging import datetime import re -ABSTRACT_LENGTH = 150 * 10 # Abstract recommended max word length * avg 10 letters long -CONCLUSION_LENGTH = 200 * 10 # Conclusion max word legnth * avg 10 letters long +ABSTRACT_LENGTH = 150 * 8 # Abstract recommended max word length * avg 8 letters long +CONCLUSION_LENGTH = 200 * 8 # Conclusion max word length * avg 8 letters long ABSTRACT_H1 = "abstract" CONCLUSION_H1 = "conclusion" +CONTENT_ERROR_MESSAGE = """ +Uploaded PDF doesn't contain an abstract or conclusion paragraph. The +document summarization pipeline will attempt a best effort at summarizing +the PDF. Your results might vary in quality. + +For best results, use a single-column, academic paper that contains both +a labeled 'Abstract' and 'Conclusion' section. +""" + def coerce_datetime_zulu(input_datetime: datetime.datetime): """Force datetime into specific format. @@ -40,13 +50,15 @@ def coerce_datetime_zulu(input_datetime: datetime.datetime): ) -def truncate_complete_text(complete_text: str) -> str: +def truncate_complete_text(complete_text: str, logger_name: str) -> str: """Extracts the abstract and conclusion from an academic paper. Uses a heuristics to approximate the extent of the abstract and conclusion. For abstract: assumes beginning after the string `abstract` and extends for 6-7 sentences For conclusion: assumes beginning after the string `conclusion` and extends for 7-9 sentences + #56 : Improve this function + Args: complete_text (str): the complete text of the academic paper @@ -55,8 +67,20 @@ def truncate_complete_text(complete_text: str) -> str: """ complete_text = complete_text.lower() abstract_start = complete_text.find(ABSTRACT_H1) + + # If no "Abstract" heading found, produce the entire text + if abstract_start == -1: + abstract_start = 0 + log_content_error(logger_name=logger_name) + conclusion_start = complete_text.find(CONCLUSION_H1) + # If no "Conclusion" heading found, produce the last little bit + # of the text + if conclusion_start == -1: + conclusion_start = len(complete_text) - (CONCLUSION_LENGTH) + log_content_error(logger_name=logger_name) + abstract = complete_text[abstract_start:ABSTRACT_LENGTH] conclusion = complete_text[conclusion_start:] if len(conclusion) > CONCLUSION_LENGTH: @@ -67,3 +91,9 @@ def truncate_complete_text(complete_text: str) -> str: Conclusion: {conclusion} """ + + +def log_content_error(logger_name: str): + logging_client = logging.Client() + logger = logging_client.logger(logger_name) + logger.log(CONTENT_ERROR_MESSAGE, severity="WARNING") diff --git a/webhook/utils_test.py b/webhook/utils_test.py new file mode 100644 index 0000000..7625a0d --- /dev/null +++ b/webhook/utils_test.py @@ -0,0 +1,143 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from utils import truncate_complete_text +from unittest.mock import MagicMock, patch + +from google.cloud import logging + + +def test_truncate_complete_test(): + complete_text = """ +This is a test paper + +Abstract + +This is an abstract. An abstract provides an overview of the +academic paper. A good abstract is usually about 150 words long. They can +sometimes be longer. They can sometimes be shorter. An abstract should help the +reader get the gist of the academic paper without having to read the entire +paper. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor +incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis +nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu +fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in +culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor +incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis +nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu +fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in +culpa qui officia deserunt mollit anim id est laborum. + +Conclusion + +This is a conclusion. It describes the results of the academic paper that +precedes it. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor +incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis +nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu +fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in +culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor +incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis +nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu +fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in +culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor +incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis +nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu +fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in +culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor +incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis +nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu +fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in +culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor +incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis +nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu +fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in +culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor +incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis +nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu +fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in +culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor +incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis +nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu +fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in +culpa qui officia deserunt mollit anim id est laborum. +Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor +incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis +nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu +fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in +culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor +incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis +nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu +fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in +culpa qui officia deserunt mollit anim id est laborum. + +Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor +incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis +nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu +fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in +culpa qui officia deserunt mollit anim id est laborum. + """ + + extracted_text = truncate_complete_text(complete_text=complete_text, logger_name="fake_logger") + + assert "this is an abstract" in extracted_text + assert "this is a conclusion" in extracted_text + + +@patch.object(logging, "Client") +def test_truncate_complete_text_no_abstract_or_conclusion(mock_logging): + + # Set up + mock_logging_client = MagicMock(spec=logging.Client) + mock_logging.return_value = mock_logging_client + + mock_logger = MagicMock(spec=logging.Logger) + mock_logging_client.logger = mock_logger + + # Act + extracted_text = truncate_complete_text("This is a bad input", "fake_logger") + + # Assert + mock_logger.assert_called() + assert "this is a bad input" in extracted_text