Skip to content

Commit

Permalink
fix: handle empty string case in extraction util (#54)
Browse files Browse the repository at this point in the history
  • Loading branch information
telpirion authored Jul 27, 2023
1 parent 7aa2990 commit cb958b5
Show file tree
Hide file tree
Showing 5 changed files with 194 additions and 11 deletions.
2 changes: 2 additions & 0 deletions webhook/document_extract_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import pytest
import backoff
import os
from google.cloud import vision
Expand All @@ -27,6 +28,7 @@


# System / integration test
@pytest.mark.skip(reason="#55 : OCR takes too long and is flaky")
@backoff.on_exception(backoff.expo, Exception, max_tries=3)
def test_async_document_extract_system(capsys):
out = document_extract.async_document_extract(
Expand Down
22 changes: 15 additions & 7 deletions webhook/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,15 +129,23 @@ def cloud_event_entrypoint(event_id, bucket, name, time_created):


def summarization_entrypoint(
name,
extracted_text,
time_created,
bucket=None,
event_id=None,
):
name: str,
extracted_text: str,
time_created: datetime.time,
bucket: str = None,
event_id: str = None,
) -> str:
logging_client = logging.Client()
logger = logging_client.logger(_FUNCTIONS_VERTEX_EVENT_LOGGER)

if len(extracted_text) == 0:
logger.log(f"""cloud_event_id({event_id}): BAD INPUT
No characters recognized from PDF and so the PDF cannot be
summarized. Be sure to upload a high-quality PDF that contains 'Abstract' and
'Conclusion' sections.
""", severity="ERROR")
return ""

complete_text_filename = f'summaries/{name.replace(".pdf", "")}_fulltext.txt'
upload_to_gcs(
_OUTPUT_BUCKET,
Expand All @@ -149,7 +157,7 @@ def summarization_entrypoint(
severity="INFO",
)

extracted_text_trunc = truncate_complete_text(extracted_text)
extracted_text_trunc = truncate_complete_text(extracted_text, _FUNCTIONS_VERTEX_EVENT_LOGGER)
summary = predict_large_language_model(
project_id=_PROJECT_ID,
model_name=_MODEL_NAME,
Expand Down
2 changes: 1 addition & 1 deletion webhook/services_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def test_up16_services():
assert check_blob_exists(_OUTPUT_BUCKET, complete_text_filename)

# TODO(erschmid): replace truncate with better solution
extracted_text_ = truncate_complete_text(extracted_text)
extracted_text_ = truncate_complete_text(extracted_text, "test_logger")
summary = predict_large_language_model(
project_id=_PROJECT_ID,
model_name=_MODEL_NAME,
Expand Down
36 changes: 33 additions & 3 deletions webhook/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from google.cloud import logging
import datetime
import re

ABSTRACT_LENGTH = 150 * 10 # Abstract recommended max word length * avg 10 letters long
CONCLUSION_LENGTH = 200 * 10 # Conclusion max word legnth * avg 10 letters long
ABSTRACT_LENGTH = 150 * 8 # Abstract recommended max word length * avg 8 letters long
CONCLUSION_LENGTH = 200 * 8 # Conclusion max word length * avg 8 letters long
ABSTRACT_H1 = "abstract"
CONCLUSION_H1 = "conclusion"

CONTENT_ERROR_MESSAGE = """
Uploaded PDF doesn't contain an abstract or conclusion paragraph. The
document summarization pipeline will attempt a best effort at summarizing
the PDF. Your results might vary in quality.
For best results, use a single-column, academic paper that contains both
a labeled 'Abstract' and 'Conclusion' section.
"""


def coerce_datetime_zulu(input_datetime: datetime.datetime):
"""Force datetime into specific format.
Expand All @@ -40,13 +50,15 @@ def coerce_datetime_zulu(input_datetime: datetime.datetime):
)


def truncate_complete_text(complete_text: str) -> str:
def truncate_complete_text(complete_text: str, logger_name: str) -> str:
"""Extracts the abstract and conclusion from an academic paper.
Uses a heuristics to approximate the extent of the abstract and conclusion.
For abstract: assumes beginning after the string `abstract` and extends for 6-7 sentences
For conclusion: assumes beginning after the string `conclusion` and extends for 7-9 sentences
#56 : Improve this function
Args:
complete_text (str): the complete text of the academic paper
Expand All @@ -55,8 +67,20 @@ def truncate_complete_text(complete_text: str) -> str:
"""
complete_text = complete_text.lower()
abstract_start = complete_text.find(ABSTRACT_H1)

# If no "Abstract" heading found, produce the entire text
if abstract_start == -1:
abstract_start = 0
log_content_error(logger_name=logger_name)

conclusion_start = complete_text.find(CONCLUSION_H1)

# If no "Conclusion" heading found, produce the last little bit
# of the text
if conclusion_start == -1:
conclusion_start = len(complete_text) - (CONCLUSION_LENGTH)
log_content_error(logger_name=logger_name)

abstract = complete_text[abstract_start:ABSTRACT_LENGTH]
conclusion = complete_text[conclusion_start:]
if len(conclusion) > CONCLUSION_LENGTH:
Expand All @@ -67,3 +91,9 @@ def truncate_complete_text(complete_text: str) -> str:
Conclusion: {conclusion}
"""


def log_content_error(logger_name: str):
logging_client = logging.Client()
logger = logging_client.logger(logger_name)
logger.log(CONTENT_ERROR_MESSAGE, severity="WARNING")
143 changes: 143 additions & 0 deletions webhook/utils_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from utils import truncate_complete_text
from unittest.mock import MagicMock, patch

from google.cloud import logging


def test_truncate_complete_test():
complete_text = """
This is a test paper
Abstract
This is an abstract. An abstract provides an overview of the
academic paper. A good abstract is usually about 150 words long. They can
sometimes be longer. They can sometimes be shorter. An abstract should help the
reader get the gist of the academic paper without having to read the entire
paper.
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.
Conclusion
This is a conclusion. It describes the results of the academic paper that
precedes it.
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.
"""

extracted_text = truncate_complete_text(complete_text=complete_text, logger_name="fake_logger")

assert "this is an abstract" in extracted_text
assert "this is a conclusion" in extracted_text


@patch.object(logging, "Client")
def test_truncate_complete_text_no_abstract_or_conclusion(mock_logging):

# Set up
mock_logging_client = MagicMock(spec=logging.Client)
mock_logging.return_value = mock_logging_client

mock_logger = MagicMock(spec=logging.Logger)
mock_logging_client.logger = mock_logger

# Act
extracted_text = truncate_complete_text("This is a bad input", "fake_logger")

# Assert
mock_logger.assert_called()
assert "this is a bad input" in extracted_text

0 comments on commit cb958b5

Please sign in to comment.