Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: handle empty string case in extraction util #54

Merged
merged 4 commits into from
Jul 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions webhook/document_extract_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import pytest
import backoff
import os
from google.cloud import vision
Expand All @@ -27,6 +28,7 @@


# System / integration test
@pytest.mark.skip(reason="#55 : OCR takes too long and is flaky")
@backoff.on_exception(backoff.expo, Exception, max_tries=3)
def test_async_document_extract_system(capsys):
out = document_extract.async_document_extract(
Expand Down
22 changes: 15 additions & 7 deletions webhook/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,15 +129,23 @@ def cloud_event_entrypoint(event_id, bucket, name, time_created):


def summarization_entrypoint(
name,
extracted_text,
time_created,
bucket=None,
event_id=None,
):
name: str,
extracted_text: str,
time_created: datetime.time,
bucket: str = None,
event_id: str = None,
) -> str:
logging_client = logging.Client()
logger = logging_client.logger(_FUNCTIONS_VERTEX_EVENT_LOGGER)

if len(extracted_text) == 0:
logger.log(f"""cloud_event_id({event_id}): BAD INPUT
No characters recognized from PDF and so the PDF cannot be
summarized. Be sure to upload a high-quality PDF that contains 'Abstract' and
'Conclusion' sections.
""", severity="ERROR")
return ""

complete_text_filename = f'summaries/{name.replace(".pdf", "")}_fulltext.txt'
upload_to_gcs(
_OUTPUT_BUCKET,
Expand All @@ -149,7 +157,7 @@ def summarization_entrypoint(
severity="INFO",
)

extracted_text_trunc = truncate_complete_text(extracted_text)
extracted_text_trunc = truncate_complete_text(extracted_text, _FUNCTIONS_VERTEX_EVENT_LOGGER)
summary = predict_large_language_model(
project_id=_PROJECT_ID,
model_name=_MODEL_NAME,
Expand Down
2 changes: 1 addition & 1 deletion webhook/services_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def test_up16_services():
assert check_blob_exists(_OUTPUT_BUCKET, complete_text_filename)

# TODO(erschmid): replace truncate with better solution
extracted_text_ = truncate_complete_text(extracted_text)
extracted_text_ = truncate_complete_text(extracted_text, "test_logger")
summary = predict_large_language_model(
project_id=_PROJECT_ID,
model_name=_MODEL_NAME,
Expand Down
36 changes: 33 additions & 3 deletions webhook/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from google.cloud import logging
import datetime
import re

ABSTRACT_LENGTH = 150 * 10 # Abstract recommended max word length * avg 10 letters long
CONCLUSION_LENGTH = 200 * 10 # Conclusion max word legnth * avg 10 letters long
ABSTRACT_LENGTH = 150 * 8 # Abstract recommended max word length * avg 8 letters long
CONCLUSION_LENGTH = 200 * 8 # Conclusion max word length * avg 8 letters long
ABSTRACT_H1 = "abstract"
CONCLUSION_H1 = "conclusion"

CONTENT_ERROR_MESSAGE = """
Uploaded PDF doesn't contain an abstract or conclusion paragraph. The
document summarization pipeline will attempt a best effort at summarizing
the PDF. Your results might vary in quality.

For best results, use a single-column, academic paper that contains both
a labeled 'Abstract' and 'Conclusion' section.
"""


def coerce_datetime_zulu(input_datetime: datetime.datetime):
"""Force datetime into specific format.
Expand All @@ -40,13 +50,15 @@ def coerce_datetime_zulu(input_datetime: datetime.datetime):
)


def truncate_complete_text(complete_text: str) -> str:
def truncate_complete_text(complete_text: str, logger_name: str) -> str:
"""Extracts the abstract and conclusion from an academic paper.

Uses a heuristics to approximate the extent of the abstract and conclusion.
For abstract: assumes beginning after the string `abstract` and extends for 6-7 sentences
For conclusion: assumes beginning after the string `conclusion` and extends for 7-9 sentences

#56 : Improve this function

Args:
complete_text (str): the complete text of the academic paper

Expand All @@ -55,8 +67,20 @@ def truncate_complete_text(complete_text: str) -> str:
"""
complete_text = complete_text.lower()
abstract_start = complete_text.find(ABSTRACT_H1)

# If no "Abstract" heading found, produce the entire text
if abstract_start == -1:
abstract_start = 0
log_content_error(logger_name=logger_name)

conclusion_start = complete_text.find(CONCLUSION_H1)

# If no "Conclusion" heading found, produce the last little bit
# of the text
if conclusion_start == -1:
conclusion_start = len(complete_text) - (CONCLUSION_LENGTH)
log_content_error(logger_name=logger_name)

abstract = complete_text[abstract_start:ABSTRACT_LENGTH]
conclusion = complete_text[conclusion_start:]
if len(conclusion) > CONCLUSION_LENGTH:
Expand All @@ -67,3 +91,9 @@ def truncate_complete_text(complete_text: str) -> str:

Conclusion: {conclusion}
"""


def log_content_error(logger_name: str):
logging_client = logging.Client()
logger = logging_client.logger(logger_name)
logger.log(CONTENT_ERROR_MESSAGE, severity="WARNING")
143 changes: 143 additions & 0 deletions webhook/utils_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from utils import truncate_complete_text
from unittest.mock import MagicMock, patch

from google.cloud import logging


def test_truncate_complete_test():
complete_text = """
This is a test paper

Abstract

This is an abstract. An abstract provides an overview of the
academic paper. A good abstract is usually about 150 words long. They can
sometimes be longer. They can sometimes be shorter. An abstract should help the
reader get the gist of the academic paper without having to read the entire
paper.

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.

Conclusion

This is a conclusion. It describes the results of the academic paper that
precedes it.

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.

Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.
"""

extracted_text = truncate_complete_text(complete_text=complete_text, logger_name="fake_logger")

assert "this is an abstract" in extracted_text
assert "this is a conclusion" in extracted_text


@patch.object(logging, "Client")
def test_truncate_complete_text_no_abstract_or_conclusion(mock_logging):

# Set up
mock_logging_client = MagicMock(spec=logging.Client)
mock_logging.return_value = mock_logging_client

mock_logger = MagicMock(spec=logging.Logger)
mock_logging_client.logger = mock_logger

# Act
extracted_text = truncate_complete_text("This is a bad input", "fake_logger")

# Assert
mock_logger.assert_called()
assert "this is a bad input" in extracted_text