Skip to content

Commit

Permalink
Merge pull request #579 from freelawproject/577-stop-sending-full-doc…
Browse files Browse the repository at this point in the history
…ument-to-redis

feat(subscription): Optimize document handling in subscription tasks
  • Loading branch information
mlissner authored Aug 22, 2024
2 parents 4a285ba + f3562b2 commit 387153f
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 31 deletions.
46 changes: 32 additions & 14 deletions bc/subscription/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@

def enqueue_posts_for_new_case(
subscription: Subscription,
document: bytes | None = None,
document: bytes | str | None = None,
check_sponsor_message: bool = False,
initial_document: DocumentDict | None = None,
) -> None:
Expand All @@ -49,13 +49,21 @@ def enqueue_posts_for_new_case(
Args:
subscription (Subscription): the new subscription object.
document (bytes | None, optional): document content(if available) as bytes.
check_sponsor_message (bool, optional): designates whether this method should check
the sponsorships field and compute the sponsor_message for each channel. Defaults to False.
document (bytes | str | None): The document, either as raw bytes or as
a URL path to download the file.
check_sponsor_message (bool, optional): designates whether this method
should check the sponsorships field and compute the sponsor_message
for each channel. Defaults to False.
"""

files = None
if document:
# TODO: Refactor to remove isinstance check in the if statements
# This code was added to handle potential data already in the queue
# during deployment.
if isinstance(document, str):
document = download_pdf_from_cl(document)

if document is not None and isinstance(document, bytes):
files = get_thumbnails_from_range(document, "[1,2,3,4]")

docket: DocketDict | None = None
Expand Down Expand Up @@ -131,7 +139,7 @@ def enqueue_posts_for_new_case(

def enqueue_posts_for_docket_alert(
webhook_event: FilingWebhookEvent,
document: bytes | None = None,
document: bytes | str | None = None,
check_sponsor_message: bool = False,
) -> None:
"""
Expand All @@ -140,9 +148,12 @@ def enqueue_posts_for_docket_alert(
Args:
webhook_event (FilingWebhookEvent): The FilingWebhookEvent record.
document (bytes | None, optional): document content(if available) as bytes.
check_sponsor_message (bool, optional): designates whether this method should check
the sponsorships field and compute the sponsor_message for each channel. Defaults to False.
document (bytes | None, optional): document as bytes or the URL to
download the file.
check_sponsor_message (bool, optional): designates whether this method
should check.
the sponsorships field and compute the sponsor_message for each channel.
Defaults to False.
"""
if not webhook_event.subscription:
return
Expand Down Expand Up @@ -236,7 +247,7 @@ def check_webhook_before_posting(fwe_pk: int):
document = None
cl_document = lookup_document_by_doc_id(filing_webhook_event.doc_id)
if cl_document["filepath_local"]:
document = download_pdf_from_cl(cl_document["filepath_local"])
document = cl_document["filepath_local"]
else:
sponsorship = check_active_sponsorships(
filing_webhook_event.subscription.pk
Expand Down Expand Up @@ -341,7 +352,7 @@ def process_fetch_webhook_event(
"The RECAP document doesn't have a path to download the file"
)

pdf_data = download_pdf_from_cl(cl_document["filepath_local"])
pdf_data = cl_document["filepath_local"]

sponsor_groups = get_sponsored_groups_per_subscription(subscription.pk)

Expand Down Expand Up @@ -371,7 +382,7 @@ def process_fetch_webhook_event(
def make_post_for_webhook_event(
channel_pk: int,
fwe_pk: int,
document: bytes | None,
document: bytes | str | None,
sponsor_text: str | None = None,
) -> Post:
"""Post a new status in the given channel using the data of the given webhook
Expand All @@ -380,7 +391,8 @@ def make_post_for_webhook_event(
Args:
channel_pk (int): The pk of the channel where the post will be created.
fwe_pk (int): The PK of the FilingWebhookEvent record.
document (bytes | None): document content(if available) as bytes.
document (bytes | str | None): The document, either as raw bytes or as
a URL path to download the file.
sponsor_text (str | None): sponsor message to include in the thumbnails.
Returns:
Expand Down Expand Up @@ -412,7 +424,13 @@ def make_post_for_webhook_event(
)

files = None
if document:
# TODO: Refactor to remove isinstance check in the if statements
# This code was added to handle potential data already in the queue
# during deployment.
if isinstance(document, str):
document = download_pdf_from_cl(document)

if document is not None and isinstance(document, bytes):
thumbnail_range = "[1,2,3]" if image else "[1,2,3,4]"
files = get_thumbnails_from_range(document, thumbnail_range)

Expand Down
67 changes: 50 additions & 17 deletions bc/subscription/tests/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,19 +88,15 @@ def test_ignores_webhook_for_junk_entries(self, mock_lookup):
mock_lookup.assert_not_called()

@patch("bc.subscription.tasks.enqueue_posts_for_docket_alert")
@patch("bc.subscription.tasks.download_pdf_from_cl")
def test_can_create_post_for_webhook_w_document(
self, mock_download, mock_enqueue, mock_lookup
self, mock_enqueue, mock_lookup
):
filepath = "recap/gov.uscourts.mied.365816/gov.uscourts.mied.365816.1.0_12.pdf"
mock_lookup.return_value = {"filepath_local": filepath}
mock_download.return_value = b"\x68\x65\x6c\x6c\x6f"

check_webhook_before_posting(self.webhook_event.id)

mock_lookup.assert_called_with(self.webhook_event.doc_id)
mock_download.assert_called_with(filepath)
mock_enqueue.assert_called_with(self.webhook_event, mock_download())
mock_enqueue.assert_called_with(self.webhook_event, filepath)

@patch("bc.subscription.tasks.enqueue_posts_for_docket_alert")
@patch("bc.subscription.tasks.download_pdf_from_cl")
Expand Down Expand Up @@ -160,29 +156,22 @@ def test_raise_exception_for_webhook_no_subscription(self):
process_fetch_webhook_event(self.webhook_no_subscription.id)

@patch("bc.subscription.tasks.enqueue_posts_for_docket_alert")
@patch("bc.subscription.tasks.download_pdf_from_cl")
@patch("bc.subscription.tasks.lookup_document_by_doc_id")
def test_can_create_post_after_purchase(
self, mock_lookup, mock_download, mock_enqueue
):
def test_can_create_post_after_purchase(self, mock_lookup, mock_enqueue):
filepath = (
"recap/gov.uscourts.dcd.178502/gov.uscourts.dcd.178502.2.0_18.pdf"
)
mock_lookup.return_value = {
"filepath_local": filepath,
"page_count": 1,
}
mock_download.return_value = b"\x68\x65\x6c\x6c\x6f"

process_fetch_webhook_event(self.webhook_event.id)
webhook = FilingWebhookEvent.objects.get(id=self.webhook_event.id)

self.assertEqual(webhook.status, FilingWebhookEvent.SUCCESSFUL)
mock_lookup.assert_called_with(self.webhook_event.doc_id)
mock_download.assert_called_with(filepath)
mock_enqueue.assert_called_with(
self.webhook_event, mock_download(), True
)
mock_enqueue.assert_called_with(self.webhook_event, filepath, True)


@patch("bc.subscription.tasks.add_sponsored_text_to_thumbnails")
Expand All @@ -193,6 +182,7 @@ class MakePostForWebhookEventTest(TestCase):
webhook_minute_entry = None
channel = None
bin_object = None
fake_document_path = None

@classmethod
def setUpTestData(cls) -> None:
Expand All @@ -208,6 +198,7 @@ def setUpTestData(cls) -> None:
)
cls.channel = ChannelFactory(mastodon=True)
cls.bin_object = b"\x68\x65\x6c\x6c\x6f"
cls.fake_document_path = faker.url()

def setUp(self) -> None:
self.status_id = str(
Expand Down Expand Up @@ -251,22 +242,37 @@ def test_can_create_minute_entry_no_document(
mock_add_sponsor_text.assert_not_called()
mock_thumbnails.assert_not_called()

@patch("bc.subscription.tasks.download_pdf_from_cl")
def test_get_three_thumbnails_from_documents(
self, mock_api, mock_thumbnails, mock_add_sponsor_text
self, mock_download, mock_api, mock_thumbnails, mock_add_sponsor_text
):
mock_api.return_value = self.mock_api_wrapper(self.status_id)

# Remove the short description to make sure the template will use the long one
self.webhook_event.short_description = ""
self.webhook_event.save(update_fields=["short_description"])

# This test case handles scenarios where the document is provided
# directly as a binary object.
make_post_for_webhook_event(
self.channel.pk, self.webhook_event.pk, self.bin_object
)

mock_thumbnails.assert_called_with(self.bin_object, "[1,2,3]")
mock_add_sponsor_text.assert_not_called()

# This test case verifies that `make_post_for_webhook_event` can handle
# document URLs as input, in contrast to the previous test that used
# URLs.
mock_download.return_value = self.bin_object
make_post_for_webhook_event(
self.channel.pk, self.webhook_event.pk, self.fake_document_path
)

mock_download.assert_called_once_with(self.fake_document_path)
mock_thumbnails.assert_called_with(self.bin_object, "[1,2,3]")
mock_add_sponsor_text.assert_not_called()

def test_get_four_thumbnails_from_document(
self, mock_api, mock_thumbnails, mock_add_sponsor_text
):
Expand Down Expand Up @@ -464,8 +470,10 @@ def test_can_post_new_case_w_link(
)

@patch("bc.subscription.tasks.get_thumbnails_from_range")
@patch("bc.subscription.tasks.download_pdf_from_cl")
def test_can_post_new_case_w_thumbnails(
self,
mock_download_pdf,
mock_thumbnails,
mock_api,
mock_queue,
Expand All @@ -484,16 +492,23 @@ def test_can_post_new_case_w_thumbnails(
thumb_1 = faker.binary(4)
thumb_2 = faker.binary(6)
mock_thumbnails.return_value = [thumb_1, thumb_2]

mock_download_pdf.return_value = document
message, _ = TWITTER_FOLLOW_A_NEW_CASE.format(
docket=self.subscription_w_link.name_with_summary,
docket_link=self.subscription_w_link.cl_url,
docket_id=self.subscription_w_link.cl_docket_id,
article_url=self.subscription_w_link.article_url,
)

# This test case ensures backward compatibility with legacy tasks using
# the old bytes document format, which is scheduled for deprecation. It
# verifies that the system can still process tasks enqueued before the
# format change.
enqueue_posts_for_new_case(self.subscription_w_link, document)

# Since we passed the PDF as a bytes object, we don't expect the
# download function to be called.
mock_download_pdf.assert_not_called()
mock_thumbnails.assert_called_once_with(document, "[1,2,3,4]")
mock_queue.enqueue.assert_called_once_with(
api_wrapper.add_status,
Expand All @@ -503,6 +518,24 @@ def test_can_post_new_case_w_thumbnails(
retry=mock_retry(),
)

# This test case verifies the handling of document URLs.
# We provide a fake URL to the enqueue function and expect:
# - The download PDF function to be called with the URL.
# - The thumbnails function to be called with the downloaded PDF content and the specified range.
# - The queue's enqueue function to be called with the expected arguments for adding a status.
fake_path = faker.url()
enqueue_posts_for_new_case(self.subscription_w_link, fake_path)

mock_download_pdf.assert_called_once_with(fake_path)
mock_thumbnails.assert_called_with(document, "[1,2,3,4]")
mock_queue.enqueue.assert_called_with(
api_wrapper.add_status,
message,
None,
[thumb_1, thumb_2],
retry=mock_retry(),
)

@patch("bc.subscription.tasks.add_sponsored_text_to_thumbnails")
@patch("bc.subscription.tasks.get_thumbnails_from_range")
def test_can_create_post_w_sponsored_thumbnails(
Expand Down

0 comments on commit 387153f

Please sign in to comment.