Skip to content

Commit

Permalink
Merge pull request #2805 from ttys0dev/optimize-file-memory
Browse files Browse the repository at this point in the history
Avoid reading pdf files into memory
  • Loading branch information
mlissner authored Jun 7, 2023
2 parents 40bdf43 + f56df3b commit c179897
Show file tree
Hide file tree
Showing 5 changed files with 13 additions and 55 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
- uses: actions/setup-python@v2
with:
# Once codebase is updated, this can easily be changed to any specific version.
python-version: "3.10"
python-version: "3.11"

- name: Install Poetry
uses: snok/install-poetry@v1
Expand Down
16 changes: 6 additions & 10 deletions cl/recap/tasks.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import hashlib
import logging
from dataclasses import dataclass
from datetime import datetime
from io import BytesIO
from typing import List, Optional, Tuple
from zipfile import ZipFile

Expand All @@ -12,7 +12,7 @@
from django.conf import settings
from django.contrib.auth.models import User
from django.core.exceptions import ValidationError
from django.core.files.base import ContentFile
from django.core.files.base import ContentFile, File
from django.core.files.uploadedfile import SimpleUploadedFile
from django.db import IntegrityError, transaction
from django.utils.timezone import now
Expand Down Expand Up @@ -53,7 +53,6 @@
)
from cl.corpus_importer.utils import mark_ia_upload_needed
from cl.custom_filters.templatetags.text_filters import oxford_join
from cl.lib.crypto import sha1
from cl.lib.filesizes import convert_size_to_bytes
from cl.lib.microservice_utils import microservice
from cl.lib.pacer import is_pacer_court_accessible, map_cl_to_pacer_id
Expand Down Expand Up @@ -316,7 +315,8 @@ def process_recap_pdf(self, pk):

# Do the file, finally.
try:
file_contents = pq.filepath_local.read()
with pq.filepath_local.open("rb") as f:
new_sha1 = hashlib.file_digest(f, "sha1").hexdigest()
except IOError as exc:
msg = f"Internal processing error ({exc.errno}: {exc.strerror})."
if (self.request.retries == self.max_retries) or pq.debug:
Expand All @@ -326,10 +326,6 @@ def process_recap_pdf(self, pk):
mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY)
raise self.retry(exc=exc)

if not file_contents:
return None

new_sha1 = sha1(file_contents)
existing_document = all(
[
rd.sha1 == new_sha1,
Expand All @@ -340,15 +336,15 @@ def process_recap_pdf(self, pk):
if not existing_document:
# Different sha1, it wasn't available, or it's missing from disk. Move
# the new file over from the processing queue storage.
cf = ContentFile(file_contents)
file_name = get_document_filename(
rd.docket_entry.docket.court_id,
rd.docket_entry.docket.pacer_case_id,
rd.document_number,
rd.attachment_number,
)
if not pq.debug:
rd.filepath_local.save(file_name, cf, save=False)
with pq.filepath_local.open("rb") as f:
rd.filepath_local.save(file_name, File(f), save=False)

# Do page count and extraction
response = microservice(
Expand Down
1 change: 1 addition & 0 deletions cl/settings/third_party/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
AWS_S3_CUSTOM_DOMAIN = "storage.courtlistener.com"
AWS_DEFAULT_ACL = "public-read"
AWS_QUERYSTRING_AUTH = False
AWS_S3_MAX_MEMORY_SIZE = 16 * 1024 * 1024

if DEVELOPMENT:
AWS_STORAGE_BUCKET_NAME = "dev-com-courtlistener-storage"
Expand Down
47 changes: 4 additions & 43 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ psycopg2 = "^2.9.5"
pycparser = "^2.21"
pyopenssl = "*"
pyparsing = "^2.4.2"
python = ">=3.10, <3.12"
python = ">=3.11, <3.12"
python-dateutil = "^2.8.1"
python-magic = "^0.4.21"
pytz = "*"
Expand Down

0 comments on commit c179897

Please sign in to comment.