From f56df3b248ff749fcfe1b449f229f282748004af Mon Sep 17 00:00:00 2001 From: ttys0dev <126845556+ttys0dev@users.noreply.github.com> Date: Tue, 6 Jun 2023 23:12:42 -0600 Subject: [PATCH] Avoid reading pdf files into memory --- .github/workflows/lint.yml | 2 +- cl/recap/tasks.py | 16 +++++------- cl/settings/third_party/aws.py | 1 + poetry.lock | 47 +++------------------------------- pyproject.toml | 2 +- 5 files changed, 13 insertions(+), 55 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 34d4dbc7a8..b4c8511f56 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -17,7 +17,7 @@ jobs: - uses: actions/setup-python@v2 with: # Once codebase is updated, this can easily be changed to any specific version. - python-version: "3.10" + python-version: "3.11" - name: Install Poetry uses: snok/install-poetry@v1 diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py index 4d48d057c5..f882eca460 100644 --- a/cl/recap/tasks.py +++ b/cl/recap/tasks.py @@ -1,7 +1,7 @@ +import hashlib import logging from dataclasses import dataclass from datetime import datetime -from io import BytesIO from typing import List, Optional, Tuple from zipfile import ZipFile @@ -12,7 +12,7 @@ from django.conf import settings from django.contrib.auth.models import User from django.core.exceptions import ValidationError -from django.core.files.base import ContentFile +from django.core.files.base import ContentFile, File from django.core.files.uploadedfile import SimpleUploadedFile from django.db import IntegrityError, transaction from django.utils.timezone import now @@ -53,7 +53,6 @@ ) from cl.corpus_importer.utils import mark_ia_upload_needed from cl.custom_filters.templatetags.text_filters import oxford_join -from cl.lib.crypto import sha1 from cl.lib.filesizes import convert_size_to_bytes from cl.lib.microservice_utils import microservice from cl.lib.pacer import is_pacer_court_accessible, map_cl_to_pacer_id @@ -316,7 +315,8 @@ def process_recap_pdf(self, pk): # Do the file, finally. try: - file_contents = pq.filepath_local.read() + with pq.filepath_local.open("rb") as f: + new_sha1 = hashlib.file_digest(f, "sha1").hexdigest() except IOError as exc: msg = f"Internal processing error ({exc.errno}: {exc.strerror})." if (self.request.retries == self.max_retries) or pq.debug: @@ -326,10 +326,6 @@ def process_recap_pdf(self, pk): mark_pq_status(pq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) raise self.retry(exc=exc) - if not file_contents: - return None - - new_sha1 = sha1(file_contents) existing_document = all( [ rd.sha1 == new_sha1, @@ -340,7 +336,6 @@ def process_recap_pdf(self, pk): if not existing_document: # Different sha1, it wasn't available, or it's missing from disk. Move # the new file over from the processing queue storage. - cf = ContentFile(file_contents) file_name = get_document_filename( rd.docket_entry.docket.court_id, rd.docket_entry.docket.pacer_case_id, @@ -348,7 +343,8 @@ def process_recap_pdf(self, pk): rd.attachment_number, ) if not pq.debug: - rd.filepath_local.save(file_name, cf, save=False) + with pq.filepath_local.open("rb") as f: + rd.filepath_local.save(file_name, File(f), save=False) # Do page count and extraction response = microservice( diff --git a/cl/settings/third_party/aws.py b/cl/settings/third_party/aws.py index 5b943f8ce2..f5350367a8 100644 --- a/cl/settings/third_party/aws.py +++ b/cl/settings/third_party/aws.py @@ -18,6 +18,7 @@ AWS_S3_CUSTOM_DOMAIN = "storage.courtlistener.com" AWS_DEFAULT_ACL = "public-read" AWS_QUERYSTRING_AUTH = False +AWS_S3_MAX_MEMORY_SIZE = 16 * 1024 * 1024 if DEVELOPMENT: AWS_STORAGE_BUCKET_NAME = "dev-com-courtlistener-storage" diff --git a/poetry.lock b/poetry.lock index d923189ec5..7f91780e6a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -94,11 +94,7 @@ files = [ [package.dependencies] lazy-object-proxy = ">=1.4.0" -typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""} -wrapt = [ - {version = ">=1.11,<2", markers = "python_version < \"3.11\""}, - {version = ">=1.14,<2", markers = "python_version >= \"3.11\""}, -] +wrapt = {version = ">=1.14,<2", markers = "python_version >= \"3.11\""} [[package]] name = "asttokens" @@ -271,7 +267,6 @@ mypy-extensions = ">=0.4.3" packaging = ">=22.0" pathspec = ">=0.9.0" platformdirs = ">=2" -tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} [package.extras] colorama = ["colorama (>=0.4.3)"] @@ -1169,7 +1164,6 @@ files = [ django = "*" django-stubs-ext = ">=4.2.0" mypy = ">=0.980" -tomli = {version = "*", markers = "python_version < \"3.11\""} types-pytz = "*" types-PyYAML = "*" typing-extensions = "*" @@ -1337,20 +1331,6 @@ six = "*" [package.extras] develop = ["coverage (<5.0.0)", "mock", "pytest (>=3.0.0)", "pytest-cov", "pytest-mock (<3.0.0)", "pytz", "sphinx", "sphinx-rtd-theme"] -[[package]] -name = "exceptiongroup" -version = "1.1.1" -description = "Backport of PEP 654 (exception groups)" -optional = false -python-versions = ">=3.7" -files = [ - {file = "exceptiongroup-1.1.1-py3-none-any.whl", hash = "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e"}, - {file = "exceptiongroup-1.1.1.tar.gz", hash = "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"}, -] - -[package.extras] -test = ["pytest (>=6)"] - [[package]] name = "executing" version = "1.1.0" @@ -1560,7 +1540,6 @@ files = [ [package.dependencies] astor = "*" -tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} [package.extras] dev = ["build", "pre-commit", "pytest", "pytest-cov", "twine"] @@ -2375,7 +2354,6 @@ files = [ [package.dependencies] mypy-extensions = ">=1.0.0" -tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} typing-extensions = ">=3.10" [package.extras] @@ -2940,14 +2918,10 @@ files = [ [package.dependencies] astroid = ">=2.15.4,<=2.17.0-dev0" colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} -dill = [ - {version = ">=0.2", markers = "python_version < \"3.11\""}, - {version = ">=0.3.6", markers = "python_version >= \"3.11\""}, -] +dill = {version = ">=0.3.6", markers = "python_version >= \"3.11\""} isort = ">=4.2.5,<6" mccabe = ">=0.6,<0.8" platformdirs = ">=2.2.0" -tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} tomlkit = ">=0.10.1" [package.extras] @@ -3019,11 +2993,9 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "sys_platform == \"win32\""} -exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} iniconfig = "*" packaging = "*" pluggy = ">=0.12,<2.0" -tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] @@ -3856,17 +3828,6 @@ idna = "*" requests = ">=2.1.0" requests-file = ">=1.4" -[[package]] -name = "tomli" -version = "2.0.1" -description = "A lil' TOML parser" -optional = false -python-versions = ">=3.7" -files = [ - {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, - {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, -] - [[package]] name = "tomlkit" version = "0.11.8" @@ -4614,5 +4575,5 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" -python-versions = ">=3.10, <3.12" -content-hash = "d435d4c7dce4af1c659d4dbe2d712e2091a1514a80439b4e4009404f69bef595" +python-versions = ">=3.11, <3.12" +content-hash = "674af32861e1e5bf9c31401f02a3af0b698be8b60b9492cd89ab5464218efd3e" diff --git a/pyproject.toml b/pyproject.toml index 5ec87e6e65..91020cf1e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,7 +67,7 @@ psycopg2 = "^2.9.5" pycparser = "^2.21" pyopenssl = "*" pyparsing = "^2.4.2" -python = ">=3.10, <3.12" +python = ">=3.11, <3.12" python-dateutil = "^2.8.1" python-magic = "^0.4.21" pytz = "*"