Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
ccurme committed Mar 22, 2024
1 parent 72b25cb commit 310e734
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 18 deletions.
1 change: 0 additions & 1 deletion backend/extraction/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
SUPPORTED_MIMETYPES = sorted(HANDLERS.keys())

MAX_FILE_SIZE_MB = 10 # in MB
MAX_CHUNK_COUNT = 50


def _guess_mimetype(file_bytes: bytes) -> str:
Expand Down
6 changes: 0 additions & 6 deletions backend/server/extraction_runnable.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from typing_extensions import TypedDict

from db.models import Example, Extractor
from extraction.parsing import MAX_CHUNK_COUNT
from extraction.utils import update_json_schema
from server import settings
from server.models import DEFAULT_MODEL, get_chunk_size, get_model
Expand Down Expand Up @@ -193,11 +192,6 @@ async def extract_entire_document(
model_name=DEFAULT_MODEL,
)
texts = text_splitter.split_text(content)
if len(texts) > MAX_CHUNK_COUNT:
raise HTTPException(
status_code=413,
detail=f"Text exceeds the maximum limit of {MAX_CHUNK_COUNT} chunks.",
)
extraction_requests = [
ExtractRequest(
text=text,
Expand Down
26 changes: 15 additions & 11 deletions backend/tests/unit_tests/api/test_api_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,14 +177,18 @@ async def test_extract_from_large_file() -> None:
f.write("This is a named temporary file.")
f.seek(0)
f.flush()
with patch("server.extraction_runnable.MAX_CHUNK_COUNT", 0):
response = await client.post(
"/extract",
data={
"extractor_id": extractor_id,
"mode": "entire_document",
},
files={"file": f},
headers=headers,
)
assert response.status_code == 413
with patch("server.extraction_runnable.settings.MAX_CHUNKS", 1):
with patch.object(
CharacterTextSplitter, "split_text", return_value=["a", "b"]
):
response = await client.post(
"/extract",
data={
"extractor_id": extractor_id,
"mode": "entire_document",
},
files={"file": f},
headers=headers,
)
assert response.status_code == 200
assert response.json() == {"data": ["a"]}

0 comments on commit 310e734

Please sign in to comment.