Skip to content

Commit

Permalink
Fix file utils imports
Browse files Browse the repository at this point in the history
  • Loading branch information
evamaxfield authored Jun 16, 2023
1 parent 69ff775 commit d6d71d1
Showing 1 changed file with 6 additions and 2 deletions.
8 changes: 6 additions & 2 deletions cdp_backend/utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,8 @@

import fireo
import fsspec
import pypdf
import requests
from fsspec.core import url_to_fs
from tika import parser

from ..database import models as db_models

Expand Down Expand Up @@ -854,6 +852,8 @@ def parse_doc_file(document_raw: bytes) -> str:
str:
A str of all text in the .doc file.
"""
from tika import parser

parsed_content = parser.from_buffer(document_raw)["content"]
return remove_duplicate_space(parsed_content)

Expand All @@ -872,6 +872,8 @@ def parse_pdf_file(document_raw: bytes) -> str:
str:
A str of all text in the .pdf file.
"""
import pypdf

pdf_reader = pypdf.PdfReader(io.BytesIO(document_raw))
text = ""

Expand All @@ -898,6 +900,8 @@ def parse_pptx_file(document_raw: bytes) -> str:
str:
A str of all text in the .pdf file.
"""
from tika import parser

parsed_pptx = parser.from_buffer(document_raw)["content"]
return remove_duplicate_space(parsed_pptx)

Expand Down

0 comments on commit d6d71d1

Please sign in to comment.