diff --git a/application/api/user/routes.py b/application/api/user/routes.py index 794c69d4a..84a678639 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -343,6 +343,7 @@ def post(self): ".mdx", ".json", ".xlsx", + ".pptx", ], job_name, final_filename, diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py index bb63aa61b..3b8fbca86 100644 --- a/application/parser/file/bulk.py +++ b/application/parser/file/bulk.py @@ -12,6 +12,7 @@ from application.parser.file.rst_parser import RstParser from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser from application.parser.file.json_parser import JSONParser +from application.parser.file.pptx_parser import PPTXParser from application.parser.schema.base import Document DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = { @@ -25,6 +26,7 @@ ".html": HTMLParser(), ".mdx": MarkdownParser(), ".json":JSONParser(), + ".pptx":PPTXParser(), } diff --git a/application/parser/file/pptx_parser.py b/application/parser/file/pptx_parser.py new file mode 100644 index 000000000..00cb3698f --- /dev/null +++ b/application/parser/file/pptx_parser.py @@ -0,0 +1,75 @@ +"""PPT parser. +Contains parsers for presentation (.pptx) files to extract slide text. +""" +from pathlib import Path +from typing import Any, Dict, List, Union + +from application.parser.file.base_parser import BaseParser + +class PPTXParser(BaseParser): + r"""PPTX (.pptx) parser for extracting text from PowerPoint slides. + Args: + concat_slides (bool): Specifies whether to concatenate all slide text into one document. + - If True, slide texts will be joined together as a single string. + - If False, each slide's text will be stored as a separate entry in a list. + Set to True by default. + slide_separator (str): Separator used to join slides' text content. + Only used when `concat_slides=True`. Default is "\n". + Refer to https://python-pptx.readthedocs.io/en/latest/ for more information. + """ + + def __init__( + self, + *args: Any, + concat_slides: bool = True, + slide_separator: str = "\n", + **kwargs: Any + ) -> None: + """Init params.""" + super().__init__(*args, **kwargs) + self._concat_slides = concat_slides + self._slide_separator = slide_separator + + def _init_parser(self) -> Dict: + """Init parser.""" + return {} + + def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: + r""" + Parse a .pptx file and extract text from each slide. + Args: + file (Path): Path to the .pptx file. + errors (str): Error handling policy ('ignore' by default). + Returns: + Union[str, List[str]]: Concatenated text if concat_slides is True, + otherwise a list of slide texts. + """ + + try: + from pptx import Presentation + except ImportError: + raise ImportError("pptx module is required to read .PPTX files.") + + try: + presentation = Presentation(file) + slide_texts=[] + + # Iterate over each slide in the presentation + for slide in presentation.slides: + slide_text="" + + # Iterate over each shape in the slide + for shape in slide.shapes: + # Check if the shape has a 'text' attribute and append that to the slide_text + if hasattr(shape,"text"): + slide_text+=shape.text + + slide_texts.append(slide_text.strip()) + + if self._concat_slides: + return self._slide_separator.join(slide_texts) + else: + return slide_texts + + except Exception as e: + raise e \ No newline at end of file diff --git a/application/requirements.txt b/application/requirements.txt index aad629f14..5325a8499 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -14,6 +14,7 @@ esutils==1.0.1 Flask==3.0.3 faiss-cpu==1.8.0.post1 flask-restx==1.3.0 +gTTS==2.3.2 gunicorn==23.0.0 html2text==2024.2.26 javalang==0.13.0 @@ -65,6 +66,7 @@ pymongo==4.8.0 pypdf2==3.0.1 python-dateutil==2.9.0.post0 python-dotenv==1.0.1 +python-pptx==0.4.1 qdrant-client==1.11.0 redis==5.0.1 referencing==0.30.2 @@ -84,5 +86,4 @@ urllib3==2.2.3 vine==5.1.0 wcwidth==0.2.13 werkzeug==3.0.4 -yarl==1.11.1 -gTTS==2.3.2 \ No newline at end of file +yarl==1.11.1 \ No newline at end of file diff --git a/frontend/src/locale/en.json b/frontend/src/locale/en.json index a1f254ac9..52d3b50e5 100644 --- a/frontend/src/locale/en.json +++ b/frontend/src/locale/en.json @@ -86,7 +86,7 @@ "start": "Start Chatting", "name": "Name", "choose": "Choose Files", - "info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .zip limited to 25mb", + "info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .pptx, .zip limited to 25mb", "uploadedFiles": "Uploaded Files", "cancel": "Cancel", "train": "Train", diff --git a/frontend/src/locale/es.json b/frontend/src/locale/es.json index 6a096ffd9..44fafed70 100644 --- a/frontend/src/locale/es.json +++ b/frontend/src/locale/es.json @@ -86,7 +86,7 @@ "start": "Empezar a chatear", "name": "Nombre", "choose": "Seleccionar Archivos", - "info": "Por favor, suba archivos .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .zip limitados a 25 MB", + "info": "Por favor, suba archivos .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .pptx, .zip limitados a 25 MB", "uploadedFiles": "Archivos Subidos", "cancel": "Cancelar", "train": "Entrenar", diff --git a/frontend/src/locale/jp.json b/frontend/src/locale/jp.json index 841a477b8..a69ae31f3 100644 --- a/frontend/src/locale/jp.json +++ b/frontend/src/locale/jp.json @@ -86,7 +86,7 @@ "start": "チャットを開始する", "name": "名前", "choose": "ファイルを選択", - "info": ".pdf, .txt, .rst, .docx, .md, .json, .zipファイルを25MBまでアップロードしてください", + "info": ".pdf, .txt, .rst, .docx, .md, .json, .pptx, .zipファイルを25MBまでアップロードしてください", "uploadedFiles": "アップロードされたファイル", "cancel": "キャンセル", "train": "トレーニング", diff --git a/frontend/src/locale/zh-TW.json b/frontend/src/locale/zh-TW.json index 35df818b0..fa0638f44 100644 --- a/frontend/src/locale/zh-TW.json +++ b/frontend/src/locale/zh-TW.json @@ -80,7 +80,7 @@ "remote": "遠端", "name": "名稱", "choose": "選擇檔案", - "info": "請上傳 .pdf, .txt, .rst, .docx, .md, .json, .zip 檔案,大小限制為 25MB", + "info": "請上傳 .pdf, .txt, .rst, .docx, .md, .json, .pptx, .zip 檔案,大小限制為 25MB", "uploadedFiles": "已上傳的檔案", "cancel": "取消", "train": "訓練", diff --git a/frontend/src/locale/zh.json b/frontend/src/locale/zh.json index 710c5e3e2..51f8bfe90 100644 --- a/frontend/src/locale/zh.json +++ b/frontend/src/locale/zh.json @@ -86,7 +86,7 @@ "start": "开始聊天", "name": "名称", "choose": "选择文件", - "info": "请上传 .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .zip 文件,限 25MB", + "info": "请上传 .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .pptx, .zip 文件,限 25MB", "uploadedFiles": "已上传文件", "cancel": "取消", "train": "训练", diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx index 2da284c3c..81ce9f2b0 100644 --- a/frontend/src/upload/Upload.tsx +++ b/frontend/src/upload/Upload.tsx @@ -321,6 +321,8 @@ function Upload({ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': [ '.xlsx', ], + 'application/vnd.openxmlformats-officedocument.presentationml.presentation': + ['.pptx'], }, });