Skip to content

Commit

Permalink
feat: Presentation parser implementation
Browse files Browse the repository at this point in the history
Signed-off-by: JeevaRamanathan M <[email protected]>
  • Loading branch information
JeevaRamanathan committed Oct 31, 2024
1 parent 45e14bc commit 5c75634
Show file tree
Hide file tree
Showing 10 changed files with 88 additions and 7 deletions.
1 change: 1 addition & 0 deletions application/api/user/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,7 @@ def post(self):
".mdx",
".json",
".xlsx",
".pptx",
],
job_name,
final_filename,
Expand Down
2 changes: 2 additions & 0 deletions application/parser/file/bulk.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from application.parser.file.rst_parser import RstParser
from application.parser.file.tabular_parser import PandasCSVParser,ExcelParser
from application.parser.file.json_parser import JSONParser
from application.parser.file.pptx_parser import PPTXParser
from application.parser.schema.base import Document

DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
Expand All @@ -25,6 +26,7 @@
".html": HTMLParser(),
".mdx": MarkdownParser(),
".json":JSONParser(),
".pptx":PPTXParser(),
}


Expand Down
75 changes: 75 additions & 0 deletions application/parser/file/pptx_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""PPT parser.
Contains parsers for presentation (.pptx) files to extract slide text.
"""
from pathlib import Path
from typing import Any, Dict, List, Union

from application.parser.file.base_parser import BaseParser

class PPTXParser(BaseParser):
r"""PPTX (.pptx) parser for extracting text from PowerPoint slides.
Args:
concat_slides (bool): Specifies whether to concatenate all slide text into one document.
- If True, slide texts will be joined together as a single string.
- If False, each slide's text will be stored as a separate entry in a list.
Set to True by default.
slide_separator (str): Separator used to join slides' text content.
Only used when `concat_slides=True`. Default is "\n".
Refer to https://python-pptx.readthedocs.io/en/latest/ for more information.
"""

def __init__(
self,
*args: Any,
concat_slides: bool = True,
slide_separator: str = "\n",
**kwargs: Any
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
self._concat_slides = concat_slides
self._slide_separator = slide_separator

def _init_parser(self) -> Dict:
"""Init parser."""
return {}

Check warning on line 35 in application/parser/file/pptx_parser.py

View check run for this annotation

Codecov / codecov/patch

application/parser/file/pptx_parser.py#L35

Added line #L35 was not covered by tests

def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
r"""
Parse a .pptx file and extract text from each slide.
Args:
file (Path): Path to the .pptx file.
errors (str): Error handling policy ('ignore' by default).
Returns:
Union[str, List[str]]: Concatenated text if concat_slides is True,
otherwise a list of slide texts.
"""

try:
from pptx import Presentation
except ImportError:
raise ImportError("pptx module is required to read .PPTX files.")

Check warning on line 51 in application/parser/file/pptx_parser.py

View check run for this annotation

Codecov / codecov/patch

application/parser/file/pptx_parser.py#L48-L51

Added lines #L48 - L51 were not covered by tests

try:
presentation = Presentation(file)
slide_texts=[]

Check warning on line 55 in application/parser/file/pptx_parser.py

View check run for this annotation

Codecov / codecov/patch

application/parser/file/pptx_parser.py#L53-L55

Added lines #L53 - L55 were not covered by tests

# Iterate over each slide in the presentation
for slide in presentation.slides:
slide_text=""

Check warning on line 59 in application/parser/file/pptx_parser.py

View check run for this annotation

Codecov / codecov/patch

application/parser/file/pptx_parser.py#L58-L59

Added lines #L58 - L59 were not covered by tests

# Iterate over each shape in the slide
for shape in slide.shapes:

Check warning on line 62 in application/parser/file/pptx_parser.py

View check run for this annotation

Codecov / codecov/patch

application/parser/file/pptx_parser.py#L62

Added line #L62 was not covered by tests
# Check if the shape has a 'text' attribute and append that to the slide_text
if hasattr(shape,"text"):
slide_text+=shape.text

Check warning on line 65 in application/parser/file/pptx_parser.py

View check run for this annotation

Codecov / codecov/patch

application/parser/file/pptx_parser.py#L64-L65

Added lines #L64 - L65 were not covered by tests

slide_texts.append(slide_text.strip())

Check warning on line 67 in application/parser/file/pptx_parser.py

View check run for this annotation

Codecov / codecov/patch

application/parser/file/pptx_parser.py#L67

Added line #L67 was not covered by tests

if self._concat_slides:
return self._slide_separator.join(slide_texts)

Check warning on line 70 in application/parser/file/pptx_parser.py

View check run for this annotation

Codecov / codecov/patch

application/parser/file/pptx_parser.py#L69-L70

Added lines #L69 - L70 were not covered by tests
else:
return slide_texts

Check warning on line 72 in application/parser/file/pptx_parser.py

View check run for this annotation

Codecov / codecov/patch

application/parser/file/pptx_parser.py#L72

Added line #L72 was not covered by tests

except Exception as e:
raise e

Check warning on line 75 in application/parser/file/pptx_parser.py

View check run for this annotation

Codecov / codecov/patch

application/parser/file/pptx_parser.py#L74-L75

Added lines #L74 - L75 were not covered by tests
5 changes: 3 additions & 2 deletions application/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ esutils==1.0.1
Flask==3.0.3
faiss-cpu==1.8.0.post1
flask-restx==1.3.0
gTTS==2.3.2
gunicorn==23.0.0
html2text==2024.2.26
javalang==0.13.0
Expand Down Expand Up @@ -65,6 +66,7 @@ pymongo==4.8.0
pypdf2==3.0.1
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
python-pptx==0.4.1
qdrant-client==1.11.0
redis==5.0.1
referencing==0.30.2
Expand All @@ -84,5 +86,4 @@ urllib3==2.2.3
vine==5.1.0
wcwidth==0.2.13
werkzeug==3.0.4
yarl==1.11.1
gTTS==2.3.2
yarl==1.11.1
2 changes: 1 addition & 1 deletion frontend/src/locale/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@
"start": "Start Chatting",
"name": "Name",
"choose": "Choose Files",
"info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .zip limited to 25mb",
"info": "Please upload .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .pptx, .zip limited to 25mb",
"uploadedFiles": "Uploaded Files",
"cancel": "Cancel",
"train": "Train",
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/locale/es.json
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@
"start": "Empezar a chatear",
"name": "Nombre",
"choose": "Seleccionar Archivos",
"info": "Por favor, suba archivos .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .zip limitados a 25 MB",
"info": "Por favor, suba archivos .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .pptx, .zip limitados a 25 MB",
"uploadedFiles": "Archivos Subidos",
"cancel": "Cancelar",
"train": "Entrenar",
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/locale/jp.json
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@
"start": "チャットを開始する",
"name": "名前",
"choose": "ファイルを選択",
"info": ".pdf, .txt, .rst, .docx, .md, .json, .zipファイルを25MBまでアップロードしてください",
"info": ".pdf, .txt, .rst, .docx, .md, .json, .pptx, .zipファイルを25MBまでアップロードしてください",
"uploadedFiles": "アップロードされたファイル",
"cancel": "キャンセル",
"train": "トレーニング",
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/locale/zh-TW.json
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@
"remote": "遠端",
"name": "名稱",
"choose": "選擇檔案",
"info": "請上傳 .pdf, .txt, .rst, .docx, .md, .json, .zip 檔案,大小限制為 25MB",
"info": "請上傳 .pdf, .txt, .rst, .docx, .md, .json, .pptx, .zip 檔案,大小限制為 25MB",
"uploadedFiles": "已上傳的檔案",
"cancel": "取消",
"train": "訓練",
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/locale/zh.json
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@
"start": "开始聊天",
"name": "名称",
"choose": "选择文件",
"info": "请上传 .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .zip 文件,限 25MB",
"info": "请上传 .pdf, .txt, .rst, .csv, .xlsx, .docx, .md, .html, .epub, .json, .pptx, .zip 文件,限 25MB",
"uploadedFiles": "已上传文件",
"cancel": "取消",
"train": "训练",
Expand Down
2 changes: 2 additions & 0 deletions frontend/src/upload/Upload.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,8 @@ function Upload({
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': [
'.xlsx',
],
'application/vnd.openxmlformats-officedocument.presentationml.presentation':
['.pptx'],
},
});

Expand Down

0 comments on commit 5c75634

Please sign in to comment.