diff --git a/docker/prepare.sh b/docker/prepare.sh index ce367c6..e443d30 100644 --- a/docker/prepare.sh +++ b/docker/prepare.sh @@ -1,7 +1,7 @@ #!/bin/bash function start_docker() { - docker run --gpus=all --shm-size 2g --net=host -itd --name bisheng_unstr_dev1 \ + docker run --net=host -itd --name bisheng_unstr_dev1 \ -v /home/hanfeng:/home/hanfeng -v /home/public:/home/public ubuntu:20.04 bash } diff --git a/src/unstructured/__init__.py b/src/bisheng_unstructured/__init__.py similarity index 100% rename from src/unstructured/__init__.py rename to src/bisheng_unstructured/__init__.py diff --git a/src/unstructured/__version__.py b/src/bisheng_unstructured/__version__.py similarity index 100% rename from src/unstructured/__version__.py rename to src/bisheng_unstructured/__version__.py diff --git a/src/unstructured/cleaners/__init__.py b/src/bisheng_unstructured/cleaners/__init__.py similarity index 100% rename from src/unstructured/cleaners/__init__.py rename to src/bisheng_unstructured/cleaners/__init__.py diff --git a/src/unstructured/cleaners/core.py b/src/bisheng_unstructured/cleaners/core.py similarity index 99% rename from src/unstructured/cleaners/core.py rename to src/bisheng_unstructured/cleaners/core.py index 49c2068..050050f 100644 --- a/src/unstructured/cleaners/core.py +++ b/src/bisheng_unstructured/cleaners/core.py @@ -3,10 +3,10 @@ import sys import unicodedata -from unstructured.file_utils.encoding import ( +from bisheng_unstructured.file_utils.encoding import ( format_encoding_str, ) -from unstructured.nlp.patterns import ( +from bisheng_unstructured.nlp.patterns import ( DOUBLE_PARAGRAPH_PATTERN_RE, E_BULLET_PATTERN, LINE_BREAK_RE, diff --git a/src/unstructured/cleaners/extract.py b/src/bisheng_unstructured/cleaners/extract.py similarity index 98% rename from src/unstructured/cleaners/extract.py rename to src/bisheng_unstructured/cleaners/extract.py index 69c3999..e28c0cf 100644 --- a/src/unstructured/cleaners/extract.py +++ b/src/bisheng_unstructured/cleaners/extract.py @@ -2,7 +2,7 @@ import re from typing import List, Optional -from unstructured.nlp.patterns import ( +from bisheng_unstructured.nlp.patterns import ( EMAIL_ADDRESS_PATTERN, EMAIL_DATETIMETZ_PATTERN, IP_ADDRESS_NAME_PATTERN, diff --git a/src/unstructured/cleaners/translate.py b/src/bisheng_unstructured/cleaners/translate.py similarity index 95% rename from src/unstructured/cleaners/translate.py rename to src/bisheng_unstructured/cleaners/translate.py index 8cb16a1..69907cd 100644 --- a/src/unstructured/cleaners/translate.py +++ b/src/bisheng_unstructured/cleaners/translate.py @@ -4,8 +4,8 @@ import langdetect from transformers import MarianMTModel, MarianTokenizer -from unstructured.nlp.tokenize import sent_tokenize -from unstructured.staging.huggingface import chunk_by_attention_window +from bisheng_unstructured.nlp.tokenize import sent_tokenize +from bisheng_unstructured.staging.huggingface import chunk_by_attention_window def _get_opus_mt_model_name(source_lang: str, target_lang: str): diff --git a/src/unstructured/documents/__init__.py b/src/bisheng_unstructured/documents/__init__.py similarity index 100% rename from src/unstructured/documents/__init__.py rename to src/bisheng_unstructured/documents/__init__.py diff --git a/src/unstructured/documents/base.py b/src/bisheng_unstructured/documents/base.py similarity index 97% rename from src/unstructured/documents/base.py rename to src/bisheng_unstructured/documents/base.py index a2c729b..54994d3 100644 --- a/src/unstructured/documents/base.py +++ b/src/bisheng_unstructured/documents/base.py @@ -3,7 +3,7 @@ from abc import ABC from typing import List, Optional -from unstructured.documents.elements import Element, NarrativeText +from bisheng_unstructured.documents.elements import Element, NarrativeText class Document(ABC): diff --git a/src/unstructured/documents/coordinates.py b/src/bisheng_unstructured/documents/coordinates.py similarity index 100% rename from src/unstructured/documents/coordinates.py rename to src/bisheng_unstructured/documents/coordinates.py diff --git a/src/unstructured/documents/elements.py b/src/bisheng_unstructured/documents/elements.py similarity index 99% rename from src/unstructured/documents/elements.py rename to src/bisheng_unstructured/documents/elements.py index c0cb1ee..8827697 100644 --- a/src/unstructured/documents/elements.py +++ b/src/bisheng_unstructured/documents/elements.py @@ -13,7 +13,7 @@ from functools import wraps from typing import Any, Callable, Dict, List, Optional, Tuple, TypedDict, Union, cast -from unstructured.documents.coordinates import ( +from bisheng_unstructured.documents.coordinates import ( TYPE_TO_COORDINATE_SYSTEM_MAP, CoordinateSystem, RelativeCoordinateSystem, diff --git a/src/unstructured/documents/email_elements.py b/src/bisheng_unstructured/documents/email_elements.py similarity index 97% rename from src/unstructured/documents/email_elements.py rename to src/bisheng_unstructured/documents/email_elements.py index ae449a6..0931c26 100644 --- a/src/unstructured/documents/email_elements.py +++ b/src/bisheng_unstructured/documents/email_elements.py @@ -4,7 +4,7 @@ from datetime import datetime from typing import Callable, List, Union -from unstructured.documents.elements import UUID, Element, NoID, Text +from bisheng_unstructured.documents.elements import UUID, Element, NoID, Text class NoDatestamp(ABC): diff --git a/src/unstructured/documents/html.py b/src/bisheng_unstructured/documents/html.py similarity index 98% rename from src/unstructured/documents/html.py rename to src/bisheng_unstructured/documents/html.py index 22ade7e..ae0ce80 100644 --- a/src/unstructured/documents/html.py +++ b/src/bisheng_unstructured/documents/html.py @@ -12,9 +12,11 @@ -from unstructured.cleaners.core import clean_bullets, replace_unicode_quotes -from unstructured.documents.base import Page -from unstructured.documents.elements import ( +from bisheng_unstructured.cleaners.core import ( + clean_bullets, replace_unicode_quotes +) +from bisheng_unstructured.documents.base import Page +from bisheng_unstructured.documents.elements import ( Address, Element, EmailAddress, @@ -26,9 +28,9 @@ Table, ElementMetadata ) -from unstructured.documents.xml import VALID_PARSERS, XMLDocument -from unstructured.logger import logger -from unstructured.partition.text_type import ( +from bisheng_unstructured.documents.xml import VALID_PARSERS, XMLDocument +from bisheng_unstructured.logger import logger +from bisheng_unstructured.partition.text_type import ( is_bulleted_text, is_email_address, is_possible_narrative_text, diff --git a/src/unstructured/documents/html_utils.py b/src/bisheng_unstructured/documents/html_utils.py similarity index 96% rename from src/unstructured/documents/html_utils.py rename to src/bisheng_unstructured/documents/html_utils.py index 3992436..763e111 100644 --- a/src/unstructured/documents/html_utils.py +++ b/src/bisheng_unstructured/documents/html_utils.py @@ -1,4 +1,4 @@ -from unstructured.documents.markdown import transform_html_table_to_md +from bisheng_unstructured.documents.markdown import transform_html_table_to_md def visualize_html(elements, output_file): diff --git a/src/bisheng_unstructured/documents/layout.py b/src/bisheng_unstructured/documents/layout.py new file mode 100644 index 0000000..2bd7021 --- /dev/null +++ b/src/bisheng_unstructured/documents/layout.py @@ -0,0 +1,145 @@ +from __future__ import annotations + +import os +import tempfile +from pathlib import PurePath +from typing import BinaryIO, Collection, List, Optional, Tuple, Union, cast + +import numpy as np +from PIL import Image + + +class DocumentLayout: + """Class for handling documents that are saved as .pdf files. For .pdf files, a + document image analysis (DIA) model detects the layout of the page prior to extracting + element.""" + + def __init__(self, pages=None): + self._pages = pages + + def __str__(self) -> str: + return "\n\n".join([str(page) for page in self.pages]) + + @property + def pages(self) -> List[PageLayout]: + """Gets all elements from pages in sequential order.""" + return self._pages + + @classmethod + def from_pages(cls, pages: List[PageLayout]) -> DocumentLayout: + """Generates a new instance of the class from a list of `PageLayouts`s""" + doc_layout = cls() + doc_layout._pages = pages + return doc_layout + + @classmethod + def from_file( + cls, + filename: str, + detection_model: Optional[Any] = None, + element_extraction_model: Optional[Any] = None, + fixed_layouts: Optional[List[Optional[List[Any]]]] = None, + ocr_strategy: str = "auto", + ocr_languages: str = "eng", + extract_tables: bool = False, + pdf_image_dpi: int = 200, + ) -> DocumentLayout: + """Creates a DocumentLayout from a pdf file.""" + logger.info(f"Reading PDF for file: {filename} ...") + pages: List[PageLayout] = [] + return cls.from_pages(pages) + + @classmethod + def from_image_file( + cls, + filename: str, + detection_model: Optional[Any] = None, + element_extraction_model: Optional[Any] = None, + ocr_strategy: str = "auto", + ocr_languages: str = "eng", + fixed_layout: Optional[List[Any]] = None, + extract_tables: bool = False, + ) -> DocumentLayout: + """Creates a DocumentLayout from an image file.""" + logger.info(f"Reading image file: {filename} ...") + return cls.from_pages([]) + + +class PageLayout: + """Class for an individual PDF page.""" + + def __init__( + self, + number: int, + image: Image.Image, + layout: Optional[List[Any]], + image_metadata: Optional[dict] = None, + image_path: Optional[Union[str, PurePath]] = None, + detection_model: Optional[Any] = None, + element_extraction_model: Optional[Any] = None, + ocr_strategy: str = "auto", + ocr_languages: str = "eng", + extract_tables: bool = False, + ): + self.elements: Collection[Any] = [] + + + def __str__(self) -> str: + return "\n\n".join([str(element) for element in self.elements]) + + def get_elements_using_image_extraction( + self, + inplace=True, + ) -> Optional[List[Any]]: + """Uses end-to-end text element extraction model to extract the elements on the page.""" + return [] + + def get_elements_with_detection_model(self, inplace=True) -> Optional[List[Any]]: + """Uses specified model to detect the elements on the page.""" + elements = [] + if inplace: + self.elements = elements + return None + return elements + + def get_elements_from_layout(self, layout: List[Any]) -> List[Any]: + """Uses the given Layout to separate the page text into elements, either extracting the + text from the discovered layout blocks or from the image using OCR.""" + return [] + + def _get_image_array(self) -> Union[np.ndarray, None]: + """Converts the raw image into a numpy array.""" + if self.image_array is None: + if self.image: + self.image_array = np.array(self.image) + else: + image = Image.open(self.image_path) + self.image_array = np.array(image) + return self.image_array + + @classmethod + def from_image( + cls, + image: Image.Image, + image_path: Optional[Union[str, PurePath]], + number: int = 1, + detection_model: Optional[Any] = None, + element_extraction_model: Optional[Any] = None, + layout: Optional[List[Any]] = None, + ocr_strategy: str = "auto", + ocr_languages: str = "eng", + extract_tables: bool = False, + fixed_layout: Optional[List[Any]] = None, + ): + """Creates a PageLayout from an already-loaded PIL Image.""" + page = cls( + number=number, + image=image, + layout=layout, + detection_model=detection_model, + element_extraction_model=element_extraction_model, + ocr_strategy=ocr_strategy, + ocr_languages=ocr_languages, + extract_tables=extract_tables, + ) + return page diff --git a/src/unstructured/documents/markdown.py b/src/bisheng_unstructured/documents/markdown.py similarity index 100% rename from src/unstructured/documents/markdown.py rename to src/bisheng_unstructured/documents/markdown.py diff --git a/src/unstructured/documents/pdf_parser/__init__.py b/src/bisheng_unstructured/documents/pdf_parser/__init__.py similarity index 100% rename from src/unstructured/documents/pdf_parser/__init__.py rename to src/bisheng_unstructured/documents/pdf_parser/__init__.py diff --git a/src/unstructured/documents/pdf_parser/blob.py b/src/bisheng_unstructured/documents/pdf_parser/blob.py similarity index 100% rename from src/unstructured/documents/pdf_parser/blob.py rename to src/bisheng_unstructured/documents/pdf_parser/blob.py diff --git a/src/unstructured/documents/pdf_parser/image.py b/src/bisheng_unstructured/documents/pdf_parser/image.py similarity index 95% rename from src/unstructured/documents/pdf_parser/image.py rename to src/bisheng_unstructured/documents/pdf_parser/image.py index 66f0f5f..32fc168 100644 --- a/src/unstructured/documents/pdf_parser/image.py +++ b/src/bisheng_unstructured/documents/pdf_parser/image.py @@ -1,9 +1,9 @@ from typing import Any, Iterator, List, Mapping, Optional, Union import base64 -from unstructured.documents.base import Page +from bisheng_unstructured.documents.base import Page -from unstructured.models import ( +from bisheng_unstructured.models import ( LayoutAgent, TableAgent, OCRAgent, TableDetAgent) from .blob import Blob diff --git a/src/unstructured/documents/pdf_parser/pdf.py b/src/bisheng_unstructured/documents/pdf_parser/pdf.py similarity index 99% rename from src/unstructured/documents/pdf_parser/pdf.py rename to src/bisheng_unstructured/documents/pdf_parser/pdf.py index ec4f49c..ad566df 100644 --- a/src/unstructured/documents/pdf_parser/pdf.py +++ b/src/bisheng_unstructured/documents/pdf_parser/pdf.py @@ -22,11 +22,11 @@ import pypdfium2 import fitz -from unstructured.models import ( +from bisheng_unstructured.models import ( LayoutAgent, TableAgent, OCRAgent, TableDetAgent) -from unstructured.documents.base import Document, Page -from unstructured.documents.markdown import ( +from bisheng_unstructured.documents.base import Document, Page +from bisheng_unstructured.documents.markdown import ( transform_html_table_to_md, merge_md_tables, merge_html_tables, @@ -34,7 +34,7 @@ clean_html_table ) -from unstructured.documents.elements import ( +from bisheng_unstructured.documents.elements import ( ListItem, NarrativeText, Text, diff --git a/src/unstructured/documents/pdf_parser/test_pdf.py b/src/bisheng_unstructured/documents/pdf_parser/test_pdf.py similarity index 98% rename from src/unstructured/documents/pdf_parser/test_pdf.py rename to src/bisheng_unstructured/documents/pdf_parser/test_pdf.py index f950f47..b245b7d 100644 --- a/src/unstructured/documents/pdf_parser/test_pdf.py +++ b/src/bisheng_unstructured/documents/pdf_parser/test_pdf.py @@ -8,8 +8,8 @@ import pypdfium2 import fitz -from unstructured.models import LayoutAgent, TableAgent, OCRAgent -from unstructured.documents.pdf_parser.blob import Blob +from bisheng_unstructured.models import LayoutAgent, TableAgent, OCRAgent +from bisheng_unstructured.documents.pdf_parser.blob import Blob def draw_polygon(image, bbox, text=None, color=(255, 0, 0), thickness=1): diff --git a/src/unstructured/documents/xml.py b/src/bisheng_unstructured/documents/xml.py similarity index 95% rename from src/unstructured/documents/xml.py rename to src/bisheng_unstructured/documents/xml.py index c85ea5e..4014bb9 100644 --- a/src/unstructured/documents/xml.py +++ b/src/bisheng_unstructured/documents/xml.py @@ -2,10 +2,10 @@ from lxml import etree -from unstructured.documents.base import Document, Page -from unstructured.file_utils.encoding import read_txt_file -from unstructured.logger import logger -from unstructured.partition.text import ( +from bisheng_unstructured.documents.base import Document, Page +from bisheng_unstructured.file_utils.encoding import read_txt_file +from bisheng_unstructured.logger import logger +from bisheng_unstructured.partition.text import ( element_from_text, partition_text, ) diff --git a/src/unstructured/file_utils/__init__.py b/src/bisheng_unstructured/file_utils/__init__.py similarity index 100% rename from src/unstructured/file_utils/__init__.py rename to src/bisheng_unstructured/file_utils/__init__.py diff --git a/src/unstructured/file_utils/encoding.py b/src/bisheng_unstructured/file_utils/encoding.py similarity index 98% rename from src/unstructured/file_utils/encoding.py rename to src/bisheng_unstructured/file_utils/encoding.py index 01d57ff..c59bf77 100644 --- a/src/unstructured/file_utils/encoding.py +++ b/src/bisheng_unstructured/file_utils/encoding.py @@ -2,7 +2,7 @@ import chardet -from unstructured.partition.common import convert_to_bytes +from bisheng_unstructured.partition.common import convert_to_bytes ENCODE_REC_THRESHOLD = 0.8 diff --git a/src/unstructured/file_utils/exploration.py b/src/bisheng_unstructured/file_utils/exploration.py similarity index 97% rename from src/unstructured/file_utils/exploration.py rename to src/bisheng_unstructured/file_utils/exploration.py index 55d9719..ede00a4 100644 --- a/src/unstructured/file_utils/exploration.py +++ b/src/bisheng_unstructured/file_utils/exploration.py @@ -5,7 +5,7 @@ import pandas as pd -from unstructured.file_utils.filetype import detect_filetype +from bisheng_unstructured.file_utils.filetype import detect_filetype def get_directory_file_info(directory: str) -> pd.DataFrame: diff --git a/src/unstructured/file_utils/file_conversion.py b/src/bisheng_unstructured/file_utils/file_conversion.py similarity index 91% rename from src/unstructured/file_utils/file_conversion.py rename to src/bisheng_unstructured/file_utils/file_conversion.py index 23b803e..e072a99 100644 --- a/src/unstructured/file_utils/file_conversion.py +++ b/src/bisheng_unstructured/file_utils/file_conversion.py @@ -1,8 +1,8 @@ import tempfile from typing import IO, Optional -from unstructured.partition.common import exactly_one -from unstructured.utils import dependency_exists, requires_dependencies +from bisheng_unstructured.partition.common import exactly_one +from bisheng_unstructured.utils import dependency_exists, requires_dependencies if dependency_exists("pypandoc"): import pypandoc diff --git a/src/unstructured/file_utils/filetype.py b/src/bisheng_unstructured/file_utils/filetype.py similarity index 97% rename from src/unstructured/file_utils/filetype.py rename to src/bisheng_unstructured/file_utils/filetype.py index a4dd120..9ecdd24 100644 --- a/src/unstructured/file_utils/filetype.py +++ b/src/bisheng_unstructured/file_utils/filetype.py @@ -7,21 +7,20 @@ import zipfile from enum import Enum from functools import wraps -from typing import IO, TYPE_CHECKING, Callable, List, Optional +from typing import IO, Callable, List, Optional -from unstructured.documents.coordinates import PixelSpace -from unstructured.documents.elements import Element, PageBreak -from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str -from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN -from unstructured.partition.common import ( +from bisheng_unstructured.documents.coordinates import PixelSpace +from bisheng_unstructured.documents.elements import Element, PageBreak +from bisheng_unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str +from bisheng_unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN +from bisheng_unstructured.partition.common import ( _add_element_metadata, _remove_element_metadata, exactly_one, normalize_layout_element, ) -if TYPE_CHECKING: - from unstructured_inference.inference.layout import DocumentLayout, PageLayout +from bisheng_unstructured.documents.layout import DocumentLayout, PageLayout try: import magic @@ -30,8 +29,8 @@ except ImportError: # pragma: nocover LIBMAGIC_AVAILABLE = False # pragma: nocover -from unstructured.logger import logger -from unstructured.nlp.patterns import EMAIL_HEAD_RE +from bisheng_unstructured.logger import logger +from bisheng_unstructured.nlp.patterns import EMAIL_HEAD_RE TXT_MIME_TYPES = [ "text/plain", diff --git a/src/unstructured/file_utils/google_filetype.py b/src/bisheng_unstructured/file_utils/google_filetype.py similarity index 100% rename from src/unstructured/file_utils/google_filetype.py rename to src/bisheng_unstructured/file_utils/google_filetype.py diff --git a/src/unstructured/file_utils/metadata.py b/src/bisheng_unstructured/file_utils/metadata.py similarity index 100% rename from src/unstructured/file_utils/metadata.py rename to src/bisheng_unstructured/file_utils/metadata.py diff --git a/src/unstructured/logger.py b/src/bisheng_unstructured/logger.py similarity index 100% rename from src/unstructured/logger.py rename to src/bisheng_unstructured/logger.py diff --git a/src/unstructured/models/__init__.py b/src/bisheng_unstructured/models/__init__.py similarity index 100% rename from src/unstructured/models/__init__.py rename to src/bisheng_unstructured/models/__init__.py diff --git a/src/unstructured/models/layout_agent.py b/src/bisheng_unstructured/models/layout_agent.py similarity index 100% rename from src/unstructured/models/layout_agent.py rename to src/bisheng_unstructured/models/layout_agent.py diff --git a/src/unstructured/models/ocr_agent.py b/src/bisheng_unstructured/models/ocr_agent.py similarity index 100% rename from src/unstructured/models/ocr_agent.py rename to src/bisheng_unstructured/models/ocr_agent.py diff --git a/src/unstructured/models/table_agent.py b/src/bisheng_unstructured/models/table_agent.py similarity index 100% rename from src/unstructured/models/table_agent.py rename to src/bisheng_unstructured/models/table_agent.py diff --git a/src/unstructured/ingest/__init__.py b/src/bisheng_unstructured/nlp/__init__.py similarity index 100% rename from src/unstructured/ingest/__init__.py rename to src/bisheng_unstructured/nlp/__init__.py diff --git a/src/unstructured/nlp/english-words.txt b/src/bisheng_unstructured/nlp/english-words.txt similarity index 100% rename from src/unstructured/nlp/english-words.txt rename to src/bisheng_unstructured/nlp/english-words.txt diff --git a/src/unstructured/nlp/english_words.py b/src/bisheng_unstructured/nlp/english_words.py similarity index 100% rename from src/unstructured/nlp/english_words.py rename to src/bisheng_unstructured/nlp/english_words.py diff --git a/src/bisheng_unstructured/nlp/partition.py b/src/bisheng_unstructured/nlp/partition.py new file mode 100644 index 0000000..5621365 --- /dev/null +++ b/src/bisheng_unstructured/nlp/partition.py @@ -0,0 +1,7 @@ +# flake8: noqa +from bisheng_unstructured.partition.pdf import partition_pdf +from bisheng_unstructured.partition.text_type import ( + is_bulleted_text, + is_possible_narrative_text, + is_possible_title, +) diff --git a/src/unstructured/nlp/patterns.py b/src/bisheng_unstructured/nlp/patterns.py similarity index 100% rename from src/unstructured/nlp/patterns.py rename to src/bisheng_unstructured/nlp/patterns.py diff --git a/src/unstructured/nlp/tokenize.py b/src/bisheng_unstructured/nlp/tokenize.py similarity index 100% rename from src/unstructured/nlp/tokenize.py rename to src/bisheng_unstructured/nlp/tokenize.py diff --git a/src/unstructured/ingest/cli/__init__.py b/src/bisheng_unstructured/partition/__init__.py similarity index 100% rename from src/unstructured/ingest/cli/__init__.py rename to src/bisheng_unstructured/partition/__init__.py diff --git a/src/unstructured/partition/api.py b/src/bisheng_unstructured/partition/api.py similarity index 96% rename from src/unstructured/partition/api.py rename to src/bisheng_unstructured/partition/api.py index 1064c06..db81c67 100644 --- a/src/unstructured/partition/api.py +++ b/src/bisheng_unstructured/partition/api.py @@ -7,9 +7,9 @@ import requests -from unstructured.documents.elements import Element -from unstructured.partition.common import exactly_one -from unstructured.staging.base import dict_to_elements, elements_from_json +from bisheng_unstructured.documents.elements import Element +from bisheng_unstructured.partition.common import exactly_one +from bisheng_unstructured.staging.base import dict_to_elements, elements_from_json def partition_via_api( diff --git a/src/unstructured/partition/auto.py b/src/bisheng_unstructured/partition/auto.py similarity index 86% rename from src/unstructured/partition/auto.py rename to src/bisheng_unstructured/partition/auto.py index d15649b..c14e07b 100644 --- a/src/unstructured/partition/auto.py +++ b/src/bisheng_unstructured/partition/auto.py @@ -3,71 +3,71 @@ import requests -from unstructured.documents.elements import DataSourceMetadata -from unstructured.file_utils.filetype import ( +from bisheng_unstructured.documents.elements import DataSourceMetadata +from bisheng_unstructured.file_utils.filetype import ( FILETYPE_TO_MIMETYPE, STR_TO_FILETYPE, FileType, detect_filetype, is_json_processable, ) -from unstructured.logger import logger -from unstructured.partition.common import exactly_one -from unstructured.partition.email import partition_email -from unstructured.partition.html import partition_html -from unstructured.partition.json import partition_json -from unstructured.partition.text import partition_text -from unstructured.partition.xml import partition_xml -from unstructured.utils import dependency_exists +from bisheng_unstructured.logger import logger +from bisheng_unstructured.partition.common import exactly_one +from bisheng_unstructured.partition.email import partition_email +from bisheng_unstructured.partition.html import partition_html +from bisheng_unstructured.partition.json import partition_json +from bisheng_unstructured.partition.text import partition_text +from bisheng_unstructured.partition.xml import partition_xml +from bisheng_unstructured.utils import dependency_exists if dependency_exists("pandas"): - from unstructured.partition.csv import partition_csv - from unstructured.partition.tsv import partition_tsv + from bisheng_unstructured.partition.csv import partition_csv + from bisheng_unstructured.partition.tsv import partition_tsv if dependency_exists("docx"): - from unstructured.partition.doc import partition_doc - from unstructured.partition.docx import partition_docx + from bisheng_unstructured.partition.doc import partition_doc + from bisheng_unstructured.partition.docx import partition_docx if dependency_exists("docx") and dependency_exists("pypandoc"): - from unstructured.partition.odt import partition_odt + from bisheng_unstructured.partition.odt import partition_odt if dependency_exists("ebooklib"): - from unstructured.partition.epub import partition_epub + from bisheng_unstructured.partition.epub import partition_epub if dependency_exists("pypandoc"): - from unstructured.partition.org import partition_org - from unstructured.partition.rst import partition_rst - from unstructured.partition.rtf import partition_rtf + from bisheng_unstructured.partition.org import partition_org + from bisheng_unstructured.partition.rst import partition_rst + from bisheng_unstructured.partition.rtf import partition_rtf if dependency_exists("markdown"): - from unstructured.partition.md import partition_md + from bisheng_unstructured.partition.md import partition_md if dependency_exists("msg_parser"): - from unstructured.partition.msg import partition_msg + from bisheng_unstructured.partition.msg import partition_msg pdf_imports = ["pdf2image", "pdfminer", "PIL"] if all(dependency_exists(dep) for dep in pdf_imports): - from unstructured.partition.pdf import partition_pdf + from bisheng_unstructured.partition.pdf import partition_pdf if dependency_exists("unstructured_inference"): - from unstructured.partition.image import partition_image + from bisheng_unstructured.partition.image import partition_image if dependency_exists("pptx"): - from unstructured.partition.ppt import partition_ppt - from unstructured.partition.pptx import partition_pptx + from bisheng_unstructured.partition.ppt import partition_ppt + from bisheng_unstructured.partition.pptx import partition_pptx if dependency_exists("pandas") and dependency_exists("openpyxl"): - from unstructured.partition.xlsx import partition_xlsx + from bisheng_unstructured.partition.xlsx import partition_xlsx def partition( diff --git a/src/unstructured/partition/common.py b/src/bisheng_unstructured/partition/common.py similarity index 96% rename from src/unstructured/partition/common.py rename to src/bisheng_unstructured/partition/common.py index 8975aa4..915da72 100644 --- a/src/unstructured/partition/common.py +++ b/src/bisheng_unstructured/partition/common.py @@ -10,8 +10,8 @@ import emoji from tabulate import tabulate -from unstructured.documents.coordinates import CoordinateSystem -from unstructured.documents.elements import ( +from bisheng_unstructured.documents.coordinates import CoordinateSystem +from bisheng_unstructured.documents.elements import ( TYPE_TO_TEXT_ELEMENT_MAP, CheckBox, CoordinatesMetadata, @@ -21,15 +21,15 @@ PageBreak, Text, ) -from unstructured.logger import logger -from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE -from unstructured.utils import dependency_exists +from bisheng_unstructured.logger import logger +from bisheng_unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE +from bisheng_unstructured.utils import dependency_exists if dependency_exists("docx") and dependency_exists("docx.table"): from docx.table import Table as docxtable if TYPE_CHECKING: - from unstructured_inference.inference.layoutelement import ( + from bisheng_unstructured.inference.inference.layoutelement import ( LayoutElement, LocationlessLayoutElement, ) diff --git a/src/unstructured/partition/csv.py b/src/bisheng_unstructured/partition/csv.py similarity index 91% rename from src/unstructured/partition/csv.py rename to src/bisheng_unstructured/partition/csv.py index 6a7314d..1cd5e95 100644 --- a/src/unstructured/partition/csv.py +++ b/src/bisheng_unstructured/partition/csv.py @@ -4,14 +4,14 @@ import pandas as pd from lxml.html.soupparser import fromstring as soupparser_fromstring -from unstructured.documents.elements import ( +from bisheng_unstructured.documents.elements import ( Element, ElementMetadata, Table, process_metadata, ) -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.partition.common import ( +from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from bisheng_unstructured.partition.common import ( exactly_one, get_last_modified_date, get_last_modified_date_from_file, diff --git a/src/unstructured/partition/doc.py b/src/bisheng_unstructured/partition/doc.py similarity index 90% rename from src/unstructured/partition/doc.py rename to src/bisheng_unstructured/partition/doc.py index ffa8ccc..ba141be 100644 --- a/src/unstructured/partition/doc.py +++ b/src/bisheng_unstructured/partition/doc.py @@ -2,15 +2,15 @@ import tempfile from typing import IO, List, Optional -from unstructured.documents.elements import Element, process_metadata -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.partition.common import ( +from bisheng_unstructured.documents.elements import Element, process_metadata +from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from bisheng_unstructured.partition.common import ( convert_office_doc, exactly_one, get_last_modified_date, get_last_modified_date_from_file, ) -from unstructured.partition.docx import partition_docx +from bisheng_unstructured.partition.docx import partition_docx @process_metadata() diff --git a/src/unstructured/partition/docx.py b/src/bisheng_unstructured/partition/docx.py similarity index 97% rename from src/unstructured/partition/docx.py rename to src/bisheng_unstructured/partition/docx.py index fbe817d..3f9dec4 100644 --- a/src/unstructured/partition/docx.py +++ b/src/bisheng_unstructured/partition/docx.py @@ -9,8 +9,8 @@ from docx.text.paragraph import Paragraph from docx.text.run import Run -from unstructured.cleaners.core import clean_bullets -from unstructured.documents.elements import ( +from bisheng_unstructured.cleaners.core import clean_bullets +from bisheng_unstructured.documents.elements import ( Address, Element, ElementMetadata, @@ -25,22 +25,22 @@ Title, process_metadata, ) -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.partition.common import ( +from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from bisheng_unstructured.partition.common import ( convert_ms_office_table_to_text, exactly_one, get_last_modified_date, get_last_modified_date_from_file, spooled_to_bytes_io_if_needed, ) -from unstructured.partition.text_type import ( +from bisheng_unstructured.partition.text_type import ( is_bulleted_text, is_email_address, is_possible_narrative_text, is_possible_title, is_us_city_state_zip, ) -from unstructured.utils import dependency_exists +from bisheng_unstructured.utils import dependency_exists if dependency_exists("pypandoc"): import pypandoc diff --git a/src/unstructured/partition/email.py b/src/bisheng_unstructured/partition/email.py similarity index 95% rename from src/unstructured/partition/email.py rename to src/bisheng_unstructured/partition/email.py index 12a4452..4150bd3 100644 --- a/src/unstructured/partition/email.py +++ b/src/bisheng_unstructured/partition/email.py @@ -8,13 +8,13 @@ from tempfile import NamedTemporaryFile, SpooledTemporaryFile, TemporaryDirectory from typing import IO, Callable, Dict, List, Optional, Tuple, Union -from unstructured.file_utils.encoding import ( +from bisheng_unstructured.file_utils.encoding import ( COMMON_ENCODINGS, format_encoding_str, read_txt_file, validate_encoding, ) -from unstructured.partition.common import ( +from bisheng_unstructured.partition.common import ( convert_to_bytes, exactly_one, ) @@ -24,15 +24,15 @@ else: from typing import Final -from unstructured.cleaners.core import clean_extra_whitespace, replace_mime_encodings -from unstructured.cleaners.extract import ( +from bisheng_unstructured.cleaners.core import clean_extra_whitespace, replace_mime_encodings +from bisheng_unstructured.cleaners.extract import ( extract_datetimetz, extract_email_address, extract_ip_address, extract_ip_address_name, extract_mapi_id, ) -from unstructured.documents.elements import ( +from bisheng_unstructured.documents.elements import ( Element, ElementMetadata, Image, @@ -41,18 +41,18 @@ Title, process_metadata, ) -from unstructured.documents.email_elements import ( +from bisheng_unstructured.documents.email_elements import ( MetaData, ReceivedInfo, Recipient, Sender, Subject, ) -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.logger import logger -from unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE -from unstructured.partition.html import partition_html -from unstructured.partition.text import partition_text +from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from bisheng_unstructured.logger import logger +from bisheng_unstructured.nlp.patterns import EMAIL_DATETIMETZ_PATTERN_RE +from bisheng_unstructured.partition.html import partition_html +from bisheng_unstructured.partition.text import partition_text VALID_CONTENT_SOURCES: Final[List[str]] = ["text/html", "text/plain"] diff --git a/src/unstructured/partition/epub.py b/src/bisheng_unstructured/partition/epub.py similarity index 91% rename from src/unstructured/partition/epub.py rename to src/bisheng_unstructured/partition/epub.py index 9a108bb..12b0298 100644 --- a/src/unstructured/partition/epub.py +++ b/src/bisheng_unstructured/partition/epub.py @@ -3,14 +3,14 @@ from ebooklib import epub -from unstructured.documents.elements import Element, process_metadata -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.partition.common import ( +from bisheng_unstructured.documents.elements import Element, process_metadata +from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from bisheng_unstructured.partition.common import ( exactly_one, get_last_modified_date, get_last_modified_date_from_file, ) -from unstructured.partition.html import partition_html +from bisheng_unstructured.partition.html import partition_html @process_metadata() diff --git a/src/unstructured/partition/html.py b/src/bisheng_unstructured/partition/html.py similarity index 91% rename from src/unstructured/partition/html.py rename to src/bisheng_unstructured/partition/html.py index 8996529..049ff0a 100644 --- a/src/unstructured/partition/html.py +++ b/src/bisheng_unstructured/partition/html.py @@ -2,24 +2,24 @@ import requests -from unstructured.documents.elements import Element, process_metadata -from unstructured.documents.html import HTMLDocument -from unstructured.documents.xml import VALID_PARSERS -from unstructured.file_utils.encoding import read_txt_file -from unstructured.file_utils.file_conversion import convert_file_to_html_text -from unstructured.file_utils.filetype import ( +from bisheng_unstructured.documents.elements import Element, process_metadata +from bisheng_unstructured.documents.html import HTMLDocument +from bisheng_unstructured.documents.xml import VALID_PARSERS +from bisheng_unstructured.file_utils.encoding import read_txt_file +from bisheng_unstructured.file_utils.file_conversion import convert_file_to_html_text +from bisheng_unstructured.file_utils.filetype import ( FileType, add_metadata_with_filetype, document_to_element_list, ) -from unstructured.partition.common import ( +from bisheng_unstructured.partition.common import ( exactly_one, get_last_modified_date, get_last_modified_date_from_file, ) if TYPE_CHECKING: - from unstructured_inference.inference.layout import DocumentLayout + from bisheng_unstructured.inference.inference.layout import DocumentLayout @process_metadata() diff --git a/src/unstructured/partition/image.py b/src/bisheng_unstructured/partition/image.py similarity index 91% rename from src/unstructured/partition/image.py rename to src/bisheng_unstructured/partition/image.py index 65bf87a..41b1a1b 100644 --- a/src/unstructured/partition/image.py +++ b/src/bisheng_unstructured/partition/image.py @@ -1,8 +1,8 @@ from typing import List, Optional -from unstructured.documents.elements import Element, process_metadata -from unstructured.partition.common import exactly_one -from unstructured.partition.pdf import partition_pdf_or_image +from bisheng_unstructured.documents.elements import Element, process_metadata +from bisheng_unstructured.partition.common import exactly_one +from bisheng_unstructured.partition.pdf import partition_pdf_or_image @process_metadata() diff --git a/src/unstructured/partition/json.py b/src/bisheng_unstructured/partition/json.py similarity index 90% rename from src/unstructured/partition/json.py rename to src/bisheng_unstructured/partition/json.py index f4771da..fd1f012 100644 --- a/src/unstructured/partition/json.py +++ b/src/bisheng_unstructured/partition/json.py @@ -1,18 +1,18 @@ import json from typing import IO, List, Optional -from unstructured.documents.elements import Element, process_metadata -from unstructured.file_utils.filetype import ( +from bisheng_unstructured.documents.elements import Element, process_metadata +from bisheng_unstructured.file_utils.filetype import ( FileType, add_metadata_with_filetype, is_json_processable, ) -from unstructured.partition.common import ( +from bisheng_unstructured.partition.common import ( exactly_one, get_last_modified_date, get_last_modified_date_from_file, ) -from unstructured.staging.base import dict_to_elements +from bisheng_unstructured.staging.base import dict_to_elements @process_metadata() diff --git a/src/unstructured/partition/md.py b/src/bisheng_unstructured/partition/md.py similarity index 89% rename from src/unstructured/partition/md.py rename to src/bisheng_unstructured/partition/md.py index cb19bbb..bf36768 100644 --- a/src/unstructured/partition/md.py +++ b/src/bisheng_unstructured/partition/md.py @@ -3,15 +3,15 @@ import markdown import requests -from unstructured.documents.elements import Element, process_metadata -from unstructured.documents.xml import VALID_PARSERS -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.partition.common import ( +from bisheng_unstructured.documents.elements import Element, process_metadata +from bisheng_unstructured.documents.xml import VALID_PARSERS +from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from bisheng_unstructured.partition.common import ( exactly_one, get_last_modified_date, get_last_modified_date_from_file, ) -from unstructured.partition.html import partition_html +from bisheng_unstructured.partition.html import partition_html def optional_decode(contents: Union[str, bytes]) -> str: diff --git a/src/unstructured/partition/msg.py b/src/bisheng_unstructured/partition/msg.py similarity index 92% rename from src/unstructured/partition/msg.py rename to src/bisheng_unstructured/partition/msg.py index 414cc18..4321b16 100644 --- a/src/unstructured/partition/msg.py +++ b/src/bisheng_unstructured/partition/msg.py @@ -4,12 +4,12 @@ import msg_parser -from unstructured.documents.elements import Element, ElementMetadata, process_metadata -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.partition.common import exactly_one -from unstructured.partition.email import convert_to_iso_8601 -from unstructured.partition.html import partition_html -from unstructured.partition.text import partition_text +from bisheng_unstructured.documents.elements import Element, ElementMetadata, process_metadata +from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from bisheng_unstructured.partition.common import exactly_one +from bisheng_unstructured.partition.email import convert_to_iso_8601 +from bisheng_unstructured.partition.html import partition_html +from bisheng_unstructured.partition.text import partition_text @process_metadata() diff --git a/src/unstructured/partition/odt.py b/src/bisheng_unstructured/partition/odt.py similarity index 80% rename from src/unstructured/partition/odt.py rename to src/bisheng_unstructured/partition/odt.py index c6e1f9e..457e73d 100644 --- a/src/unstructured/partition/odt.py +++ b/src/bisheng_unstructured/partition/odt.py @@ -1,12 +1,12 @@ from typing import IO, List, Optional -from unstructured.documents.elements import Element, process_metadata -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.partition.common import ( +from bisheng_unstructured.documents.elements import Element, process_metadata +from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from bisheng_unstructured.partition.common import ( get_last_modified_date, get_last_modified_date_from_file, ) -from unstructured.partition.docx import convert_and_partition_docx +from bisheng_unstructured.partition.docx import convert_and_partition_docx @process_metadata() diff --git a/src/unstructured/partition/org.py b/src/bisheng_unstructured/partition/org.py similarity index 83% rename from src/unstructured/partition/org.py rename to src/bisheng_unstructured/partition/org.py index 9e12d28..ae77a22 100644 --- a/src/unstructured/partition/org.py +++ b/src/bisheng_unstructured/partition/org.py @@ -1,8 +1,8 @@ from typing import IO, List, Optional -from unstructured.documents.elements import Element -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.partition.html import convert_and_partition_html +from bisheng_unstructured.documents.elements import Element +from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from bisheng_unstructured.partition.html import convert_and_partition_html @add_metadata_with_filetype(FileType.ORG) diff --git a/src/unstructured/partition/pdf.py b/src/bisheng_unstructured/partition/pdf.py similarity index 96% rename from src/unstructured/partition/pdf.py rename to src/bisheng_unstructured/partition/pdf.py index 12724d3..3622c0b 100644 --- a/src/unstructured/partition/pdf.py +++ b/src/bisheng_unstructured/partition/pdf.py @@ -10,9 +10,9 @@ from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox from pdfminer.utils import open_filename -from unstructured.cleaners.core import clean_extra_whitespace -from unstructured.documents.coordinates import PixelSpace -from unstructured.documents.elements import ( +from bisheng_unstructured.cleaners.core import clean_extra_whitespace +from bisheng_unstructured.documents.coordinates import PixelSpace +from bisheng_unstructured.documents.elements import ( CoordinatesMetadata, Element, ElementMetadata, @@ -21,22 +21,22 @@ Text, process_metadata, ) -from unstructured.file_utils.filetype import ( +from bisheng_unstructured.file_utils.filetype import ( FileType, add_metadata_with_filetype, document_to_element_list, ) -from unstructured.nlp.patterns import PARAGRAPH_PATTERN -from unstructured.partition.common import ( +from bisheng_unstructured.nlp.patterns import PARAGRAPH_PATTERN +from bisheng_unstructured.partition.common import ( convert_to_bytes, exactly_one, get_last_modified_date, get_last_modified_date_from_file, spooled_to_bytes_io_if_needed, ) -from unstructured.partition.strategies import determine_pdf_or_image_strategy -from unstructured.partition.text import element_from_text, partition_text -from unstructured.utils import requires_dependencies +from bisheng_unstructured.partition.strategies import determine_pdf_or_image_strategy +from bisheng_unstructured.partition.text import element_from_text, partition_text +from bisheng_unstructured.utils import requires_dependencies RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL) @@ -227,7 +227,7 @@ def _partition_pdf_or_image_local( **kwargs, ) -> List[Element]: """Partition using package installed locally.""" - from unstructured_inference.inference.layout import ( + from bisheng_unstructured.inference.inference.layout import ( process_data_with_model, process_file_with_model, ) diff --git a/src/unstructured/partition/ppt.py b/src/bisheng_unstructured/partition/ppt.py similarity index 89% rename from src/unstructured/partition/ppt.py rename to src/bisheng_unstructured/partition/ppt.py index 9d9598e..fd0dbad 100644 --- a/src/unstructured/partition/ppt.py +++ b/src/bisheng_unstructured/partition/ppt.py @@ -2,15 +2,15 @@ import tempfile from typing import IO, List, Optional -from unstructured.documents.elements import Element, process_metadata -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.partition.common import ( +from bisheng_unstructured.documents.elements import Element, process_metadata +from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from bisheng_unstructured.partition.common import ( convert_office_doc, exactly_one, get_last_modified_date, get_last_modified_date_from_file, ) -from unstructured.partition.pptx import partition_pptx +from bisheng_unstructured.partition.pptx import partition_pptx @process_metadata() diff --git a/src/unstructured/partition/pptx.py b/src/bisheng_unstructured/partition/pptx.py similarity index 95% rename from src/unstructured/partition/pptx.py rename to src/bisheng_unstructured/partition/pptx.py index 0c2bd69..d075b0d 100644 --- a/src/unstructured/partition/pptx.py +++ b/src/bisheng_unstructured/partition/pptx.py @@ -3,7 +3,7 @@ import pptx -from unstructured.documents.elements import ( +from bisheng_unstructured.documents.elements import ( Element, ElementMetadata, EmailAddress, @@ -15,15 +15,15 @@ Title, process_metadata, ) -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.partition.common import ( +from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from bisheng_unstructured.partition.common import ( convert_ms_office_table_to_text, exactly_one, get_last_modified_date, get_last_modified_date_from_file, spooled_to_bytes_io_if_needed, ) -from unstructured.partition.text_type import ( +from bisheng_unstructured.partition.text_type import ( is_email_address, is_possible_narrative_text, is_possible_title, diff --git a/src/unstructured/partition/rst.py b/src/bisheng_unstructured/partition/rst.py similarity index 82% rename from src/unstructured/partition/rst.py rename to src/bisheng_unstructured/partition/rst.py index c452426..19cdad1 100644 --- a/src/unstructured/partition/rst.py +++ b/src/bisheng_unstructured/partition/rst.py @@ -1,8 +1,8 @@ from typing import IO, List, Optional -from unstructured.documents.elements import Element, process_metadata -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.partition.html import convert_and_partition_html +from bisheng_unstructured.documents.elements import Element, process_metadata +from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from bisheng_unstructured.partition.html import convert_and_partition_html @process_metadata() diff --git a/src/unstructured/partition/rtf.py b/src/bisheng_unstructured/partition/rtf.py similarity index 82% rename from src/unstructured/partition/rtf.py rename to src/bisheng_unstructured/partition/rtf.py index 1dbe9b5..e68e017 100644 --- a/src/unstructured/partition/rtf.py +++ b/src/bisheng_unstructured/partition/rtf.py @@ -1,8 +1,8 @@ from typing import IO, List, Optional -from unstructured.documents.elements import Element, process_metadata -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.partition.html import convert_and_partition_html +from bisheng_unstructured.documents.elements import Element, process_metadata +from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from bisheng_unstructured.partition.html import convert_and_partition_html @process_metadata() diff --git a/src/unstructured/partition/strategies.py b/src/bisheng_unstructured/partition/strategies.py similarity index 97% rename from src/unstructured/partition/strategies.py rename to src/bisheng_unstructured/partition/strategies.py index e66040e..1615e2b 100644 --- a/src/unstructured/partition/strategies.py +++ b/src/bisheng_unstructured/partition/strategies.py @@ -1,8 +1,8 @@ from tempfile import SpooledTemporaryFile from typing import BinaryIO, Dict, List, Optional, Union -from unstructured.logger import logger -from unstructured.utils import dependency_exists +from bisheng_unstructured.logger import logger +from bisheng_unstructured.utils import dependency_exists VALID_STRATEGIES: Dict[str, List[str]] = { "auto": [ diff --git a/src/unstructured/partition/text.py b/src/bisheng_unstructured/partition/text.py similarity index 94% rename from src/unstructured/partition/text.py rename to src/bisheng_unstructured/partition/text.py index b3126d2..83905fd 100644 --- a/src/unstructured/partition/text.py +++ b/src/bisheng_unstructured/partition/text.py @@ -2,12 +2,12 @@ import textwrap from typing import IO, Callable, List, Optional, Tuple -from unstructured.cleaners.core import ( +from bisheng_unstructured.cleaners.core import ( auto_paragraph_grouper, clean_bullets, ) -from unstructured.documents.coordinates import CoordinateSystem -from unstructured.documents.elements import ( +from bisheng_unstructured.documents.coordinates import CoordinateSystem +from bisheng_unstructured.documents.elements import ( Address, Element, ElementMetadata, @@ -18,16 +18,16 @@ Title, process_metadata, ) -from unstructured.file_utils.encoding import read_txt_file -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.nlp.patterns import PARAGRAPH_PATTERN -from unstructured.nlp.tokenize import sent_tokenize -from unstructured.partition.common import ( +from bisheng_unstructured.file_utils.encoding import read_txt_file +from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from bisheng_unstructured.nlp.patterns import PARAGRAPH_PATTERN +from bisheng_unstructured.nlp.tokenize import sent_tokenize +from bisheng_unstructured.partition.common import ( exactly_one, get_last_modified_date, get_last_modified_date_from_file, ) -from unstructured.partition.text_type import ( +from bisheng_unstructured.partition.text_type import ( is_bulleted_text, is_email_address, is_possible_narrative_text, diff --git a/src/unstructured/partition/text_type.py b/src/bisheng_unstructured/partition/text_type.py similarity index 97% rename from src/unstructured/partition/text_type.py rename to src/bisheng_unstructured/partition/text_type.py index d515011..7507af8 100644 --- a/src/unstructured/partition/text_type.py +++ b/src/bisheng_unstructured/partition/text_type.py @@ -9,10 +9,10 @@ else: from typing import Final -from unstructured.cleaners.core import remove_punctuation -from unstructured.logger import trace_logger -from unstructured.nlp.english_words import ENGLISH_WORDS -from unstructured.nlp.patterns import ( +from bisheng_unstructured.cleaners.core import remove_punctuation +from bisheng_unstructured.logger import trace_logger +from bisheng_unstructured.nlp.english_words import ENGLISH_WORDS +from bisheng_unstructured.nlp.patterns import ( EMAIL_ADDRESS_PATTERN_RE, ENDS_IN_PUNCT_RE, UNICODE_BULLETS_RE, @@ -20,7 +20,7 @@ US_PHONE_NUMBERS_RE, ZH_PUNC_NOT_IN_TITLE_RE, ) -from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize +from bisheng_unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"] ENGLISH_WORD_SPLIT_RE = re.compile(r"[\s\-,.!?_\/]+") diff --git a/src/unstructured/partition/tsv.py b/src/bisheng_unstructured/partition/tsv.py similarity index 91% rename from src/unstructured/partition/tsv.py rename to src/bisheng_unstructured/partition/tsv.py index 0fd2a89..b592ec5 100644 --- a/src/unstructured/partition/tsv.py +++ b/src/bisheng_unstructured/partition/tsv.py @@ -4,14 +4,14 @@ import pandas as pd from lxml.html.soupparser import fromstring as soupparser_fromstring -from unstructured.documents.elements import ( +from bisheng_unstructured.documents.elements import ( Element, ElementMetadata, Table, process_metadata, ) -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.partition.common import ( +from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from bisheng_unstructured.partition.common import ( exactly_one, get_last_modified_date, get_last_modified_date_from_file, diff --git a/src/unstructured/partition/xlsx.py b/src/bisheng_unstructured/partition/xlsx.py similarity index 92% rename from src/unstructured/partition/xlsx.py rename to src/bisheng_unstructured/partition/xlsx.py index 2f45382..4b4b8cb 100644 --- a/src/unstructured/partition/xlsx.py +++ b/src/bisheng_unstructured/partition/xlsx.py @@ -4,14 +4,14 @@ import pandas as pd from lxml.html.soupparser import fromstring as soupparser_fromstring -from unstructured.documents.elements import ( +from bisheng_unstructured.documents.elements import ( Element, ElementMetadata, Table, process_metadata, ) -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.partition.common import ( +from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from bisheng_unstructured.partition.common import ( exactly_one, get_last_modified_date, get_last_modified_date_from_file, diff --git a/src/unstructured/partition/xml.py b/src/bisheng_unstructured/partition/xml.py similarity index 91% rename from src/unstructured/partition/xml.py rename to src/bisheng_unstructured/partition/xml.py index dc4a153..1adbe62 100644 --- a/src/unstructured/partition/xml.py +++ b/src/bisheng_unstructured/partition/xml.py @@ -2,16 +2,16 @@ from tempfile import SpooledTemporaryFile from typing import IO, BinaryIO, List, Optional, Union, cast -from unstructured.documents.elements import Element, process_metadata -from unstructured.file_utils.encoding import read_txt_file -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.partition.common import ( +from bisheng_unstructured.documents.elements import Element, process_metadata +from bisheng_unstructured.file_utils.encoding import read_txt_file +from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from bisheng_unstructured.partition.common import ( exactly_one, get_last_modified_date, get_last_modified_date_from_file, spooled_to_bytes_io_if_needed, ) -from unstructured.partition.text import partition_text +from bisheng_unstructured.partition.text import partition_text def is_leaf(elem): diff --git a/src/unstructured/ingest/connector/__init__.py b/src/bisheng_unstructured/staging/__init__.py similarity index 100% rename from src/unstructured/ingest/connector/__init__.py rename to src/bisheng_unstructured/staging/__init__.py diff --git a/src/unstructured/staging/argilla.py b/src/bisheng_unstructured/staging/argilla.py similarity index 94% rename from src/unstructured/staging/argilla.py rename to src/bisheng_unstructured/staging/argilla.py index 7e4216a..b25a6c9 100644 --- a/src/unstructured/staging/argilla.py +++ b/src/bisheng_unstructured/staging/argilla.py @@ -7,8 +7,8 @@ TokenClassificationRecord, ) -from unstructured.documents.elements import Text -from unstructured.nlp.tokenize import word_tokenize +from bisheng_unstructured.documents.elements import Text +from bisheng_unstructured.nlp.tokenize import word_tokenize def stage_for_argilla( diff --git a/src/unstructured/staging/base.py b/src/bisheng_unstructured/staging/base.py similarity index 97% rename from src/unstructured/staging/base.py rename to src/bisheng_unstructured/staging/base.py index 0d2d012..f127d68 100644 --- a/src/unstructured/staging/base.py +++ b/src/bisheng_unstructured/staging/base.py @@ -3,15 +3,15 @@ import json from typing import Any, Dict, List, Optional -from unstructured.documents.elements import ( +from bisheng_unstructured.documents.elements import ( TYPE_TO_TEXT_ELEMENT_MAP, CheckBox, Element, ElementMetadata, NoID, ) -from unstructured.partition.common import exactly_one -from unstructured.utils import dependency_exists, requires_dependencies +from bisheng_unstructured.partition.common import exactly_one +from bisheng_unstructured.utils import dependency_exists, requires_dependencies if dependency_exists("pandas"): import pandas as pd diff --git a/src/unstructured/staging/baseplate.py b/src/bisheng_unstructured/staging/baseplate.py similarity index 93% rename from src/unstructured/staging/baseplate.py rename to src/bisheng_unstructured/staging/baseplate.py index e1e0122..1993907 100644 --- a/src/unstructured/staging/baseplate.py +++ b/src/bisheng_unstructured/staging/baseplate.py @@ -1,7 +1,7 @@ from typing import Dict, List, TypedDict -from unstructured.documents.elements import Text -from unstructured.staging.base import flatten_dict +from bisheng_unstructured.documents.elements import Text +from bisheng_unstructured.staging.base import flatten_dict class BaseplateRow(TypedDict): diff --git a/src/unstructured/staging/datasaur.py b/src/bisheng_unstructured/staging/datasaur.py similarity index 96% rename from src/unstructured/staging/datasaur.py rename to src/bisheng_unstructured/staging/datasaur.py index a7f96ae..607fe0e 100644 --- a/src/unstructured/staging/datasaur.py +++ b/src/bisheng_unstructured/staging/datasaur.py @@ -1,6 +1,6 @@ from typing import Any, Dict, List, Optional -from unstructured.documents.elements import Text +from bisheng_unstructured.documents.elements import Text def stage_for_datasaur( diff --git a/src/unstructured/staging/huggingface.py b/src/bisheng_unstructured/staging/huggingface.py similarity index 97% rename from src/unstructured/staging/huggingface.py rename to src/bisheng_unstructured/staging/huggingface.py index 3f67bb9..038b74f 100644 --- a/src/unstructured/staging/huggingface.py +++ b/src/bisheng_unstructured/staging/huggingface.py @@ -3,7 +3,7 @@ from transformers import PreTrainedTokenizer -from unstructured.documents.elements import Element, NarrativeText, Text +from bisheng_unstructured.documents.elements import Element, NarrativeText, Text def stage_for_transformers( diff --git a/src/unstructured/staging/label_box.py b/src/bisheng_unstructured/staging/label_box.py similarity index 98% rename from src/unstructured/staging/label_box.py rename to src/bisheng_unstructured/staging/label_box.py index 7e09491..453a0cd 100644 --- a/src/unstructured/staging/label_box.py +++ b/src/bisheng_unstructured/staging/label_box.py @@ -2,7 +2,7 @@ import uuid from typing import Any, Dict, List, Optional, Sequence, Union -from unstructured.documents.elements import UUID, NoID, Text +from bisheng_unstructured.documents.elements import UUID, NoID, Text VALID_ATTACHMENT_TYPES: List[str] = ["IMAGE", "VIDEO", "RAW_TEXT", "TEXT_URL", "HTML"] diff --git a/src/unstructured/staging/label_studio.py b/src/bisheng_unstructured/staging/label_studio.py similarity index 98% rename from src/unstructured/staging/label_studio.py rename to src/bisheng_unstructured/staging/label_studio.py index c94fe43..4dc0dd3 100644 --- a/src/unstructured/staging/label_studio.py +++ b/src/bisheng_unstructured/staging/label_studio.py @@ -2,7 +2,7 @@ from dataclasses import dataclass from typing import Any, Dict, List, Optional, Union -from unstructured.documents.elements import Text +from bisheng_unstructured.documents.elements import Text LABEL_STUDIO_TYPE = List[Dict[str, Dict[str, str]]] diff --git a/src/unstructured/staging/prodigy.py b/src/bisheng_unstructured/staging/prodigy.py similarity index 98% rename from src/unstructured/staging/prodigy.py rename to src/bisheng_unstructured/staging/prodigy.py index e4d5a99..55de695 100644 --- a/src/unstructured/staging/prodigy.py +++ b/src/bisheng_unstructured/staging/prodigy.py @@ -2,7 +2,7 @@ import io from typing import Dict, Generator, Iterable, List, Optional, Union -from unstructured.documents.elements import Text +from bisheng_unstructured.documents.elements import Text PRODIGY_TYPE = List[Dict[str, Union[str, Dict[str, str]]]] diff --git a/src/unstructured/staging/weaviate.py b/src/bisheng_unstructured/staging/weaviate.py similarity index 97% rename from src/unstructured/staging/weaviate.py rename to src/bisheng_unstructured/staging/weaviate.py index 6c11a16..0b2030d 100644 --- a/src/unstructured/staging/weaviate.py +++ b/src/bisheng_unstructured/staging/weaviate.py @@ -1,6 +1,6 @@ from typing import Any, Dict, List, TypedDict -from unstructured.documents.elements import ElementMetadata, Text +from bisheng_unstructured.documents.elements import ElementMetadata, Text class Properties(TypedDict): diff --git a/src/unstructured/utils.py b/src/bisheng_unstructured/utils.py similarity index 100% rename from src/unstructured/utils.py rename to src/bisheng_unstructured/utils.py diff --git a/src/unstructured/ingest/README.md b/src/unstructured/ingest/README.md deleted file mode 100644 index cc2d77e..0000000 --- a/src/unstructured/ingest/README.md +++ /dev/null @@ -1,81 +0,0 @@ -# Batch Processing Documents - -## The unstructured-ingest CLI - -The unstructured library includes a CLI to batch ingest documents from (soon to be -various) sources, storing structured outputs locally on the filesystem. - -For example, the following command processes all the documents in S3 in the -`utic-dev-tech-fixtures` bucket with a prefix of `small-pdf-set/`. - - unstructured-ingest \ - s3 \ - --remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \ - --anonymous \ - --structured-output-dir s3-small-batch-output \ - --num-processes 2 - -Naturally, --num-processes may be adjusted for better instance utilization with multiprocessing. - -Installation note: make sure to install the following extras when installing unstructured, needed for the above command: - - pip install "unstructured[s3,local-inference]" - -See the [Quick Start](https://github.com/Unstructured-IO/unstructured#eight_pointed_black_star-quick-start) which documents how to pip install `dectectron2` and other OS dependencies, necessary for the parsing of .PDF files. - -# Developers' Guide - -## Local testing - -When testing from a local checkout rather than a pip-installed version of `unstructured`, -just execute `unstructured/ingest/main.py`, e.g.: - - PYTHONPATH=. ./unstructured/ingest/main.py \ - s3 \ - --remote-url s3://utic-dev-tech-fixtures/small-pdf-set/ \ - --anonymous \ - --structured-output-dir s3-small-batch-output \ - --num-processes 2 - -## Adding Data Connectors - -To add a connector, refer to [unstructured/ingest/connector/github.py](unstructured/ingest/connector/github.py) as example that implements the three relevant abstract base classes. - -If the connector has an available `fsspec` implementation, then refer to [unstructured/ingest/connector/s3.py](unstructured/ingest/connector/s3.py). - -Then, update [unstructured/ingest/main.py/cli](unstructured/ingest/cli) to add a subcommand associated with the connector, and hook it up to the parent group. - -Create at least one folder [examples/ingest](examples/ingest) with an easily reproducible -script that shows the new connector in action. - -Finally, to ensure the connector remains stable, add a new script test_unstructured_ingest/test-ingest-\.sh similar to [test_unstructured_ingest/test-ingest-s3.sh](test_unstructured_ingest/test-ingest-s3.sh), and append a line invoking the new script in [test_unstructured_ingest/test-ingest.sh](test_unstructured_ingest/test-ingest.sh). - -You'll notice that the unstructured outputs for the new documents are expected -to be checked into CI under test_unstructured_ingest/expected-structured-output/\. So, you'll need to `git add` those json outputs so that `test-ingest.sh` passes in CI. - -The `main.py` flags of --re-download/--no-re-download , --download-dir, --preserve-downloads, --structured-output-dir, and --reprocess are honored by the connector. - -### The checklist: - -In checklist form, the above steps are summarized as: - -- [ ] Create a new module under [unstructured/ingest/connector/](unstructured/ingest/connector/) implementing the 3 abstract base classes, similar to [unstructured/ingest/connector/github.py](unstructured/ingest/connector/github.py). - - [ ] The subclass of `BaseIngestDoc` overrides `process_file()` if extra processing logic is needed other than what is provided by [auto.partition()](unstructured/partition/auto.py). -- [ ] Update [unstructured/ingest/cli](unstructured/ingest/cli) with support for the new connector. -- [ ] Create a folder under [examples/ingest](examples/ingest) that includes at least one well documented script. -- [ ] Add a script test_unstructured_ingest/test-ingest-\.sh. It's json output files should have a total of no more than 100K. -- [ ] Git add the expected outputs under test_unstructured_ingest/expected-structured-output/\ so the above test passes in CI. -- [ ] Add a line to [test_unstructured_ingest/test-ingest.sh](test_unstructured_ingest/test-ingest.sh) invoking the new test script. -- [ ] If additional python dependencies are needed for the new connector: - - [ ] Add them as an extra to [setup.py](unstructured/setup.py). - - [ ] Update the Makefile, adding a target for `install-ingest-` and adding another `pip-compile` line to the `pip-compile` make target. See [this commit](https://github.com/Unstructured-IO/unstructured/commit/ab542ca3c6274f96b431142262d47d727f309e37) for a reference. - - [ ] The added dependencies should be imported at runtime when the new connector is invoked, rather than as top-level imports. - - [ ] Add the decorator `unstructured.utils.requires_dependencies` on top of each class instance or function that uses those connector-specific dependencies e.g. for `GitHubConnector` should look like `@requires_dependencies(dependencies=["github"], extras="github")` - - [ ] Run `make tidy` and `make check` to ensure linting checks pass. -- [ ] Honors the conventions of `BaseConnectorConfig` defined in [unstructured/ingest/interfaces.py](unstructured/ingest/interfaces.py) which is passed through [the CLI](unstructured/ingest/main.py): - - [ ] If running with an `.output_dir` where structured outputs already exists for a given file, the file content is not re-downloaded from the data source nor is it reprocessed. This is made possible by implementing the call to `MyIngestDoc.has_output()` which is invoked in [MainProcess._filter_docs_with_outputs](ingest-prep-for-many/unstructured/ingest/main.py). - - [ ] Unless `.reprocess` is `True`, then documents are always reprocessed. - - [ ] If `.preserve_download` is `True`, documents downloaded to `.download_dir` are not removed after processing. - - [ ] Else if `.preserve_download` is `False`, documents downloaded to `.download_dir` are removed after they are **successfully** processed during the invocation of `MyIngestDoc.cleanup_file()` in [process_document](unstructured/ingest/doc_processor/generalized.py) - - [ ] Does not re-download documents to `.download_dir` if `.re_download` is False, enforced in `MyIngestDoc.get_file()` - - [ ] Prints more details if `--verbose` in ingest CLI, similar to [unstructured/ingest/connector/github.py](unstructured/ingest/connector/github.py) logging messages. diff --git a/src/unstructured/ingest/cli/cli.py b/src/unstructured/ingest/cli/cli.py deleted file mode 100644 index 4fd8da6..0000000 --- a/src/unstructured/ingest/cli/cli.py +++ /dev/null @@ -1,47 +0,0 @@ -import click - -import unstructured.ingest.cli.cmds as cli_cmds - - -@click.group() -def ingest(): - pass - - -# Dynamically update shared options for supported subcommands -subcommands = [ - cli_cmds.box, - cli_cmds.s3, - cli_cmds.gcs, - cli_cmds.dropbox, - cli_cmds.azure, - cli_cmds.fsspec, - cli_cmds.github, - cli_cmds.gitlab, - cli_cmds.reddit, - cli_cmds.slack, - cli_cmds.discord, - cli_cmds.wikipedia, - cli_cmds.gdrive, - cli_cmds.biomed, - cli_cmds.notion, - cli_cmds.onedrive, - cli_cmds.outlook, - cli_cmds.local, - cli_cmds.elasticsearch, - cli_cmds.confluence, - cli_cmds.sharepoint, - cli_cmds.airtable, -] - -for subcommand in subcommands: - ingest.add_command(subcommand()) - - -def get_cmd() -> click.Command: - cmd = ingest - # Add all subcommands - for subcommand in subcommands: - # add_shared_options(cmd) - cmd.add_command(subcommand()) - return cmd diff --git a/src/unstructured/ingest/cli/cmds/__init__.py b/src/unstructured/ingest/cli/cmds/__init__.py deleted file mode 100644 index d8cb14f..0000000 --- a/src/unstructured/ingest/cli/cmds/__init__.py +++ /dev/null @@ -1,47 +0,0 @@ -from .airtable import get_cmd as airtable -from .azure import get_cmd as azure -from .biomed import get_cmd as biomed -from .box import get_cmd as box -from .confluence import get_cmd as confluence -from .discord import get_cmd as discord -from .dropbox import get_cmd as dropbox -from .elasticsearch import get_cmd as elasticsearch -from .fsspec import get_cmd as fsspec -from .gcs import get_cmd as gcs -from .github import get_cmd as github -from .gitlab import get_cmd as gitlab -from .google_drive import get_cmd as gdrive -from .local import get_cmd as local -from .notion import get_cmd as notion -from .onedrive import get_cmd as onedrive -from .outlook import get_cmd as outlook -from .reddit import get_cmd as reddit -from .s3 import get_cmd as s3 -from .sharepoint import get_cmd as sharepoint -from .slack import get_cmd as slack -from .wikipedia import get_cmd as wikipedia - -__all__ = [ - "airtable", - "azure", - "biomed", - "box", - "confluence", - "discord", - "dropbox", - "elasticsearch", - "fsspec", - "gcs", - "gdrive", - "github", - "gitlab", - "local", - "notion", - "onedrive", - "outlook", - "reddit", - "s3", - "sharepoint", - "slack", - "wikipedia", -] diff --git a/src/unstructured/ingest/cli/cmds/airtable.py b/src/unstructured/ingest/cli/cmds/airtable.py deleted file mode 100644 index 0e15a62..0000000 --- a/src/unstructured/ingest/cli/cmds/airtable.py +++ /dev/null @@ -1,76 +0,0 @@ -import logging - -import click - -from unstructured.ingest.cli.common import ( - add_recursive_option, - add_shared_options, - log_options, - map_to_processor_config, - map_to_standard_config, - run_init_checks, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import airtable as airtable_fn - - -@click.command() -@click.option( - "--personal-access-token", - default=None, - help="Personal access token to authenticate into Airtable. Check: \ - https://support.airtable.com/docs/creating-and-using-api-keys-and-access-tokens for more info", -) -@click.option( - "--list-of-paths", - default=None, - help="""A list of paths that specify the locations to ingest data from within Airtable. - - If this argument is not set, the connector ingests all tables within each and every base. - --list-of-paths: path1 path2 path3 …. - path: base_id/table_id(optional)/view_id(optional)/ - - To obtain (base, table, view) ids in bulk, check: - https://airtable.com/developers/web/api/list-bases (base ids) - https://airtable.com/developers/web/api/get-base-schema (table and view ids) - https://pyairtable.readthedocs.io/en/latest/metadata.html (base, table and view ids) - - To obtain specific ids from Airtable UI, go to your workspace, and copy any - relevant id from the URL structure: - https://airtable.com/appAbcDeF1ghijKlm/tblABcdEfG1HIJkLm/viwABCDEfg6hijKLM - appAbcDeF1ghijKlm -> base_id - tblABcdEfG1HIJkLm -> table_id - viwABCDEfg6hijKLM -> view_id - - You can also check: https://support.airtable.com/docs/finding-airtable-ids - - Here is an example for one --list-of-paths: - base1/ → gets the entirety of all tables inside base1 - base1/table1 → gets all rows and columns within table1 in base1 - base1/table1/view1 → gets the rows and columns that are - visible in view1 for the table1 in base1 - - Examples to invalid airtable_paths: - table1 → has to mention base to be valid - base1/view1 → has to mention table to be valid - """, -) -def airtable(**options): - verbose = options.get("verbose", False) - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - log_options(options) - try: - run_init_checks(**options) - connector_config = map_to_standard_config(options) - processor_config = map_to_processor_config(options) - airtable_fn(connector_config=connector_config, processor_config=processor_config, **options) - except Exception as e: - logger.error(e, exc_info=True) - raise click.ClickException(str(e)) from e - - -def get_cmd() -> click.Command: - cmd = airtable - add_shared_options(cmd) - add_recursive_option(cmd) - return cmd diff --git a/src/unstructured/ingest/cli/cmds/azure.py b/src/unstructured/ingest/cli/cmds/azure.py deleted file mode 100644 index e9cf4f6..0000000 --- a/src/unstructured/ingest/cli/cmds/azure.py +++ /dev/null @@ -1,54 +0,0 @@ -import logging - -import click - -from unstructured.ingest.cli.common import ( - add_recursive_option, - add_remote_url_option, - add_shared_options, - log_options, - map_to_processor_config, - map_to_standard_config, - run_init_checks, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import azure as azure_fn - - -@click.command() -@click.option( - "--account-key", - default=None, - help="Azure Blob Storage or DataLake account key (not required if " - "`azure_account_name` is public).", -) -@click.option( - "--account-name", - default=None, - help="Azure Blob Storage or DataLake account name.", -) -@click.option( - "--connection-string", - default=None, - help="Azure Blob Storage or DataLake connection string.", -) -def azure(**options): - verbose = options.get("verbose", False) - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - log_options(options) - try: - run_init_checks(**options) - connector_config = map_to_standard_config(options) - processor_config = map_to_processor_config(options) - azure_fn(connector_config=connector_config, processor_config=processor_config, **options) - except Exception as e: - logger.error(e, exc_info=True) - raise click.ClickException(str(e)) from e - - -def get_cmd() -> click.Command: - cmd = azure - add_recursive_option(cmd) - add_shared_options(cmd) - add_remote_url_option(cmd) - return cmd diff --git a/src/unstructured/ingest/cli/cmds/biomed.py b/src/unstructured/ingest/cli/cmds/biomed.py deleted file mode 100644 index aef05ab..0000000 --- a/src/unstructured/ingest/cli/cmds/biomed.py +++ /dev/null @@ -1,71 +0,0 @@ -import logging - -import click - -from unstructured.ingest.cli.common import ( - add_shared_options, - log_options, - map_to_processor_config, - map_to_standard_config, - run_init_checks, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import biomed as biomed_fn - - -@click.command() -@click.option( - "--api-id", - default=None, - help="ID parameter for OA Web Service API.", -) -@click.option( - "--api-from", - default=None, - help="From parameter for OA Web Service API.", -) -@click.option( - "--api-until", - default=None, - help="Until parameter for OA Web Service API.", -) -@click.option( - "--decay", - default=0.3, - help="(In float) Factor to multiply the delay between retries.", -) -@click.option( - "--path", - default=None, - help="PMC Open Access FTP Directory Path.", -) -@click.option( - "--max-request-time", - default=45, - help="(In seconds) Max request time to OA Web Service API.", -) -@click.option( - "--max-retries", - default=1, - help="Max requests to OA Web Service API.", -) -def biomed( - **options, -): - verbose = options.get("verbose", False) - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - log_options(options) - try: - run_init_checks(**options) - connector_config = map_to_standard_config(options) - processor_config = map_to_processor_config(options) - biomed_fn(connector_config=connector_config, processor_config=processor_config, **options) - except Exception as e: - logger.error(e, exc_info=True) - raise click.ClickException(str(e)) from e - - -def get_cmd() -> click.Command: - cmd = biomed - add_shared_options(cmd) - return cmd diff --git a/src/unstructured/ingest/cli/cmds/box.py b/src/unstructured/ingest/cli/cmds/box.py deleted file mode 100644 index 189ee8f..0000000 --- a/src/unstructured/ingest/cli/cmds/box.py +++ /dev/null @@ -1,43 +0,0 @@ -import logging - -import click - -from unstructured.ingest.cli.common import ( - add_recursive_option, - add_remote_url_option, - add_shared_options, - log_options, - map_to_processor_config, - map_to_standard_config, - run_init_checks, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import box as box_fn - - -@click.command() -@click.option( - "--box-app-config", - default=None, - help="Path to Box app credentials as json file.", -) -def box(**options): - verbose = options.get("verbose", False) - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - log_options(options) - try: - run_init_checks(**options) - connector_config = map_to_standard_config(options) - processor_config = map_to_processor_config(options) - box_fn(connector_config=connector_config, processor_config=processor_config, **options) - except Exception as e: - logger.error(e, exc_info=True) - raise click.ClickException(str(e)) from e - - -def get_cmd() -> click.Command: - cmd = box - add_recursive_option(cmd) - add_shared_options(cmd) - add_remote_url_option(cmd) - return cmd diff --git a/src/unstructured/ingest/cli/cmds/confluence.py b/src/unstructured/ingest/cli/cmds/confluence.py deleted file mode 100644 index 95c9f37..0000000 --- a/src/unstructured/ingest/cli/cmds/confluence.py +++ /dev/null @@ -1,78 +0,0 @@ -import logging - -import click - -from unstructured.ingest.cli.common import ( - add_shared_options, - log_options, - map_to_processor_config, - map_to_standard_config, - run_init_checks, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import confluence as confluence_fn - - -@click.command() -@click.option( - "--api-token", - required=True, - help="API Token to authenticate into Confluence Cloud. \ - Check https://developer.atlassian.com/cloud/confluence/basic-auth-for-rest-apis/ \ - for more info.", -) -@click.option( - "--list-of-spaces", - default=None, - help="A list of confluence space ids to be fetched. From each fetched space, \ - --confluence-num-of-docs-from-each-space number of docs will be ingested. \ - --confluence-list-of-spaces and --confluence-num-of-spaces cannot be used at the same time", -) -@click.option( - "--max-num-of-docs-from-each-space", - default=100, - help="Number of documents to be aimed to be ingested from each fetched confluence space. \ - If any space has fewer documents, all the documents from that space will be ingested. \ - Documents are not necessarily ingested in order of creation date.", -) -@click.option( - "--max-num-of-spaces", - default=500, - help="Number of confluence space ids to be fetched. From each fetched space, \ - --confluence-num-of-docs-from-each-space number of docs will be ingested. \ - --confluence-list-of-spaces and --confluence-num-of-spaces cannot be used at the same time", -) -@click.option( - "--url", - required=True, - help='URL to Confluence Cloud, e.g. "unstructured-ingest-test.atlassian.net"', -) -@click.option( - "--user-email", - required=True, - help="Email to authenticate into Confluence Cloud", -) -def confluence( - **options, -): - verbose = options.get("verbose", False) - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - log_options(options) - try: - run_init_checks(**options) - connector_config = map_to_standard_config(options) - processor_config = map_to_processor_config(options) - confluence_fn( - connector_config=connector_config, - processor_config=processor_config, - **options, - ) - except Exception as e: - logger.error(e, exc_info=True) - raise click.ClickException(str(e)) from e - - -def get_cmd() -> click.Command: - cmd = confluence - add_shared_options(cmd) - return cmd diff --git a/src/unstructured/ingest/cli/cmds/discord.py b/src/unstructured/ingest/cli/cmds/discord.py deleted file mode 100644 index efc8c4f..0000000 --- a/src/unstructured/ingest/cli/cmds/discord.py +++ /dev/null @@ -1,50 +0,0 @@ -import logging - -import click - -from unstructured.ingest.cli.common import ( - add_shared_options, - log_options, - map_to_processor_config, - map_to_standard_config, - run_init_checks, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import discord as discord_fn - - -@click.command() -@click.option( - "--channels", - required=True, - help="A comma separated list of discord channel ids to ingest from.", -) -@click.option( - "--period", - default=None, - help="Number of days to go back in the history of discord channels, must be a number", -) -@click.option( - "--token", - required=True, - help="Bot token used to access Discord API, must have " - "READ_MESSAGE_HISTORY scope for the bot user", -) -def discord(**options): - verbose = options.get("verbose", False) - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - log_options(options) - try: - run_init_checks(**options) - connector_config = map_to_standard_config(options) - processor_config = map_to_processor_config(options) - discord_fn(connector_config=connector_config, processor_config=processor_config, **options) - except Exception as e: - logger.error(e, exc_info=True) - raise click.ClickException(str(e)) from e - - -def get_cmd() -> click.Command: - cmd = discord - add_shared_options(cmd) - return cmd diff --git a/src/unstructured/ingest/cli/cmds/dropbox.py b/src/unstructured/ingest/cli/cmds/dropbox.py deleted file mode 100644 index 83895c5..0000000 --- a/src/unstructured/ingest/cli/cmds/dropbox.py +++ /dev/null @@ -1,43 +0,0 @@ -import logging - -import click - -from unstructured.ingest.cli.common import ( - add_recursive_option, - add_remote_url_option, - add_shared_options, - log_options, - map_to_processor_config, - map_to_standard_config, - run_init_checks, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import dropbox as dropbox_fn - - -@click.command() -@click.option( - "--token", - required=True, - help="Dropbox access token.", -) -def dropbox(**options): - verbose = options.get("verbose", False) - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - log_options(options) - try: - run_init_checks(**options) - connector_config = map_to_standard_config(options) - processor_config = map_to_processor_config(options) - dropbox_fn(connector_config=connector_config, processor_config=processor_config, **options) - except Exception as e: - logger.error(e, exc_info=True) - raise click.ClickException(str(e)) from e - - -def get_cmd() -> click.Command: - cmd = dropbox - add_shared_options(cmd) - add_remote_url_option(cmd) - add_recursive_option(cmd) - return cmd diff --git a/src/unstructured/ingest/cli/cmds/elasticsearch.py b/src/unstructured/ingest/cli/cmds/elasticsearch.py deleted file mode 100644 index b1a4773..0000000 --- a/src/unstructured/ingest/cli/cmds/elasticsearch.py +++ /dev/null @@ -1,56 +0,0 @@ -import logging - -import click - -from unstructured.ingest.cli.common import ( - add_shared_options, - log_options, - map_to_processor_config, - map_to_standard_config, - run_init_checks, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import elasticsearch as elasticsearch_fn - - -@click.command() -@click.option( - "--index-name", - required=True, - help="Name for the Elasticsearch index to pull data from", -) -@click.option( - "--jq-query", - default=None, - help="JQ query to get and concatenate a subset of the fields from a JSON document. " - "For a group of JSON documents, it assumes that all of the documents have the same schema. " - "Currently only supported for the Elasticsearch connector. " - "Example: --jq-query '{meta, body}'", -) -@click.option( - "--url", - required=True, - help='URL to the Elasticsearch cluster, e.g. "http://localhost:9200"', -) -def elasticsearch(**options): - verbose = options.get("verbose", False) - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - log_options(options) - try: - run_init_checks(**options) - connector_config = map_to_standard_config(options) - processor_config = map_to_processor_config(options) - elasticsearch_fn( - connector_config=connector_config, - processor_config=processor_config, - **options, - ) - except Exception as e: - logger.error(e, exc_info=True) - raise click.ClickException(str(e)) from e - - -def get_cmd() -> click.Command: - cmd = elasticsearch - add_shared_options(cmd) - return cmd diff --git a/src/unstructured/ingest/cli/cmds/fsspec.py b/src/unstructured/ingest/cli/cmds/fsspec.py deleted file mode 100644 index 8ab9d7a..0000000 --- a/src/unstructured/ingest/cli/cmds/fsspec.py +++ /dev/null @@ -1,38 +0,0 @@ -import logging - -import click - -from unstructured.ingest.cli.common import ( - add_recursive_option, - add_remote_url_option, - add_shared_options, - log_options, - map_to_processor_config, - map_to_standard_config, - run_init_checks, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import fsspec as fsspec_fn - - -@click.command() -def fsspec(**options): - verbose = options.get("verbose", False) - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - log_options(options) - try: - run_init_checks(**options) - connector_config = map_to_standard_config(options) - processor_config = map_to_processor_config(options) - fsspec_fn(connector_config=connector_config, processor_config=processor_config, **options) - except Exception as e: - logger.error(e, exc_info=True) - raise click.ClickException(str(e)) from e - - -def get_cmd() -> click.Command: - cmd = fsspec - add_recursive_option(cmd) - add_shared_options(cmd) - add_remote_url_option(cmd) - return cmd diff --git a/src/unstructured/ingest/cli/cmds/gcs.py b/src/unstructured/ingest/cli/cmds/gcs.py deleted file mode 100644 index a8b5c1f..0000000 --- a/src/unstructured/ingest/cli/cmds/gcs.py +++ /dev/null @@ -1,44 +0,0 @@ -import logging - -import click - -from unstructured.ingest.cli.common import ( - add_recursive_option, - add_remote_url_option, - add_shared_options, - log_options, - map_to_processor_config, - map_to_standard_config, - run_init_checks, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import gcs as gcs_fn - - -@click.command() -@click.option( - "--token", - default=None, - help="Token used to access Google Cloud. GCSFS will attempt to use your default gcloud creds" - "or get creds from the google metadata service or fall back to anonymous access.", -) -def gcs(**options): - verbose = options.get("verbose", False) - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - log_options(options) - try: - run_init_checks(**options) - connector_config = map_to_standard_config(options) - processor_config = map_to_processor_config(options) - gcs_fn(connector_config=connector_config, processor_config=processor_config, **options) - except Exception as e: - logger.error(e, exc_info=True) - raise click.ClickException(str(e)) from e - - -def get_cmd() -> click.Command: - cmd = gcs - add_recursive_option(cmd) - add_shared_options(cmd) - add_remote_url_option(cmd) - return cmd diff --git a/src/unstructured/ingest/cli/cmds/github.py b/src/unstructured/ingest/cli/cmds/github.py deleted file mode 100644 index d99445a..0000000 --- a/src/unstructured/ingest/cli/cmds/github.py +++ /dev/null @@ -1,58 +0,0 @@ -import logging - -import click - -from unstructured.ingest.cli.common import ( - add_shared_options, - log_options, - map_to_processor_config, - map_to_standard_config, - run_init_checks, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import github as github_fn - - -@click.command() -@click.option( - "--git-access-token", - default=None, - help="A GitHub or GitLab access token, see https://docs.github.com/en/authentication " - " or https://docs.gitlab.com/ee/api/rest/index.html#personalprojectgroup-access-tokens", -) -@click.option( - "--git-branch", - default=None, - help="The branch for which to fetch files from. If not given," - " the default repository branch is used.", -) -@click.option( - "--git-file-glob", - default=None, - help="A comma-separated list of file globs to limit which types of files are accepted," - " e.g. '*.html,*.txt'", -) -@click.option( - "--url", - required=True, - help='URL to GitHub repository, e.g. "https://github.com/Unstructured-IO/unstructured",' - ' or a repository owner/name pair, e.g. "Unstructured-IO/unstructured"', -) -def github(**options): - verbose = options.get("verbose", False) - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - log_options(options) - try: - run_init_checks(**options) - connector_config = map_to_standard_config(options) - processor_config = map_to_processor_config(options) - github_fn(connector_config=connector_config, processor_config=processor_config, **options) - except Exception as e: - logger.error(e, exc_info=True) - raise click.ClickException(str(e)) from e - - -def get_cmd() -> click.Command: - cmd = github - add_shared_options(cmd) - return cmd diff --git a/src/unstructured/ingest/cli/cmds/gitlab.py b/src/unstructured/ingest/cli/cmds/gitlab.py deleted file mode 100644 index 001a36e..0000000 --- a/src/unstructured/ingest/cli/cmds/gitlab.py +++ /dev/null @@ -1,58 +0,0 @@ -import logging - -import click - -from unstructured.ingest.cli.common import ( - add_shared_options, - log_options, - map_to_processor_config, - map_to_standard_config, - run_init_checks, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import gitlab as gitlab_fn - - -@click.command() -@click.option( - "--git-access-token", - default=None, - help="A GitHub or GitLab access token, see https://docs.github.com/en/authentication " - " or https://docs.gitlab.com/ee/api/rest/index.html#personalprojectgroup-access-tokens", -) -@click.option( - "--git-branch", - default=None, - help="The branch for which to fetch files from. If not given," - " the default repository branch is used.", -) -@click.option( - "--git-file-glob", - default=None, - help="A comma-separated list of file globs to limit which types of files are accepted," - " e.g. '*.html,*.txt'", -) -@click.option( - "--url", - required=True, - help='URL to GitLab repository, e.g. "https://gitlab.com/gitlab-com/content-sites/docsy-gitlab"' - ', or a repository path, e.g. "gitlab-com/content-sites/docsy-gitlab"', -) -def gitlab(**options): - verbose = options.get("verbose", False) - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - log_options(options) - try: - run_init_checks(**options) - connector_config = map_to_standard_config(options) - processor_config = map_to_processor_config(options) - gitlab_fn(connector_config=connector_config, processor_config=processor_config, **options) - except Exception as e: - logger.error(e, exc_info=True) - raise click.ClickException(str(e)) from e - - -def get_cmd() -> click.Command: - cmd = gitlab - add_shared_options(cmd) - return cmd diff --git a/src/unstructured/ingest/cli/cmds/google_drive.py b/src/unstructured/ingest/cli/cmds/google_drive.py deleted file mode 100644 index 4cf239d..0000000 --- a/src/unstructured/ingest/cli/cmds/google_drive.py +++ /dev/null @@ -1,51 +0,0 @@ -import logging - -import click - -from unstructured.ingest.cli.common import ( - add_recursive_option, - add_shared_options, - log_options, - map_to_processor_config, - map_to_standard_config, - run_init_checks, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import gdrive as gdrive_fn - - -@click.command() -@click.option( - "--drive-id", - required=True, - help="Google Drive File or Folder ID.", -) -@click.option( - "--extension", - default=None, - help="Filters the files to be processed based on extension e.g. .jpg, .docx, etc.", -) -@click.option( - "--service-account-key", - required=True, - help="Path to the Google Drive service account json file.", -) -def gdrive(**options): - verbose = options.get("verbose", False) - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - log_options(options) - try: - run_init_checks(**options) - connector_config = map_to_standard_config(options) - processor_config = map_to_processor_config(options) - gdrive_fn(connector_config=connector_config, processor_config=processor_config, **options) - except Exception as e: - logger.error(e, exc_info=True) - raise click.ClickException(str(e)) from e - - -def get_cmd() -> click.Command: - cmd = gdrive - add_recursive_option(cmd) - add_shared_options(cmd) - return cmd diff --git a/src/unstructured/ingest/cli/cmds/local.py b/src/unstructured/ingest/cli/cmds/local.py deleted file mode 100644 index 37317d4..0000000 --- a/src/unstructured/ingest/cli/cmds/local.py +++ /dev/null @@ -1,47 +0,0 @@ -import logging - -import click - -from unstructured.ingest.cli.common import ( - add_recursive_option, - add_shared_options, - log_options, - map_to_processor_config, - map_to_standard_config, - run_init_checks, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import local as local_fn - - -@click.command() -@click.option( - "--file-glob", - default=None, - help="A comma-separated list of file globs to limit which types of local files are accepted," - " e.g. '*.html,*.txt'", -) -@click.option( - "--input-path", - required=True, - help="Path to the location in the local file system that will be processed.", -) -def local(**options): - verbose = options.get("verbose", False) - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - log_options(options) - try: - run_init_checks(**options) - connector_config = map_to_standard_config(options) - processor_config = map_to_processor_config(options) - local_fn(connector_config=connector_config, processor_config=processor_config, **options) - except Exception as e: - logger.error(e, exc_info=True) - raise click.ClickException(str(e)) from e - - -def get_cmd() -> click.Command: - cmd = local - add_shared_options(cmd) - add_recursive_option(cmd) - return cmd diff --git a/src/unstructured/ingest/cli/cmds/notion.py b/src/unstructured/ingest/cli/cmds/notion.py deleted file mode 100644 index 32a9843..0000000 --- a/src/unstructured/ingest/cli/cmds/notion.py +++ /dev/null @@ -1,51 +0,0 @@ -import logging - -import click - -from unstructured.ingest.cli.common import ( - add_recursive_option, - add_shared_options, - log_options, - map_to_processor_config, - map_to_standard_config, - run_init_checks, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import notion as notion_fn - - -@click.command() -@click.option( - "--page-ids", - default=None, - help="Comma separated list of Notion page IDs to pull text from", -) -@click.option( - "--database-ids", - default=None, - help="Comma separated list of Notion database IDs to pull text from", -) -@click.option( - "--api-key", - required=True, - help="API key for Notion api", -) -def notion(**options): - verbose = options.get("verbose", False) - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - log_options(options) - try: - run_init_checks(**options) - connector_config = map_to_standard_config(options) - processor_config = map_to_processor_config(options) - notion_fn(connector_config=connector_config, processor_config=processor_config, **options) - except Exception as e: - logger.error(e, exc_info=True) - raise click.ClickException(str(e)) from e - - -def get_cmd() -> click.Command: - cmd = notion - add_shared_options(cmd) - add_recursive_option(cmd) - return cmd diff --git a/src/unstructured/ingest/cli/cmds/onedrive.py b/src/unstructured/ingest/cli/cmds/onedrive.py deleted file mode 100644 index d0b0f6c..0000000 --- a/src/unstructured/ingest/cli/cmds/onedrive.py +++ /dev/null @@ -1,67 +0,0 @@ -import logging - -import click - -from unstructured.ingest.cli.common import ( - add_recursive_option, - add_shared_options, - log_options, - map_to_processor_config, - map_to_standard_config, - run_init_checks, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import onedrive as onedrive_fn - - -@click.command() -@click.option( - "--authority-url", - default="https://login.microsoftonline.com", - help="Authentication token provider for Microsoft apps, default is " - "https://login.microsoftonline.com", -) -@click.option( - "--client-id", - required=True, - help="Microsoft app client ID", -) -@click.option( - "--client-cred", - required=True, - help="Microsoft App client secret", -) -@click.option( - "--path", - default=None, - help="Folder to start parsing files from.", -) -@click.option( - "--tenant", - default="common", - help="ID or domain name associated with your Azure AD instance", -) -@click.option( - "--user-pname", - required=True, - help="User principal name, usually is your Azure AD email.", -) -def onedrive(**options): - verbose = options.get("verbose", False) - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - log_options(options) - try: - run_init_checks(**options) - connector_config = map_to_standard_config(options) - processor_config = map_to_processor_config(options) - onedrive_fn(connector_config=connector_config, processor_config=processor_config, **options) - except Exception as e: - logger.error(e, exc_info=True) - raise click.ClickException(str(e)) from e - - -def get_cmd() -> click.Command: - cmd = onedrive - add_recursive_option(cmd) - add_shared_options(cmd) - return cmd diff --git a/src/unstructured/ingest/cli/cmds/outlook.py b/src/unstructured/ingest/cli/cmds/outlook.py deleted file mode 100644 index d478a8a..0000000 --- a/src/unstructured/ingest/cli/cmds/outlook.py +++ /dev/null @@ -1,68 +0,0 @@ -import logging - -import click - -from unstructured.ingest.cli.common import ( - add_recursive_option, - add_shared_options, - log_options, - map_to_processor_config, - map_to_standard_config, - run_init_checks, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import outlook as outlook_fn - - -@click.command() -@click.option( - "--authority-url", - default="https://login.microsoftonline.com", - help="Authentication token provider for Microsoft apps, default is " - "https://login.microsoftonline.com", -) -@click.option( - "--client-id", - required=True, - help="Microsoft app client ID", -) -@click.option( - "--client-cred", - default=None, - help="Microsoft App client secret", -) -@click.option( - "--outlook-folders", - default=None, - help="Comma separated list of folders to download email messages from. " - "Do not specify subfolders. Use quotes if spaces in folder names.", -) -@click.option( - "--tenant", - default="common", - help="ID or domain name associated with your Azure AD instance", -) -@click.option( - "--user-email", - required=True, - help="Outlook email to download messages from.", -) -def outlook(**options): - verbose = options.get("verbose", False) - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - log_options(options) - try: - run_init_checks(**options) - connector_config = map_to_standard_config(options) - processor_config = map_to_processor_config(options) - outlook_fn(connector_config=connector_config, processor_config=processor_config, **options) - except Exception as e: - logger.error(e, exc_info=True) - raise click.ClickException(str(e)) from e - - -def get_cmd() -> click.Command: - cmd = outlook - add_recursive_option(cmd) - add_shared_options(cmd) - return cmd diff --git a/src/unstructured/ingest/cli/cmds/reddit.py b/src/unstructured/ingest/cli/cmds/reddit.py deleted file mode 100644 index 76e2028..0000000 --- a/src/unstructured/ingest/cli/cmds/reddit.py +++ /dev/null @@ -1,67 +0,0 @@ -import logging - -import click - -from unstructured.ingest.cli.common import ( - add_shared_options, - log_options, - map_to_processor_config, - map_to_standard_config, - run_init_checks, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import reddit as reddit_fn - - -@click.command() -@click.option( - "--client-id", - required=True, - help="The client ID, see " - "https://praw.readthedocs.io/en/stable/getting_started/quick_start.html#prerequisites" - " for more information.", -) -@click.option( - "--client-secret", - required=True, - help="The client secret, see " - "https://praw.readthedocs.io/en/stable/getting_started/quick_start.html#prerequisites" - " for more information.", -) -@click.option("--num-posts", default=10, help="The number of posts to fetch.") -@click.option( - "--search-query", - default=None, - help="If set, return posts using this query. Otherwise, use hot posts.", -) -@click.option( - "--subreddit-name", - required=True, - help='The name of a subreddit, without the "r\\", e.g. "machinelearning"', -) -@click.option( - "--user-agent", - required=True, - default="Unstructured Ingest Subreddit fetcher", - help="The user agent to use on the Reddit API, see " - "https://praw.readthedocs.io/en/stable/getting_started/quick_start.html#prerequisites" - " for more information.", -) -def reddit(**options): - verbose = options.get("verbose", False) - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - log_options(options) - try: - run_init_checks(**options) - connector_config = map_to_standard_config(options) - processor_config = map_to_processor_config(options) - reddit_fn(connector_config=connector_config, processor_config=processor_config, **options) - except Exception as e: - logger.error(e, exc_info=True) - raise click.ClickException(str(e)) from e - - -def get_cmd() -> click.Command: - cmd = reddit - add_shared_options(cmd) - return cmd diff --git a/src/unstructured/ingest/cli/cmds/s3.py b/src/unstructured/ingest/cli/cmds/s3.py deleted file mode 100644 index aad1bf5..0000000 --- a/src/unstructured/ingest/cli/cmds/s3.py +++ /dev/null @@ -1,44 +0,0 @@ -import logging - -import click - -from unstructured.ingest.cli.common import ( - add_recursive_option, - add_remote_url_option, - add_shared_options, - log_options, - map_to_processor_config, - map_to_standard_config, - run_init_checks, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import s3 as s3_fn - - -@click.command() -@click.option( - "--anonymous", - is_flag=True, - default=False, - help="Connect to s3 without local AWS credentials.", -) -def s3(**options): - verbose = options.get("verbose", False) - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - log_options(options) - try: - run_init_checks(**options) - connector_config = map_to_standard_config(options) - processor_config = map_to_processor_config(options) - s3_fn(connector_config=connector_config, processor_config=processor_config, **options) - except Exception as e: - logger.error(e, exc_info=True) - raise click.ClickException(str(e)) from e - - -def get_cmd() -> click.Command: - cmd = s3 - add_recursive_option(cmd) - add_shared_options(cmd) - add_remote_url_option(cmd) - return cmd diff --git a/src/unstructured/ingest/cli/cmds/sharepoint.py b/src/unstructured/ingest/cli/cmds/sharepoint.py deleted file mode 100644 index e7fd3c2..0000000 --- a/src/unstructured/ingest/cli/cmds/sharepoint.py +++ /dev/null @@ -1,72 +0,0 @@ -import logging - -import click - -from unstructured.ingest.cli.common import ( - add_recursive_option, - add_shared_options, - log_options, - map_to_processor_config, - map_to_standard_config, - run_init_checks, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import sharepoint as sharepoint_fn - - -@click.command() -@click.option( - "--client-id", - default=None, - help="Sharepoint app client ID", -) -@click.option( - "--client-cred", - default=None, - help="Sharepoint app secret", -) -@click.option( - "--site", - default=None, - help="Sharepoint site url. Process either base url e.g https://[tenant].sharepoint.com \ - or relative sites https://[tenant].sharepoint.com/sites/.\ - To process all sites within the tenant pass a site url as\ - https://[tenant]-admin.sharepoint.com.\ - This requires the app to be registered at a tenant level", -) -@click.option( - "--path", - default="Shared Documents", - help="Path from which to start parsing files. If the connector is to process all sites \ - within the tenant this filter will be applied to all sites document libraries. \ - Default 'Shared Documents'", -) -@click.option( - "--files-only", - is_flag=True, - default=False, - help="Process only files.", -) -def sharepoint(**options): - verbose = options.get("verbose", False) - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - log_options(options) - try: - run_init_checks(**options) - connector_config = map_to_standard_config(options) - processor_config = map_to_processor_config(options) - sharepoint_fn( - connector_config=connector_config, - processor_config=processor_config, - **options, - ) - except Exception as e: - logger.error(e, exc_info=True) - raise click.ClickException(str(e)) from e - - -def get_cmd() -> click.Command: - cmd = sharepoint - add_recursive_option(cmd) - add_shared_options(cmd) - return cmd diff --git a/src/unstructured/ingest/cli/cmds/slack.py b/src/unstructured/ingest/cli/cmds/slack.py deleted file mode 100644 index aa8e191..0000000 --- a/src/unstructured/ingest/cli/cmds/slack.py +++ /dev/null @@ -1,57 +0,0 @@ -import logging - -import click - -from unstructured.ingest.cli.common import ( - add_shared_options, - log_options, - map_to_processor_config, - map_to_standard_config, - run_init_checks, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import slack as slack_fn - - -@click.command() -@click.option( - "--channels", - required=True, - help="Comma separated list of Slack channel IDs to pull messages from, " - "can be a public or private channel", -) -@click.option( - "--start-date", - default=None, - help="Start date/time in formats YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or " - "YYYY-MM-DD+HH:MM:SS or YYYY-MM-DDTHH:MM:SStz", -) -@click.option( - "--end-date", - default=None, - help="End date/time in formats YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS or " - "YYYY-MM-DD+HH:MM:SS or YYYY-MM-DDTHH:MM:SStz", -) -@click.option( - "--token", - required=True, - help="Bot token used to access Slack API, must have channels:history " "scope for the bot user", -) -def slack(**options): - verbose = options.get("verbose", False) - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - log_options(options) - try: - run_init_checks(**options) - connector_config = map_to_standard_config(options) - processor_config = map_to_processor_config(options) - slack_fn(connector_config=connector_config, processor_config=processor_config, **options) - except Exception as e: - logger.error(e, exc_info=True) - raise click.ClickException(str(e)) from e - - -def get_cmd() -> click.Command: - cmd = slack - add_shared_options(cmd) - return cmd diff --git a/src/unstructured/ingest/cli/cmds/wikipedia.py b/src/unstructured/ingest/cli/cmds/wikipedia.py deleted file mode 100644 index 61a1cf0..0000000 --- a/src/unstructured/ingest/cli/cmds/wikipedia.py +++ /dev/null @@ -1,49 +0,0 @@ -import logging - -import click - -from unstructured.ingest.cli.common import ( - add_shared_options, - log_options, - map_to_processor_config, - map_to_standard_config, - run_init_checks, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.runner import wikipedia as wikipedia_fn - - -@click.command() -@click.option( - "--auto-suggest", - default=True, - help="Whether to automatically suggest a page if the exact page was not found." - " Set to False if the wrong Wikipedia page is fetched.", -) -@click.option( - "--page-title", - required=True, - help='Title of a Wikipedia page, e.g. "Open source software".', -) -def wikipedia(**options): - verbose = options.get("verbose", False) - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - log_options(options) - try: - run_init_checks(**options) - connector_config = map_to_standard_config(options) - processor_config = map_to_processor_config(options) - wikipedia_fn( - connector_config=connector_config, - processor_config=processor_config, - **options, - ) - except Exception as e: - logger.error(e, exc_info=True) - raise click.ClickException(str(e)) from e - - -def get_cmd() -> click.Command: - cmd = wikipedia - add_shared_options(cmd) - return cmd diff --git a/src/unstructured/ingest/cli/common.py b/src/unstructured/ingest/cli/common.py deleted file mode 100644 index 046ad44..0000000 --- a/src/unstructured/ingest/cli/common.py +++ /dev/null @@ -1,278 +0,0 @@ -import logging -from typing import Optional - -from click import ClickException, Command, Option - -from unstructured.ingest.interfaces import ( - ProcessorConfigs, - StandardConnectorConfig, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger - - -def run_init_checks( - verbose: bool, - local_input_path: Optional[str], - download_dir: Optional[str], - metadata_exclude: Optional[str], - metadata_include: Optional[str], - flatten_metadata: bool, - fields_include: str, - partition_by_api: bool, - partition_endpoint: Optional[str], - preserve_downloads: bool, - download_only: bool, - **kwargs, -): - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - # Initial breaking checks - if local_input_path is not None and download_dir: - raise ClickException( - "Files should already be in local file system: there is nothing to download, " - "but --download-dir is specified.", - ) - if metadata_exclude is not None and metadata_include is not None: - raise ClickException( - "Arguments `--metadata-include` and `--metadata-exclude` are " - "mutually exclusive with each other.", - ) - - # Warnings - if flatten_metadata and "metadata" not in fields_include: - logger.warning( - "`--flatten-metadata` is specified, but there is no metadata to flatten, " - "since `--metadata` is not specified in `--fields-include`.", - ) - if "metadata" not in fields_include and (metadata_include or metadata_exclude): - logger.warning( - "Either '--metadata-include` or `--metadata-exclude` is specified" - " while metadata is not specified in fields-include.", - ) - - if ( - not partition_by_api - and partition_endpoint != "https://api.unstructured.io/general/v0/general" - ): - logger.warning( - "Ignoring --partition-endpoint because --partition-by-api was not set", - ) - if (not preserve_downloads and not download_only) and download_dir: - logger.warning( - "Not preserving downloaded files but download_dir is specified", - ) - - -def log_options(options: dict): - ingest_log_streaming_init(logging.DEBUG if options["verbose"] else logging.INFO) - sensitive_fields = [ - "account_name", - "account_key", - "api_key", - "token", - "client_id", - "client_cred", - ] - options_to_log = options.copy() - options_to_log.update( - { - k: "*******" - for k, v in options_to_log.items() - if k in sensitive_fields and v is not None - }, - ) - logger.debug(f"options: {options_to_log}") - - -def map_to_standard_config(options: dict) -> StandardConnectorConfig: - return StandardConnectorConfig( - download_dir=options["download_dir"], - output_dir=options["structured_output_dir"], - download_only=options["download_only"], - fields_include=options["fields_include"], - flatten_metadata=options["flatten_metadata"], - metadata_exclude=options["metadata_exclude"], - metadata_include=options["metadata_include"], - partition_by_api=options["partition_by_api"], - partition_endpoint=options["partition_endpoint"], - preserve_downloads=options["preserve_downloads"], - re_download=options["re_download"], - api_key=options["api_key"], - ) - - -def map_to_processor_config(options: dict) -> ProcessorConfigs: - return ProcessorConfigs( - partition_strategy=options["partition_strategy"], - partition_ocr_languages=options["partition_ocr_languages"], - partition_pdf_infer_table_structure=options["partition_pdf_infer_table_structure"], - partition_encoding=options["partition_encoding"], - num_processes=options["num_processes"], - reprocess=options["reprocess"], - max_docs=options["max_docs"], - ) - - -def add_remote_url_option(cmd: Command): - cmd.params.append( - Option( - ["--remote-url"], - required=True, - help="Remote fsspec URL formatted as `protocol://dir/path`, it can contain both " - "a directory or a single file.", - ), - ) - - -def add_recursive_option(cmd: Command): - cmd.params.append( - Option( - ["--recursive"], - is_flag=True, - default=False, - help="Recursively download files in their respective folders" - "otherwise stop at the files in provided folder level.", - ), - ) - - -def add_shared_options(cmd: Command): - options = [ - Option( - ["--max-docs"], - default=None, - type=int, - help="If specified, process at most specified number of documents.", - ), - Option( - ["--flatten-metadata"], - is_flag=True, - default=False, - help="Results in flattened json elements. " - "Specifically, the metadata key values are brought to the top-level of the element, " - "and the `metadata` key itself is removed.", - ), - Option( - ["--fields-include"], - default="element_id,text,type,metadata", - help="If set, include the specified top-level fields in an element. " - "Default is `element_id,text,type,metadata`.", - ), - Option( - ["--metadata-include"], - default=None, - help="If set, include the specified metadata fields if they exist " - "and drop all other fields. " - "Usage: provide a single string with comma separated values. " - "Example: --metadata-include filename,page_number ", - ), - Option( - ["--metadata-exclude"], - default=None, - help="If set, drop the specified metadata fields if they exist. " - "Usage: provide a single string with comma separated values. " - "Example: --metadata-exclude filename,page_number ", - ), - Option( - ["--partition-by-api"], - is_flag=True, - default=False, - help="Use a remote API to partition the files." - " Otherwise, use the function from partition.auto", - ), - Option( - ["--partition-endpoint"], - default="https://api.unstructured.io/general/v0/general", - help="If partitioning via api, use the following host. " - "Default: https://api.unstructured.io/general/v0/general", - ), - Option( - ["--partition-strategy"], - default="auto", - help="The method that will be used to process the documents. " - "Default: auto. Other strategies include `fast` and `hi_res`.", - ), - Option( - ["--partition-ocr-languages"], - default="eng", - help="A list of language packs to specify which languages to use for OCR, " - "separated by '+' e.g. 'eng+deu' to use the English and German language packs. " - "The appropriate Tesseract " - "language pack needs to be installed." - "Default: eng", - ), - Option( - ["--partition-pdf-infer-table-structure"], - default=False, - help="If set to True, partition will includ the table's text content in the response." - "Default: False", - ), - Option( - ["--partition-encoding"], - default=None, - help="Text encoding to use when reading documents. By default the encoding is " - "detected automatically.", - ), - Option( - ["--api-key"], - default="", - help="API Key for partition endpoint.", - ), - Option( - ["--local-input-path"], - default=None, - help="Path to the location in the local file system that will be processed.", - ), - Option( - ["--local-file-glob"], - default=None, - help="A comma-separated list of file globs to limit which " - "types of local files are accepted," - " e.g. '*.html,*.txt'", - ), - Option( - ["--download-dir"], - help="Where files are downloaded to, defaults to " - "`$HOME/.cache/unstructured/ingest/`.", - ), - Option( - ["--preserve-downloads"], - is_flag=True, - default=False, - help="Preserve downloaded files. Otherwise each file is removed after being processed " - "successfully.", - ), - Option( - ["--download-only"], - is_flag=True, - default=False, - help="Download any files that are not already present in either --download-dir or " - "the default download ~/.cache/... location in case --download-dir " - "is not specified and " - "skip processing them through unstructured.", - ), - Option( - ["--re-download/--no-re-download"], - default=False, - help="Re-download files even if they are already present in --download-dir.", - ), - Option( - ["--structured-output-dir"], - default="structured-output", - help="Where to place structured output .json files.", - ), - Option( - ["--reprocess"], - is_flag=True, - default=False, - help="Reprocess a downloaded file even if the relevant structured output .json file " - "in --structured-output-dir already exists.", - ), - Option( - ["--num-processes"], - default=2, - show_default=True, - help="Number of parallel processes to process docs in.", - ), - Option(["-v", "--verbose"], is_flag=True, default=False), - ] - cmd.params.extend(options) diff --git a/src/unstructured/ingest/connector/airtable.py b/src/unstructured/ingest/connector/airtable.py deleted file mode 100644 index 92b09c8..0000000 --- a/src/unstructured/ingest/connector/airtable.py +++ /dev/null @@ -1,222 +0,0 @@ -import os -from dataclasses import dataclass -from pathlib import Path -from typing import Optional - -from unstructured.ingest.interfaces import ( - BaseConnector, - BaseConnectorConfig, - BaseIngestDoc, - ConnectorCleanupMixin, - IngestDocCleanupMixin, - StandardConnectorConfig, -) -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - - -@dataclass -class SimpleAirtableConfig(BaseConnectorConfig): - """Connector config where: - auth_token is the authentication token to authenticate into Airtable. - - Check https://support.airtable.com/docs/airtable-api-key-deprecation-notice - for more info on authentication. - """ - - personal_access_token: str - list_of_paths: Optional[str] - - -@dataclass -class AirtableFileMeta: - """Metadata specifying a table id, a base id which the table is stored in, - and an optional view id in case particular rows and fields are to be ingested""" - - base_id: str - table_id: str - view_id: Optional[str] = None - - -@dataclass -class AirtableIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): - """Class encapsulating fetching a doc and writing processed results (but not - doing the processing). - - Current implementation creates an Airtable connection object - to fetch each document, rather than creating a it for each thread. - """ - - config: SimpleAirtableConfig - file_meta: AirtableFileMeta - - @property - def filename(self): - return ( - Path(self.standard_config.download_dir) - / self.file_meta.base_id - / f"{self.file_meta.table_id}.csv" - ).resolve() - - @property - def _output_filename(self): - """Create output file path based on output directory, base id, and table id""" - output_file = f"{self.file_meta.table_id}.json" - return Path(self.standard_config.output_dir) / self.file_meta.base_id / output_file - - @requires_dependencies(["pyairtable", "pandas"]) - @BaseIngestDoc.skip_if_file_exists - def get_file(self): - logger.debug(f"Fetching {self} - PID: {os.getpid()}") - - # TODO: instead of having a separate connection object for each doc, - # have a separate connection object for each process - import pandas as pd - from pyairtable import Api - - self.api = Api(self.config.personal_access_token) - table = self.api.table(self.file_meta.base_id, self.file_meta.table_id) - - df = pd.DataFrame.from_dict( - [row["fields"] for row in table.all(view=self.file_meta.view_id)], - ).sort_index(axis=1) - - self.document = df.to_csv() - self.filename.parent.mkdir(parents=True, exist_ok=True) - - with open(self.filename, "w", encoding="utf8") as f: - f.write(self.document) - - -airtable_id_prefixes = ["app", "tbl", "viw"] - - -def raise_airtable_path_error(piece): - if any(piece[:3] == prefix for prefix in airtable_id_prefixes): - raise ( - ValueError( - "Path components are not correctly ordered.\ - Valid path structures: \ - - base_id/table_id/view_id , \ - - base_id/table_id, \ - - base_id .\ - It is also possible to leave --airtable-list-of-paths \ - argument empty (this will ingest everything).", - ) - ) - else: - raise ( - ValueError( - """Path components are not valid Airtable ids. - base_id should look like: appAbcDeF1ghijKlm, - table_id should look like: tblAbcDeF1ghijKlm, - view_id should look like: viwAbcDeF1ghijKlm""", - ) - ) - - -def check_path_validity(path): - pieces = path.split("/") - assert ( - 1 <= len(pieces) <= 3 - ), "Path should be composed of between 1-3 \ - components (base_id, table_id, view_id)." - - for i, piece in enumerate(pieces): - try: - assert piece[:3] == airtable_id_prefixes[i] - except AssertionError: - raise_airtable_path_error(piece) - - -@dataclass -class AirtableConnector(ConnectorCleanupMixin, BaseConnector): - """Fetches tables or views from an Airtable org.""" - - config: SimpleAirtableConfig - - def __init__( - self, - standard_config: StandardConnectorConfig, - config: SimpleAirtableConfig, - ): - super().__init__(standard_config, config) - - @requires_dependencies(["pyairtable"]) - def initialize(self): - from pyairtable import Api - - self.base_ids_to_fetch_tables_from = [] - if self.config.list_of_paths: - self.list_of_paths = self.config.list_of_paths.split() - - self.api = Api(self.config.personal_access_token) - - @requires_dependencies(["pyairtable"]) - def use_all_bases(self): - from pyairtable.metadata import get_api_bases - - self.base_ids_to_fetch_tables_from = [ - base["id"] for base in get_api_bases(self.api)["bases"] - ] - - @requires_dependencies(["pyairtable"]) - def fetch_table_ids(self): - from pyairtable.metadata import get_base_schema - - bases = [ - (base_id, self.api.base(base_id)) for base_id in self.base_ids_to_fetch_tables_from - ] - - metadata_for_each_base = [ - (base_id, get_base_schema(base)["tables"]) for base_id, base in bases - ] - - baseid_tableid_viewid_tuples = [ - (base_id, table["id"], None) - for base_id, base_metadata in metadata_for_each_base - for table in base_metadata - ] - - return baseid_tableid_viewid_tuples - - def get_ingest_docs(self): - """Fetches documents in an Airtable org.""" - - # When no list of paths provided, the connector ingests everything. - if not self.config.list_of_paths: - self.use_all_bases() - baseid_tableid_viewid_tuples = self.fetch_table_ids() - - # When there is a list of paths, the connector checks the validity - # of the paths, and fetches table_ids to be ingested, based on the paths. - else: - self.paths = self.config.list_of_paths.split() - self.paths = [path.strip("/") for path in self.paths] - - [check_path_validity(path) for path in self.paths] - - self.base_ids_to_fetch_tables_from = [] - baseid_tableid_viewid_tuples = [] - - for path in self.paths: - components = path.split("/") - if len(components) == 1: # only a base_id is provided - self.base_ids_to_fetch_tables_from.append(components[0]) - elif len(components) == 2: # a base_id and a table_id are provided - baseid_tableid_viewid_tuples.append((components[0], components[1], None)) - elif len(components) == 3: # a base_id, table_id, and a view_id are provided - baseid_tableid_viewid_tuples.append( - (components[0], components[1], components[2]), - ) - - baseid_tableid_viewid_tuples += self.fetch_table_ids() - - return [ - AirtableIngestDoc( - self.standard_config, - self.config, - AirtableFileMeta(base_id, table_id, view_id), - ) - for base_id, table_id, view_id in baseid_tableid_viewid_tuples - ] diff --git a/src/unstructured/ingest/connector/azure.py b/src/unstructured/ingest/connector/azure.py deleted file mode 100644 index df059c9..0000000 --- a/src/unstructured/ingest/connector/azure.py +++ /dev/null @@ -1,33 +0,0 @@ -from dataclasses import dataclass -from typing import Type - -from unstructured.ingest.connector.fsspec import ( - FsspecConnector, - FsspecIngestDoc, - SimpleFsspecConfig, -) -from unstructured.ingest.interfaces import StandardConnectorConfig -from unstructured.utils import requires_dependencies - - -@dataclass -class SimpleAzureBlobStorageConfig(SimpleFsspecConfig): - pass - - -class AzureBlobStorageIngestDoc(FsspecIngestDoc): - @requires_dependencies(["adlfs", "fsspec"], extras="azure") - def get_file(self): - super().get_file() - - -@requires_dependencies(["adlfs", "fsspec"], extras="azure") -class AzureBlobStorageConnector(FsspecConnector): - ingest_doc_cls: Type[AzureBlobStorageIngestDoc] = AzureBlobStorageIngestDoc - - def __init__( - self, - standard_config: StandardConnectorConfig, - config: SimpleAzureBlobStorageConfig, - ) -> None: - super().__init__(standard_config=standard_config, config=config) diff --git a/src/unstructured/ingest/connector/biomed.py b/src/unstructured/ingest/connector/biomed.py deleted file mode 100644 index c30212a..0000000 --- a/src/unstructured/ingest/connector/biomed.py +++ /dev/null @@ -1,292 +0,0 @@ -import os -import urllib.request -from dataclasses import dataclass -from ftplib import FTP, error_perm -from pathlib import Path -from typing import List, Optional, Union - -import requests -from bs4 import BeautifulSoup -from requests.adapters import HTTPAdapter -from urllib3.util import Retry - -from unstructured.ingest.interfaces import ( - BaseConnector, - BaseConnectorConfig, - BaseIngestDoc, - ConnectorCleanupMixin, - IngestDocCleanupMixin, - StandardConnectorConfig, -) -from unstructured.ingest.logger import logger -from unstructured.utils import ( - validate_date_args, -) - -DOMAIN = "ftp.ncbi.nlm.nih.gov" -FTP_DOMAIN = f"ftp://{DOMAIN}" -PMC_DIR = "pub/pmc" -PDF_DIR = "oa_pdf" - - -@dataclass -class BiomedFileMeta: - ftp_path: str - download_filepath: Union[str, os.PathLike] - output_filepath: Union[str, os.PathLike] - - -@dataclass -class SimpleBiomedConfig(BaseConnectorConfig): - """Connector config where path is the FTP directory path and - id_, from_, until, format are API parameters.""" - - path: Optional[str] - # OA Web Service API Options - id_: Optional[str] - from_: Optional[str] - until: Optional[str] - max_retries: int = 5 - request_timeout: int = 45 - decay: float = 0.3 - - def validate_api_inputs(self): - valid = False - - if self.from_: - valid = validate_date_args(self.from_) - - if self.until: - valid = validate_date_args(self.until) - - return valid - - def __post_init__(self): - self.is_file = False - self.is_dir = False - self.is_api = False - - if not self.path: - is_valid = self.validate_api_inputs() - if not is_valid: - raise ValueError( - "Path argument or at least one of the " - "OA Web Service arguments MUST be provided.", - ) - - self.is_api = True - else: - self.path = self.path.strip("/") - is_valid = self.path.lower().startswith(PDF_DIR) - - if not is_valid: - raise ValueError(f"Path MUST start with {PDF_DIR}") - - ftp = FTP(DOMAIN) - ftp.login() - - path = Path(PMC_DIR) / self.path - response = "" - try: - if path.suffix == ".pdf": - response = ftp.cwd(str(path.parent)) - self.is_file = True - else: - response = ftp.cwd(str(path)) - except error_perm as exc: - if "no such file or directory" in exc.args[0].lower(): - raise ValueError(f"The path: {path} is not valid.") - elif "not a directory" in exc.args[0].lower(): - self.is_file = True - elif "command successful" in response: - self.is_dir = True - else: - raise ValueError( - "Something went wrong when validating the path: {path}.", - ) - - -@dataclass -class BiomedIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): - config: SimpleBiomedConfig - file_meta: BiomedFileMeta - - @property - def filename(self): - return Path(self.file_meta.download_filepath).resolve() # type: ignore - - @property - def _output_filename(self): - return Path(f"{self.file_meta.output_filepath}.json").resolve() - - def cleanup_file(self): - if ( - not self.standard_config.preserve_downloads - and self.filename.is_file() - and not self.standard_config.download_only - ): - logger.debug(f"Cleaning up {self}") - Path.unlink(self.filename) - - @BaseIngestDoc.skip_if_file_exists - def get_file(self): - download_path = self.file_meta.download_filepath # type: ignore - dir_ = Path(os.path.dirname(download_path)) # type: ignore - if not dir_.is_dir(): - logger.debug(f"Creating directory: {dir_}") - - if dir_: - dir_.mkdir(parents=True, exist_ok=True) - urllib.request.urlretrieve( - self.file_meta.ftp_path, # type: ignore - self.file_meta.download_filepath, - ) - logger.debug(f"File downloaded: {self.file_meta.download_filepath}") - - -class BiomedConnector(ConnectorCleanupMixin, BaseConnector): - """Objects of this class support fetching documents from Biomedical literature FTP directory""" - - config: SimpleBiomedConfig - - def __init__( - self, - standard_config: StandardConnectorConfig, - config: SimpleBiomedConfig, - ): - super().__init__(standard_config, config) - - def _list_objects_api(self): - def urls_to_metadata(urls): - files = [] - for url in urls: - parts = url.split(PDF_DIR) - if len(parts) > 1: - local_path = parts[1].strip("/") - files.append( - BiomedFileMeta( - ftp_path=url, - download_filepath=( - Path(self.standard_config.download_dir) / local_path - ).resolve(), - output_filepath=( - Path(self.standard_config.output_dir) / local_path - ).resolve(), - ), - ) - - return files - - files: List[BiomedFileMeta] = [] - - endpoint_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?format=pdf" - - if self.config.id_: - endpoint_url += f"&id={self.config.id_}" - - if self.config.from_: - endpoint_url += f"&from={self.config.from_}" - - if self.config.until: - endpoint_url += f"&until={self.config.until}" - - while endpoint_url: - session = requests.Session() - retries = Retry( - total=self.config.max_retries, - backoff_factor=self.config.decay, - ) - adapter = HTTPAdapter(max_retries=retries) - session.mount("http://", adapter) - session.mount("https://", adapter) - response = session.get(endpoint_url, timeout=self.config.request_timeout) - soup = BeautifulSoup(response.content, features="lxml") - urls = [link["href"] for link in soup.find_all("link")] - - if not urls: - return files - - endpoint_url = urls[-1] if "resumptiontoken" in urls[-1].lower() else None - if endpoint_url: - urls = urls[:-1] - - files.extend(urls_to_metadata(urls)) - - return files - - def _list_objects(self): - files = [] - - # Conform to mypy, null check performed elsewhere. - # Wouldn't be in this method unless self.config.path exists - path: str = self.config.path if self.config.path else "" - - def traverse(path, download_dir, output_dir): - full_path = Path(PMC_DIR) / path - logger.debug(f"Traversing directory: {full_path}") - - ftp = FTP(DOMAIN) - ftp.login() - - try: - response = ftp.cwd(str(full_path)) - except error_perm: - raise ValueError(f"{full_path} is not a valid directory.") - - if "command successful" in response.lower(): - sub_paths = [path / p for p in ftp.nlst()] - - if not sub_paths: - return - - ext = Path(sub_paths[0]).suffix - if ext: - for sub_path in sub_paths: - ftp_path = f"{FTP_DOMAIN}/{PMC_DIR}/{sub_path}" - local_path = "/".join(str(sub_path).split("/")[1:]) - files.append( - BiomedFileMeta( - ftp_path=ftp_path, - download_filepath=( - Path(self.standard_config.download_dir) / local_path - ).resolve(), - output_filepath=( - Path(self.standard_config.output_dir) / local_path - ).resolve(), - ), - ) - - else: - for sub_path in sub_paths: - traverse(sub_path, download_dir, output_dir) - - else: - raise ValueError(f"{full_path} is not a valid directory.") - - ftp_path = f"{FTP_DOMAIN}/{PMC_DIR}/{self.config.path}" - if self.config.is_file: - local_path = "/".join(path.split("/")[1:]) - return [ - BiomedFileMeta( - ftp_path=ftp_path, - download_filepath=( - Path(self.standard_config.download_dir) / local_path - ).resolve(), - output_filepath=(Path(self.standard_config.output_dir) / local_path).resolve(), - ), - ] - else: - traverse( - Path(path), - Path(self.standard_config.download_dir), - Path(self.standard_config.output_dir), - ) - - return files - - def initialize(self): - pass - - def get_ingest_docs(self): - files = self._list_objects_api() if self.config.is_api else self._list_objects() - return [BiomedIngestDoc(self.standard_config, self.config, file) for file in files] diff --git a/src/unstructured/ingest/connector/box.py b/src/unstructured/ingest/connector/box.py deleted file mode 100644 index 3eae7b8..0000000 --- a/src/unstructured/ingest/connector/box.py +++ /dev/null @@ -1,74 +0,0 @@ -""" -Box Connector -Box does not make it simple to download files with an App. -First of all, this does not work with a free Box account. -Make sure the App service email is a collaborator for your folder (co-owner or editor) -Make sure you have the 'write all files' application scope -Maybe check 'Make api calls as the as-user header' -REAUTHORIZE app after making any of the above changes -""" - -from dataclasses import dataclass -from typing import Type - -from unstructured.ingest.connector.fsspec import ( - FsspecConnector, - FsspecIngestDoc, - SimpleFsspecConfig, -) -from unstructured.ingest.interfaces import StandardConnectorConfig -from unstructured.utils import requires_dependencies - - -class AccessTokenError(Exception): - """There is a problem with the Access Token.""" - - -@dataclass -class SimpleBoxConfig(SimpleFsspecConfig): - @requires_dependencies(["boxfs"], extras="box") - def __post_init__(self): - from boxsdk import JWTAuth - - super().__post_init__() - # We are passing in a json file path via the envt. variable. - # Need to convert that to an Oauth2 object. - try: - self.access_kwargs["oauth"] = JWTAuth.from_settings_file( - self.access_kwargs["box_app_config"], - ) - except (TypeError, ValueError, KeyError) as e: - raise AccessTokenError(f"Problem with box_app_config: {e}") - - def __getstate__(self): - """ - NOTE: This should not be a permanent solution. - Multiprocessing fails when it tries to pickle some Locks in the SimpleBoxConfig. - __getstate__ is called right before an object gets pickled. - We are setting those attributes to None to allow pickling. - """ - state = self.__dict__.copy() - state["access_kwargs"]["oauth"]._refresh_lock = None - state["access_kwargs"]["oauth"]._rsa_private_key._blinding_lock = None - state["access_kwargs"]["oauth"]._rsa_private_key._backend = None - state["access_kwargs"]["oauth"]._rsa_private_key._rsa_cdata = None - state["access_kwargs"]["oauth"]._rsa_private_key._evp_pkey = None - return state - - -class BoxIngestDoc(FsspecIngestDoc): - @requires_dependencies(["boxfs", "fsspec"], extras="box") - def get_file(self): - super().get_file() - - -@requires_dependencies(["boxfs", "fsspec"], extras="box") -class BoxConnector(FsspecConnector): - ingest_doc_cls: Type[BoxIngestDoc] = BoxIngestDoc - - def __init__( - self, - config: SimpleBoxConfig, - standard_config: StandardConnectorConfig, - ) -> None: - super().__init__(standard_config, config) diff --git a/src/unstructured/ingest/connector/confluence.py b/src/unstructured/ingest/connector/confluence.py deleted file mode 100644 index 5771aef..0000000 --- a/src/unstructured/ingest/connector/confluence.py +++ /dev/null @@ -1,208 +0,0 @@ -import math -import os -from dataclasses import dataclass -from pathlib import Path -from typing import Optional - -from atlassian import Confluence - -from unstructured.ingest.interfaces import ( - BaseConnector, - BaseConnectorConfig, - BaseIngestDoc, - ConnectorCleanupMixin, - IngestDocCleanupMixin, - StandardConnectorConfig, -) -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - - -@dataclass -class SimpleConfluenceConfig(BaseConnectorConfig): - """Connector config where: - user_email is the email to authenticate into Confluence Cloud, - api_token is the api token to authenticate into Confluence Cloud, - and url is the URL pointing to the Confluence Cloud instance. - - Check https://developer.atlassian.com/cloud/confluence/basic-auth-for-rest-apis/ - for more info on the api_token. - """ - - user_email: str - api_token: str - url: str - list_of_spaces: Optional[str] - max_number_of_spaces: int - max_number_of_docs_from_each_space: int - - -@dataclass -class ConfluenceFileMeta: - """Metadata specifying: - id for the confluence space that the document locates in, - and the id of document that is being reached to. - """ - - space_id: str - document_id: str - - -def scroll_wrapper(func): - def wrapper(*args, **kwargs): - """Wraps a function to obtain scroll functionality.""" - number_of_items_to_fetch = kwargs["number_of_items_to_fetch"] - del kwargs["number_of_items_to_fetch"] - - kwargs["limit"] = min(100, number_of_items_to_fetch) - kwargs["start"] = 0 if "start" not in kwargs else kwargs["start"] - - all_results = [] - num_iterations = math.ceil(number_of_items_to_fetch / kwargs["limit"]) - - for _ in range(num_iterations): - response = func(*args, **kwargs) - if type(response) is list: - all_results += func(*args, **kwargs) - elif type(response) is dict: - all_results += func(*args, **kwargs)["results"] - - kwargs["start"] += kwargs["limit"] - - return all_results[:number_of_items_to_fetch] - - return wrapper - - -@dataclass -class ConfluenceIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): - """Class encapsulating fetching a doc and writing processed results (but not - doing the processing). - - Current implementation creates a Confluence connection object - to fetch each doc, rather than creating a it for each thread. - """ - - config: SimpleConfluenceConfig - file_meta: ConfluenceFileMeta - - # TODO: remove one of filename or _tmp_download_file, using a wrapper - @property - def filename(self): - return ( - Path(self.standard_config.download_dir) - / self.file_meta.space_id - / f"{self.file_meta.document_id}.html" - ).resolve() - - @property - def _output_filename(self): - """Create output file path based on output directory, space id and document id.""" - output_file = f"{self.file_meta.document_id}.json" - return Path(self.standard_config.output_dir) / self.file_meta.space_id / output_file - - @requires_dependencies(["atlassian"]) - @BaseIngestDoc.skip_if_file_exists - def get_file(self): - logger.debug(f"Fetching {self} - PID: {os.getpid()}") - - # TODO: instead of having a separate connection object for each doc, - # have a separate connection object for each process - confluence = Confluence( - self.config.url, - username=self.config.user_email, - password=self.config.api_token, - ) - - result = confluence.get_page_by_id(page_id=self.file_meta.document_id, expand="body.view") - self.document = result["body"]["view"]["value"] - self.filename.parent.mkdir(parents=True, exist_ok=True) - with open(self.filename, "w", encoding="utf8") as f: - f.write(self.document) - - -@requires_dependencies(["atlassian"]) -@dataclass -class ConfluenceConnector(ConnectorCleanupMixin, BaseConnector): - """Fetches body fields from all documents within all spaces in a Confluence Cloud instance.""" - - config: SimpleConfluenceConfig - - def __init__( - self, - standard_config: StandardConnectorConfig, - config: SimpleConfluenceConfig, - ): - super().__init__(standard_config, config) - - @requires_dependencies(["atlassian"]) - def initialize(self): - self.confluence = Confluence( - url=self.config.url, - username=self.config.user_email, - password=self.config.api_token, - ) - - self.list_of_spaces = None - if self.config.list_of_spaces: - self.list_of_spaces = self.config.list_of_spaces.split(",") - if self.config.max_number_of_spaces: - logger.warning( - """--confluence-list-of-spaces and --confluence-num-of-spaces cannot - be used at the same time. Connector will only fetch the - --confluence-list-of-spaces that you've provided.""", - ) - - @requires_dependencies(["atlassian"]) - def _get_space_ids(self): - """Fetches spaces in a confluence domain.""" - - get_spaces_with_scroll = scroll_wrapper(self.confluence.get_all_spaces) - - all_results = get_spaces_with_scroll( - number_of_items_to_fetch=self.config.max_number_of_spaces, - ) - - space_ids = [space["key"] for space in all_results] - return space_ids - - @requires_dependencies(["atlassian"]) - def _get_docs_ids_within_one_space( - self, - space_id: str, - content_type: str = "page", - ): - get_pages_with_scroll = scroll_wrapper(self.confluence.get_all_pages_from_space) - results = get_pages_with_scroll( - space=space_id, - number_of_items_to_fetch=self.config.max_number_of_docs_from_each_space, - content_type=content_type, - ) - - doc_ids = [(space_id, doc["id"]) for doc in results] - return doc_ids - - @requires_dependencies(["atlassian"]) - def _get_doc_ids_within_spaces(self): - space_ids = self._get_space_ids() if not self.list_of_spaces else self.list_of_spaces - - doc_ids_all = [self._get_docs_ids_within_one_space(space_id=id) for id in space_ids] - - doc_ids_flattened = [ - (space_id, doc_id) - for doc_ids_space in doc_ids_all - for space_id, doc_id in doc_ids_space - ] - return doc_ids_flattened - - def get_ingest_docs(self): - """Fetches all documents in a confluence space.""" - doc_ids = self._get_doc_ids_within_spaces() - return [ - ConfluenceIngestDoc( - self.standard_config, - self.config, - ConfluenceFileMeta(space_id, doc_id), - ) - for space_id, doc_id in doc_ids - ] diff --git a/src/unstructured/ingest/connector/discord.py b/src/unstructured/ingest/connector/discord.py deleted file mode 100644 index 636fdf4..0000000 --- a/src/unstructured/ingest/connector/discord.py +++ /dev/null @@ -1,146 +0,0 @@ -import datetime as dt -import os -from dataclasses import dataclass -from pathlib import Path -from typing import List, Optional - -from unstructured.ingest.interfaces import ( - BaseConnector, - BaseConnectorConfig, - BaseIngestDoc, - ConnectorCleanupMixin, - IngestDocCleanupMixin, - StandardConnectorConfig, -) -from unstructured.ingest.logger import logger -from unstructured.utils import ( - requires_dependencies, -) - - -@dataclass -class SimpleDiscordConfig(BaseConnectorConfig): - """Connector config where channels is a comma separated list of - Discord channels to pull messages from. - """ - - # Discord Specific Options - channels: List[str] - token: str - days: Optional[int] - verbose: bool = False - - def __post_init__(self): - if self.days: - try: - self.days = int(self.days) - except ValueError: - raise ValueError("--discord-period must be an integer") - - pass - - @staticmethod - def parse_channels(channel_str: str) -> List[str]: - """Parses a comma separated list of channels into a list.""" - return [x.strip() for x in channel_str.split(",")] - - -@dataclass -class DiscordIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): - """Class encapsulating fetching a doc and writing processed results (but not - doing the processing!). - Also includes a cleanup method. When things go wrong and the cleanup - method is not called, the file is left behind on the filesystem to assist debugging. - """ - - config: SimpleDiscordConfig - channel: str - days: Optional[int] - token: str - - # NOTE(crag): probably doesn't matter, but intentionally not defining tmp_download_file - # __post_init__ for multiprocessing simplicity (no Path objects in initially - # instantiated object) - def _tmp_download_file(self): - channel_file = self.channel + ".txt" - return Path(self.standard_config.download_dir) / channel_file - - @property - def _output_filename(self): - output_file = self.channel + ".json" - return Path(self.standard_config.output_dir) / output_file - - def _create_full_tmp_dir_path(self): - self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) - - @BaseIngestDoc.skip_if_file_exists - @requires_dependencies(dependencies=["discord"], extras="discord") - def get_file(self): - """Actually fetches the data from discord and stores it locally.""" - - import discord - from discord.ext import commands - - self._create_full_tmp_dir_path() - if self.config.verbose: - logger.debug(f"fetching {self} - PID: {os.getpid()}") - messages: List[discord.Message] = [] - intents = discord.Intents.default() - intents.message_content = True - bot = commands.Bot(command_prefix=">", intents=intents) - - @bot.event - async def on_ready(): - try: - after_date = None - if self.days: - after_date = dt.datetime.utcnow() - dt.timedelta(days=self.days) - - channel = bot.get_channel(int(self.channel)) - async for msg in channel.history(after=after_date): # type: ignore - messages.append(msg) - - await bot.close() - except Exception as e: - logger.error(f"Error fetching messages: {e}") - await bot.close() - - bot.run(self.token) - - with open(self._tmp_download_file(), "w") as f: - for m in messages: - f.write(m.content + "\n") - - @property - def filename(self): - """The filename of the file created from a discord channel""" - return self._tmp_download_file() - - -class DiscordConnector(ConnectorCleanupMixin, BaseConnector): - """Objects of this class support fetching document(s) from""" - - config: SimpleDiscordConfig - - def __init__( - self, - standard_config: StandardConnectorConfig, - config: SimpleDiscordConfig, - ): - super().__init__(standard_config, config) - - def initialize(self): - """Verify that can get metadata for an object, validates connections info.""" - os.mkdir(self.standard_config.download_dir) - - def get_ingest_docs(self): - return [ - DiscordIngestDoc( - self.standard_config, - self.config, - channel, - self.config.days, - self.config.token, - ) - for channel in self.config.channels - ] diff --git a/src/unstructured/ingest/connector/dropbox.py b/src/unstructured/ingest/connector/dropbox.py deleted file mode 100644 index 385e05b..0000000 --- a/src/unstructured/ingest/connector/dropbox.py +++ /dev/null @@ -1,124 +0,0 @@ -""" -Dropbox Connector -The Dropbox Connector presents a couple abnormal situations. -1) They don't have an unexpiring token -2) They require a forward slash `/` in front of the remote_file_path. This presents -some real problems creating paths. When appending a path that begins with a -forward slash to any path, whether using the / shorthand or joinpath, causes the -starting path to disappear. So the `/` needs to be stripped off. -3) To list and get files from the root directory Dropbox you need a ""," ", or " /" -""" -import re -from dataclasses import dataclass -from pathlib import Path -from typing import Type - -from unstructured.ingest.connector.fsspec import ( - FsspecConnector, - FsspecIngestDoc, - SimpleFsspecConfig, -) -from unstructured.ingest.interfaces import StandardConnectorConfig -from unstructured.utils import requires_dependencies - - -class MissingFolderError(Exception): - """There is no folder by that name. For root try `dropbox:// /`""" - - -@dataclass -class SimpleDropboxConfig(SimpleFsspecConfig): - pass - - -class DropboxIngestDoc(FsspecIngestDoc): - @requires_dependencies(["dropboxdrivefs", "fsspec"]) - def get_file(self): - super().get_file() - - @property - def _output_filename(self): - # Dropbox requires a forward slash at the front of the folder path. This - # creates some complications in path joining so a custom path is created here. - # Dropbox uses an empty string `""`, or a space `" "`` or a `" /"` to list root - if self.config.dir_path == " ": - return Path(self.standard_config.output_dir) / re.sub( - "^/", - "", - f"{self.remote_file_path}.json", - ) - else: - return ( - Path(self.standard_config.output_dir) - / f"{self.remote_file_path.replace(f'/{self.config.dir_path}/', '')}.json" - ) - - def _tmp_download_file(self): - # Dropbox requires a forward slash at the front of the folder path. This - # creates some complications in path joining so a custom path is created here. - # Dropbox uses an empty string `""`, or a space `" "`` or a `" /"` to list root - if self.config.dir_path == " ": - return Path(self.standard_config.download_dir) / re.sub( - "^/", - "", - self.remote_file_path, - ) - else: - return Path(self.standard_config.download_dir) / self.remote_file_path.replace( - f"/{self.config.dir_path}/", - "", - ) - - -@requires_dependencies(["dropboxdrivefs", "fsspec"]) -class DropboxConnector(FsspecConnector): - ingest_doc_cls: Type[DropboxIngestDoc] = DropboxIngestDoc - - def __init__( - self, - config: SimpleDropboxConfig, - standard_config: StandardConnectorConfig, - ) -> None: - super().__init__(standard_config, config) - - def initialize(self): - # Dropbox requires a forward slash at the front of the folder path. This - # creates some complications in path joining so a custom path is created here. - ls_output = self.fs.ls(f"/{self.config.path_without_protocol}") - if ls_output and len(ls_output) >= 1: - return - elif ls_output: - raise ValueError( - f"No objects found in {self.config.path}.", - ) - else: - raise MissingFolderError( - "There is no folder by that name. For root try `dropbox:// /`", - ) - - def _list_files(self): - # Dropbox requires a forward slash at the front of the folder path. This - # creates some complications in path joining so a custom path is created here. - if not self.config.recursive: - # fs.ls does not walk directories - # directories that are listed in cloud storage can cause problems because they are seen - # as 0byte files - return [ - x.get("name") - for x in self.fs.ls( - f"/{self.config.path_without_protocol}", - detail=True, - ) - if x.get("size") - ] - else: - # fs.find will recursively walk directories - # "size" is a common key for all the cloud protocols with fs - return [ - k - for k, v in self.fs.find( - f"/{self.config.path_without_protocol}", - detail=True, - ).items() - if v.get("size") - ] diff --git a/src/unstructured/ingest/connector/elasticsearch.py b/src/unstructured/ingest/connector/elasticsearch.py deleted file mode 100644 index a48c7d4..0000000 --- a/src/unstructured/ingest/connector/elasticsearch.py +++ /dev/null @@ -1,169 +0,0 @@ -import hashlib -import json -import os -from dataclasses import dataclass -from pathlib import Path -from typing import Optional - -import jq -from elasticsearch import Elasticsearch -from elasticsearch.helpers import scan - -from unstructured.ingest.interfaces import ( - BaseConnector, - BaseConnectorConfig, - BaseIngestDoc, - ConnectorCleanupMixin, - IngestDocCleanupMixin, - StandardConnectorConfig, -) -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - - -@dataclass -class SimpleElasticsearchConfig(BaseConnectorConfig): - """Connector config where: - url is the url to access the elasticsearch server, - index_name is the name of the index to reach to, - - and jq_query is a query to get specific fields from each document that is reached, - rather than getting and processing all fields in a document. - """ - - url: str - index_name: str - jq_query: Optional[str] - - -@dataclass -class ElasticsearchFileMeta: - """Metadata specifying: - name of the elasticsearch index that is being reached to, - and the id of document that is being reached to, - """ - - index_name: str - document_id: str - - -@dataclass -class ElasticsearchIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): - """Class encapsulating fetching a doc and writing processed results (but not - doing the processing!). - - Current implementation creates a python Elasticsearch client to fetch each doc, - rather than creating a client for each thread. - """ - - config: SimpleElasticsearchConfig - file_meta: ElasticsearchFileMeta - - # TODO: remove one of filename or _tmp_download_file, using a wrapper - @property - def filename(self): - return ( - Path(self.standard_config.download_dir) - / self.file_meta.index_name - / f"{self.file_meta.document_id}.txt" - ).resolve() - - @property - def _output_filename(self): - """Create filename document id combined with a hash of the query to uniquely identify - the output file.""" - # Generate SHA256 hash and take the first 8 characters - query_hash = hashlib.sha256((self.config.jq_query or "").encode()).hexdigest()[:8] - output_file = f"{self.file_meta.document_id}-{query_hash}.json" - return Path(self.standard_config.output_dir) / self.config.index_name / output_file - - # TODO: change test fixtures such that examples with - # nested dictionaries are included in test documents - def _flatten_values(self, value, seperator="\n", no_value_str=""): - """Flattens list or dict objects. Joins each value or item with - the seperator character. Keys are not included in the joined string. - When a dict value or a list item is None, no_value_str is used to - represent that value / item.""" - if value is None: - return no_value_str - - if isinstance(value, list): - flattened_values = [self._flatten_values(item, seperator) for item in value] - return seperator.join(flattened_values) - - elif isinstance(value, dict): - flattened_values = [self._flatten_values(item, seperator) for item in value.values()] - return seperator.join(flattened_values) - - else: - return str(value) - - def _concatenate_dict_fields(self, dictionary, seperator="\n"): - """Concatenates all values for each key in a dictionary in a nested manner. - Used to parse a python dictionary to an aggregated string""" - values = [self._flatten_values(value, seperator) for value in dictionary.values()] - concatenated_values = seperator.join(values) - return concatenated_values - - @requires_dependencies(["elasticsearch"]) - @BaseIngestDoc.skip_if_file_exists - def get_file(self): - logger.debug(f"Fetching {self} - PID: {os.getpid()}") - # TODO: instead of having a separate client for each doc, - # have a separate client for each process - es = Elasticsearch(self.config.url) - document_dict = es.get( - index=self.config.index_name, - id=self.file_meta.document_id, - ).body["_source"] - if self.config.jq_query: - document_dict = json.loads(jq.compile(self.config.jq_query).input(document_dict).text()) - self.document = self._concatenate_dict_fields(document_dict) - self.filename.parent.mkdir(parents=True, exist_ok=True) - with open(self.filename, "w", encoding="utf8") as f: - f.write(self.document) - - -@requires_dependencies(["elasticsearch"]) -@dataclass -class ElasticsearchConnector(ConnectorCleanupMixin, BaseConnector): - """Fetches particular fields from all documents in a given elasticsearch cluster and index""" - - config: SimpleElasticsearchConfig - - def __init__( - self, - standard_config: StandardConnectorConfig, - config: SimpleElasticsearchConfig, - ): - super().__init__(standard_config, config) - - def initialize(self): - self.es = Elasticsearch(self.config.url) - self.scan_query: dict = {"query": {"match_all": {}}} - self.search_query: dict = {"match_all": {}} - self.es.search(index=self.config.index_name, query=self.search_query, size=1) - - @requires_dependencies(["elasticsearch"]) - def _get_doc_ids(self): - """Fetches all document ids in an index""" - hits = scan( - self.es, - query=self.scan_query, - scroll="1m", - index=self.config.index_name, - ) - - return [hit["_id"] for hit in hits] - - def get_ingest_docs(self): - """Fetches all documents in an index, using ids that are fetched with _get_doc_ids""" - ids = self._get_doc_ids() - return [ - ElasticsearchIngestDoc( - self.standard_config, - self.config, - ElasticsearchFileMeta(self.config.index_name, id), - ) - for id in ids - ] diff --git a/src/unstructured/ingest/connector/fsspec.py b/src/unstructured/ingest/connector/fsspec.py deleted file mode 100644 index 63df58b..0000000 --- a/src/unstructured/ingest/connector/fsspec.py +++ /dev/null @@ -1,175 +0,0 @@ -import os -import re -from dataclasses import dataclass, field -from pathlib import Path -from typing import Type - -from unstructured.ingest.interfaces import ( - BaseConnector, - BaseConnectorConfig, - BaseIngestDoc, - ConnectorCleanupMixin, - IngestDocCleanupMixin, - StandardConnectorConfig, -) -from unstructured.ingest.logger import logger - -SUPPORTED_REMOTE_FSSPEC_PROTOCOLS = [ - "s3", - "s3a", - "abfs", - "az", - "gs", - "gcs", - "box", - "dropbox", -] - - -@dataclass -class SimpleFsspecConfig(BaseConnectorConfig): - # fsspec specific options - path: str - recursive: bool - access_kwargs: dict = field(default_factory=dict) - protocol: str = field(init=False) - path_without_protocol: str = field(init=False) - dir_path: str = field(init=False) - file_path: str = field(init=False) - - def __post_init__(self): - self.protocol, self.path_without_protocol = self.path.split("://") - if self.protocol not in SUPPORTED_REMOTE_FSSPEC_PROTOCOLS: - raise ValueError( - f"Protocol {self.protocol} not supported yet, only " - f"{SUPPORTED_REMOTE_FSSPEC_PROTOCOLS} are supported.", - ) - - # dropbox root is an empty string - match = re.match(rf"{self.protocol}://([\s])/", self.path) - if match and self.protocol == "dropbox": - self.dir_path = " " - self.file_path = "" - return - - # just a path with no trailing prefix - match = re.match(rf"{self.protocol}://([^/\s]+?)(/*)$", self.path) - if match: - self.dir_path = match.group(1) - self.file_path = "" - return - - # valid path with a dir and/or file - match = re.match(rf"{self.protocol}://([^/\s]+?)/([^\s]*)", self.path) - if not match: - raise ValueError( - f"Invalid path {self.path}. Expected :///.", - ) - self.dir_path = match.group(1) - self.file_path = match.group(2) or "" - - -@dataclass -class FsspecIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): - """Class encapsulating fetching a doc and writing processed results (but not - doing the processing!). - - Also includes a cleanup method. When things go wrong and the cleanup - method is not called, the file is left behind on the filesystem to assist debugging. - """ - - config: SimpleFsspecConfig - remote_file_path: str - - def _tmp_download_file(self): - return Path(self.standard_config.download_dir) / self.remote_file_path.replace( - f"{self.config.dir_path}/", - "", - ) - - @property - def _output_filename(self): - return ( - Path(self.standard_config.output_dir) - / f"{self.remote_file_path.replace(f'{self.config.dir_path}/', '')}.json" - ) - - def _create_full_tmp_dir_path(self): - """Includes "directories" in the object path""" - self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) - - @BaseIngestDoc.skip_if_file_exists - def get_file(self): - """Fetches the file from the current filesystem and stores it locally.""" - from fsspec import AbstractFileSystem, get_filesystem_class - - self._create_full_tmp_dir_path() - fs: AbstractFileSystem = get_filesystem_class(self.config.protocol)( - **self.config.access_kwargs, - ) - logger.debug(f"Fetching {self} - PID: {os.getpid()}") - fs.get(rpath=self.remote_file_path, lpath=self._tmp_download_file().as_posix()) - - @property - def filename(self): - """The filename of the file after downloading from cloud""" - return self._tmp_download_file() - - -class FsspecConnector(ConnectorCleanupMixin, BaseConnector): - """Objects of this class support fetching document(s) from""" - - config: SimpleFsspecConfig - ingest_doc_cls: Type[FsspecIngestDoc] = FsspecIngestDoc - - def __init__( - self, - standard_config: StandardConnectorConfig, - config: SimpleFsspecConfig, - ): - from fsspec import AbstractFileSystem, get_filesystem_class - - super().__init__(standard_config, config) - self.fs: AbstractFileSystem = get_filesystem_class(self.config.protocol)( - **self.config.access_kwargs, - ) - - def initialize(self): - """Verify that can get metadata for an object, validates connections info.""" - ls_output = self.fs.ls(self.config.path_without_protocol) - if len(ls_output) < 1: - raise ValueError( - f"No objects found in {self.config.path}.", - ) - - def _list_files(self): - if not self.config.recursive: - # fs.ls does not walk directories - # directories that are listed in cloud storage can cause problems - # because they are seen as 0 byte files - return [ - x.get("name") - for x in self.fs.ls(self.config.path_without_protocol, detail=True) - if x.get("size") > 0 - ] - else: - # fs.find will recursively walk directories - # "size" is a common key for all the cloud protocols with fs - return [ - k - for k, v in self.fs.find( - self.config.path_without_protocol, - detail=True, - ).items() - if v.get("size") > 0 - ] - - def get_ingest_docs(self): - return [ - self.ingest_doc_cls( - standard_config=self.standard_config, - config=self.config, - remote_file_path=file, - ) - for file in self._list_files() - ] diff --git a/src/unstructured/ingest/connector/gcs.py b/src/unstructured/ingest/connector/gcs.py deleted file mode 100644 index 256934b..0000000 --- a/src/unstructured/ingest/connector/gcs.py +++ /dev/null @@ -1,33 +0,0 @@ -from dataclasses import dataclass -from typing import Type - -from unstructured.ingest.connector.fsspec import ( - FsspecConnector, - FsspecIngestDoc, - SimpleFsspecConfig, -) -from unstructured.ingest.interfaces import StandardConnectorConfig -from unstructured.utils import requires_dependencies - - -@dataclass -class SimpleGcsConfig(SimpleFsspecConfig): - pass - - -class GcsIngestDoc(FsspecIngestDoc): - @requires_dependencies(["gcsfs", "fsspec"], extras="gcs") - def get_file(self): - super().get_file() - - -@requires_dependencies(["gcsfs", "fsspec"], extras="gcs") -class GcsConnector(FsspecConnector): - ingest_doc_cls: Type[GcsIngestDoc] = GcsIngestDoc - - def __init__( - self, - config: SimpleGcsConfig, - standard_config: StandardConnectorConfig, - ) -> None: - super().__init__(standard_config, config) diff --git a/src/unstructured/ingest/connector/git.py b/src/unstructured/ingest/connector/git.py deleted file mode 100644 index 85268a9..0000000 --- a/src/unstructured/ingest/connector/git.py +++ /dev/null @@ -1,94 +0,0 @@ -import fnmatch -import os -from dataclasses import dataclass, field -from pathlib import Path -from typing import Optional - -from unstructured.ingest.interfaces import ( - BaseConnector, - BaseConnectorConfig, - BaseIngestDoc, - ConnectorCleanupMixin, - IngestDocCleanupMixin, -) -from unstructured.ingest.logger import logger - - -@dataclass -class SimpleGitConfig(BaseConnectorConfig): - url: str - access_token: Optional[str] - branch: Optional[str] - file_glob: Optional[str] - repo_path: str = field(init=False, repr=False) - - -@dataclass -class GitIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): - config: SimpleGitConfig = field(repr=False) - path: str - - @property - def filename(self): - return (Path(self.standard_config.download_dir) / self.path).resolve() - - @property - def _output_filename(self): - return Path(self.standard_config.output_dir) / f"{self.path}.json" - - def _create_full_tmp_dir_path(self): - """includes directories in in the gitlab repository""" - self.filename.parent.mkdir(parents=True, exist_ok=True) - - @BaseIngestDoc.skip_if_file_exists - def get_file(self): - """Fetches the "remote" doc and stores it locally on the filesystem.""" - self._create_full_tmp_dir_path() - logger.debug(f"Fetching {self} - PID: {os.getpid()}") - self._fetch_and_write() - - def _fetch_and_write(self) -> None: - raise NotImplementedError() - - -@dataclass -class GitConnector(ConnectorCleanupMixin, BaseConnector): - config: SimpleGitConfig - - def initialize(self): - pass - - def is_file_type_supported(self, path: str) -> bool: - # Workaround to ensure that auto.partition isn't fed with .yaml, .py, etc. files - # TODO: What to do with no filenames? e.g. LICENSE, Makefile, etc. - supported = path.endswith( - ( - ".md", - ".txt", - ".pdf", - ".doc", - ".docx", - ".eml", - ".html", - ".png", - ".jpg", - ".ppt", - ".pptx", - ".xml", - ), - ) - if not supported: - logger.debug( - f"The file {path!r} is discarded as it does not contain a supported filetype.", - ) - return supported - - def does_path_match_glob(self, path: str) -> bool: - if not self.config.file_glob: - return True - patterns = self.config.file_glob.split(",") - for pattern in patterns: - if fnmatch.filter([path], pattern): - return True - logger.debug(f"The file {path!r} is discarded as it does not match any given glob.") - return False diff --git a/src/unstructured/ingest/connector/github.py b/src/unstructured/ingest/connector/github.py deleted file mode 100644 index bf9754d..0000000 --- a/src/unstructured/ingest/connector/github.py +++ /dev/null @@ -1,87 +0,0 @@ -from dataclasses import dataclass -from typing import TYPE_CHECKING -from urllib.parse import urlparse - -import requests - -from unstructured.ingest.connector.git import ( - GitConnector, - GitIngestDoc, - SimpleGitConfig, -) -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - -if TYPE_CHECKING: - from github.Repository import Repository - - -@dataclass -class SimpleGitHubConfig(SimpleGitConfig): - def __post_init__(self): - parsed_gh_url = urlparse(self.url) - path_fragments = [fragment for fragment in parsed_gh_url.path.split("/") if fragment] - - # If a scheme and netloc are provided, ensure they are correct - # Additionally, ensure that the path contains two fragments - if ( - (parsed_gh_url.scheme and parsed_gh_url.scheme != "https") - or (parsed_gh_url.netloc and parsed_gh_url.netloc != "github.com") - or len(path_fragments) != 2 - ): - raise ValueError( - 'Please provide a valid URL, e.g. "https://github.com/Unstructured-IO/unstructured"' - ' or a repository owner/name pair, e.g. "Unstructured-IO/unstructured".', - ) - - # If there's no issues, store the core repository info - self.repo_path = parsed_gh_url.path - - -@dataclass -class GitHubIngestDoc(GitIngestDoc): - repo: "Repository" - - def _fetch_and_write(self) -> None: - content_file = self.repo.get_contents(self.path) - contents = b"" - if ( - not content_file.content # type: ignore - and content_file.encoding == "none" # type: ignore - and content_file.size # type: ignore - ): - logger.info("File too large for the GitHub API, using direct download link instead.") - response = requests.get(content_file.download_url) # type: ignore - if response.status_code != 200: - logger.info("Direct download link has failed... Skipping this file.") - else: - contents = response.content - else: - contents = content_file.decoded_content # type: ignore - - with open(self.filename, "wb") as f: - f.write(contents) - - -@requires_dependencies(["github"], extras="github") -@dataclass -class GitHubConnector(GitConnector): - def __post_init__(self) -> None: - from github import Github - - self.github = Github(self.config.access_token) - - def get_ingest_docs(self): - repo = self.github.get_repo(self.config.repo_path) - - # Load the Git tree with all files, and then create Ingest docs - # for all blobs, i.e. all files, ignoring directories - sha = self.config.branch or repo.default_branch - git_tree = repo.get_git_tree(sha, recursive=True) - return [ - GitHubIngestDoc(self.standard_config, self.config, element.path, repo) - for element in git_tree.tree - if element.type == "blob" - and self.is_file_type_supported(element.path) - and (not self.config.file_glob or self.does_path_match_glob(element.path)) - ] diff --git a/src/unstructured/ingest/connector/gitlab.py b/src/unstructured/ingest/connector/gitlab.py deleted file mode 100644 index 40bb361..0000000 --- a/src/unstructured/ingest/connector/gitlab.py +++ /dev/null @@ -1,71 +0,0 @@ -from dataclasses import dataclass -from typing import TYPE_CHECKING -from urllib.parse import urlparse - -from unstructured.ingest.connector.git import ( - GitConnector, - GitIngestDoc, - SimpleGitConfig, -) -from unstructured.utils import requires_dependencies - -if TYPE_CHECKING: - from gitlab.v4.objects.projects import Project - - -@dataclass -class SimpleGitLabConfig(SimpleGitConfig): - def __post_init__(self): - parsed_gh_url = urlparse(self.url) - - # If no scheme or netloc are provided, use the default gitlab.com - if not parsed_gh_url.scheme and not parsed_gh_url.netloc: - self.url = "https://gitlab.com" - else: - self.url = f"{parsed_gh_url.scheme}://{parsed_gh_url.netloc}" - self.repo_path = parsed_gh_url.path - while self.repo_path.startswith("/"): - self.repo_path = self.repo_path[1:] - - -@dataclass -class GitLabIngestDoc(GitIngestDoc): - project: "Project" - - def _fetch_and_write(self) -> None: - content_file = self.project.files.get( - self.path, - ref=self.config.branch or self.project.default_branch, - ) - contents = content_file.decode() - - with open(self.filename, "wb") as f: - f.write(contents) - - -@requires_dependencies(["gitlab"], extras="gitlab") -@dataclass -class GitLabConnector(GitConnector): - def __post_init__(self) -> None: - from gitlab import Gitlab - - self.gitlab = Gitlab(self.config.url, private_token=self.config.access_token) - - def get_ingest_docs(self): - # Load the Git tree with all files, and then create Ingest docs - # for all blobs, i.e. all files, ignoring directories - project = self.gitlab.projects.get(self.config.repo_path) - ref = self.config.branch or project.default_branch - git_tree = project.repository_tree( - ref=ref, - recursive=True, - iterator=True, - all=True, - ) - return [ - GitLabIngestDoc(self.standard_config, self.config, element["path"], project) - for element in git_tree - if element["type"] == "blob" - and self.is_file_type_supported(element["path"]) - and (not self.config.file_glob or self.does_path_match_glob(element["path"])) - ] diff --git a/src/unstructured/ingest/connector/google_drive.py b/src/unstructured/ingest/connector/google_drive.py deleted file mode 100644 index 7053a03..0000000 --- a/src/unstructured/ingest/connector/google_drive.py +++ /dev/null @@ -1,262 +0,0 @@ -import io -import json -import os -from dataclasses import dataclass -from mimetypes import guess_extension -from pathlib import Path -from typing import TYPE_CHECKING, Dict, Optional - -from unstructured.file_utils.filetype import EXT_TO_FILETYPE -from unstructured.file_utils.google_filetype import GOOGLE_DRIVE_EXPORT_TYPES -from unstructured.ingest.interfaces import ( - BaseConnector, - BaseConnectorConfig, - BaseIngestDoc, - BaseSessionHandle, - ConfigSessionHandleMixin, - ConnectorCleanupMixin, - IngestDocCleanupMixin, - IngestDocSessionHandleMixin, - StandardConnectorConfig, -) -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - -if TYPE_CHECKING: - from googleapiclient.discovery import Resource as GoogleAPIResource - -FILE_FORMAT = "{id}-{name}{ext}" -DIRECTORY_FORMAT = "{id}-{name}" - - -@dataclass -class GoogleDriveSessionHandle(BaseSessionHandle): - service: "GoogleAPIResource" - - -@requires_dependencies(["googleapiclient"], extras="google-drive") -def create_service_account_object(key_path, id=None): - """ - Creates a service object for interacting with Google Drive. - - Providing a drive id enforces a key validation process. - - Args: - key_path: Path to Google Drive service account json file. - id: ID of a file on Google Drive. File has to be either publicly accessible or accessible - to the service account. - - Returns: - Service account object - """ - from google.auth import default, exceptions - from googleapiclient.discovery import build - from googleapiclient.errors import HttpError - - try: - os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = key_path - creds, _ = default() - service = build("drive", "v3", credentials=creds) - - if id: - service.files().list( - spaces="drive", - fields="files(id)", - pageToken=None, - corpora="user", - q=f"'{id}' in parents", - ).execute() - - except HttpError as exc: - raise ValueError(f"{exc.reason}") - except exceptions.DefaultCredentialsError: - raise ValueError("The provided API key is invalid.") - - return service - - -@dataclass -class SimpleGoogleDriveConfig(ConfigSessionHandleMixin, BaseConnectorConfig): - """Connector config where drive_id is the id of the document to process or - the folder to process all documents from.""" - - # Google Drive Specific Options - drive_id: str - service_account_key: str - extension: Optional[str] - recursive: bool = False - - def __post_init__(self): - if self.extension and self.extension not in EXT_TO_FILETYPE: - raise ValueError( - f"Extension not supported. " - f"Value MUST be one of {', '.join([k for k in EXT_TO_FILETYPE if k is not None])}.", - ) - - def create_session_handle( - self, - ) -> GoogleDriveSessionHandle: - service = create_service_account_object(self.service_account_key) - return GoogleDriveSessionHandle(service=service) - - -@dataclass -class GoogleDriveIngestDoc(IngestDocSessionHandleMixin, IngestDocCleanupMixin, BaseIngestDoc): - config: SimpleGoogleDriveConfig - file_meta: Dict - - @property - def filename(self): - return Path(self.file_meta.get("download_filepath")).resolve() # type: ignore - - @property - def _output_filename(self): - return Path(f"{self.file_meta.get('output_filepath')}.json").resolve() - - @BaseIngestDoc.skip_if_file_exists - @requires_dependencies(["googleapiclient"], extras="google-drive") - def get_file(self): - from googleapiclient.errors import HttpError - from googleapiclient.http import MediaIoBaseDownload - - if self.file_meta.get("mimeType", "").startswith("application/vnd.google-apps"): - export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get( - self.file_meta.get("mimeType"), # type: ignore - ) - if not export_mime: - logger.info( - f"File not supported. Name: {self.file_meta.get('name')} " - f"ID: {self.file_meta.get('id')} " - f"MimeType: {self.file_meta.get('mimeType')}", - ) - return - - request = self.session_handle.service.files().export_media( - fileId=self.file_meta.get("id"), - mimeType=export_mime, - ) - else: - request = self.session_handle.service.files().get_media(fileId=self.file_meta.get("id")) - file = io.BytesIO() - downloader = MediaIoBaseDownload(file, request) - downloaded = False - try: - while downloaded is False: - status, downloaded = downloader.next_chunk() - except HttpError: - pass - - saved = False - if downloaded and file: - dir_ = self.file_meta.get("download_dir") - if dir_: - if not dir_.is_dir(): - logger.debug(f"Creating directory: {self.file_meta.get('download_dir')}") - - if dir_: - dir_.mkdir(parents=True, exist_ok=True) - - with open(self.filename, "wb") as handler: - handler.write(file.getbuffer()) - saved = True - logger.debug(f"File downloaded: {self.filename}.") - - if not saved: - logger.error(f"Error while downloading and saving file: {self.filename}.") - - def write_result(self): - """Write the structured json result for this doc. result must be json serializable.""" - if self.standard_config.download_only: - return - self._output_filename.parent.mkdir(parents=True, exist_ok=True) - with open(self._output_filename, "w") as output_f: - output_f.write(json.dumps(self.isd_elems_no_filename, ensure_ascii=False, indent=2)) - logger.info(f"Wrote {self._output_filename}") - - -class GoogleDriveConnector(ConnectorCleanupMixin, BaseConnector): - """Objects of this class support fetching documents from Google Drive""" - - config: SimpleGoogleDriveConfig - - def __init__(self, standard_config: StandardConnectorConfig, config: SimpleGoogleDriveConfig): - super().__init__(standard_config, config) - - def _list_objects(self, drive_id, recursive=False): - files = [] - service = self.config.create_session_handle().service - - def traverse(drive_id, download_dir, output_dir, recursive=False): - page_token = None - while True: - response = ( - service.files() - .list( - spaces="drive", - fields="nextPageToken, files(id, name, mimeType)", - pageToken=page_token, - corpora="user", - q=f"'{drive_id}' in parents", - ) - .execute() - ) - - for meta in response.get("files", []): - if meta.get("mimeType") == "application/vnd.google-apps.folder": - dir_ = DIRECTORY_FORMAT.format(name=meta.get("name"), id=meta.get("id")) - if recursive: - download_sub_dir = (download_dir / dir_).resolve() - output_sub_dir = (output_dir / dir_).resolve() - traverse(meta.get("id"), download_sub_dir, output_sub_dir, True) - else: - ext = "" - if not Path(meta.get("name")).suffixes: - guess = guess_extension(meta.get("mimeType")) - ext = guess if guess else ext - - if meta.get("mimeType", "").startswith("application/vnd.google-apps"): - export_mime = GOOGLE_DRIVE_EXPORT_TYPES.get(meta.get("mimeType")) - if not export_mime: - logger.info( - f"File {meta.get('name')} has an " - f"unsupported MimeType {meta.get('mimeType')}", - ) - continue - - if not ext: - guess = guess_extension(export_mime) - ext = guess if guess else ext - - # TODO (Habeeb): Consider filtering at the query level. - if self.config.extension and self.config.extension != ext: # noqa: SIM102 - logger.debug( - f"File {meta.get('name')} does not match " - f"the file type {self.config.extension}", - ) - continue - - name = FILE_FORMAT.format(name=meta.get("name"), id=meta.get("id"), ext=ext) - meta["download_dir"] = download_dir - meta["download_filepath"] = (download_dir / name).resolve() - meta["output_dir"] = output_dir - meta["output_filepath"] = (output_dir / name).resolve() - files.append(meta) - - page_token = response.get("nextPageToken", None) - if page_token is None: - break - - traverse( - drive_id, - Path(self.standard_config.download_dir), - Path(self.standard_config.output_dir), - recursive, - ) - return files - - def initialize(self): - pass - - def get_ingest_docs(self): - files = self._list_objects(self.config.drive_id, self.config.recursive) - return [GoogleDriveIngestDoc(self.standard_config, self.config, file) for file in files] diff --git a/src/unstructured/ingest/connector/local.py b/src/unstructured/ingest/connector/local.py deleted file mode 100644 index 6d4901e..0000000 --- a/src/unstructured/ingest/connector/local.py +++ /dev/null @@ -1,116 +0,0 @@ -import fnmatch -import glob -import os -from dataclasses import dataclass -from pathlib import Path -from typing import Optional, Type - -from unstructured.ingest.interfaces import ( - BaseConnector, - BaseConnectorConfig, - BaseIngestDoc, - StandardConnectorConfig, -) -from unstructured.ingest.logger import logger - - -@dataclass -class SimpleLocalConfig(BaseConnectorConfig): - # Local specific options - input_path: str - recursive: bool = False - file_glob: Optional[str] = None - - def __post_init__(self): - if os.path.isfile(self.input_path): - self.input_path_is_file = True - else: - self.input_path_is_file = False - - -@dataclass -class LocalIngestDoc(BaseIngestDoc): - """Class encapsulating fetching a doc and writing processed results (but not - doing the processing!). - """ - - config: SimpleLocalConfig - path: str - - @property - def filename(self): - """The filename of the local file to be processed""" - return Path(self.path) - - def cleanup_file(self): - """Not applicable to local file system""" - pass - - def get_file(self): - """Not applicable to local file system""" - pass - - @property - def _output_filename(self) -> Path: - """Returns output filename for the doc - If input path argument is a file itself, it returns the filename of the doc. - If input path argument is a folder, it returns the relative path of the doc. - """ - input_path = Path(self.config.input_path) - basename = ( - f"{Path(self.path).name}.json" - if input_path.is_file() - else f"{Path(self.path).relative_to(input_path)}.json" - ) - return Path(self.standard_config.output_dir) / basename - - -class LocalConnector(BaseConnector): - """Objects of this class support fetching document(s) from local file system""" - - config: SimpleLocalConfig - ingest_doc_cls: Type[LocalIngestDoc] = LocalIngestDoc - - def __init__( - self, - standard_config: StandardConnectorConfig, - config: SimpleLocalConfig, - ): - super().__init__(standard_config, config) - - def cleanup(self, cur_dir=None): - """Not applicable to local file system""" - pass - - def initialize(self): - """Not applicable to local file system""" - pass - - def _list_files(self): - if self.config.input_path_is_file: - return glob.glob(f"{self.config.input_path}") - elif self.config.recursive: - return glob.glob(f"{self.config.input_path}/**", recursive=self.config.recursive) - else: - return glob.glob(f"{self.config.input_path}/*") - - def does_path_match_glob(self, path: str) -> bool: - if self.config.file_glob is None: - return True - patterns = self.config.file_glob.split(",") - for pattern in patterns: - if fnmatch.filter([path], pattern): - return True - logger.debug(f"The file {path!r} is discarded as it does not match any given glob.") - return False - - def get_ingest_docs(self): - return [ - self.ingest_doc_cls( - self.standard_config, - self.config, - file, - ) - for file in self._list_files() - if os.path.isfile(file) and self.does_path_match_glob(file) - ] diff --git a/src/unstructured/ingest/connector/notion/__init__.py b/src/unstructured/ingest/connector/notion/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/unstructured/ingest/connector/notion/client.py b/src/unstructured/ingest/connector/notion/client.py deleted file mode 100644 index 1b8fcba..0000000 --- a/src/unstructured/ingest/connector/notion/client.py +++ /dev/null @@ -1,90 +0,0 @@ -from typing import Any, Generator, List, Tuple - -from notion_client import Client as NotionClient -from notion_client.api_endpoints import ( - BlocksChildrenEndpoint as NotionBlocksChildrenEndpoint, -) -from notion_client.api_endpoints import BlocksEndpoint as NotionBlocksEndpoint -from notion_client.api_endpoints import DatabasesEndpoint as NotionDatabasesEndpoint -from notion_client.api_endpoints import PagesEndpoint as NotionPagesEndpoint - -from unstructured.ingest.connector.notion.types.block import Block -from unstructured.ingest.connector.notion.types.database import Database -from unstructured.ingest.connector.notion.types.database_properties import ( - map_cells, -) -from unstructured.ingest.connector.notion.types.page import Page - - -class BlocksChildrenEndpoint(NotionBlocksChildrenEndpoint): - def list(self, block_id: str, **kwargs: Any) -> Tuple[List[Block], dict]: - resp: dict = super().list(block_id=block_id, **kwargs) # type: ignore - child_blocks = [Block.from_dict(data=b) for b in resp.pop("results", [])] - return child_blocks, resp - - def iterate_list( - self, - block_id: str, - **kwargs: Any, - ) -> Generator[List[Block], None, None]: - while True: - response: dict = super().list(block_id=block_id, **kwargs) # type: ignore - child_blocks = [Block.from_dict(data=b) for b in response.pop("results", [])] - yield child_blocks - - next_cursor = response.get("next_cursor") - if not response.get("has_more") or not next_cursor: - return - - -class DatabasesEndpoint(NotionDatabasesEndpoint): - def retrieve(self, database_id: str, **kwargs: Any) -> Database: - resp: dict = super().retrieve(database_id=database_id, **kwargs) # type: ignore - return Database.from_dict(data=resp) - - def query(self, database_id: str, **kwargs: Any) -> Tuple[List[Page], dict]: - """Get a list of [Pages](https://developers.notion.com/reference/page) contained in the database. - - *[🔗 Endpoint documentation](https://developers.notion.com/reference/post-database-query)* - """ # noqa: E501 - resp: dict = super().query(database_id=database_id, **kwargs) # type: ignore - pages = [Page.from_dict(data=p) for p in resp.pop("results")] - for p in pages: - p.properties = map_cells(p.properties) - return pages, resp - - def iterate_query(self, database_id: str, **kwargs: Any) -> Generator[List[Page], None, None]: - while True: - response: dict = super().query(database_id=database_id, **kwargs) # type: ignore - pages = [Page.from_dict(data=p) for p in response.pop("results", [])] - for p in pages: - p.properties = map_cells(p.properties) - yield pages - - next_cursor = response.get("next_cursor") - if not response.get("has_more") or not next_cursor: - return - - -class BlocksEndpoint(NotionBlocksEndpoint): - def __init__(self, *args: Any, **kwargs: Any) -> None: - super().__init__(*args, **kwargs) - self.children = BlocksChildrenEndpoint(*args, **kwargs) - - def retrieve(self, block_id: str, **kwargs: Any) -> Block: - resp: dict = super().retrieve(block_id=block_id, **kwargs) # type: ignore - return Block.from_dict(data=resp) - - -class PagesEndpoint(NotionPagesEndpoint): - def retrieve(self, page_id: str, **kwargs: Any) -> Page: - resp: dict = super().retrieve(page_id=page_id, **kwargs) # type: ignore - return Page.from_dict(data=resp) - - -class Client(NotionClient): - def __init__(self, *args: Any, **kwargs: Any) -> None: - super().__init__(*args, **kwargs) - self.blocks = BlocksEndpoint(self) - self.pages = PagesEndpoint(self) - self.databases = DatabasesEndpoint(self) diff --git a/src/unstructured/ingest/connector/notion/connector.py b/src/unstructured/ingest/connector/notion/connector.py deleted file mode 100644 index dcc571c..0000000 --- a/src/unstructured/ingest/connector/notion/connector.py +++ /dev/null @@ -1,415 +0,0 @@ -import logging -import os -from dataclasses import dataclass -from pathlib import Path -from typing import List, Optional -from uuid import UUID - -from unstructured.ingest.connector.notion.types.database import Database -from unstructured.ingest.connector.notion.types.page import Page -from unstructured.ingest.interfaces import ( - BaseConnector, - BaseConnectorConfig, - BaseIngestDoc, - ConnectorCleanupMixin, - IngestDocCleanupMixin, - StandardConnectorConfig, -) -from unstructured.ingest.logger import make_default_logger -from unstructured.utils import ( - requires_dependencies, -) - - -@dataclass -class SimpleNotionConfig(BaseConnectorConfig): - """Connector config to process all messages by channel id's.""" - - page_ids: List[str] - database_ids: List[str] - recursive: bool - api_key: str - verbose: bool - logger: Optional[logging.Logger] = None - - @staticmethod - def parse_ids(ids_str: str) -> List[str]: - """Parses a comma separated list of ids into a list of UUID strings.""" - return [str(UUID(x.strip())) for x in ids_str.split(",")] - - def get_logger(self) -> logging.Logger: - if self.logger: - return self.logger - return make_default_logger(logging.DEBUG if self.verbose else logging.INFO) - - -@dataclass -class NotionPageIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): - """Class encapsulating fetching a doc and writing processed results (but not - doing the processing!). - - Also includes a cleanup method. When things go wrong and the cleanup - method is not called, the file is left behind on the filesystem to assist debugging. - """ - - page_id: str - api_key: str - config: SimpleNotionConfig - file_metadata: Optional[Page] = None - file_exists: bool = False - check_exists: bool = False - - def _tmp_download_file(self): - page_file = self.page_id + ".html" - return Path(self.standard_config.download_dir) / page_file - - @property - def _output_filename(self): - page_file = self.page_id + ".json" - return Path(self.standard_config.output_dir) / page_file - - def _create_full_tmp_dir_path(self): - self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) - - @BaseIngestDoc.skip_if_file_exists - @requires_dependencies(dependencies=["notion_client"]) - def get_file(self): - from notion_client import APIErrorCode, APIResponseError - - from unstructured.ingest.connector.notion.client import Client as NotionClient - from unstructured.ingest.connector.notion.helpers import extract_page_html - - self._create_full_tmp_dir_path() - - self.config.get_logger().debug(f"fetching page {self.page_id} - PID: {os.getpid()}") - - client = NotionClient(auth=self.api_key, logger=self.config.get_logger()) - - try: - text_extraction = extract_page_html( - client=client, - page_id=self.page_id, - logger=self.config.get_logger(), - ) - self.check_exists = True - self.file_exists = True - if html := text_extraction.html: - with open(self._tmp_download_file(), "w") as page_file: - page_file.write(html.render(pretty=True)) - - except APIResponseError as error: - if error.code == APIErrorCode.ObjectNotFound: - self.check_exists = True - self.file_exists = False - else: - self.config.get_logger().error(f"Error: {error}") - - @requires_dependencies(dependencies=["notion_client"]) - def get_file_metadata(self): - from notion_client import APIErrorCode, APIResponseError - - from unstructured.ingest.connector.notion.client import Client as NotionClient - - client = NotionClient(auth=self.api_key, logger=self.config.get_logger()) - - # The Notion block endpoint gives more hierarchical information (parent,child relationships) - # than the pages endpoint so choosing to use that one to get metadata about the page - try: - self.file_metadata = client.pages.retrieve(page_id=self.page_id) # type: ignore - self.check_exists = True - self.file_exists = True - except APIResponseError as error: - if error.code == APIErrorCode.ObjectNotFound: - self.check_exists = True - self.file_exists = False - else: - self.config.get_logger().error(f"Error: {error}") - - @property - def date_created(self) -> Optional[str]: - """The date the document was created on the source system.""" - if not self.file_metadata: - self.get_file_metadata() - - return self.file_metadata.created_time if self.file_metadata else None - - @property - def date_modified(self) -> Optional[str]: - """The date the document was last modified on the source system.""" - if not self.file_metadata: - self.get_file_metadata() - - return self.file_metadata.last_edited_time if self.file_metadata else None - - @property - def exists(self) -> Optional[bool]: - """Whether the document exists on the remote source.""" - if self.check_exists: - return self.file_exists - - self.get_file_metadata() - - return self.file_exists - - @property - def filename(self): - """The filename of the file created from a notion page""" - return self._tmp_download_file() - - -@dataclass -class NotionDatabaseIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): - """Class encapsulating fetching a doc and writing processed results (but not - doing the processing!). - - Also includes a cleanup method. When things go wrong and the cleanup - method is not called, the file is left behind on the filesystem to assist debugging. - """ - - database_id: str - api_key: str - config: SimpleNotionConfig - file_metadata: Optional[Database] = None - file_exists: bool = False - check_exists: bool = False - - def _tmp_download_file(self): - page_file = self.database_id + ".html" - return Path(self.standard_config.download_dir) / page_file - - @property - def _output_filename(self): - page_file = self.database_id + ".json" - return Path(self.standard_config.output_dir) / page_file - - def _create_full_tmp_dir_path(self): - self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) - - @BaseIngestDoc.skip_if_file_exists - @requires_dependencies(dependencies=["notion_client"]) - def get_file(self): - from notion_client import APIErrorCode, APIResponseError - - from unstructured.ingest.connector.notion.client import Client as NotionClient - from unstructured.ingest.connector.notion.helpers import extract_database_html - - self._create_full_tmp_dir_path() - - self.config.get_logger().debug(f"fetching database {self.database_id} - PID: {os.getpid()}") - - client = NotionClient(auth=self.api_key, logger=self.config.get_logger()) - - try: - text_extraction = extract_database_html( - client=client, - database_id=self.database_id, - logger=self.config.get_logger(), - ) - self.check_exists = True - self.file_exists = True - if html := text_extraction.html: - with open(self._tmp_download_file(), "w") as page_file: - page_file.write(html.render(pretty=True)) - - except APIResponseError as error: - if error.code == APIErrorCode.ObjectNotFound: - self.check_exists = True - self.file_exists = False - else: - self.config.get_logger().error(f"Error: {error}") - - @requires_dependencies(dependencies=["notion_client"]) - def get_file_metadata(self): - from notion_client import APIErrorCode, APIResponseError - - from unstructured.ingest.connector.notion.client import Client as NotionClient - - client = NotionClient(auth=self.api_key, logger=self.config.get_logger()) - - # The Notion block endpoint gives more hierarchical information (parent,child relationships) - # than the pages endpoint so choosing to use that one to get metadata about the page - try: - self.file_metadata = client.databases.retrieve( - database_id=self.database_id, - ) # type: ignore - self.check_exists = True - self.file_exists = True - except APIResponseError as error: - if error.code == APIErrorCode.ObjectNotFound: - self.check_exists = True - self.file_exists = False - else: - self.config.get_logger().error(f"Error: {error}") - - @property - def date_created(self) -> Optional[str]: - """The date the document was created on the source system.""" - if not self.file_metadata: - self.get_file_metadata() - - return self.file_metadata.created_time if self.file_metadata else None - - @property - def date_modified(self) -> Optional[str]: - """The date the document was last modified on the source system.""" - if not self.file_metadata: - self.get_file_metadata() - - return self.file_metadata.last_edited_time if self.file_metadata else None - - @property - def exists(self) -> Optional[bool]: - """Whether the document exists on the remote source.""" - if self.check_exists: - return self.file_exists - - self.get_file_metadata() - - return self.file_exists - - @property - def filename(self): - """The filename of the file created from a notion page""" - return self._tmp_download_file() - - -@requires_dependencies(dependencies=["notion_client"]) -class NotionConnector(ConnectorCleanupMixin, BaseConnector): - """Objects of this class support fetching document(s) from""" - - config: SimpleNotionConfig - - def __init__( - self, - standard_config: StandardConnectorConfig, - config: SimpleNotionConfig, - ): - super().__init__( - standard_config=standard_config, - config=config, - ) - - def initialize(self): - """Verify that can get metadata for an object, validates connections info.""" - pass - - @requires_dependencies(dependencies=["notion_client"]) - def get_child_page_content(self, page_id: str): - from unstructured.ingest.connector.notion.client import Client as NotionClient - from unstructured.ingest.connector.notion.helpers import ( - get_recursive_content_from_page, - ) - - client = NotionClient(auth=self.config.api_key, logger=self.config.get_logger()) - - child_content = get_recursive_content_from_page( - client=client, - page_id=page_id, - logger=self.config.get_logger(), - ) - return child_content - - def get_child_content(self, page_id: str): - from unstructured.ingest.connector.notion.client import Client as NotionClient - from unstructured.ingest.connector.notion.helpers import ( - get_recursive_content_from_page, - ) - - client = NotionClient(auth=self.config.api_key, logger=self.config.logger) - - child_content = get_recursive_content_from_page( - client=client, - page_id=page_id, - logger=self.config.get_logger(), - ) - return child_content - - @requires_dependencies(dependencies=["notion_client"]) - def get_child_database_content(self, database_id: str): - from unstructured.ingest.connector.notion.client import Client as NotionClient - from unstructured.ingest.connector.notion.helpers import ( - get_recursive_content_from_database, - ) - - client = NotionClient(auth=self.config.api_key, logger=self.config.get_logger()) - - child_content = get_recursive_content_from_database( - client=client, - database_id=database_id, - logger=self.config.get_logger(), - ) - return child_content - - def get_ingest_docs(self): - docs: List[BaseIngestDoc] = [] - if self.config.page_ids: - docs += [ - NotionPageIngestDoc( - standard_config=self.standard_config, - config=self.config, - page_id=page_id, - api_key=self.config.api_key, - ) - for page_id in self.config.page_ids - ] - if self.config.database_ids: - docs += [ - NotionDatabaseIngestDoc( - standard_config=self.standard_config, - config=self.config, - database_id=database_id, - api_key=self.config.api_key, - ) - for database_id in self.config.database_ids - ] - if self.config.recursive: - child_pages = [] - child_databases = [] - for page_id in self.config.page_ids: - child_content = self.get_child_page_content(page_id=page_id) - child_pages.extend(child_content.child_pages) - child_databases.extend(child_content.child_databases) - - for database_id in self.config.database_ids: - child_content = self.get_child_database_content(database_id=database_id) - child_pages.extend(child_content.child_pages) - child_databases.extend(child_content.child_databases) - - # Remove duplicates - child_pages = list(set(child_pages)) - child_pages = [c for c in child_pages if c not in self.config.page_ids] - - child_databases = list(set(child_databases)) - child_databases = [db for db in child_databases if db not in self.config.database_ids] - - if child_pages: - self.config.get_logger().info( - "Adding the following child page ids: {}".format(", ".join(child_pages)), - ) - docs += [ - NotionPageIngestDoc( - standard_config=self.standard_config, - config=self.config, - page_id=page_id, - api_key=self.config.api_key, - ) - for page_id in child_pages - ] - - if child_databases: - self.config.get_logger().info( - "Adding the following child database ids: {}".format( - ", ".join(child_databases), - ), - ) - docs += [ - NotionDatabaseIngestDoc( - standard_config=self.standard_config, - config=self.config, - database_id=database_id, - api_key=self.config.api_key, - ) - for database_id in child_databases - ] - - return docs diff --git a/src/unstructured/ingest/connector/notion/helpers.py b/src/unstructured/ingest/connector/notion/helpers.py deleted file mode 100644 index 5bc1c84..0000000 --- a/src/unstructured/ingest/connector/notion/helpers.py +++ /dev/null @@ -1,525 +0,0 @@ -import enum -import logging -from dataclasses import dataclass, field -from typing import List, Optional, Tuple -from urllib.parse import urlparse -from uuid import UUID - -from htmlBuilder.attributes import Style, Type -from htmlBuilder.tags import ( - Body, - Div, - Head, - Html, - HtmlTag, - Ol, - Table, - Td, - Th, - Title, - Tr, - Ul, -) - -import unstructured.ingest.connector.notion.types.blocks as notion_blocks -from unstructured.ingest.connector.notion.client import Client -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.block import Block -from unstructured.ingest.connector.notion.types.database import Database - - -@dataclass -class TextExtractionResponse: - text: Optional[str] = None - child_pages: List[str] = field(default_factory=list) - child_databases: List[str] = field(default_factory=list) - - -@dataclass -class HtmlExtractionResponse: - html: Optional[HtmlTag] = None - child_pages: List[str] = field(default_factory=list) - child_databases: List[str] = field(default_factory=list) - - -def extract_page_html( - client: Client, - page_id: str, - logger: logging.Logger, -) -> HtmlExtractionResponse: - page_id_uuid = UUID(page_id) - html_elements: List[Tuple[BlockBase, HtmlTag]] = [] - parent_block: Block = client.blocks.retrieve(block_id=page_id) # type: ignore - head = None - if isinstance(parent_block.block, notion_blocks.ChildPage): - head = Head([], Title([], parent_block.block.title)) - child_pages: List[str] = [] - child_databases: List[str] = [] - parents: List[Tuple[int, Block]] = [(0, parent_block)] - processed_block_ids = [] - while len(parents) > 0: - level, parent = parents.pop(0) - parent_html = parent.get_html() - if parent_html: - html_elements.append((parent.block, parent_html)) - logger.debug(f"processing block: {parent}") - if isinstance(parent.block, notion_blocks.ChildPage) and parent.id != str(page_id_uuid): - child_pages.append(parent.id) - continue - if isinstance(parent.block, notion_blocks.ChildDatabase): - child_databases.append(parent.id) - continue - if isinstance(parent.block, notion_blocks.Table): - table_response = build_table(client=client, table=parent) - html_elements.append((parent.block, table_response.table_html)) - child_pages.extend(table_response.child_pages) - child_databases.extend(table_response.child_databases) - continue - if isinstance(parent.block, notion_blocks.ColumnList): - column_html = build_columned_list(client=client, column_parent=parent) - html_elements.append((parent.block, column_html)) - continue - if isinstance(parent.block, notion_blocks.BulletedListItem): - bullet_list_resp = build_bulleted_list_children( - client=client, - bulleted_list_item_parent=parent, - ) - if bullet_list_children := bullet_list_resp.child_list: - html_elements.append((parent.block, bullet_list_children)) - continue - if isinstance(parent.block, notion_blocks.NumberedListItem): - numbered_list_resp = build_numbered_list_children( - client=client, - numbered_list_item_parent=parent, - ) - if numbered_list_children := numbered_list_resp.child_list: - html_elements.append((parent.block, numbered_list_children)) - continue - if parent.block.can_have_children() and parent.has_children: - children = [] - for children_block in client.blocks.children.iterate_list( # type: ignore - block_id=parent.id, - ): - children.extend(children_block) - if children: - logger.debug(f"Adding {len(children)} children from parent: {parent}") - for child in children: - if child.id not in processed_block_ids: - parents.append((level + 1, child)) - processed_block_ids.append(parent) - - # Join list items - joined_html_elements = [] - numbered_list_items = [] - bullet_list_items = [] - for block, html in html_elements: - if isinstance(block, notion_blocks.BulletedListItem): - bullet_list_items.append(html) - continue - if isinstance(block, notion_blocks.NumberedListItem): - numbered_list_items.append(html) - continue - if len(numbered_list_items) > 0: - joined_html_elements.append(Ol([], numbered_list_items)) - numbered_list_items = [] - if len(bullet_list_items) > 0: - joined_html_elements.append(Ul([], bullet_list_items)) - bullet_list_items = [] - joined_html_elements.append(html) - - body = Body([], joined_html_elements) - all_elements = [body] - if head: - all_elements = [head] + all_elements - full_html = Html([], all_elements) - return HtmlExtractionResponse( - full_html, - child_pages=child_pages, - child_databases=child_databases, - ) - - -def extract_database_html( - client: Client, - database_id: str, - logger: logging.Logger, -) -> HtmlExtractionResponse: - logger.debug(f"processing database id: {database_id}") - database: Database = client.databases.retrieve(database_id=database_id) # type: ignore - property_keys = list(database.properties.keys()) - property_keys = sorted(property_keys) - table_html_rows = [] - child_pages: List[str] = [] - child_databases: List[str] = [] - # Create header row - table_html_rows.append(Tr([], [Th([], k) for k in property_keys])) - - all_pages = [] - for page_chunk in client.databases.iterate_query(database_id=database_id): # type: ignore - all_pages.extend(page_chunk) - - logger.debug(f"Creating {len(all_pages)} rows") - for page in all_pages: - if is_database_url(page.url): - child_databases.append(page.id) - if is_page_url(page.url): - child_pages.append(page.id) - properties = page.properties - inner_html = [properties.get(k).get_html() for k in property_keys] # type: ignore - table_html_rows.append( - Tr( - [], - [Td([], cell) for cell in [html if html else Div([], []) for html in inner_html]], - ), - ) - - table_html = Table([], table_html_rows) - - return HtmlExtractionResponse( - html=table_html, - child_pages=child_pages, - child_databases=child_databases, - ) - - -@dataclass -class ChildExtractionResponse: - child_pages: List[str] = field(default_factory=list) - child_databases: List[str] = field(default_factory=list) - - -class QueueEntryType(enum.Enum): - DATABASE = "database" - PAGE = "page" - - -@dataclass -class QueueEntry: - type: QueueEntryType - id: UUID - - -def get_recursive_content_from_page( - client: Client, - page_id: str, - logger: logging.Logger, -) -> ChildExtractionResponse: - return get_recursive_content( - client=client, - init_entry=QueueEntry(type=QueueEntryType.PAGE, id=UUID(page_id)), - logger=logger, - ) - - -def get_recursive_content_from_database( - client: Client, - database_id: str, - logger: logging.Logger, -) -> ChildExtractionResponse: - return get_recursive_content( - client=client, - init_entry=QueueEntry(type=QueueEntryType.DATABASE, id=UUID(database_id)), - logger=logger, - ) - - -def get_recursive_content( - client: Client, - init_entry: QueueEntry, - logger: logging.Logger, -) -> ChildExtractionResponse: - parents: List[QueueEntry] = [init_entry] - child_pages = [] - child_dbs = [] - processed = [] - while len(parents) > 0: - parent: QueueEntry = parents.pop() - processed.append(parent.id) - if parent.type == QueueEntryType.PAGE: - logger.debug(f"Getting child data from page: {parent.id}") - for children in client.blocks.children.iterate_list( # type: ignore - block_id=str(parent.id), - ): - child_pages_from_page = [ - c for c in children if isinstance(c.block, notion_blocks.ChildPage) - ] - if child_pages_from_page: - child_page_blocks: List[notion_blocks.ChildPage] = [ - p.block - for p in child_pages_from_page - if isinstance(p.block, notion_blocks.ChildPage) - ] - logger.debug( - "found child pages from parent page {}: {}".format( - parent.id, - ", ".join([block.title for block in child_page_blocks]), - ), - ) - new_pages = [p.id for p in child_pages_from_page if p.id not in processed] - child_pages.extend(new_pages) - parents.extend( - [QueueEntry(type=QueueEntryType.PAGE, id=UUID(i)) for i in new_pages], - ) - - child_dbs_from_page = [ - c for c in children if isinstance(c.block, notion_blocks.ChildDatabase) - ] - if child_dbs_from_page: - child_db_blocks: List[notion_blocks.ChildDatabase] = [ - c.block - for c in children - if isinstance(c.block, notion_blocks.ChildDatabase) - ] - logger.debug( - "found child database from parent page {}: {}".format( - parent.id, - ", ".join([block.title for block in child_db_blocks]), - ), - ) - new_dbs = [db.id for db in child_dbs_from_page if db.id not in processed] - child_dbs.extend(new_dbs) - parents.extend( - [QueueEntry(type=QueueEntryType.DATABASE, id=UUID(i)) for i in new_dbs], - ) - elif parent.type == QueueEntryType.DATABASE: - logger.debug(f"Getting child data from database: {parent.id}") - for page_entries in client.databases.iterate_query( # type: ignore - database_id=str(parent.id), - ): - child_pages_from_db = [p for p in page_entries if is_page_url(p.url)] - if child_pages_from_db: - logger.debug( - "found child pages from parent database {}: {}".format( - parent.id, - ", ".join([p.url for p in child_pages_from_db]), - ), - ) - new_pages = [p.id for p in child_pages_from_db if p.id not in processed] - child_pages.extend(new_pages) - parents.extend( - [QueueEntry(type=QueueEntryType.PAGE, id=UUID(i)) for i in new_pages], - ) - - child_dbs_from_db = [p for p in page_entries if is_database_url(p.url)] - if child_dbs_from_db: - logger.debug( - "found child database from parent database {}: {}".format( - parent.id, - ", ".join([db.url for db in child_dbs_from_db]), - ), - ) - new_dbs = [db.id for db in child_dbs_from_db if db.id not in processed] - child_dbs.extend(new_dbs) - parents.extend( - [QueueEntry(type=QueueEntryType.DATABASE, id=UUID(i)) for i in new_dbs], - ) - - return ChildExtractionResponse( - child_pages=child_pages, - child_databases=child_dbs, - ) - - -def is_valid_uuid(uuid_str: str) -> bool: - try: - UUID(uuid_str) - return True - except Exception: - return False - - -def is_page_url(url: str): - parsed_url = urlparse(url) - path = parsed_url.path.split("/")[-1] - if parsed_url.netloc != "www.notion.so": - return False - if is_valid_uuid(path): - return False - strings = path.split("-") - if len(strings) > 0 and is_valid_uuid(strings[-1]): - return True - return False - - -def is_database_url(url: str): - parsed_url = urlparse(url) - path = parsed_url.path.split("/")[-1] - if parsed_url.netloc != "www.notion.so": - return False - return is_valid_uuid(path) - - -@dataclass -class BuildTableResponse: - table_html: HtmlTag - child_pages: List[str] = field(default_factory=list) - child_databases: List[str] = field(default_factory=list) - - -def build_table(client: Client, table: Block) -> BuildTableResponse: - if not isinstance(table.block, notion_blocks.Table): - raise ValueError(f"block type not table: {type(table.block)}") - rows: List[notion_blocks.TableRow] = [] - child_pages: List[str] = [] - child_databases: List[str] = [] - for row_chunk in client.blocks.children.iterate_list( # type: ignore - block_id=table.id, - ): - rows.extend( - [row.block for row in row_chunk if isinstance(row.block, notion_blocks.TableRow)], - ) - - # Extract child databases and pages - for row in rows: - for c in row.cells: - for rt in c.rich_texts: - if mention := rt.mention: - if mention.type == "page" and (page := mention.page): - child_pages.append(page.id) - if mention.type == "database" and (database := mention.database): - child_databases.append(database.id) - - header: Optional[notion_blocks.TableRow] = None - if table.block.has_column_header: - header = rows.pop(0) - table_html_rows = [] - if header: - header.is_header = True - table_html_rows.append(header.get_html()) - table_html_rows.extend([row.get_html() for row in rows]) - html_table = Table([], table_html_rows) - - return BuildTableResponse( - table_html=html_table, - child_pages=child_pages, - child_databases=child_databases, - ) - - -def build_columned_list(client: Client, column_parent: Block) -> HtmlTag: - if not isinstance(column_parent.block, notion_blocks.ColumnList): - raise ValueError(f"block type not column list: {type(column_parent.block)}") - columns: List[Block] = [] - for column_chunk in client.blocks.children.iterate_list( # type: ignore - block_id=column_parent.id, - ): - columns.extend(column_chunk) - num_columns = len(columns) - columns_content = [] - for column in columns: - for column_content_chunk in client.blocks.children.iterate_list( # type: ignore - block_id=column.id, - ): - columns_content.append( - Div( - [Style(f"width:{100/num_columns}%; float: left")], - [content.block.get_html() for content in column_content_chunk], - ), - ) - - return Div([], columns_content) - - -@dataclass -class BulletedListResponse: - html: HtmlTag - child_list: Optional[HtmlTag] = None - - -bulleted_list_styles = ["circle", "square", "disc"] - - -def build_bulleted_list_children( - client: Client, - bulleted_list_item_parent: Block, - list_style_ind: int = 0, -) -> BulletedListResponse: - if not isinstance(bulleted_list_item_parent.block, notion_blocks.BulletedListItem): - raise ValueError( - f"block type not bulleted list item: {type(bulleted_list_item_parent.block)}", - ) - html = bulleted_list_item_parent.get_html() - if html: - html.attributes = [Style("margin-left: 10px")] - if not bulleted_list_item_parent.has_children: - return BulletedListResponse( - html=html, - ) - children = [] - for child_block in client.blocks.children.iterate_list( # type: ignore - block_id=bulleted_list_item_parent.id, - ): - children.extend(child_block) - if not children: - return BulletedListResponse( - html=bulleted_list_item_parent.get_html(), - ) - child_html = [] - for child in children: - child_resp = build_bulleted_list_children( - client=client, - bulleted_list_item_parent=child, - list_style_ind=(list_style_ind + 1) % len(bulleted_list_styles), - ) - child_html.append(child_resp.html) - if child_children := child_resp.child_list: - child_html.append(child_children) - - return BulletedListResponse( - html=html, - child_list=Ul( - [Style(f"list-style-type: {bulleted_list_styles[list_style_ind]}")], - child_html, - ), - ) - - -@dataclass -class NumberedListResponse: - html: HtmlTag - child_list: Optional[HtmlTag] = None - - -numbered_list_types = ["a", "i", "1"] - - -def build_numbered_list_children( - client: Client, - numbered_list_item_parent: Block, - type_attr_ind=0, -) -> NumberedListResponse: - if not isinstance(numbered_list_item_parent.block, notion_blocks.NumberedListItem): - raise ValueError( - f"block type not numbered list item: {type(numbered_list_item_parent.block)}", - ) - html = numbered_list_item_parent.get_html() - if html: - html.attributes = [Style("margin-left: 10px")] - if not numbered_list_item_parent.has_children: - return NumberedListResponse( - html=html, - ) - children = [] - for child_block in client.blocks.children.iterate_list( # type: ignore - block_id=numbered_list_item_parent.id, - ): - children.extend(child_block) - if not children: - return NumberedListResponse( - html=numbered_list_item_parent.get_html(), - ) - child_html = [] - for child in children: - child_resp = build_numbered_list_children( - client=client, - numbered_list_item_parent=child, - type_attr_ind=(type_attr_ind + 1) % len(numbered_list_types), - ) - child_html.append(child_resp.html) - if child_children := child_resp.child_list: - child_html.append(child_children) - - return NumberedListResponse( - html=html, - child_list=Ol([Type(numbered_list_types[type_attr_ind])], child_html), - ) diff --git a/src/unstructured/ingest/connector/notion/interfaces.py b/src/unstructured/ingest/connector/notion/interfaces.py deleted file mode 100644 index bcfa788..0000000 --- a/src/unstructured/ingest/connector/notion/interfaces.py +++ /dev/null @@ -1,32 +0,0 @@ -from abc import ABC, abstractmethod -from typing import Optional - -from htmlBuilder.tags import HtmlTag - - -class FromJSONMixin(ABC): - @classmethod - @abstractmethod - def from_dict(cls, data: dict): - pass - - -class GetHTMLMixin(ABC): - @abstractmethod - def get_html(self) -> Optional[HtmlTag]: - pass - - -class BlockBase(FromJSONMixin, GetHTMLMixin): - @staticmethod - @abstractmethod - def can_have_children() -> bool: - pass - - -class DBPropertyBase(FromJSONMixin): - pass - - -class DBCellBase(FromJSONMixin, GetHTMLMixin): - pass diff --git a/src/unstructured/ingest/connector/notion/types/__init__.py b/src/unstructured/ingest/connector/notion/types/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/unstructured/ingest/connector/notion/types/block.py b/src/unstructured/ingest/connector/notion/types/block.py deleted file mode 100644 index b5d36e3..0000000 --- a/src/unstructured/ingest/connector/notion/types/block.py +++ /dev/null @@ -1,93 +0,0 @@ -# https://developers.notion.com/reference/page -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import HtmlTag - -from unstructured.ingest.connector.notion.interfaces import ( - BlockBase, - FromJSONMixin, - GetHTMLMixin, -) -from unstructured.ingest.connector.notion.types import blocks -from unstructured.ingest.connector.notion.types.parent import Parent -from unstructured.ingest.connector.notion.types.user import PartialUser - -block_type_mapping = { - "bookmark": blocks.Bookmark, - "breadcrumb": blocks.Breadcrumb, - "bulleted_list_item": blocks.BulletedListItem, - "callout": blocks.Callout, - "child_database": blocks.ChildDatabase, - "child_page": blocks.ChildPage, - "code": blocks.Code, - "column": blocks.Column, - "column_list": blocks.ColumnList, - "divider": blocks.Divider, - "heading_1": blocks.Heading, - "heading_2": blocks.Heading, - "heading_3": blocks.Heading, - "embed": blocks.Embed, - "equation": blocks.Equation, - "file": blocks.File, - "image": blocks.Image, - "link_preview": blocks.LinkPreview, - "numbered_list_item": blocks.NumberedListItem, - "paragraph": blocks.Paragraph, - "pdf": blocks.PDF, - "quote": blocks.Quote, - "synced_block": blocks.SyncBlock, - "table": blocks.Table, - "table_of_contents": blocks.TableOfContents, - "table_row": blocks.TableRow, - "template": blocks.Template, - "to_do": blocks.ToDo, - "toggle": blocks.Toggle, - "unsupported": blocks.Unsupported, - "video": blocks.Video, -} - - -@dataclass -class Block(FromJSONMixin, GetHTMLMixin): - id: str - type: str - created_time: str - created_by: PartialUser - last_edited_time: str - last_edited_by: PartialUser - archived: bool - has_children: bool - parent: Parent - block: BlockBase - object: str = "block" - - def __repr__(self): - return f"{self.__class__.__name__}(id={self.id}, type={self.type})" - - @classmethod - def from_dict(cls, data: dict): - t = data["type"] - block_data = data.pop(t) - created_by = data.pop("created_by") - last_edited_by = data.pop("last_edited_by") - parent = data.pop("parent") - try: - block = cls( - created_by=PartialUser.from_dict(created_by), - last_edited_by=PartialUser.from_dict(last_edited_by), - parent=Parent.from_dict(parent), - block=block_type_mapping[t].from_dict(block_data), # type: ignore - **data, - ) - except KeyError as ke: - raise KeyError(f"failed to map to associated block type -> {t}: {block_data}") from ke - except TypeError as te: - raise TypeError(f"failed to map to associated block type -> {t}: {block_data}") from te - - return block - - def get_html(self) -> Optional[HtmlTag]: - if self.block: - return self.block.get_html() - return None diff --git a/src/unstructured/ingest/connector/notion/types/blocks/__init__.py b/src/unstructured/ingest/connector/notion/types/blocks/__init__.py deleted file mode 100644 index 14e0467..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/__init__.py +++ /dev/null @@ -1,61 +0,0 @@ -from .bookmark import Bookmark -from .breadcrumb import Breadcrumb -from .bulleted_list_item import BulletedListItem -from .callout import Callout -from .child_database import ChildDatabase -from .child_page import ChildPage -from .code import Code -from .column_list import Column, ColumnList -from .divider import Divider -from .embed import Embed -from .equation import Equation -from .file import File -from .heading import Heading -from .image import Image -from .link_preview import LinkPreview -from .numbered_list import NumberedListItem -from .paragraph import Paragraph -from .pdf import PDF -from .quote import Quote -from .synced_block import DuplicateSyncedBlock, OriginalSyncedBlock, SyncBlock -from .table import Table, TableRow -from .table_of_contents import TableOfContents -from .template import Template -from .todo import ToDo -from .toggle import Toggle -from .unsupported import Unsupported -from .video import Video - -__all__ = [ - "Bookmark", - "Breadcrumb", - "BulletedListItem", - "Callout", - "ChildDatabase", - "ChildPage", - "Code", - "Column", - "ColumnList", - "Divider", - "Embed", - "Equation", - "File", - "Heading", - "Image", - "LinkPreview", - "NumberedListItem", - "Paragraph", - "PDF", - "Quote", - "SyncBlock", - "OriginalSyncedBlock", - "DuplicateSyncedBlock", - "Table", - "TableRow", - "TableOfContents", - "Template", - "ToDo", - "Toggle", - "Unsupported", - "Video", -] diff --git a/src/unstructured/ingest/connector/notion/types/blocks/bookmark.py b/src/unstructured/ingest/connector/notion/types/blocks/bookmark.py deleted file mode 100644 index 4680447..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/bookmark.py +++ /dev/null @@ -1,40 +0,0 @@ -# https://developers.notion.com/reference/block#bookmark -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.attributes import Href -from htmlBuilder.tags import A, Br, Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class Bookmark(BlockBase): - url: str - caption: List[RichText] = field(default_factory=list) - - @classmethod - def from_dict(cls, data: dict): - captions = data.pop("caption", []) - return cls( - url=data["url"], - caption=[RichText.from_dict(c) for c in captions], - ) - - def get_html(self) -> Optional[HtmlTag]: - texts = [] - if self.url: - texts.append(A([Href(self.url)], self.url)) - if self.caption: - texts.append(Div([], [rt.get_html() for rt in self.caption])) - if not texts: - return None - joined = [Br()] * (len(texts) * 2 - 1) - joined[0::2] = texts - - return Div([], joined) - - @staticmethod - def can_have_children() -> bool: - return False diff --git a/src/unstructured/ingest/connector/notion/types/blocks/breadcrumb.py b/src/unstructured/ingest/connector/notion/types/blocks/breadcrumb.py deleted file mode 100644 index d6b1626..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/breadcrumb.py +++ /dev/null @@ -1,21 +0,0 @@ -# https://developers.notion.com/reference/block#breadcrumb -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase - - -@dataclass -class Breadcrumb(BlockBase): - @staticmethod - def can_have_children() -> bool: - return False - - @classmethod - def from_dict(cls, data: dict): - return cls() - - def get_html(self) -> Optional[HtmlTag]: - pass diff --git a/src/unstructured/ingest/connector/notion/types/blocks/bulleted_list_item.py b/src/unstructured/ingest/connector/notion/types/blocks/bulleted_list_item.py deleted file mode 100644 index 5db911d..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/bulleted_list_item.py +++ /dev/null @@ -1,31 +0,0 @@ -# https://developers.notion.com/reference/block#bulleted-list-item -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.tags import HtmlTag, Li - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class BulletedListItem(BlockBase): - color: str - children: List[dict] = field(default_factory=list) - rich_text: List[RichText] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - rich_text = data.pop("rich_text", []) - return cls( - color=data["color"], - children=data.get("children", []), - rich_text=[RichText.from_dict(rt) for rt in rich_text], - ) - - def get_html(self) -> Optional[HtmlTag]: - return Li([], [rt.get_html() for rt in self.rich_text]) diff --git a/src/unstructured/ingest/connector/notion/types/blocks/callout.py b/src/unstructured/ingest/connector/notion/types/blocks/callout.py deleted file mode 100644 index 6ea2bb1..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/callout.py +++ /dev/null @@ -1,94 +0,0 @@ -# https://developers.notion.com/reference/block#callout -from dataclasses import dataclass, field -from typing import List, Optional, Union - -from htmlBuilder.attributes import Href, Style -from htmlBuilder.tags import A, Div, HtmlTag, P - -from unstructured.ingest.connector.notion.interfaces import ( - BlockBase, - FromJSONMixin, - GetHTMLMixin, -) -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class EmojiIcon(FromJSONMixin, GetHTMLMixin): - emoji: str - type: str = "emoji" - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return P([], self.emoji) - - -@dataclass -class ExternalIconContent(FromJSONMixin): - url: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class ExternalIcon(FromJSONMixin, GetHTMLMixin): - external: ExternalIconContent - type: str = "external" - - @classmethod - def from_dict(cls, data: dict): - return cls(external=ExternalIconContent.from_dict(data=data.pop("external")), **data) - - def get_html(self) -> Optional[HtmlTag]: - if self.external: - return A([Href(self.external.url)], [self.external.url]) - else: - return None - - -class Icon(FromJSONMixin): - @classmethod - def from_dict(cls, data: dict) -> Union[EmojiIcon, ExternalIcon]: - t = data.get("type") - if t == "emoji": - return EmojiIcon.from_dict(data) - elif t == "external": - return ExternalIcon.from_dict(data) - else: - raise ValueError(f"Unexpected icon type: {t} ({data})") - - -@dataclass -class Callout(BlockBase): - color: str - icon: Optional[Union[EmojiIcon, ExternalIcon]] = None - rich_text: List[RichText] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - rich_text = data.pop("rich_text", []) - return cls( - color=data["color"], - icon=Icon.from_dict(data.pop("icon")), - rich_text=[RichText.from_dict(rt) for rt in rich_text], - ) - - def get_html(self) -> Optional[HtmlTag]: - elements = [] - if self.icon and self.icon.get_html(): - elements.append(self.icon.get_html()) - if self.rich_text: - elements.extend([rt.get_html() for rt in self.rich_text]) - attributes = [] - if self.color: - attributes.append(Style(f"color:{self.color}")) - return Div(attributes, elements) diff --git a/src/unstructured/ingest/connector/notion/types/blocks/child_database.py b/src/unstructured/ingest/connector/notion/types/blocks/child_database.py deleted file mode 100644 index 578b400..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/child_database.py +++ /dev/null @@ -1,23 +0,0 @@ -# https://developers.notion.com/reference/block#child-database -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import HtmlTag, P - -from unstructured.ingest.connector.notion.interfaces import BlockBase - - -@dataclass -class ChildDatabase(BlockBase): - title: str - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return P([], self.title) diff --git a/src/unstructured/ingest/connector/notion/types/blocks/child_page.py b/src/unstructured/ingest/connector/notion/types/blocks/child_page.py deleted file mode 100644 index 6ee6f90..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/child_page.py +++ /dev/null @@ -1,23 +0,0 @@ -# https://developers.notion.com/reference/block#child-page -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import HtmlTag, P - -from unstructured.ingest.connector.notion.interfaces import BlockBase, GetHTMLMixin - - -@dataclass -class ChildPage(BlockBase, GetHTMLMixin): - title: str - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return P([], self.title) diff --git a/src/unstructured/ingest/connector/notion/types/blocks/code.py b/src/unstructured/ingest/connector/notion/types/blocks/code.py deleted file mode 100644 index 3a6d80e..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/code.py +++ /dev/null @@ -1,43 +0,0 @@ -# https://developers.notion.com/reference/block#code -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.tags import Br, Div, HtmlTag -from htmlBuilder.tags import Code as HtmlCode - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class Code(BlockBase): - language: str - rich_text: List[RichText] = field(default_factory=list) - caption: List[RichText] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return False - - @classmethod - def from_dict(cls, data: dict): - rich_text = data.pop("rich_text", []) - caption = data.pop("caption", []) - return cls( - language=data["language"], - rich_text=[RichText.from_dict(rt) for rt in rich_text], - caption=[RichText.from_dict(c) for c in caption], - ) - - def get_html(self) -> Optional[HtmlTag]: - texts = [] - if self.rich_text: - texts.append(HtmlCode([], [rt.get_html() for rt in self.rich_text])) - if self.caption: - texts.append(Div([], [rt.get_html() for rt in self.caption])) - if not texts: - return None - joined = [Br()] * (len(texts) * 2 - 1) - joined[0::2] = texts - - return Div([], joined) diff --git a/src/unstructured/ingest/connector/notion/types/blocks/column_list.py b/src/unstructured/ingest/connector/notion/types/blocks/column_list.py deleted file mode 100644 index d2df367..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/column_list.py +++ /dev/null @@ -1,35 +0,0 @@ -# https://developers.notion.com/reference/block#column-list-and-column -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase - - -@dataclass -class ColumnList(BlockBase): - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - return cls() - - def get_html(self) -> Optional[HtmlTag]: - return None - - -@dataclass -class Column(BlockBase): - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - return cls() - - def get_html(self) -> Optional[HtmlTag]: - return None diff --git a/src/unstructured/ingest/connector/notion/types/blocks/divider.py b/src/unstructured/ingest/connector/notion/types/blocks/divider.py deleted file mode 100644 index 33fc01e..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/divider.py +++ /dev/null @@ -1,22 +0,0 @@ -# https://developers.notion.com/reference/block#divider -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.attributes import Style -from htmlBuilder.tags import Hr, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase - - -@dataclass -class Divider(BlockBase): - @staticmethod - def can_have_children() -> bool: - return False - - @classmethod - def from_dict(cls, data: dict): - return cls() - - def get_html(self) -> Optional[HtmlTag]: - return Hr([Style("border-top: 3px solid #bbb")]) diff --git a/src/unstructured/ingest/connector/notion/types/blocks/embed.py b/src/unstructured/ingest/connector/notion/types/blocks/embed.py deleted file mode 100644 index 561fe82..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/embed.py +++ /dev/null @@ -1,36 +0,0 @@ -# https://developers.notion.com/reference/block#embed -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.attributes import Href -from htmlBuilder.tags import A, Br, Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class Embed(BlockBase): - url: str - caption: List[RichText] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return False - - @classmethod - def from_dict(cls, data: dict): - return cls(caption=[RichText.from_dict(d) for d in data.pop("caption", [])], **data) - - def get_html(self) -> Optional[HtmlTag]: - texts = [] - if self.url: - texts.append(A([Href(self.url)], self.url)) - if self.caption: - texts.append(Div([], [rt.get_html() for rt in self.caption])) - if not texts: - return None - joined = [Br()] * (len(texts) * 2 - 1) - joined[0::2] = texts - - return Div([], joined) diff --git a/src/unstructured/ingest/connector/notion/types/blocks/equation.py b/src/unstructured/ingest/connector/notion/types/blocks/equation.py deleted file mode 100644 index ccab3d0..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/equation.py +++ /dev/null @@ -1,23 +0,0 @@ -# https://developers.notion.com/reference/block#equation -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase - - -@dataclass -class Equation(BlockBase): - expression: str - - @staticmethod - def can_have_children() -> bool: - return False - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return Div([], self.expression) diff --git a/src/unstructured/ingest/connector/notion/types/blocks/file.py b/src/unstructured/ingest/connector/notion/types/blocks/file.py deleted file mode 100644 index ad7fe54..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/file.py +++ /dev/null @@ -1,49 +0,0 @@ -# https://developers.notion.com/reference/block#file -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.attributes import Href -from htmlBuilder.tags import A, Br, Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.file import External -from unstructured.ingest.connector.notion.types.file import File as FileContent -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class File(BlockBase): - type: str - external: Optional[External] = None - file: Optional[FileContent] = None - caption: List[RichText] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return False - - @classmethod - def from_dict(cls, data: dict): - caption = [RichText.from_dict(rt) for rt in data.pop("caption", [])] - t = data["type"] - file = cls(type=t, caption=caption) - if t == "external": - file.external = External.from_dict(data["external"]) - elif t == "file": - file.file = FileContent.from_dict(data["file"]) - return file - - def get_html(self) -> Optional[HtmlTag]: - texts = [] - if self.file: - texts.append(A([Href(self.file.url)], self.file.url)) - if self.external: - texts.append(A([Href(self.external.url)], self.external.url)) - if self.caption: - texts.append(Div([], [rt.get_html() for rt in self.caption])) - if not texts: - return None - joined = [Br()] * (len(texts) * 2 - 1) - joined[0::2] = texts - - return Div([], joined) diff --git a/src/unstructured/ingest/connector/notion/types/blocks/heading.py b/src/unstructured/ingest/connector/notion/types/blocks/heading.py deleted file mode 100644 index 86983f5..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/heading.py +++ /dev/null @@ -1,37 +0,0 @@ -# https://developers.notion.com/reference/block#headings -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.attributes import Style -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class Heading(BlockBase): - color: str - is_toggleable: bool - rich_text: List[RichText] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return False - - @classmethod - def from_dict(cls, data: dict): - rich_text = data.pop("rich_text", []) - heading = cls(**data) - heading.rich_text = [RichText.from_dict(rt) for rt in rich_text] - return heading - - def get_html(self) -> Optional[HtmlTag]: - if not self.rich_text: - return None - - texts = [rt.get_html() for rt in self.rich_text] - attributes = [] - if self.color and self.color != "default": - attributes.append(Style(f"color: {self.color}")) - return Div(attributes, texts) diff --git a/src/unstructured/ingest/connector/notion/types/blocks/image.py b/src/unstructured/ingest/connector/notion/types/blocks/image.py deleted file mode 100644 index d9c5203..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/image.py +++ /dev/null @@ -1,21 +0,0 @@ -# https://developers.notion.com/reference/block#image -from typing import Optional - -from htmlBuilder.attributes import Src -from htmlBuilder.tags import HtmlTag, Img - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.file import FileObject - - -class Image(BlockBase, FileObject): - @staticmethod - def can_have_children() -> bool: - return False - - def get_html(self) -> Optional[HtmlTag]: - if self.external: - return Img([Src(self.external.url)], []) - if self.file: - return Img([Src(self.file.url)], []) - return None diff --git a/src/unstructured/ingest/connector/notion/types/blocks/link_preview.py b/src/unstructured/ingest/connector/notion/types/blocks/link_preview.py deleted file mode 100644 index 913df1f..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/link_preview.py +++ /dev/null @@ -1,24 +0,0 @@ -# https://developers.notion.com/reference/block#link-preview -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.attributes import Href -from htmlBuilder.tags import A, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase - - -@dataclass -class LinkPreview(BlockBase): - url: str - - @staticmethod - def can_have_children() -> bool: - return False - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return A([Href(self.url)], self.url) diff --git a/src/unstructured/ingest/connector/notion/types/blocks/numbered_list.py b/src/unstructured/ingest/connector/notion/types/blocks/numbered_list.py deleted file mode 100644 index b0051bc..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/numbered_list.py +++ /dev/null @@ -1,29 +0,0 @@ -# https://developers.notion.com/reference/block#numbered-list-item -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.tags import HtmlTag, Li - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class NumberedListItem(BlockBase): - color: str - children: List[dict] = field(default_factory=list) - rich_text: List[RichText] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - rich_text = data.pop("rich_text", []) - numbered_list = cls(**data) - numbered_list.rich_text = [RichText.from_dict(rt) for rt in rich_text] - return numbered_list - - def get_html(self) -> Optional[HtmlTag]: - return Li([], [rt.get_html() for rt in self.rich_text]) diff --git a/src/unstructured/ingest/connector/notion/types/blocks/paragraph.py b/src/unstructured/ingest/connector/notion/types/blocks/paragraph.py deleted file mode 100644 index bc31e4c..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/paragraph.py +++ /dev/null @@ -1,31 +0,0 @@ -# https://developers.notion.com/reference/block#paragraph -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.tags import Br, Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class Paragraph(BlockBase): - color: str - children: List[dict] = field(default_factory=list) - rich_text: List[RichText] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - rich_text = data.pop("rich_text", []) - paragraph = cls(**data) - paragraph.rich_text = [RichText.from_dict(rt) for rt in rich_text] - return paragraph - - def get_html(self) -> Optional[HtmlTag]: - if not self.rich_text: - return Br() - return Div([], [rt.get_html() for rt in self.rich_text]) diff --git a/src/unstructured/ingest/connector/notion/types/blocks/pdf.py b/src/unstructured/ingest/connector/notion/types/blocks/pdf.py deleted file mode 100644 index 61ef3a8..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/pdf.py +++ /dev/null @@ -1,49 +0,0 @@ -# https://developers.notion.com/reference/block#pdf -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.attributes import Href -from htmlBuilder.tags import A, Br, Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.file import External, File -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class PDF(BlockBase): - type: str - caption: List[RichText] = field(default_factory=list) - external: Optional[External] = None - file: Optional[File] = None - - @staticmethod - def can_have_children() -> bool: - return False - - @classmethod - def from_dict(cls, data: dict): - caption = data.pop("caption", []) - t = data["type"] - paragraph = cls(type=t) - paragraph.caption = [RichText.from_dict(c) for c in caption] - if t == "external": - paragraph.external = External.from_dict(data["external"]) - elif t == "file": - paragraph.file = File.from_dict(data["file"]) - return paragraph - - def get_html(self) -> Optional[HtmlTag]: - texts = [] - if self.external: - texts.append(A([Href(self.external.url)], self.external.url)) - if self.file: - texts.append(A([Href(self.file.url)], self.file.url)) - if self.caption: - texts.append(Div([], [rt.get_html() for rt in self.caption])) - if not texts: - return None - joined = [Br()] * (len(texts) * 2 - 1) - joined[0::2] = texts - - return Div([], joined) diff --git a/src/unstructured/ingest/connector/notion/types/blocks/quote.py b/src/unstructured/ingest/connector/notion/types/blocks/quote.py deleted file mode 100644 index 1469f1d..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/quote.py +++ /dev/null @@ -1,37 +0,0 @@ -# https://developers.notion.com/reference/block#quote -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.attributes import Style -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class Quote(BlockBase): - color: str - children: List[dict] = field(default_factory=list) - rich_text: List[RichText] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - rich_text = data.pop("rich_text", []) - quote = cls(**data) - quote.rich_text = [RichText.from_dict(rt) for rt in rich_text] - return quote - - def get_html(self) -> Optional[HtmlTag]: - if not self.rich_text: - return None - - texts = [rt.get_html() for rt in self.rich_text] - attributes = [] - if self.color and self.color != "default": - attributes.append(Style(f"color: {self.color}")) - return Div(attributes, texts) diff --git a/src/unstructured/ingest/connector/notion/types/blocks/synced_block.py b/src/unstructured/ingest/connector/notion/types/blocks/synced_block.py deleted file mode 100644 index b4cd2da..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/synced_block.py +++ /dev/null @@ -1,57 +0,0 @@ -# https://developers.notion.com/reference/block#synced-block -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.tags import HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase - - -@dataclass -class OriginalSyncedBlock(BlockBase): - synced_from: Optional[str] = None - children: List[dict] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - return cls(children=data["children"]) - - def get_html(self) -> Optional[HtmlTag]: - return None - - -@dataclass -class DuplicateSyncedBlock(BlockBase): - type: str - block_id: str - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return None - - -class SyncBlock(BlockBase): - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - if "synced_from" in data: - return OriginalSyncedBlock.from_dict(data) - else: - return DuplicateSyncedBlock.from_dict(data) - - def get_html(self) -> Optional[HtmlTag]: - return None diff --git a/src/unstructured/ingest/connector/notion/types/blocks/table.py b/src/unstructured/ingest/connector/notion/types/blocks/table.py deleted file mode 100644 index 7858275..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/table.py +++ /dev/null @@ -1,63 +0,0 @@ -# https://developers.notion.com/reference/block#table -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.tags import HtmlTag, Td, Th, Tr - -from unstructured.ingest.connector.notion.interfaces import ( - BlockBase, - FromJSONMixin, -) -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class Table(BlockBase): - table_width: int - has_column_header: bool - has_row_header: bool - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return None - - -@dataclass -class TableCell(FromJSONMixin): - rich_texts: List[RichText] - - @classmethod - def from_dict(cls, data: dict): - return cls(rich_texts=[RichText.from_dict(rt) for rt in data.pop("rich_texts", [])]) - - def get_html(self, is_header: bool) -> Optional[HtmlTag]: - if is_header: - return Th([], [rt.get_html() for rt in self.rich_texts]) - else: - return Td([], [rt.get_html() for rt in self.rich_texts]) - - -# https://developers.notion.com/reference/block#table-rows -@dataclass -class TableRow(BlockBase): - is_header: bool = False - cells: List[TableCell] = field(default_factory=list) - - @classmethod - def from_dict(cls, data: dict): - cells = data.get("cells", []) - return cls(cells=[TableCell.from_dict({"rich_texts": c}) for c in cells]) - - @staticmethod - def can_have_children() -> bool: - return False - - def get_html(self) -> Optional[HtmlTag]: - return Tr([], [cell.get_html(is_header=self.is_header) for cell in self.cells]) diff --git a/src/unstructured/ingest/connector/notion/types/blocks/table_of_contents.py b/src/unstructured/ingest/connector/notion/types/blocks/table_of_contents.py deleted file mode 100644 index f753f60..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/table_of_contents.py +++ /dev/null @@ -1,23 +0,0 @@ -# https://developers.notion.com/reference/block#table-of-contents -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase - - -@dataclass -class TableOfContents(BlockBase): - color: str - - @staticmethod - def can_have_children() -> bool: - return False - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return None diff --git a/src/unstructured/ingest/connector/notion/types/blocks/template.py b/src/unstructured/ingest/connector/notion/types/blocks/template.py deleted file mode 100644 index 4505687..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/template.py +++ /dev/null @@ -1,30 +0,0 @@ -# https://developers.notion.com/reference/block#template -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class Template(BlockBase): - children: List[dict] = field(default_factory=list) - rich_text: List[RichText] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - rich_text = data.pop("rich_text", []) - template = cls(**data) - template.rich_text = [RichText.from_dict(rt) for rt in rich_text] - return template - - def get_html(self) -> Optional[HtmlTag]: - if not self.rich_text: - return None - return Div([], [rt.get_html() for rt in self.rich_text]) diff --git a/src/unstructured/ingest/connector/notion/types/blocks/todo.py b/src/unstructured/ingest/connector/notion/types/blocks/todo.py deleted file mode 100644 index 3e03b2c..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/todo.py +++ /dev/null @@ -1,42 +0,0 @@ -# https://developers.notion.com/reference/block#to-do -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.attributes import Checked, Style, Type -from htmlBuilder.tags import Div, HtmlTag, Input - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class ToDo(BlockBase): - color: str - checked: bool = False - rich_text: List[RichText] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - rich_text = data.pop("rich_text", []) - todo = cls(**data) - todo.rich_text = [RichText.from_dict(rt) for rt in rich_text] - return todo - - def get_html(self) -> Optional[HtmlTag]: - if not self.rich_text: - return None - - elements = [] - check_input_attributes = [Type("checkbox")] - if self.checked: - check_input_attributes.append(Checked("")) - elements.append(Input(check_input_attributes)) - elements.extend([rt.get_html() for rt in self.rich_text]) - attributes = [] - if self.color and self.color != "default": - attributes.append(Style(f"color: {self.color}")) - return Div(attributes, elements) diff --git a/src/unstructured/ingest/connector/notion/types/blocks/toggle.py b/src/unstructured/ingest/connector/notion/types/blocks/toggle.py deleted file mode 100644 index 8619eb7..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/toggle.py +++ /dev/null @@ -1,37 +0,0 @@ -# https://developers.notion.com/reference/block#toggle-blocks -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.attributes import Style -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class Toggle(BlockBase): - color: str - children: List[dict] = field(default_factory=list) - rich_text: List[RichText] = field(default_factory=list) - - @staticmethod - def can_have_children() -> bool: - return True - - @classmethod - def from_dict(cls, data: dict): - rich_text = data.pop("rich_text", []) - toggle = cls(**data) - toggle.rich_text = [RichText.from_dict(rt) for rt in rich_text] - return toggle - - def get_html(self) -> Optional[HtmlTag]: - if not self.rich_text: - return None - - texts = [rt.get_html() for rt in self.rich_text] - attributes = [] - if self.color and self.color != "default": - attributes.append(Style(f"color: {self.color}")) - return Div(attributes, texts) diff --git a/src/unstructured/ingest/connector/notion/types/blocks/unsupported.py b/src/unstructured/ingest/connector/notion/types/blocks/unsupported.py deleted file mode 100644 index 6e28b8c..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/unsupported.py +++ /dev/null @@ -1,20 +0,0 @@ -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import HtmlTag - -from unstructured.ingest.connector.notion.interfaces import BlockBase - - -@dataclass -class Unsupported(BlockBase): - @staticmethod - def can_have_children() -> bool: - return False - - @classmethod - def from_dict(cls, data: dict): - return cls() - - def get_html(self) -> Optional[HtmlTag]: - return None diff --git a/src/unstructured/ingest/connector/notion/types/blocks/video.py b/src/unstructured/ingest/connector/notion/types/blocks/video.py deleted file mode 100644 index 2523adf..0000000 --- a/src/unstructured/ingest/connector/notion/types/blocks/video.py +++ /dev/null @@ -1,22 +0,0 @@ -# https://developers.notion.com/reference/block#image -from typing import Optional - -from htmlBuilder.attributes import Src -from htmlBuilder.tags import HtmlTag, Source -from htmlBuilder.tags import Video as VideoHtml - -from unstructured.ingest.connector.notion.interfaces import BlockBase -from unstructured.ingest.connector.notion.types.file import FileObject - - -class Video(BlockBase, FileObject): - @staticmethod - def can_have_children() -> bool: - return False - - def get_html(self) -> Optional[HtmlTag]: - if self.external: - return VideoHtml([], [Source([Src(self.external.url)], [self.external.url])]) - if self.file: - return VideoHtml([], [Source([Src(self.file.url)], [self.file.url])]) - return None diff --git a/src/unstructured/ingest/connector/notion/types/database.py b/src/unstructured/ingest/connector/notion/types/database.py deleted file mode 100644 index b2372a7..0000000 --- a/src/unstructured/ingest/connector/notion/types/database.py +++ /dev/null @@ -1,71 +0,0 @@ -# https://developers.notion.com/reference/database -from dataclasses import dataclass, field -from typing import Dict, List, Optional - -from htmlBuilder.tags import Div, HtmlTag, Span - -from unstructured.ingest.connector.notion.interfaces import ( - DBPropertyBase, - FromJSONMixin, - GetHTMLMixin, -) -from unstructured.ingest.connector.notion.types.database_properties import ( - map_properties, -) -from unstructured.ingest.connector.notion.types.file import FileObject -from unstructured.ingest.connector.notion.types.parent import Parent -from unstructured.ingest.connector.notion.types.rich_text import RichText -from unstructured.ingest.connector.notion.types.user import PartialUser - - -@dataclass -class Database(FromJSONMixin, GetHTMLMixin): - id: str - created_time: str - created_by: PartialUser - last_edited_time: str - last_edited_by: PartialUser - archived: bool - parent: Parent - url: str - is_inline: bool - public_url: str - properties: Dict[str, DBPropertyBase] = field(default_factory=dict) - title: List[RichText] = field(default_factory=list) - description: List[RichText] = field(default_factory=list) - icon: Optional[FileObject] = None - cover: Optional[FileObject] = None - object: str = "database" - - @classmethod - def from_dict(cls, data: dict): - created_by = data.pop("created_by") - last_edited_by = data.pop("last_edited_by") - icon = data.pop("icon") - cover = data.pop("cover") - parent = data.pop("parent") - title = data.pop("title") - description = data.pop("description") - page = cls( - properties=map_properties(data.pop("properties", {})), - created_by=PartialUser.from_dict(created_by), - last_edited_by=PartialUser.from_dict(last_edited_by), - icon=FileObject.from_dict(icon) if icon else None, - cover=FileObject.from_dict(cover) if cover else None, - parent=Parent.from_dict(parent), - title=[RichText.from_dict(data=r) for r in title], - description=[RichText.from_dict(data=r) for r in description], - **data, - ) - - return page - - def get_html(self) -> Optional[HtmlTag]: - spans = [] - if title := self.title: - spans.append(Span([], [rt.get_html() for rt in title])) - if description := self.description: - spans.append(Span([], [rt.get_html() for rt in description])) - if spans: - return Div([], spans) - return None diff --git a/src/unstructured/ingest/connector/notion/types/database_properties/__init__.py b/src/unstructured/ingest/connector/notion/types/database_properties/__init__.py deleted file mode 100644 index 1001113..0000000 --- a/src/unstructured/ingest/connector/notion/types/database_properties/__init__.py +++ /dev/null @@ -1,106 +0,0 @@ -from typing import Dict - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase - -from .checkbox import Checkbox, CheckboxCell -from .created_by import CreatedBy, CreatedByCell -from .created_time import CreatedTime, CreatedTimeCell -from .date import Date, DateCell -from .email import Email, EmailCell -from .files import Files, FilesCell -from .formula import Formula, FormulaCell -from .last_edited_by import LastEditedBy, LastEditedByCell -from .last_edited_time import LastEditedTime, LastEditedTimeCell -from .multiselect import MultiSelect, MultiSelectCell -from .number import Number, NumberCell -from .people import People, PeopleCell -from .phone_number import PhoneNumber, PhoneNumberCell -from .relation import Relation, RelationCell -from .rich_text import RichText, RichTextCell -from .rollup import Rollup, RollupCell -from .select import Select, SelectCell -from .status import Status, StatusCell -from .title import Title, TitleCell -from .unique_id import UniqueID, UniqueIDCell -from .url import URL, URLCell -from .verification import Verification, VerificationCell - -db_prop_type_mapping = { - "checkbox": Checkbox, - "created_by": CreatedBy, - "created_time": CreatedTime, - "date": Date, - "email": Email, - "files": Files, - "formula": Formula, - "last_edited_by": LastEditedBy, - "last_edited_time": LastEditedTime, - "multi_select": MultiSelect, - "number": Number, - "people": People, - "phone_number": PhoneNumber, - "relation": Relation, - "rich_text": RichText, - "rollup": Rollup, - "select": Select, - "status": Status, - "title": Title, - "unique_id": UniqueID, - "url": URL, - "verification": Verification, -} - - -def map_properties(props: Dict[str, dict]) -> Dict[str, DBPropertyBase]: - mapped_dict = {} - for k, v in props.items(): - try: - mapped_dict[k] = db_prop_type_mapping[v["type"]].from_dict(v) # type: ignore - except KeyError as ke: - raise KeyError(f"failed to map to associated database property -> {k}: {v}") from ke - - return mapped_dict - - -db_cell_type_mapping = { - "checkbox": CheckboxCell, - "created_by": CreatedByCell, - "created_time": CreatedTimeCell, - "date": DateCell, - "email": EmailCell, - "files": FilesCell, - "formula": FormulaCell, - "last_edited_by": LastEditedByCell, - "last_edited_time": LastEditedTimeCell, - "multi_select": MultiSelectCell, - "number": NumberCell, - "people": PeopleCell, - "phone_number": PhoneNumberCell, - "relation": RelationCell, - "rich_text": RichTextCell, - "rollup": RollupCell, - "select": SelectCell, - "status": StatusCell, - "title": TitleCell, - "unique_id": UniqueIDCell, - "url": URLCell, - "verification": VerificationCell, -} - - -def map_cells(props: Dict[str, dict]) -> Dict[str, DBCellBase]: - mapped_dict = {} - for k, v in props.items(): - try: - t = v["type"] - mapped_dict[k] = db_cell_type_mapping[t].from_dict(v) # type: ignore - except KeyError as ke: - raise KeyError(f"failed to map to associated database property -> {k}: {v}") from ke - - return mapped_dict - - -__all__ = [ - "map_properties", - "map_cells", -] diff --git a/src/unstructured/ingest/connector/notion/types/database_properties/checkbox.py b/src/unstructured/ingest/connector/notion/types/database_properties/checkbox.py deleted file mode 100644 index b60d187..0000000 --- a/src/unstructured/ingest/connector/notion/types/database_properties/checkbox.py +++ /dev/null @@ -1,38 +0,0 @@ -# https://developers.notion.com/reference/property-object#checkbox -from dataclasses import dataclass, field -from typing import Optional - -from htmlBuilder.attributes import Checked, Type -from htmlBuilder.tags import Div, HtmlTag, Input - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase - - -@dataclass -class Checkbox(DBPropertyBase): - id: str - name: str - type: str = "checkbox" - checkbox: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class CheckboxCell(DBCellBase): - id: str - checkbox: bool - name: Optional[str] = None - type: str = "checkbox" - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - check_input_attributes = [Type("checkbox")] - if self.checkbox: - check_input_attributes.append(Checked("")) - return Div([], Input(check_input_attributes)) diff --git a/src/unstructured/ingest/connector/notion/types/database_properties/created_by.py b/src/unstructured/ingest/connector/notion/types/database_properties/created_by.py deleted file mode 100644 index 034b0c1..0000000 --- a/src/unstructured/ingest/connector/notion/types/database_properties/created_by.py +++ /dev/null @@ -1,35 +0,0 @@ -# https://developers.notion.com/reference/property-object#created-by -from dataclasses import dataclass, field -from typing import Optional - -from htmlBuilder.tags import HtmlTag - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase -from unstructured.ingest.connector.notion.types.user import People - - -@dataclass -class CreatedBy(DBPropertyBase): - id: str - name: str - type: str = "created_by" - created_by: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class CreatedByCell(DBCellBase): - id: str - created_by: People - type: str = "created_by" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(created_by=People.from_dict(data.pop("created_by")), **data) - - def get_html(self) -> Optional[HtmlTag]: - return self.created_by.get_html() diff --git a/src/unstructured/ingest/connector/notion/types/database_properties/created_time.py b/src/unstructured/ingest/connector/notion/types/database_properties/created_time.py deleted file mode 100644 index 86c1173..0000000 --- a/src/unstructured/ingest/connector/notion/types/database_properties/created_time.py +++ /dev/null @@ -1,34 +0,0 @@ -# https://developers.notion.com/reference/property-object#created-time -from dataclasses import dataclass, field -from typing import Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase - - -@dataclass -class CreatedTime(DBPropertyBase): - id: str - name: str - type: str = "created_time" - created_time: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class CreatedTimeCell(DBCellBase): - id: str - created_time: str - type: str = "created_time" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return Div([], self.created_time) diff --git a/src/unstructured/ingest/connector/notion/types/database_properties/date.py b/src/unstructured/ingest/connector/notion/types/database_properties/date.py deleted file mode 100644 index 779ef60..0000000 --- a/src/unstructured/ingest/connector/notion/types/database_properties/date.py +++ /dev/null @@ -1,41 +0,0 @@ -# https://developers.notion.com/reference/property-object#date -from dataclasses import dataclass, field -from typing import Optional - -from htmlBuilder.tags import HtmlTag - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase -from unstructured.ingest.connector.notion.types.date import Date as DateType - - -@dataclass -class Date(DBPropertyBase): - id: str - name: str - type: str = "date" - date: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class DateCell(DBCellBase): - id: str - date: Optional[DateType] = None - name: Optional[str] = None - type: str = "date" - - @classmethod - def from_dict(cls, data: dict): - date = None - date_data = data.pop("date") - if date_data: - date = DateType.from_dict(date_data) - return cls(date=date, **data) - - def get_html(self) -> Optional[HtmlTag]: - if date := self.date: - return date.get_html() - return None diff --git a/src/unstructured/ingest/connector/notion/types/database_properties/email.py b/src/unstructured/ingest/connector/notion/types/database_properties/email.py deleted file mode 100644 index 1303770..0000000 --- a/src/unstructured/ingest/connector/notion/types/database_properties/email.py +++ /dev/null @@ -1,36 +0,0 @@ -# https://developers.notion.com/reference/property-object#email -from dataclasses import dataclass, field -from typing import Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase - - -@dataclass -class Email(DBPropertyBase): - id: str - name: str - type: str = "email" - email: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class EmailCell(DBCellBase): - id: str - email: str - name: Optional[str] = None - type: str = "email" - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - if email := self.email: - return Div([], email) - return None diff --git a/src/unstructured/ingest/connector/notion/types/database_properties/files.py b/src/unstructured/ingest/connector/notion/types/database_properties/files.py deleted file mode 100644 index 680ee15..0000000 --- a/src/unstructured/ingest/connector/notion/types/database_properties/files.py +++ /dev/null @@ -1,37 +0,0 @@ -# https://developers.notion.com/reference/property-object#files -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase -from unstructured.ingest.connector.notion.types.file import FileObject - - -@dataclass -class Files(DBPropertyBase): - id: str - name: str - type: str = "files" - files: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class FilesCell(DBCellBase): - id: str - files: List[FileObject] - type: str = "files" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(files=[FileObject.from_dict(f) for f in data.pop("files", [])], **data) - - def get_html(self) -> Optional[HtmlTag]: - if not self.files: - return None - return Div([], [f.get_html() for f in self.files]) diff --git a/src/unstructured/ingest/connector/notion/types/database_properties/formula.py b/src/unstructured/ingest/connector/notion/types/database_properties/formula.py deleted file mode 100644 index b192136..0000000 --- a/src/unstructured/ingest/connector/notion/types/database_properties/formula.py +++ /dev/null @@ -1,49 +0,0 @@ -# https://developers.notion.com/reference/property-object#formula -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import ( - DBCellBase, - DBPropertyBase, - FromJSONMixin, -) - - -@dataclass -class FormulaProp(FromJSONMixin): - expression: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class Formula(DBPropertyBase): - id: str - name: str - formula: FormulaProp - type: str = "formula" - - @classmethod - def from_dict(cls, data: dict): - return cls(formula=FormulaProp.from_dict(data.pop("formula", {})), **data) - - -@dataclass -class FormulaCell(DBCellBase): - id: str - formula: dict - type: str = "formula" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - formula = self.formula - t = formula.get("type") - return Div([], str(formula[t])) diff --git a/src/unstructured/ingest/connector/notion/types/database_properties/last_edited_by.py b/src/unstructured/ingest/connector/notion/types/database_properties/last_edited_by.py deleted file mode 100644 index a1a2d0a..0000000 --- a/src/unstructured/ingest/connector/notion/types/database_properties/last_edited_by.py +++ /dev/null @@ -1,34 +0,0 @@ -# https://developers.notion.com/reference/property-object#last-edited-by -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import HtmlTag - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase -from unstructured.ingest.connector.notion.types.user import People - - -@dataclass -class LastEditedBy(DBPropertyBase): - @classmethod - def from_dict(cls, data: dict): - return cls() - - def get_text(self) -> Optional[str]: - return None - - -@dataclass -class LastEditedByCell(DBCellBase): - id: str - last_edited_by: People - type: str = "last_edited_by" - - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(last_edited_by=People.from_dict(data.pop("last_edited_by", {})), **data) - - def get_html(self) -> Optional[HtmlTag]: - return self.last_edited_by.get_html() diff --git a/src/unstructured/ingest/connector/notion/types/database_properties/last_edited_time.py b/src/unstructured/ingest/connector/notion/types/database_properties/last_edited_time.py deleted file mode 100644 index 4c9e009..0000000 --- a/src/unstructured/ingest/connector/notion/types/database_properties/last_edited_time.py +++ /dev/null @@ -1,34 +0,0 @@ -# https://developers.notion.com/reference/property-object#last-edited-time -from dataclasses import dataclass, field -from typing import Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase - - -@dataclass -class LastEditedTime(DBPropertyBase): - id: str - name: str - type: str = "last_edited_time" - last_edited_time: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class LastEditedTimeCell(DBCellBase): - id: str - last_edited_time: str - type: str = "last_edited_time" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return Div([], self.last_edited_time) diff --git a/src/unstructured/ingest/connector/notion/types/database_properties/multiselect.py b/src/unstructured/ingest/connector/notion/types/database_properties/multiselect.py deleted file mode 100644 index 7534ab8..0000000 --- a/src/unstructured/ingest/connector/notion/types/database_properties/multiselect.py +++ /dev/null @@ -1,73 +0,0 @@ -# https://developers.notion.com/reference/property-object#multi-select -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.attributes import Style -from htmlBuilder.tags import Div, HtmlTag, Span - -from unstructured.ingest.connector.notion.interfaces import ( - DBCellBase, - DBPropertyBase, - FromJSONMixin, -) - - -@dataclass -class MultiSelectOption(FromJSONMixin): - color: str - id: str - name: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class MultiSelectProp(FromJSONMixin): - options: List[MultiSelectOption] = field(default_factory=list) - - @classmethod - def from_dict(cls, data: dict): - return cls(options=[MultiSelectOption.from_dict(o) for o in data.get("options", [])]) - - -@dataclass -class MultiSelect(DBPropertyBase): - id: str - name: str - multi_select: MultiSelectProp - type: str = "multi_select" - - @classmethod - def from_dict(cls, data: dict): - return cls( - multi_select=data.pop("multi_select", {}), - **data, - ) - - -@dataclass -class MultiSelectCell(DBCellBase): - id: str - multi_select: List[MultiSelectOption] - type: str = "multi_select" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls( - multi_select=[MultiSelectOption.from_dict(o) for o in data.pop("multi_select", [])], - **data, - ) - - def get_html(self) -> Optional[HtmlTag]: - if not self.multi_select: - return None - option_spans = [] - for option in self.multi_select: - option_attributes = [] - if option.color and option.color != "default": - option_attributes.append(Style(f"color: {option.color}")) - option_spans.append(Span(option_attributes, option.name)) - return Div([], option_spans) diff --git a/src/unstructured/ingest/connector/notion/types/database_properties/number.py b/src/unstructured/ingest/connector/notion/types/database_properties/number.py deleted file mode 100644 index 599981f..0000000 --- a/src/unstructured/ingest/connector/notion/types/database_properties/number.py +++ /dev/null @@ -1,49 +0,0 @@ -# https://developers.notion.com/reference/property-object#number -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import ( - DBCellBase, - DBPropertyBase, - FromJSONMixin, -) - - -@dataclass -class NumberProp(FromJSONMixin): - format: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class Number(DBPropertyBase): - id: str - name: str - number: NumberProp - type: str = "number" - - @classmethod - def from_dict(cls, data: dict): - return cls(number=NumberProp.from_dict(data.pop("number")), **data) - - -@dataclass -class NumberCell(DBCellBase): - id: str - number: Optional[int] = None - type: str = "number" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - if number := self.number: - return Div([], str(number)) - return None diff --git a/src/unstructured/ingest/connector/notion/types/database_properties/people.py b/src/unstructured/ingest/connector/notion/types/database_properties/people.py deleted file mode 100644 index 44e66b2..0000000 --- a/src/unstructured/ingest/connector/notion/types/database_properties/people.py +++ /dev/null @@ -1,40 +0,0 @@ -# https://developers.notion.com/reference/property-object#people -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.tags import Div, HtmlTag, Span - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase -from unstructured.ingest.connector.notion.types.user import People as PeopleType - - -@dataclass -class People(DBPropertyBase): - id: str - name: str - type: str = "people" - people: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class PeopleCell(DBCellBase): - id: str - people: List[PeopleType] - type: str = "people" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(people=[PeopleType.from_dict(p) for p in data.pop("people", {})], **data) - - def get_html(self) -> Optional[HtmlTag]: - if not self.people: - return None - people_spans = [] - for person in self.people: - people_spans.append(Span([], person.get_html())) - return Div([], people_spans) diff --git a/src/unstructured/ingest/connector/notion/types/database_properties/phone_number.py b/src/unstructured/ingest/connector/notion/types/database_properties/phone_number.py deleted file mode 100644 index 58a5c91..0000000 --- a/src/unstructured/ingest/connector/notion/types/database_properties/phone_number.py +++ /dev/null @@ -1,36 +0,0 @@ -# https://developers.notion.com/reference/property-object#phone-number -from dataclasses import dataclass, field -from typing import Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase - - -@dataclass -class PhoneNumber(DBPropertyBase): - id: str - name: str - type: str = "phone_number" - phone_number: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class PhoneNumberCell(DBCellBase): - id: str - phone_number: Optional[str] - name: Optional[str] = None - type: str = "phone_number" - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - if phone_number := self.phone_number: - return Div([], phone_number) - return None diff --git a/src/unstructured/ingest/connector/notion/types/database_properties/relation.py b/src/unstructured/ingest/connector/notion/types/database_properties/relation.py deleted file mode 100644 index 35c283a..0000000 --- a/src/unstructured/ingest/connector/notion/types/database_properties/relation.py +++ /dev/null @@ -1,67 +0,0 @@ -# https://developers.notion.com/reference/property-object#relation -from dataclasses import dataclass -from typing import Optional -from urllib.parse import unquote - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import ( - DBCellBase, - DBPropertyBase, - FromJSONMixin, -) - - -@dataclass -class DualProperty(FromJSONMixin): - synced_property_id: str - synced_property_name: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class RelationProp(FromJSONMixin): - database_id: str - type: str - dual_property: DualProperty - - @classmethod - def from_dict(cls, data: dict): - t = data.get("type") - if t == "dual_property": - dual_property = DualProperty.from_dict(data.pop(t)) - else: - raise ValueError(f"{t} type not recognized") - - return cls(dual_property=dual_property, **data) - - -@dataclass -class Relation(DBPropertyBase): - id: str - name: str - relation: RelationProp - type: str = "relation" - - @classmethod - def from_dict(cls, data: dict): - return cls(relation=RelationProp.from_dict(data.pop("relation")), **data) - - -@dataclass -class RelationCell(DBCellBase): - id: str - has_more: bool - relation: list - type: str = "relation" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return Div([], unquote(self.id)) diff --git a/src/unstructured/ingest/connector/notion/types/database_properties/rich_text.py b/src/unstructured/ingest/connector/notion/types/database_properties/rich_text.py deleted file mode 100644 index 2bd56c2..0000000 --- a/src/unstructured/ingest/connector/notion/types/database_properties/rich_text.py +++ /dev/null @@ -1,43 +0,0 @@ -# https://developers.notion.com/reference/property-object#rich-text -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.tags import Div, HtmlTag, Span - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase -from unstructured.ingest.connector.notion.types.rich_text import ( - RichText as RichTextType, -) - - -@dataclass -class RichText(DBPropertyBase): - id: str - name: str - type: str = "rich_text" - rich_text: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class RichTextCell(DBCellBase): - id: str - rich_text: List[RichTextType] - name: Optional[str] = None - type: str = "rich_text" - - @classmethod - def from_dict(cls, data: dict): - return cls( - rich_text=[RichTextType.from_dict(rt) for rt in data.pop("rich_text", [])], - **data, - ) - - def get_html(self) -> Optional[HtmlTag]: - if not self.rich_text: - return None - spans = [Span([], rt.get_html()) for rt in self.rich_text] - return Div([], spans) diff --git a/src/unstructured/ingest/connector/notion/types/database_properties/rollup.py b/src/unstructured/ingest/connector/notion/types/database_properties/rollup.py deleted file mode 100644 index 5134b40..0000000 --- a/src/unstructured/ingest/connector/notion/types/database_properties/rollup.py +++ /dev/null @@ -1,56 +0,0 @@ -# https://developers.notion.com/reference/property-object#rollup -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import Div, HtmlTag, Span - -from unstructured.ingest.connector.notion.interfaces import ( - DBCellBase, - DBPropertyBase, - FromJSONMixin, -) - - -@dataclass -class RollupProp(FromJSONMixin): - function: str - relation_property_id: str - relation_property_name: str - rollup_property_id: str - rollup_property_name: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class Rollup(DBPropertyBase): - id: str - name: str - rollup: RollupProp - type: str = "rollup" - - @classmethod - def from_dict(cls, data: dict): - return cls(rollup=RollupProp.from_dict(data.pop("rollup")), **data) - - -@dataclass -class RollupCell(DBCellBase): - id: str - rollup: dict - type: str = "rollup" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - rollup = self.rollup - t = rollup.get("type") - v = rollup[t] - if isinstance(v, list): - return Div([], [Span([], str(x)) for x in v]) - return Div([], str(v)) diff --git a/src/unstructured/ingest/connector/notion/types/database_properties/select.py b/src/unstructured/ingest/connector/notion/types/database_properties/select.py deleted file mode 100644 index 550f2ff..0000000 --- a/src/unstructured/ingest/connector/notion/types/database_properties/select.py +++ /dev/null @@ -1,68 +0,0 @@ -# https://developers.notion.com/reference/property-object#select -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.attributes import Style -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import ( - DBCellBase, - DBPropertyBase, - FromJSONMixin, -) - - -@dataclass -class SelectOption(FromJSONMixin): - color: str - id: str - name: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class SelectProp(FromJSONMixin): - options: List[SelectOption] = field(default_factory=list) - - @classmethod - def from_dict(cls, data: dict): - return cls(options=[SelectOption.from_dict(o) for o in data.get("options", [])]) - - -@dataclass -class Select(DBPropertyBase): - id: str - name: str - select: SelectProp - type: str = "select" - - @classmethod - def from_dict(cls, data: dict): - return cls(select=SelectProp.from_dict(data.pop("select", {})), **data) - - -@dataclass -class SelectCell(DBCellBase): - id: str - select: Optional[SelectOption] - type: str = "select" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - select_data = data.pop("select") - select = None - if select_data: - select = SelectOption.from_dict(select_data) - return cls(select=select, **data) - - def get_html(self) -> Optional[HtmlTag]: - if select := self.select: - select_attr = [] - if select.color and select.color != "default": - select_attr.append(Style(f"color: {select.color}")) - return Div(select_attr, select.name) - return None diff --git a/src/unstructured/ingest/connector/notion/types/database_properties/status.py b/src/unstructured/ingest/connector/notion/types/database_properties/status.py deleted file mode 100644 index 8139b98..0000000 --- a/src/unstructured/ingest/connector/notion/types/database_properties/status.py +++ /dev/null @@ -1,80 +0,0 @@ -# https://developers.notion.com/reference/property-object#status -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.attributes import Style -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import ( - DBCellBase, - DBPropertyBase, - FromJSONMixin, -) - - -@dataclass -class StatusOption(FromJSONMixin): - color: str - id: str - name: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class StatusGroup(FromJSONMixin): - color: str - id: str - name: str - option_ids: List[str] = field(default_factory=List[str]) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class StatusProp(FromJSONMixin): - options: List[StatusOption] = field(default_factory=list) - groups: List[StatusGroup] = field(default_factory=list) - - @classmethod - def from_dict(cls, data: dict): - return cls( - options=[StatusOption.from_dict(o) for o in data.get("options", [])], - groups=[StatusGroup.from_dict(g) for g in data.get("groups", [])], - ) - - -@dataclass -class Status(DBPropertyBase): - id: str - name: str - status: StatusProp - type: str = "status" - - @classmethod - def from_dict(cls, data: dict): - return cls(status=StatusProp.from_dict(data.pop("status", {})), **data) - - -@dataclass -class StatusCell(DBCellBase): - id: str - status: Optional[StatusOption] - type: str = "status" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(status=StatusOption.from_dict(data.pop("status", {})), **data) - - def get_html(self) -> Optional[HtmlTag]: - if status := self.status: - select_attr = [] - if status.color and status.color != "default": - select_attr.append(Style(f"color: {status.color}")) - return Div(select_attr, status.name) - return None diff --git a/src/unstructured/ingest/connector/notion/types/database_properties/title.py b/src/unstructured/ingest/connector/notion/types/database_properties/title.py deleted file mode 100644 index aaee0e6..0000000 --- a/src/unstructured/ingest/connector/notion/types/database_properties/title.py +++ /dev/null @@ -1,37 +0,0 @@ -# https://developers.notion.com/reference/property-object#title -from dataclasses import dataclass, field -from typing import List, Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase -from unstructured.ingest.connector.notion.types.rich_text import RichText - - -@dataclass -class Title(DBPropertyBase): - id: str - name: str - type: str = "title" - title: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class TitleCell(DBCellBase): - id: str - title: List[RichText] - type: str = "title" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(title=[RichText.from_dict(rt) for rt in data.pop("title", [])], **data) - - def get_html(self) -> Optional[HtmlTag]: - if not self.title: - return None - return Div([], [rt.get_html() for rt in self.title]) diff --git a/src/unstructured/ingest/connector/notion/types/database_properties/unique_id.py b/src/unstructured/ingest/connector/notion/types/database_properties/unique_id.py deleted file mode 100644 index 643f2c0..0000000 --- a/src/unstructured/ingest/connector/notion/types/database_properties/unique_id.py +++ /dev/null @@ -1,50 +0,0 @@ -# https://developers.notion.com/reference/property-object#title -from dataclasses import dataclass, field -from typing import Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import ( - DBCellBase, - DBPropertyBase, - FromJSONMixin, -) - - -@dataclass -class UniqueID(DBPropertyBase): - id: str - name: str - type: str = "unique_id" - unique_id: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class UniqueIDCellData(FromJSONMixin): - prefix: str - number: int - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class UniqueIDCell(DBCellBase): - id: str - unique_id: Optional[UniqueIDCellData] - type: str = "title" - name: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(unique_id=UniqueIDCellData.from_dict(data.pop("unique_id")), **data) - - def get_html(self) -> Optional[HtmlTag]: - if unique_id := self.unique_id: - return Div([], f"{unique_id.prefix}-{unique_id.number}") - return None diff --git a/src/unstructured/ingest/connector/notion/types/database_properties/url.py b/src/unstructured/ingest/connector/notion/types/database_properties/url.py deleted file mode 100644 index 8233ae9..0000000 --- a/src/unstructured/ingest/connector/notion/types/database_properties/url.py +++ /dev/null @@ -1,37 +0,0 @@ -# https://developers.notion.com/reference/property-object#url -from dataclasses import dataclass, field -from typing import Optional - -from htmlBuilder.attributes import Href -from htmlBuilder.tags import A, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import DBCellBase, DBPropertyBase - - -@dataclass -class URL(DBPropertyBase): - id: str - name: str - type: str = "url" - url: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class URLCell(DBCellBase): - id: str - url: Optional[str] = None - name: Optional[str] = None - type: str = "url" - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - if url := self.url: - return A([Href(url)], url) - return None diff --git a/src/unstructured/ingest/connector/notion/types/database_properties/verification.py b/src/unstructured/ingest/connector/notion/types/database_properties/verification.py deleted file mode 100644 index 03ade8e..0000000 --- a/src/unstructured/ingest/connector/notion/types/database_properties/verification.py +++ /dev/null @@ -1,78 +0,0 @@ -# https://developers.notion.com/reference/property-object#url -from dataclasses import dataclass, field -from typing import Optional - -from htmlBuilder.tags import Div, HtmlTag, Span - -from unstructured.ingest.connector.notion.interfaces import ( - DBCellBase, - DBPropertyBase, - FromJSONMixin, - GetHTMLMixin, -) -from unstructured.ingest.connector.notion.types.date import Date -from unstructured.ingest.connector.notion.types.user import People - - -@dataclass -class Verification(DBPropertyBase): - id: str - name: str - type: str = "verification" - verification: dict = field(default_factory=dict) - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class VerificationData(FromJSONMixin, GetHTMLMixin): - state: Optional[str] - verified_by: Optional[People] - date: Optional[Date] - - @classmethod - def from_dict(cls, data: dict): - verified_by = data.pop("verified_by", None) - date = data.pop("date", None) - return cls( - verified_by=People.from_dict(data=verified_by) if verified_by else None, - date=Date.from_dict(data=date) if date else None, - **data, - ) - - def get_html(self) -> Optional[HtmlTag]: - elements = [] - if state := self.state: - elements.append(Span([], state)) - if (verified_by := self.verified_by) and (verified_by_html := verified_by.get_html()): - elements.append(verified_by_html) - if (date := self.date) and (date_html := date.get_html()): - elements.append(date_html) - if elements: - return Div([], elements) - return None - - -@dataclass -class VerificationCell(DBCellBase): - id: str - verification: Optional[VerificationData] - name: Optional[str] = None - type: str = "verification" - - @classmethod - def from_dict(cls, data: dict): - return cls(verification=VerificationData.from_dict(data.pop("verification")), **data) - - def get_html(self) -> Optional[HtmlTag]: - elements = [] - if name := self.name: - elements.append(Span([], name)) - if (verification := self.verification) and (verification_html := verification.get_html()): - elements.append(verification_html) - - if elements: - return Div([], elements) - return None diff --git a/src/unstructured/ingest/connector/notion/types/date.py b/src/unstructured/ingest/connector/notion/types/date.py deleted file mode 100644 index 7c6dcf1..0000000 --- a/src/unstructured/ingest/connector/notion/types/date.py +++ /dev/null @@ -1,26 +0,0 @@ -# https://developers.notion.com/reference/property-value-object#date-property-values -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.tags import Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import FromJSONMixin, GetHTMLMixin - - -@dataclass -class Date(FromJSONMixin, GetHTMLMixin): - start: str - end: Optional[str] = None - time_zone: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - text = f"{self.start}" - if end := self.end: - text += f" - {end}" - if self.time_zone: - text += f" {self.time_zone}" - return Div([], text) diff --git a/src/unstructured/ingest/connector/notion/types/file.py b/src/unstructured/ingest/connector/notion/types/file.py deleted file mode 100644 index 6ade2d1..0000000 --- a/src/unstructured/ingest/connector/notion/types/file.py +++ /dev/null @@ -1,51 +0,0 @@ -# https://developers.notion.com/reference/file-object -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.attributes import Href -from htmlBuilder.tags import A, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import FromJSONMixin, GetHTMLMixin - - -@dataclass -class External(FromJSONMixin): - url: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class File(FromJSONMixin): - url: str - expiry_time: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class FileObject(FromJSONMixin, GetHTMLMixin): - type: str - external: Optional[External] = None - file: Optional[File] = None - - @classmethod - def from_dict(cls, data: dict): - t = data["type"] - file_object = cls(type=t) - if t == "external": - file_object.external = External.from_dict(data["external"]) - elif t == "file": - file_object.file = File.from_dict(data["file"]) - return file_object - - def get_html(self) -> Optional[HtmlTag]: - if self.file: - return A([Href(self.file.url)], self.file.url) - if self.external: - return A([Href(self.external.url)], self.external.url) - return None diff --git a/src/unstructured/ingest/connector/notion/types/page.py b/src/unstructured/ingest/connector/notion/types/page.py deleted file mode 100644 index 42bbb29..0000000 --- a/src/unstructured/ingest/connector/notion/types/page.py +++ /dev/null @@ -1,43 +0,0 @@ -# https://developers.notion.com/reference/page -from dataclasses import dataclass -from typing import Optional - -from unstructured.ingest.connector.notion.interfaces import FromJSONMixin -from unstructured.ingest.connector.notion.types.file import FileObject -from unstructured.ingest.connector.notion.types.parent import Parent -from unstructured.ingest.connector.notion.types.user import PartialUser - - -@dataclass -class Page(FromJSONMixin): - id: str - created_time: str - created_by: PartialUser - last_edited_time: str - last_edited_by: PartialUser - archived: bool - properties: dict - parent: Parent - url: str - public_url: str - object: str = "page" - icon: Optional[FileObject] = None - cover: Optional[FileObject] = None - - @classmethod - def from_dict(cls, data: dict): - created_by = data.pop("created_by") - last_edited_by = data.pop("last_edited_by") - icon = data.pop("icon") - cover = data.pop("cover") - parent = data.pop("parent") - page = cls( - created_by=PartialUser.from_dict(created_by), - last_edited_by=PartialUser.from_dict(last_edited_by), - icon=FileObject.from_dict(icon) if icon else None, - cover=FileObject.from_dict(cover) if cover else None, - parent=Parent.from_dict(parent), - **data, - ) - - return page diff --git a/src/unstructured/ingest/connector/notion/types/parent.py b/src/unstructured/ingest/connector/notion/types/parent.py deleted file mode 100644 index f78c166..0000000 --- a/src/unstructured/ingest/connector/notion/types/parent.py +++ /dev/null @@ -1,66 +0,0 @@ -# https://developers.notion.com/reference/parent-object -from dataclasses import dataclass - -from unstructured.ingest.connector.notion.interfaces import FromJSONMixin - - -# https://developers.notion.com/reference/parent-object#database-parent -@dataclass -class DatabaseParent(FromJSONMixin): - database_id: str - type: str = "database_id" - - @classmethod - def from_dict(cls, data: dict): - return cls(database_id=data["database_id"]) - - -# https://developers.notion.com/reference/parent-object#page-parent -@dataclass -class PageParent(FromJSONMixin): - page_id: str - type: str = "page_id" - - @classmethod - def from_dict(cls, data: dict): - return cls(page_id=data["page_id"]) - - -# https://developers.notion.com/reference/parent-object#workspace-parent -@dataclass -class WorkspaceParent(FromJSONMixin): - type: str = "workspace" - workspace: bool = True - - @classmethod - def from_dict(cls, data: dict): - return cls() - - -# https://developers.notion.com/reference/parent-object#block-parent -@dataclass -class BlockParent(FromJSONMixin): - block_id: str - type: str = "block_id" - - @classmethod - def from_dict(cls, data: dict): - return cls(block_id=data["block_id"]) - - -@dataclass -class Parent(FromJSONMixin): - block_id: str - type: str = "block_id" - - @classmethod - def from_dict(cls, data: dict): - t = data["type"] - if t == "database_id": - return DatabaseParent.from_dict(data) - elif t == "page_id": - return PageParent.from_dict(data) - elif t == "workspace": - return WorkspaceParent.from_dict(data) - elif t == "block_id": - return BlockParent.from_dict(data) diff --git a/src/unstructured/ingest/connector/notion/types/rich_text.py b/src/unstructured/ingest/connector/notion/types/rich_text.py deleted file mode 100644 index ae71a0a..0000000 --- a/src/unstructured/ingest/connector/notion/types/rich_text.py +++ /dev/null @@ -1,189 +0,0 @@ -# https://developers.notion.com/reference/rich-text -from dataclasses import dataclass -from typing import Optional - -from htmlBuilder.attributes import Href, Style -from htmlBuilder.tags import A, B, Code, Div, HtmlTag, I, S, Span, U -from htmlBuilder.tags import Text as HtmlText - -from unstructured.ingest.connector.notion.interfaces import ( - FromJSONMixin, - GetHTMLMixin, -) -from unstructured.ingest.connector.notion.types.date import Date -from unstructured.ingest.connector.notion.types.user import People - - -@dataclass -class Annotations(FromJSONMixin): - bold: bool - code: bool - italic: bool - strikethrough: bool - underline: bool - color: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class Equation(FromJSONMixin, GetHTMLMixin): - expression: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return Code([], self.expression) if self.expression else None - - -@dataclass -class MentionDatabase(FromJSONMixin, GetHTMLMixin): - id: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return Div([], self.id) if self.id else None - - -@dataclass -class MentionLinkPreview(FromJSONMixin, GetHTMLMixin): - url: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return A([Href(self.url)], self.url) if self.url else None - - -@dataclass -class MentionPage(FromJSONMixin, GetHTMLMixin): - id: str - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_html(self) -> Optional[HtmlTag]: - return Div([], self.id) if self.id else None - - -@dataclass -class MentionTemplate(FromJSONMixin): - template_mention_date: Optional[str] - template_mention_user: Optional[str] - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class Mention(FromJSONMixin, GetHTMLMixin): - type: str - database: Optional[MentionDatabase] = None - date: Optional[Date] = None - link_preview: Optional[MentionLinkPreview] = None - page: Optional[MentionPage] = None - template_mention: Optional[MentionTemplate] = None - user: Optional[People] = None - - @classmethod - def from_dict(cls, data: dict): - t = data["type"] - mention = cls(type=t) - if t == "date": - mention.date = Date.from_dict(data["date"]) - elif t == "database": - mention.database = MentionDatabase.from_dict(data["database"]) - elif t == "link_preview": - mention.link_preview = MentionLinkPreview.from_dict(data["link_preview"]) - elif t == "page": - mention.page = MentionPage.from_dict(data["page"]) - elif t == "template_mention": - mention.template_mention = MentionTemplate.from_dict(data["template_mention"]) - elif t == "user": - mention.user = People.from_dict(data["user"]) - - return mention - - def get_html(self) -> Optional[HtmlTag]: - t = self.type - if t == "date": - return self.date.get_html() if self.date else None - elif t == "database": - return self.database.get_html() if self.database else None - elif t == "link_preview": - return self.link_preview.get_html() if self.link_preview else None - elif t == "page": - return self.page.get_html() if self.page else None - elif t == "user": - return self.user.get_html() if self.user else None - return None - - -@dataclass -class Text(FromJSONMixin): - content: str - link: Optional[dict] - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - -@dataclass -class RichText(FromJSONMixin, GetHTMLMixin): - type: str - plain_text: str - annotations: Optional[Annotations] = None - href: Optional[str] = None - text: Optional[Text] = None - mention: Optional[Mention] = None - equation: Optional[Equation] = None - - def get_html(self) -> Optional[HtmlTag]: - text = HtmlText(self.plain_text) - if self.href: - text = A([Href(self.href)], text) - if self.annotations: - annotations = self.annotations - if annotations.bold: - text = B([], text) - if annotations.code: - text = Code([], text) - if annotations.italic: - text = I([], text) - if annotations.strikethrough: - text = S([], text) - if annotations.underline: - text = U([], text) - if annotations.color and annotations.color != "default": - if isinstance(text, HtmlText): - text = Span([], text) - text.attributes.append(Style(f"color:{annotations.color}")) - return text - - @classmethod - def from_dict(cls, data: dict): - t = data["type"] - rich_text = cls( - annotations=Annotations.from_dict(data.pop("annotations")), - **data, - ) - if t == "text": - rich_text.text = Text.from_dict(data["text"]) - elif t == "mention": - rich_text.mention = Mention.from_dict(data["mention"]) - elif t == "equation": - rich_text.equation = Equation.from_dict(data["equation"]) - - return rich_text diff --git a/src/unstructured/ingest/connector/notion/types/user.py b/src/unstructured/ingest/connector/notion/types/user.py deleted file mode 100644 index 4574c0b..0000000 --- a/src/unstructured/ingest/connector/notion/types/user.py +++ /dev/null @@ -1,76 +0,0 @@ -# https://developers.notion.com/reference/user -from dataclasses import dataclass, field -from typing import Optional - -from htmlBuilder.attributes import Href -from htmlBuilder.tags import A, Div, HtmlTag - -from unstructured.ingest.connector.notion.interfaces import FromJSONMixin, GetHTMLMixin - - -@dataclass -class PartialUser(FromJSONMixin): - id: str - object: str = "user" - - @classmethod - def from_dict(cls, data: dict): - return cls(id=data["id"]) - - -@dataclass -class User(FromJSONMixin, GetHTMLMixin): - object: dict - id: str - type: Optional[str] = None - name: Optional[str] = None - avatar_url: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_text(self) -> Optional[str]: - text = self.name - if self.avatar_url: - text = f"[{text}]({self.avatar_url}" - return text - - def get_html(self) -> Optional[HtmlTag]: - if self.avatar_url: - return A([Href(self.avatar_url)], self.name) - else: - return Div([], self.name) - - -@dataclass -class People(User): - person: dict = field(default_factory=dict) - - -@dataclass -class Bots(FromJSONMixin, GetHTMLMixin): - object: dict - id: str - bot: dict - owner: dict - type: str - workspace_name: str - name: Optional[str] = None - avatar_url: Optional[str] = None - - @classmethod - def from_dict(cls, data: dict): - return cls(**data) - - def get_text(self) -> Optional[str]: - text = self.name - if self.avatar_url: - text = f"[{text}]({self.avatar_url}" - return text - - def get_html(self) -> Optional[HtmlTag]: - if self.avatar_url: - return A([Href(self.avatar_url)], self.name) - else: - return Div([], self.name) diff --git a/src/unstructured/ingest/connector/onedrive.py b/src/unstructured/ingest/connector/onedrive.py deleted file mode 100644 index 5f40ee0..0000000 --- a/src/unstructured/ingest/connector/onedrive.py +++ /dev/null @@ -1,158 +0,0 @@ -from dataclasses import dataclass, field -from pathlib import Path -from typing import TYPE_CHECKING, List, Optional - -from unstructured.file_utils.filetype import EXT_TO_FILETYPE -from unstructured.ingest.interfaces import ( - BaseConnector, - BaseConnectorConfig, - BaseIngestDoc, - ConnectorCleanupMixin, - IngestDocCleanupMixin, - StandardConnectorConfig, -) -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - -if TYPE_CHECKING: - from office365.onedrive.driveitems.driveItem import DriveItem - -MAX_MB_SIZE = 512_000_000 - - -@dataclass -class SimpleOneDriveConfig(BaseConnectorConfig): - client_id: str - client_credential: str = field(repr=False) - user_pname: str - tenant: str = field(repr=False) - authority_url: Optional[str] = field(repr=False) - path: Optional[str] = field(default="") - recursive: bool = False - - def __post_init__(self): - if not (self.client_id and self.client_credential and self.user_pname): - raise ValueError( - "Please provide all the following mandatory values:" - "\n-ms-client_id\n-ms-client_cred\n-ms-user-pname", - ) - self.token_factory = self._acquire_token - - @requires_dependencies(["msal"]) - def _acquire_token(self): - from msal import ConfidentialClientApplication - - try: - app = ConfidentialClientApplication( - authority=f"{self.authority_url}/{self.tenant}", - client_id=self.client_id, - client_credential=self.client_credential, - ) - token = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"]) - except ValueError as exc: - logger.error("Couldn't set up credentials for OneDrive") - raise exc - return token - - -@dataclass -class OneDriveIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): - config: SimpleOneDriveConfig - file: "DriveItem" - - def __post_init__(self): - self.ext = "".join(Path(self.file.name).suffixes) - if not self.ext: - raise ValueError("Unsupported file without extension.") - - if self.ext not in EXT_TO_FILETYPE: - raise ValueError( - f"Extension not supported. " - f"Value MUST be one of {', '.join([k for k in EXT_TO_FILETYPE if k is not None])}.", - ) - self._set_download_paths() - - def _set_download_paths(self) -> None: - """Parses the folder structure from the source and creates the download and output paths""" - download_path = Path(f"{self.standard_config.download_dir}") - output_path = Path(f"{self.standard_config.output_dir}") - - if parent_ref := self.file.get_property("parentReference", "").path.split(":")[-1]: - odir = parent_ref[1:] if parent_ref[0] == "/" else parent_ref - download_path = download_path if odir == "" else (download_path / odir).resolve() - output_path = output_path if odir == "" else (output_path / odir).resolve() - - self.download_dir = download_path - self.download_filepath = (download_path / self.file.name).resolve() - oname = f"{self.file.name[:-len(self.ext)]}.json" - self.output_dir = output_path - self.output_filepath = (output_path / oname).resolve() - - @property - def filename(self): - return Path(self.download_filepath).resolve() - - @property - def _output_filename(self): - return Path(self.output_filepath).resolve() - - @BaseIngestDoc.skip_if_file_exists - @requires_dependencies(["office365"]) - def get_file(self): - try: - fsize = self.file.get_property("size", 0) - self.output_dir.mkdir(parents=True, exist_ok=True) - - if not self.download_dir.is_dir(): - logger.debug(f"Creating directory: {self.download_dir}") - self.download_dir.mkdir(parents=True, exist_ok=True) - - if fsize > MAX_MB_SIZE: - logger.info(f"Downloading file with size: {fsize} bytes in chunks") - with self.filename.open(mode="wb") as f: - self.file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query() - else: - with self.filename.open(mode="wb") as f: - self.file.download(f).execute_query() - except Exception as e: - logger.error(f"Error while downloading and saving file: {self.filename}.") - logger.error(e) - return - logger.info(f"File downloaded: {self.filename}") - return - - -class OneDriveConnector(ConnectorCleanupMixin, BaseConnector): - config: SimpleOneDriveConfig - - def __init__(self, standard_config: StandardConnectorConfig, config: SimpleOneDriveConfig): - super().__init__(standard_config, config) - self._set_client() - - @requires_dependencies(["office365"]) - def _set_client(self): - from office365.graph_client import GraphClient - - self.client = GraphClient(self.config.token_factory) - - def _list_objects(self, folder, recursive) -> List["DriveItem"]: - drive_items = folder.children.get().execute_query() - files = [d for d in drive_items if d.is_file] - if not recursive: - return files - folders = [d for d in drive_items if d.is_folder] - for f in folders: - files += self._list_objects(f, recursive) - return files - - def initialize(self): - pass - - def get_ingest_docs(self): - root = self.client.users[self.config.user_pname].drive.get().execute_query().root - if fpath := self.config.path: - root = root.get_by_path(fpath).get().execute_query() - if root is None or not root.is_folder: - raise ValueError(f"Unable to find directory, given: {fpath}") - files = self._list_objects(root, self.config.recursive) - return [OneDriveIngestDoc(self.standard_config, self.config, f) for f in files] diff --git a/src/unstructured/ingest/connector/outlook.py b/src/unstructured/ingest/connector/outlook.py deleted file mode 100644 index 1943f2a..0000000 --- a/src/unstructured/ingest/connector/outlook.py +++ /dev/null @@ -1,230 +0,0 @@ -import hashlib -import os -from collections import defaultdict -from dataclasses import dataclass, field -from itertools import chain -from pathlib import Path -from typing import List, Optional - -from office365.onedrive.driveitems.driveItem import DriveItem - -from unstructured.ingest.interfaces import ( - BaseConnector, - BaseConnectorConfig, - BaseIngestDoc, - ConnectorCleanupMixin, - IngestDocCleanupMixin, - StandardConnectorConfig, -) -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - -MAX_NUM_EMAILS = 1000000 # Maximum number of emails per folder - - -class MissingFolderError(Exception): - """There are no root folders with those names.""" - - -@dataclass -class SimpleOutlookConfig(BaseConnectorConfig): - """This class is getting the token.""" - - client_id: Optional[str] - client_credential: Optional[str] = field(repr=False) - user_email: str - tenant: Optional[str] = field(repr=False) - authority_url: Optional[str] = field(repr=False) - ms_outlook_folders: List[str] - recursive: bool = False - - def __post_init__(self): - if not (self.client_id and self.client_credential and self.user_email): - raise ValueError( - "Please provide one of the following mandatory values:" - "\n--client_id\n--client_cred\n--user-email", - ) - self.token_factory = self._acquire_token - - @requires_dependencies(["msal"]) - def _acquire_token(self): - from msal import ConfidentialClientApplication - - try: - app = ConfidentialClientApplication( - authority=f"{self.authority_url}/{self.tenant}", - client_id=self.client_id, - client_credential=self.client_credential, - ) - token = app.acquire_token_for_client( - scopes=["https://graph.microsoft.com/.default"], - ) - except ValueError as exc: - logger.error("Couldn't set up credentials for Outlook") - raise exc - return token - - @staticmethod - def parse_folders(folder_str: str) -> List[str]: - """Parses a comma separated string of Outlook folders into a list.""" - return [x.strip() for x in folder_str.split(",")] - - -@dataclass -class OutlookIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): - config: SimpleOutlookConfig - file: DriveItem - - def __post_init__(self): - self._set_download_paths() - - def hash_mail_name(self, id): - """Outlook email ids are 152 char long. Hash to shorten to 16.""" - return hashlib.sha256(id.encode("utf-8")).hexdigest()[:16] - - def _set_download_paths(self) -> None: - """Creates paths for downloading and parsing.""" - download_path = Path(f"{self.standard_config.download_dir}") - output_path = Path(f"{self.standard_config.output_dir}") - - self.download_dir = download_path - self.download_filepath = ( - download_path / f"{self.hash_mail_name(self.file.id)}.eml" - ).resolve() - oname = f"{self.hash_mail_name(self.file.id)}.eml.json" - self.output_dir = output_path - self.output_filepath = (output_path / oname).resolve() - - @property - def filename(self): - return Path(self.download_filepath).resolve() - - @property - def _output_filename(self): - return Path(self.output_filepath).resolve() - - @BaseIngestDoc.skip_if_file_exists - @requires_dependencies(["office365"]) - def get_file(self): - """Relies on Office365 python sdk message object to do the download.""" - try: - if not self.download_dir.is_dir(): - logger.debug(f"Creating directory: {self.download_dir}") - self.download_dir.mkdir(parents=True, exist_ok=True) - - with open( - os.path.join( - self.download_dir, - self.hash_mail_name(self.file.id) + ".eml", - ), - "wb", - ) as local_file: - self.file.download( - local_file, - ).execute_query() # download MIME representation of a message - - except Exception as e: - logger.error( - f"Error while downloading and saving file: {self.file.subject}.", - ) - logger.error(e) - return - logger.info(f"File downloaded: {self.file.subject}") - return - - -class OutlookConnector(ConnectorCleanupMixin, BaseConnector): - config: SimpleOutlookConfig - - def __init__( - self, - standard_config: StandardConnectorConfig, - config: SimpleOutlookConfig, - ): - super().__init__(standard_config, config) - self._set_client() - self.get_folder_ids() - - @requires_dependencies(["office365"]) - def _set_client(self): - from office365.graph_client import GraphClient - - self.client = GraphClient(self.config.token_factory) - - def initialize(self): - pass - - def recurse_folders(self, folder_id, main_folder_dict): - """We only get a count of subfolders for any folder. - Have to make additional calls to get subfolder ids.""" - subfolders = ( - self.client.users[self.config.user_email] - .mail_folders[folder_id] - .child_folders.get() - .execute_query() - ) - for subfolder in subfolders: - for k, v in main_folder_dict.items(): - if subfolder.get_property("parentFolderId") in v: - v.append(subfolder.id) - if subfolder.get_property("childFolderCount") > 0: - self.recurse_folders(subfolder.id, main_folder_dict) - - def get_folder_ids(self): - """Sets the mail folder ids and subfolder ids for requested root mail folders.""" - self.root_folders = defaultdict(list) - root_folders_with_subfolders = [] - get_root_folders = ( - self.client.users[self.config.user_email].mail_folders.get().execute_query() - ) - - for folder in get_root_folders: - self.root_folders[folder.display_name].append(folder.id) - if folder.get_property("childFolderCount") > 0: - root_folders_with_subfolders.append(folder.id) - - for folder in root_folders_with_subfolders: - self.recurse_folders(folder, self.root_folders) - - # Narrow down all mail folder ids (plus all subfolders) to the ones that were requested. - self.selected_folder_ids = list( - chain.from_iterable( - [ - v - for k, v in self.root_folders.items() - if k.lower() in [x.lower() for x in self.config.ms_outlook_folders] - ], - ), - ) - if not self.selected_folder_ids: - raise MissingFolderError( - f"There are no root folders with the names: {self.config.ms_outlook_folders}", - ) - - def get_ingest_docs(self): - """Returns a list of all the message objects that are in the requested root folder(s).""" - filtered_messages = [] - - # Get all the relevant messages in the selected folders/subfolders. - for folder_id in self.selected_folder_ids: - messages = ( - self.client.users[self.config.user_email] - .mail_folders[folder_id] - .messages.get() - .top(MAX_NUM_EMAILS) # Prevents the return from paging - .execute_query() - ) - # Skip empty list if there are no messages in folder. - if messages: - filtered_messages.append(messages) - - # Filtered messages have an un-downloadable resource path. - # So we get each message object individually. - individual_messages = [] - for m in list(chain.from_iterable(filtered_messages)): - messages = ( - self.client.users[self.config.user_email].messages[m.id].get().execute_query() - ) - individual_messages.append(messages) - - return [OutlookIngestDoc(self.standard_config, self.config, f) for f in individual_messages] diff --git a/src/unstructured/ingest/connector/reddit.py b/src/unstructured/ingest/connector/reddit.py deleted file mode 100644 index 2561aeb..0000000 --- a/src/unstructured/ingest/connector/reddit.py +++ /dev/null @@ -1,85 +0,0 @@ -import os -from dataclasses import dataclass, field -from pathlib import Path -from typing import TYPE_CHECKING, Optional - -from unstructured.ingest.interfaces import ( - BaseConnector, - BaseConnectorConfig, - BaseIngestDoc, - ConnectorCleanupMixin, - IngestDocCleanupMixin, - StandardConnectorConfig, -) -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - -if TYPE_CHECKING: - from praw.models import Submission - - -@dataclass -class SimpleRedditConfig(BaseConnectorConfig): - subreddit_name: str - client_id: Optional[str] - client_secret: Optional[str] - user_agent: str - search_query: Optional[str] - num_posts: int - - def __post_init__(self): - if self.num_posts <= 0: - raise ValueError("The number of Reddit posts to fetch must be positive.") - - -@dataclass -class RedditIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): - config: SimpleRedditConfig = field(repr=False) - post: "Submission" - - @property - def filename(self) -> Path: - return (Path(self.standard_config.download_dir) / f"{self.post.id}.md").resolve() - - @property - def _output_filename(self): - return Path(self.standard_config.output_dir) / f"{self.post.id}.json" - - def _create_full_tmp_dir_path(self): - self.filename.parent.mkdir(parents=True, exist_ok=True) - - @BaseIngestDoc.skip_if_file_exists - def get_file(self): - """Fetches the "remote" doc and stores it locally on the filesystem.""" - self._create_full_tmp_dir_path() - logger.debug(f"Fetching {self} - PID: {os.getpid()}") - # Write the title plus the body, if any - text_to_write = f"# {self.post.title}\n{self.post.selftext}" - with open(self.filename, "w", encoding="utf8") as f: - f.write(text_to_write) - - -@requires_dependencies(["praw"], extras="reddit") -class RedditConnector(ConnectorCleanupMixin, BaseConnector): - config: SimpleRedditConfig - - def __init__(self, standard_config: StandardConnectorConfig, config: SimpleRedditConfig): - from praw import Reddit - - super().__init__(standard_config, config) - self.reddit = Reddit( - client_id=config.client_id, - client_secret=config.client_secret, - user_agent=config.user_agent, - ) - - def initialize(self): - pass - - def get_ingest_docs(self): - subreddit = self.reddit.subreddit(self.config.subreddit_name) - if self.config.search_query: - posts = subreddit.search(self.config.search_query, limit=self.config.num_posts) - else: - posts = subreddit.hot(limit=self.config.num_posts) - return [RedditIngestDoc(self.standard_config, self.config, post) for post in posts] diff --git a/src/unstructured/ingest/connector/s3.py b/src/unstructured/ingest/connector/s3.py deleted file mode 100644 index 224de3c..0000000 --- a/src/unstructured/ingest/connector/s3.py +++ /dev/null @@ -1,33 +0,0 @@ -from dataclasses import dataclass -from typing import Type - -from unstructured.ingest.connector.fsspec import ( - FsspecConnector, - FsspecIngestDoc, - SimpleFsspecConfig, -) -from unstructured.ingest.interfaces import StandardConnectorConfig -from unstructured.utils import requires_dependencies - - -@dataclass -class SimpleS3Config(SimpleFsspecConfig): - pass - - -class S3IngestDoc(FsspecIngestDoc): - @requires_dependencies(["s3fs", "fsspec"], extras="s3") - def get_file(self): - super().get_file() - - -@requires_dependencies(["s3fs", "fsspec"], extras="s3") -class S3Connector(FsspecConnector): - ingest_doc_cls: Type[S3IngestDoc] = S3IngestDoc - - def __init__( - self, - config: SimpleS3Config, - standard_config: StandardConnectorConfig, - ) -> None: - super().__init__(standard_config, config) diff --git a/src/unstructured/ingest/connector/sharepoint.py b/src/unstructured/ingest/connector/sharepoint.py deleted file mode 100644 index 840ccf8..0000000 --- a/src/unstructured/ingest/connector/sharepoint.py +++ /dev/null @@ -1,328 +0,0 @@ -from dataclasses import dataclass, field -from html import unescape -from pathlib import Path -from typing import TYPE_CHECKING, Any, Dict, List, Optional -from urllib.parse import urlparse - -from unstructured.file_utils.filetype import EXT_TO_FILETYPE -from unstructured.ingest.interfaces import ( - BaseConnector, - BaseConnectorConfig, - BaseIngestDoc, - ConnectorCleanupMixin, - IngestDocCleanupMixin, - StandardConnectorConfig, -) -from unstructured.ingest.logger import logger -from unstructured.utils import requires_dependencies - -if TYPE_CHECKING: - from office365.sharepoint.files.file import File - -MAX_MB_SIZE = 512_000_000 - - -@dataclass -class SimpleSharepointConfig(BaseConnectorConfig): - client_id: str - client_credential: str = field(repr=False) - site_url: str - path: str - process_pages: bool = False - recursive: bool = False - - def __post_init__(self): - if not (self.client_id and self.client_credential and self.site_url): - raise ValueError( - "Please provide one of the following mandatory values:" - "\n--client-id\n--client-cred\n--site", - ) - - -@dataclass -class SharepointIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): - config: SimpleSharepointConfig - file: "File" - meta: dict - - def __post_init__(self): - self.ext = "".join(Path(self.file.name).suffixes) if not self.meta else ".html" - self.ext = self.ext if self.ext != ".aspx" else ".html" - - if not self.ext: - raise ValueError("Unsupported file without extension.") - - if self.ext not in EXT_TO_FILETYPE: - raise ValueError( - f"Extension {self.ext} not supported. " - f"Value MUST be one of {', '.join([k for k in EXT_TO_FILETYPE if k is not None])}.", - ) - self._set_download_paths() - - def _set_download_paths(self) -> None: - """Parses the folder structure from the source and creates the download and output paths""" - download_path = Path(f"{self.standard_config.download_dir}") - output_path = Path(f"{self.standard_config.output_dir}") - if self.meta: - page_url = self.meta["page"].get_property("Url", "") - parent = ( - Path(page_url).with_suffix(self.ext) - if (self.meta["site_path"] is None) - else Path(self.meta["site_path"] + "/" + page_url).with_suffix(self.ext) - ) - else: - parent = Path(self.file.serverRelativeUrl[1:]) - self.download_dir = (download_path / parent.parent).resolve() - self.download_filepath = (download_path / parent).resolve() - oname = f"{str(parent)[:-len(self.ext)]}.json" - self.output_dir = (output_path / parent.parent).resolve() - self.output_filepath = (output_path / oname).resolve() - - @property - def filename(self): - return Path(self.download_filepath).resolve() - - @property - def _output_filename(self): - return Path(self.output_filepath).resolve() - - @property - def date_created(self) -> Optional[str]: - if self.meta: - return self.meta["page"].properties.get("FirstPublished", None) - return self.file.time_created - - @property - def date_modified(self) -> Optional[str]: - if self.meta: - return self.meta["page"].properties.get("Modified", None) - return self.file.time_last_modified - - @property - def exists(self) -> Optional[bool]: - if self.meta: - return self.meta["page"].properties.get("FileName", None) and self.meta[ - "page" - ].properties.get("UniqueId", None) - return self.file.exists - - @property - def record_locator(self) -> Optional[Dict[str, Any]]: - if self.meta: - record_source = self.meta["page"] - property_name = "AbsoluteUrl" - resource_url_name = "absolute_url" - else: - record_source = self.file - property_name = "ServerRelativeUrl" - resource_url_name = "server_relative_url" - - return { - "site": self.config.site_url, - "unique_id": record_source.get_property("UniqueId", ""), - resource_url_name: record_source.get_property(property_name, ""), - } - - @property - def version(self) -> Optional[str]: - if self.meta: - return self.meta["page"].properties.get("Version", "") - - if (n_versions := len(self.file.versions)) > 0: - return self.file.versions[n_versions - 1].properties.get("id", None) - return None - - def _get_page(self): - """Retrieves HTML content of the Sharepoint site through the CanvasContent1 and - LayoutWebpartsContent1""" - - try: - content_labels = ["CanvasContent1", "LayoutWebpartsContent1"] - content = self.file.listItemAllFields.select(content_labels).get().execute_query() - pld = (content.properties.get("LayoutWebpartsContent1", "") or "") + ( - content.properties.get("CanvasContent1", "") or "" - ) - if pld != "": - pld = unescape(pld) - else: - logger.info( - f"Page {self.meta['page'].get_property('Url', '')} has no retrievable content. \ - Dumping empty doc.", - ) - pld = "
" - - self.output_dir.mkdir(parents=True, exist_ok=True) - if not self.download_dir.is_dir(): - logger.debug(f"Creating directory: {self.download_dir}") - self.download_dir.mkdir(parents=True, exist_ok=True) - with self.filename.open(mode="w") as f: - f.write(pld) - except Exception as e: - logger.error(f"Error while downloading and saving file: {self.filename}.") - logger.error(e) - return - logger.info(f"File downloaded: {self.filename}") - - def _get_file(self): - try: - fsize = self.file.length - self.output_dir.mkdir(parents=True, exist_ok=True) - - if not self.download_dir.is_dir(): - logger.debug(f"Creating directory: {self.download_dir}") - self.download_dir.mkdir(parents=True, exist_ok=True) - - if fsize > MAX_MB_SIZE: - logger.info(f"Downloading file with size: {fsize} bytes in chunks") - with self.filename.open(mode="wb") as f: - self.file.download_session(f, chunk_size=1024 * 1024 * 100).execute_query() - else: - with self.filename.open(mode="wb") as f: - self.file.download(f).execute_query() - except Exception as e: - logger.error(f"Error while downloading and saving file: {self.filename}.") - logger.error(e) - return - logger.info(f"File downloaded: {self.filename}") - - @BaseIngestDoc.skip_if_file_exists - @requires_dependencies(["office365"]) - def get_file(self): - if not self.meta: - self._get_file() - else: - self._get_page() - return - - -class SharepointConnector(ConnectorCleanupMixin, BaseConnector): - config: SimpleSharepointConfig - tenant: None - - def __init__(self, standard_config: StandardConnectorConfig, config: SimpleSharepointConfig): - super().__init__(standard_config, config) - self._setup_client() - - @requires_dependencies(["office365"]) - def _setup_client(self): - from office365.runtime.auth.client_credential import ClientCredential - from office365.sharepoint.client_context import ClientContext - - parsed_url = urlparse(self.config.site_url) - site_hostname = (parsed_url.hostname or "").split(".") - tenant_url = site_hostname[0].split("-") - self.process_all = False - self.base_site_url = "" - if tenant_url[-1] == "admin" and (parsed_url.path is None or parsed_url.path == "/"): - self.process_all = True - self.base_site_url = parsed_url._replace( - netloc=parsed_url.netloc.replace(site_hostname[0], tenant_url[0]), - ).geturl() - elif tenant_url[-1] == "admin": - raise ValueError( - "A site url in the form of https://[tenant]-admin.sharepoint.com \ - is required to process all sites within a tenant. ", - ) - - self.client = ClientContext(self.config.site_url).with_credentials( - ClientCredential(self.config.client_id, self.config.client_credential), - ) - - @requires_dependencies(["office365"]) - def _list_files(self, folder, recursive) -> List["File"]: - from office365.runtime.client_request_exception import ClientRequestException - - try: - objects = folder.expand(["Files", "Folders"]).get().execute_query() - files = list(objects.files) - if not recursive: - return files - for f in objects.folders: - if "/Forms" in f.serverRelativeUrl: - continue - files += self._list_files(f, recursive) - return files - except ClientRequestException as e: - if e.response.status_code != 404: - logger.info("Caught an error while processing documents %s", e.response.text) - return [] - - @requires_dependencies(["office365"]) - def _list_pages(self, site_client) -> list: - from office365.runtime.client_request_exception import ClientRequestException - - try: - pages = site_client.site_pages.pages.get().execute_query() - page_files = [] - - for page_meta in pages: - page_url = page_meta.get_property("Url", None) - if page_url is None: - logger.info("Missing site_url. Omitting page... ") - break - page_url = f"/{page_url}" if page_url[0] != "/" else page_url - file_page = site_client.web.get_file_by_server_relative_path(page_url) - site_path = None - if (url_path := (urlparse(site_client.base_url).path)) and (url_path != "/"): - site_path = url_path[1:] - page_files.append( - [file_page, {"page": page_meta, "site_path": site_path}], - ) - except ClientRequestException as e: - logger.info("Caught an error while processing pages %s", e.response.text) - return [] - - return page_files - - def initialize(self): - pass - - def _ingest_site_docs(self, site_client) -> List["SharepointIngestDoc"]: - root_folder = site_client.web.get_folder_by_server_relative_path(self.config.path) - files = self._list_files(root_folder, self.config.recursive) - if not files: - logger.info( - f"Couldn't process files in path {self.config.path} \ - for site {site_client.base_url}", - ) - output = [SharepointIngestDoc(self.standard_config, self.config, f, {}) for f in files] - if self.config.process_pages: - page_files = self._list_pages(site_client) - if not page_files: - logger.info(f"Couldn't process pages for site {site_client.base_url}") - page_output = [ - SharepointIngestDoc(self.standard_config, self.config, f[0], f[1]) - for f in page_files - ] - output = output + page_output - return output - - def _filter_site_url(self, site): - if site.url is None: - return False - return (site.url[0 : len(self.base_site_url)] == self.base_site_url) and ( # noqa: E203 - "/sites/" in site.url - ) - - @requires_dependencies(["office365"]) - def get_ingest_docs(self): - if self.process_all: - logger.debug(self.base_site_url) - from office365.runtime.auth.client_credential import ClientCredential - from office365.sharepoint.client_context import ClientContext - from office365.sharepoint.tenant.administration.tenant import Tenant - - tenant = Tenant(self.client) - tenant_sites = tenant.get_site_properties_from_sharepoint_by_filters().execute_query() - tenant_sites = [s.url for s in tenant_sites if self._filter_site_url(s)] - tenant_sites.append(self.base_site_url) - ingest_docs: List[SharepointIngestDoc] = [] - for site_url in set(tenant_sites): - logger.info(f"Processing docs for site: {site_url}") - site_client = ClientContext(site_url).with_credentials( - ClientCredential(self.config.client_id, self.config.client_credential), - ) - ingest_docs = ingest_docs + self._ingest_site_docs(site_client) - return ingest_docs - else: - return self._ingest_site_docs(self.client) diff --git a/src/unstructured/ingest/connector/slack.py b/src/unstructured/ingest/connector/slack.py deleted file mode 100644 index da933cc..0000000 --- a/src/unstructured/ingest/connector/slack.py +++ /dev/null @@ -1,172 +0,0 @@ -import os -from dataclasses import dataclass -from datetime import datetime -from pathlib import Path -from typing import List, Optional - -from unstructured.ingest.interfaces import ( - BaseConnector, - BaseConnectorConfig, - BaseIngestDoc, - ConnectorCleanupMixin, - IngestDocCleanupMixin, - StandardConnectorConfig, -) -from unstructured.ingest.logger import logger -from unstructured.utils import ( - requires_dependencies, - validate_date_args, -) - -DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z") - - -@dataclass -class SimpleSlackConfig(BaseConnectorConfig): - """Connector config to process all messages by channel id's.""" - - channels: List[str] - token: str - oldest: Optional[str] - latest: Optional[str] - verbose: bool = False - - def validate_inputs(self): - oldest_valid = True - latest_valid = True - - if self.oldest: - oldest_valid = validate_date_args(self.oldest) - - if self.latest: - latest_valid = validate_date_args(self.latest) - - return oldest_valid, latest_valid - - def __post_init__(self): - oldest_valid, latest_valid = self.validate_inputs() - if not oldest_valid and not latest_valid: - raise ValueError( - "Start and/or End dates are not valid. ", - ) - - @staticmethod - def parse_channels(channel_str: str) -> List[str]: - """Parses a comma separated list of channels into a list.""" - return [x.strip() for x in channel_str.split(",")] - - -@dataclass -class SlackIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): - """Class encapsulating fetching a doc and writing processed results (but not - doing the processing!). - - Also includes a cleanup method. When things go wrong and the cleanup - method is not called, the file is left behind on the filesystem to assist debugging. - """ - - config: SimpleSlackConfig - channel: str - token: str - oldest: Optional[str] - latest: Optional[str] - - # NOTE(crag): probably doesn't matter, but intentionally not defining tmp_download_file - # __post_init__ for multiprocessing simplicity (no Path objects in initially - # instantiated object) - def _tmp_download_file(self): - channel_file = self.channel + ".txt" - return Path(self.standard_config.download_dir) / channel_file - - @property - def _output_filename(self): - output_file = self.channel + ".json" - return Path(self.standard_config.output_dir) / output_file - - def _create_full_tmp_dir_path(self): - self._tmp_download_file().parent.mkdir(parents=True, exist_ok=True) - - @BaseIngestDoc.skip_if_file_exists - @requires_dependencies(dependencies=["slack_sdk"], extras="slack") - def get_file(self): - from slack_sdk import WebClient - from slack_sdk.errors import SlackApiError - - """Fetches the data from a slack channel and stores it locally.""" - - self._create_full_tmp_dir_path() - - if self.config.verbose: - logger.debug(f"fetching channel {self.channel} - PID: {os.getpid()}") - - messages = [] - self.client = WebClient(token=self.token) - - try: - oldest = "0" - latest = "0" - if self.oldest: - oldest = self.convert_datetime(self.oldest) - - if self.latest: - latest = self.convert_datetime(self.latest) - - result = self.client.conversations_history( - channel=self.channel, - oldest=oldest, - latest=latest, - ) - messages.extend(result["messages"]) - while result["has_more"]: - result = self.client.conversations_history( - channel=self.channel, - oldest=oldest, - latest=latest, - cursor=result["response_metadata"]["next_cursor"], - ) - messages.extend(result["messages"]) - except SlackApiError as e: - logger.error(f"Error: {e}") - - with open(self._tmp_download_file(), "w") as channel_file: - for message in messages: - channel_file.write(message["text"] + "\n") - - def convert_datetime(self, date_time): - for format in DATE_FORMATS: - try: - return datetime.strptime(date_time, format).timestamp() - except ValueError: - pass - - @property - def filename(self): - """The filename of the file created from a slack channel""" - return self._tmp_download_file() - - -@requires_dependencies(dependencies=["slack_sdk"], extras="slack") -class SlackConnector(ConnectorCleanupMixin, BaseConnector): - """Objects of this class support fetching document(s) from""" - - config: SimpleSlackConfig - - def __init__(self, standard_config: StandardConnectorConfig, config: SimpleSlackConfig): - super().__init__(standard_config, config) - - def initialize(self): - """Verify that can get metadata for an object, validates connections info.""" - pass - - def get_ingest_docs(self): - return [ - SlackIngestDoc( - self.standard_config, - self.config, - channel, - self.config.token, - self.config.oldest, - self.config.latest, - ) - for channel in self.config.channels - ] diff --git a/src/unstructured/ingest/connector/wikipedia.py b/src/unstructured/ingest/connector/wikipedia.py deleted file mode 100644 index 90b97dc..0000000 --- a/src/unstructured/ingest/connector/wikipedia.py +++ /dev/null @@ -1,135 +0,0 @@ -import os -from dataclasses import dataclass, field -from pathlib import Path -from typing import TYPE_CHECKING - -from unstructured.ingest.interfaces import ( - BaseConnector, - BaseConnectorConfig, - BaseIngestDoc, - ConnectorCleanupMixin, - IngestDocCleanupMixin, - StandardConnectorConfig, -) -from unstructured.ingest.logger import logger - -if TYPE_CHECKING: - from wikipedia import WikipediaPage - - -@dataclass -class SimpleWikipediaConfig(BaseConnectorConfig): - title: str - auto_suggest: bool - - -@dataclass -class WikipediaIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): - config: SimpleWikipediaConfig = field(repr=False) - page: "WikipediaPage" - - @property - def filename(self) -> Path: - raise NotImplementedError() - - @property - def text(self) -> str: - raise NotImplementedError() - - @property - def _output_filename(self): - raise NotImplementedError() - - def _create_full_tmp_dir_path(self): - self.filename.parent.mkdir(parents=True, exist_ok=True) - - @BaseIngestDoc.skip_if_file_exists - def get_file(self): - """Fetches the "remote" doc and stores it locally on the filesystem.""" - self._create_full_tmp_dir_path() - logger.debug(f"Fetching {self} - PID: {os.getpid()}") - with open(self.filename, "w", encoding="utf8") as f: - f.write(self.text) - - -class WikipediaIngestHTMLDoc(WikipediaIngestDoc): - @property - def filename(self) -> Path: - return ( - Path(self.standard_config.download_dir) - / f"{self.page.title}-{self.page.revision_id}.html" - ).resolve() - - @property - def text(self): - return self.page.html() - - @property - def _output_filename(self): - return ( - Path(self.standard_config.output_dir) - / f"{self.page.title}-{self.page.revision_id}-html.json" - ) - - -class WikipediaIngestTextDoc(WikipediaIngestDoc): - @property - def filename(self) -> Path: - return ( - Path(self.standard_config.download_dir) - / f"{self.page.title}-{self.page.revision_id}.txt" - ).resolve() - - @property - def text(self): - return self.page.content - - @property - def _output_filename(self): - return ( - Path(self.standard_config.output_dir) - / f"{self.page.title}-{self.page.revision_id}-txt.json" - ) - - -class WikipediaIngestSummaryDoc(WikipediaIngestDoc): - @property - def filename(self) -> Path: - return ( - Path(self.standard_config.download_dir) - / f"{self.page.title}-{self.page.revision_id}-summary.txt" - ).resolve() - - @property - def text(self): - return self.page.summary - - @property - def _output_filename(self): - return ( - Path(self.standard_config.output_dir) - / f"{self.page.title}-{self.page.revision_id}-summary.json" - ) - - -class WikipediaConnector(ConnectorCleanupMixin, BaseConnector): - config: SimpleWikipediaConfig - - def __init__(self, config: SimpleWikipediaConfig, standard_config: StandardConnectorConfig): - super().__init__(standard_config, config) - - def initialize(self): - pass - - def get_ingest_docs(self): - import wikipedia - - page = wikipedia.page( - self.config.title, - auto_suggest=self.config.auto_suggest, - ) - return [ - WikipediaIngestTextDoc(self.standard_config, self.config, page), - WikipediaIngestHTMLDoc(self.standard_config, self.config, page), - WikipediaIngestSummaryDoc(self.standard_config, self.config, page), - ] diff --git a/src/unstructured/ingest/doc_processor/__init__.py b/src/unstructured/ingest/doc_processor/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/unstructured/ingest/doc_processor/generalized.py b/src/unstructured/ingest/doc_processor/generalized.py deleted file mode 100644 index 243d465..0000000 --- a/src/unstructured/ingest/doc_processor/generalized.py +++ /dev/null @@ -1,65 +0,0 @@ -"""Process arbitrary files with the Unstructured library""" - -import os -from typing import Any, Dict, List, Optional - -from unstructured_inference.models.base import get_model - -from unstructured.ingest.interfaces import BaseIngestDoc as IngestDoc -from unstructured.ingest.interfaces import ( - BaseSessionHandle, - IngestDocSessionHandleMixin, -) -from unstructured.ingest.logger import logger - -# module-level variable to store session handle -session_handle: Optional[BaseSessionHandle] = None - - -def initialize(): - """Download default model or model specified by UNSTRUCTURED_HI_RES_MODEL_NAME environment - variable (avoids subprocesses all doing the same)""" - - # If more than one model will be supported and left up to user selection - supported_model = os.environ.get("UNSTRUCTURED_HI_RES_SUPPORTED_MODEL", "") - if supported_model: - for model_name in supported_model.split(","): - get_model(model_name=model_name) - - get_model(os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME")) - - -def process_document(doc: "IngestDoc", **partition_kwargs) -> Optional[List[Dict[str, Any]]]: - """Process any IngestDoc-like class of document with chosen Unstructured's partition logic. - - Parameters - ---------- - partition_kwargs - ultimately the parameters passed to partition() - """ - global session_handle - isd_elems_no_filename = None - try: - if isinstance(doc, IngestDocSessionHandleMixin): - if session_handle is None: - # create via doc.session_handle, which is a property that creates a - # session handle if one is not already defined - session_handle = doc.session_handle - else: - doc.session_handle = session_handle - # does the work necessary to load file into filesystem - # in the future, get_file_handle() could also be supported - doc.get_file() - - isd_elems_no_filename = doc.process_file(**partition_kwargs) - - # Note, this may be a no-op if the IngestDoc doesn't do anything to persist - # the results. Instead, the Processor (caller) may work with the aggregate - # results across all docs in memory. - doc.write_result() - except Exception: - # TODO(crag) save the exception instead of print? - logger.error(f"Failed to process {doc}", exc_info=True) - finally: - doc.cleanup_file() - return isd_elems_no_filename diff --git a/src/unstructured/ingest/interfaces.py b/src/unstructured/ingest/interfaces.py deleted file mode 100644 index cc686ab..0000000 --- a/src/unstructured/ingest/interfaces.py +++ /dev/null @@ -1,361 +0,0 @@ -"""Defines Abstract Base Classes (ABC's) core to batch processing documents -through Unstructured.""" - -import functools -import json -import os -from abc import ABC, abstractmethod -from dataclasses import dataclass -from datetime import datetime -from pathlib import Path -from typing import Any, Dict, List, Optional - -import requests - -from unstructured.documents.elements import DataSourceMetadata -from unstructured.ingest.logger import logger -from unstructured.partition.auto import partition -from unstructured.staging.base import convert_to_dict - - -@dataclass -class BaseSessionHandle(ABC): - """Abstract Base Class for sharing resources that are local to an individual process. - e.g., a connection for making a request for fetching documents.""" - - -@dataclass -class ProcessorConfigs: - """Common set of config required when running data connectors.""" - - partition_strategy: str - partition_ocr_languages: str - partition_pdf_infer_table_structure: bool - partition_encoding: str - num_processes: int - reprocess: bool - max_docs: int - - -@dataclass -class StandardConnectorConfig: - """Common set of config options passed to all connectors.""" - - # where raw documents are stored for processing, and then removed if not preserve_downloads - download_dir: str - # where to write structured data outputs - output_dir: str - download_only: bool = False - fields_include: str = "element_id,text,type,metadata" - flatten_metadata: bool = False - metadata_exclude: Optional[str] = None - metadata_include: Optional[str] = None - partition_by_api: bool = False - partition_endpoint: str = "https://api.unstructured.io/general/v0/general" - api_key: str = "" - preserve_downloads: bool = False - re_download: bool = False - - -class BaseConnectorConfig(ABC): - """Abstract definition on which to define connector-specific attributes.""" - - -@dataclass -class BaseConnector(ABC): - """Abstract Base Class for a connector to a remote source, e.g. S3 or Google Drive.""" - - standard_config: StandardConnectorConfig - config: BaseConnectorConfig - - def __init__(self, standard_config: StandardConnectorConfig, config: BaseConnectorConfig): - """Expects a standard_config object that implements StandardConnectorConfig - and config object that implements BaseConnectorConfig.""" - self.standard_config = standard_config - self.config = config - - @abstractmethod - def cleanup(self, cur_dir=None): - """Any additional cleanup up need after processing is complete. E.g., removing - temporary download dirs that are empty. - - By convention, documents that failed to process are typically not cleaned up.""" - pass - - @abstractmethod - def initialize(self): - """Initializes the connector. Should also validate the connector is properly - configured: e.g., list a single a document from the source.""" - pass - - @abstractmethod - def get_ingest_docs(self): - """Returns all ingest docs (derived from BaseIngestDoc). - This does not imply downloading all the raw documents themselves, - rather each IngestDoc is capable of fetching its content (in another process) - with IngestDoc.get_file().""" - pass - - -@dataclass -class BaseIngestDoc(ABC): - """An "ingest document" is specific to a connector, and provides - methods to fetch a single raw document, store it locally for processing, any cleanup - needed after successful processing of the doc, and the ability to write the doc's - structured outputs once processed. - - Crucially, it is not responsible for the actual processing of the raw document. - """ - - standard_config: StandardConnectorConfig - config: BaseConnectorConfig - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._date_processed = None - - @property - def date_created(self) -> Optional[str]: - """The date the document was created on the source system.""" - return None - - @property - def date_modified(self) -> Optional[str]: - """The date the document was last modified on the source system.""" - return None - - @property - def date_processed(self) -> Optional[str]: - """The date the document was last processed by Unstructured. - self._date_processed is assigned internally in self.partition_file()""" - return self._date_processed - - @property - def exists(self) -> Optional[bool]: - """Whether the document exists on the remote source.""" - return None - - @property - @abstractmethod - def filename(self): - """The local filename of the document after fetching from remote source.""" - - @property - @abstractmethod - def _output_filename(self): - """Filename of the structured output for this doc.""" - - @property - def record_locator(self) -> Optional[Dict[str, Any]]: # Values must be JSON-serializable - """A dictionary with any data necessary to uniquely identify the document on - the source system.""" - return None - - @property - def source_url(self) -> Optional[str]: - """The url of the source document.""" - return None - - @property - def version(self) -> Optional[str]: - """The version of the source document, this could be the last modified date, an - explicit version number, or anything else that can be used to uniquely identify - the version of the document.""" - return None - - @abstractmethod - def cleanup_file(self): - """Removes the local copy the file (or anything else) after successful processing.""" - pass - - @staticmethod - def skip_if_file_exists(func): - """Decorator that checks if a file exists, is not empty, and should not re-download, - if so log a message indicating as much and skip the decorated function.""" - - @functools.wraps(func) - def wrapper(self, *args, **kwargs): - if ( - not self.standard_config.re_download - and self.filename.is_file() - and self.filename.stat().st_size - ): - logger.debug(f"File exists: {self.filename}, skipping {func.__name__}") - return None - return func(self, *args, **kwargs) - - return wrapper - - # NOTE(crag): Future BaseIngestDoc classes could define get_file_object() methods - # in addition to or instead of get_file() - @abstractmethod - def get_file(self): - """Fetches the "remote" doc and stores it locally on the filesystem.""" - pass - - def has_output(self) -> bool: - """Determine if structured output for this doc already exists.""" - return self._output_filename.is_file() and self._output_filename.stat().st_size - - def write_result(self): - """Write the structured json result for this doc. result must be json serializable.""" - if self.standard_config.download_only: - return - self._output_filename.parent.mkdir(parents=True, exist_ok=True) - with open(self._output_filename, "w", encoding="utf8") as output_f: - json.dump(self.isd_elems_no_filename, output_f, ensure_ascii=False, indent=2) - logger.info(f"Wrote {self._output_filename}") - - def partition_file(self, **partition_kwargs) -> List[Dict[str, Any]]: - if not self.standard_config.partition_by_api: - logger.debug("Using local partition") - elements = partition( - filename=str(self.filename), - data_source_metadata=DataSourceMetadata( - url=self.source_url, - version=self.version, - record_locator=self.record_locator, - date_created=self.date_created, - date_modified=self.date_modified, - date_processed=self.date_processed, - ), - **partition_kwargs, - ) - return convert_to_dict(elements) - - else: - endpoint = self.standard_config.partition_endpoint - - logger.debug(f"Using remote partition ({endpoint})") - - with open(self.filename, "rb") as f: - headers_dict = {} - if len(self.standard_config.api_key) > 0: - headers_dict["UNSTRUCTURED-API-KEY"] = self.standard_config.api_key - response = requests.post( - f"{endpoint}", - files={"files": (str(self.filename), f)}, - headers=headers_dict, - # TODO: add m_data_source_metadata to unstructured-api pipeline_api and then - # pass the stringified json here - ) - - if response.status_code != 200: - raise RuntimeError(f"Caught {response.status_code} from API: {response.text}") - - return response.json() - - def process_file(self, **partition_kwargs) -> Optional[List[Dict[str, Any]]]: - self._date_processed = datetime.utcnow().isoformat() - if self.standard_config.download_only: - return None - logger.info(f"Processing {self.filename}") - - isd_elems = self.partition_file(**partition_kwargs) - - self.isd_elems_no_filename = [] - for elem in isd_elems: - # type: ignore - if ( - self.standard_config.metadata_exclude is not None - and self.standard_config.metadata_include is not None - ): - raise ValueError( - "Arguments `--metadata-include` and `--metadata-exclude` are " - "mutually exclusive with each other.", - ) - elif self.standard_config.metadata_exclude is not None: - ex_list = self.standard_config.metadata_exclude.split(",") - for ex in ex_list: - if "." in ex: # handle nested fields - nested_fields = ex.split(".") - current_elem = elem - for field in nested_fields[:-1]: - if field in current_elem: - current_elem = current_elem[field] - field_to_exclude = nested_fields[-1] - if field_to_exclude in current_elem: - current_elem.pop(field_to_exclude, None) - else: # handle top-level fields - elem["metadata"].pop(ex, None) # type: ignore[attr-defined] - elif self.standard_config.metadata_include is not None: - in_list = self.standard_config.metadata_include.split(",") - for k in list(elem["metadata"].keys()): # type: ignore[attr-defined] - if k not in in_list: - elem["metadata"].pop(k, None) # type: ignore[attr-defined] - - in_list = self.standard_config.fields_include.split(",") - elem = {k: v for k, v in elem.items() if k in in_list} - - if self.standard_config.flatten_metadata: - for k, v in elem["metadata"].items(): # type: ignore[attr-defined] - elem[k] = v - elem.pop("metadata") # type: ignore[attr-defined] - - self.isd_elems_no_filename.append(elem) - - return self.isd_elems_no_filename - - -class ConnectorCleanupMixin: - standard_config: StandardConnectorConfig - - def cleanup(self, cur_dir=None): - """Recursively clean up downloaded files and directories.""" - if self.standard_config.preserve_downloads or self.standard_config.download_only: - return - if cur_dir is None: - cur_dir = self.standard_config.download_dir - if cur_dir is None or not Path(cur_dir).is_dir(): - return - sub_dirs = os.listdir(cur_dir) - os.chdir(cur_dir) - for sub_dir in sub_dirs: - # don't traverse symlinks, not that there every should be any - if os.path.isdir(sub_dir) and not os.path.islink(sub_dir): - self.cleanup(sub_dir) - os.chdir("..") - if len(os.listdir(cur_dir)) == 0: - os.rmdir(cur_dir) - - -class IngestDocCleanupMixin: - standard_config: StandardConnectorConfig - - @property - @abstractmethod - def filename(self): - """The local filename of the document after fetching from remote source.""" - - def cleanup_file(self): - """Removes the local copy of the file after successful processing.""" - if ( - not self.standard_config.preserve_downloads - and self.filename.is_file() - and not self.standard_config.download_only - ): - logger.debug(f"Cleaning up {self}") - os.unlink(self.filename) - - -class ConfigSessionHandleMixin: - @abstractmethod - def create_session_handle(self) -> BaseSessionHandle: - """Creates a session handle that will be assigned on each IngestDoc to share - session related resources across all document handling for a given subprocess.""" - - -class IngestDocSessionHandleMixin: - config: ConfigSessionHandleMixin - _session_handle: Optional[BaseSessionHandle] = None - - @property - def session_handle(self): - """If a session handle is not assigned, creates a new one and assigns it.""" - if self._session_handle is None: - self._session_handle = self.config.create_session_handle() - return self._session_handle - - @session_handle.setter - def session_handle(self, session_handle: BaseSessionHandle): - self._session_handle = session_handle diff --git a/src/unstructured/ingest/logger.py b/src/unstructured/ingest/logger.py deleted file mode 100644 index 752662c..0000000 --- a/src/unstructured/ingest/logger.py +++ /dev/null @@ -1,27 +0,0 @@ -import logging - -logger = logging.getLogger("unstructured.ingest") - - -def ingest_log_streaming_init(level: int) -> None: - handler = logging.StreamHandler() - handler.name = "ingest_log_handler" - formatter = logging.Formatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s") - handler.setFormatter(formatter) - - # Only want to add the handler once - if "ingest_log_handler" not in [h.name for h in logger.handlers]: - logger.addHandler(handler) - - logger.setLevel(level) - - -def make_default_logger(level: int) -> logging.Logger: - """Return a custom logger.""" - logger = logging.getLogger("unstructured.ingest") - handler = logging.StreamHandler() - handler.name = "ingest_log_handler" - formatter = logging.Formatter("%(asctime)s %(processName)-10s %(levelname)-8s %(message)s") - handler.setFormatter(formatter) - logger.setLevel(level) - return logger diff --git a/src/unstructured/ingest/main.py b/src/unstructured/ingest/main.py deleted file mode 100755 index ead616f..0000000 --- a/src/unstructured/ingest/main.py +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env python3 -from unstructured.ingest.cli.cli import get_cmd - - -def main(): - ingest_cmd = get_cmd() - ingest_cmd() - - -if __name__ == "__main__": - main() diff --git a/src/unstructured/ingest/processor.py b/src/unstructured/ingest/processor.py deleted file mode 100644 index dbf75d6..0000000 --- a/src/unstructured/ingest/processor.py +++ /dev/null @@ -1,112 +0,0 @@ -import logging -import multiprocessing as mp -from contextlib import suppress -from functools import partial - -from unstructured.ingest.doc_processor.generalized import initialize, process_document -from unstructured.ingest.interfaces import ( - BaseConnector, - ProcessorConfigs, -) -from unstructured.ingest.logger import ingest_log_streaming_init, logger - -with suppress(RuntimeError): - mp.set_start_method("spawn") - - -class Processor: - def __init__( - self, - doc_connector, - doc_processor_fn, - num_processes, - reprocess, - verbose, - max_docs, - ): - # initialize the reader and writer - self.doc_connector = doc_connector - self.doc_processor_fn = doc_processor_fn - self.num_processes = num_processes - self.reprocess = reprocess - self.verbose = verbose - self.max_docs = max_docs - - def initialize(self): - """Slower initialization things: check connections, load things into memory, etc.""" - ingest_log_streaming_init(logging.DEBUG if self.verbose else logging.INFO) - self.doc_connector.initialize() - initialize() - - def cleanup(self): - self.doc_connector.cleanup() - - def _filter_docs_with_outputs(self, docs): - num_docs_all = len(docs) - docs = [doc for doc in docs if not doc.has_output()] - if self.max_docs is not None: - if num_docs_all > self.max_docs: - num_docs_all = self.max_docs - docs = docs[: self.max_docs] - num_docs_to_process = len(docs) - if num_docs_to_process == 0: - logger.info( - "All docs have structured outputs, nothing to do. Use --reprocess to process all.", - ) - return None - elif num_docs_to_process != num_docs_all: - logger.info( - f"Skipping processing for {num_docs_all - num_docs_to_process} docs out of " - f"{num_docs_all} since their structured outputs already exist, use --reprocess to " - "reprocess those in addition to the unprocessed ones.", - ) - return docs - - def run(self): - self.initialize() - - # fetch the list of lazy downloading IngestDoc obj's - docs = self.doc_connector.get_ingest_docs() - - # remove docs that have already been processed - if not self.reprocess: - docs = self._filter_docs_with_outputs(docs) - if not docs: - return - - # Debugging tip: use the below line and comment out the mp.Pool loop - # block to remain in single process - # self.doc_processor_fn(docs[0]) - logger.info(f"Processing {len(docs)} docs") - try: - with mp.Pool( - processes=self.num_processes, - initializer=ingest_log_streaming_init, - initargs=(logging.DEBUG if self.verbose else logging.INFO,), - ) as pool: - pool.map(self.doc_processor_fn, docs) - finally: - self.cleanup() - - -def process_documents( - doc_connector: BaseConnector, - processor_config: ProcessorConfigs, - verbose=bool, -) -> None: - process_document_with_partition_args = partial( - process_document, - strategy=processor_config.partition_strategy, - ocr_languages=processor_config.partition_ocr_languages, - encoding=processor_config.partition_encoding, - pdf_infer_table_structure=processor_config.partition_pdf_infer_table_structure, - ) - - Processor( - doc_connector=doc_connector, - doc_processor_fn=process_document_with_partition_args, - num_processes=processor_config.num_processes, - reprocess=processor_config.reprocess, - verbose=verbose, - max_docs=processor_config.max_docs, - ).run() diff --git a/src/unstructured/ingest/runner/__init__.py b/src/unstructured/ingest/runner/__init__.py deleted file mode 100644 index 5bf4285..0000000 --- a/src/unstructured/ingest/runner/__init__.py +++ /dev/null @@ -1,47 +0,0 @@ -from .airtable import airtable -from .azure import azure -from .biomed import biomed -from .box import box -from .confluence import confluence -from .discord import discord -from .dropbox import dropbox -from .elasticsearch import elasticsearch -from .fsspec import fsspec -from .gcs import gcs -from .github import github -from .gitlab import gitlab -from .google_drive import gdrive -from .local import local -from .notion import notion -from .onedrive import onedrive -from .outlook import outlook -from .reddit import reddit -from .s3 import s3 -from .sharepoint import sharepoint -from .slack import slack -from .wikipedia import wikipedia - -__all__ = [ - "airtable", - "azure", - "biomed", - "box", - "confluence", - "discord", - "dropbox", - "elasticsearch", - "fsspec", - "gcs", - "gdrive", - "github", - "gitlab", - "local", - "notion", - "onedrive", - "outlook", - "reddit", - "s3", - "sharepoint", - "slack", - "wikipedia", -] diff --git a/src/unstructured/ingest/runner/airtable.py b/src/unstructured/ingest/runner/airtable.py deleted file mode 100644 index f1536f4..0000000 --- a/src/unstructured/ingest/runner/airtable.py +++ /dev/null @@ -1,43 +0,0 @@ -import hashlib -import logging -from typing import Optional - -from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents -from unstructured.ingest.runner.utils import update_download_dir_hash - - -def airtable( - verbose: bool, - connector_config: StandardConnectorConfig, - processor_config: ProcessorConfigs, - personal_access_token: str, - list_of_paths: Optional[str], - **kwargs, -): - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - personal_access_token.encode("utf-8"), - ) - connector_config.download_dir = update_download_dir_hash( - connector_config=connector_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.airtable import ( - AirtableConnector, - SimpleAirtableConfig, - ) - - doc_connector = AirtableConnector( # type: ignore - standard_config=connector_config, - config=SimpleAirtableConfig( - personal_access_token=personal_access_token, - list_of_paths=list_of_paths, - ), - ) - - process_documents(doc_connector=doc_connector, processor_config=processor_config) diff --git a/src/unstructured/ingest/runner/azure.py b/src/unstructured/ingest/runner/azure.py deleted file mode 100644 index 93bb796..0000000 --- a/src/unstructured/ingest/runner/azure.py +++ /dev/null @@ -1,57 +0,0 @@ -import logging -from typing import Optional - -from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents -from unstructured.ingest.runner.utils import update_download_dir_remote_url - - -def azure( - verbose: bool, - connector_config: StandardConnectorConfig, - processor_config: ProcessorConfigs, - account_name: Optional[str], - account_key: Optional[str], - connection_string: Optional[str], - remote_url: str, - recursive: bool, - **kwargs, -): - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - if not account_name and not connection_string: - raise ValueError( - "missing either account-name or connection-string", - ) - - connector_config.download_dir = update_download_dir_remote_url( - connector_config=connector_config, - remote_url=remote_url, - logger=logger, - ) - - from unstructured.ingest.connector.azure import ( - AzureBlobStorageConnector, - SimpleAzureBlobStorageConfig, - ) - - if account_name: - access_kwargs = { - "account_name": account_name, - "account_key": account_key, - } - elif connection_string: - access_kwargs = {"connection_string": connection_string} - else: - access_kwargs = {} - doc_connector = AzureBlobStorageConnector( # type: ignore - standard_config=connector_config, - config=SimpleAzureBlobStorageConfig( - path=remote_url, - recursive=recursive, - access_kwargs=access_kwargs, - ), - ) - - process_documents(doc_connector=doc_connector, processor_config=processor_config) diff --git a/src/unstructured/ingest/runner/biomed.py b/src/unstructured/ingest/runner/biomed.py deleted file mode 100644 index ca92e92..0000000 --- a/src/unstructured/ingest/runner/biomed.py +++ /dev/null @@ -1,63 +0,0 @@ -import hashlib -import logging -from typing import Optional - -from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents -from unstructured.ingest.runner.utils import update_download_dir_hash - - -def biomed( - verbose: bool, - connector_config: StandardConnectorConfig, - processor_config: ProcessorConfigs, - path: Optional[str], - api_id: Optional[str], - api_from: Optional[str], - api_until: Optional[str], - max_retries: int, - max_request_time: int, - decay: float, - **kwargs, -): - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - base_path = ( - path - if path - else "{}-{}-{}".format( - api_id if api_id else "", - api_from if api_from else "", - api_until if api_until else "", - ) - ) - - hashed_dir_name = hashlib.sha256( - base_path.encode("utf-8"), - ) - - connector_config.download_dir = update_download_dir_hash( - connector_config=connector_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.biomed import ( - BiomedConnector, - SimpleBiomedConfig, - ) - - doc_connector = BiomedConnector( # type: ignore - standard_config=connector_config, - config=SimpleBiomedConfig( - path=path, - id_=api_id, - from_=api_from, - until=api_until, - max_retries=max_retries, - request_timeout=max_request_time, - decay=decay, - ), - ) - - process_documents(doc_connector=doc_connector, processor_config=processor_config) diff --git a/src/unstructured/ingest/runner/box.py b/src/unstructured/ingest/runner/box.py deleted file mode 100644 index 0b9961b..0000000 --- a/src/unstructured/ingest/runner/box.py +++ /dev/null @@ -1,38 +0,0 @@ -import logging -from typing import Optional - -from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents -from unstructured.ingest.runner.utils import update_download_dir_remote_url - - -def box( - verbose: bool, - connector_config: StandardConnectorConfig, - processor_config: ProcessorConfigs, - remote_url: str, - recursive: bool, - box_app_config: Optional[str], - **kwargs, -): - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - connector_config.download_dir = update_download_dir_remote_url( - connector_config=connector_config, - remote_url=remote_url, - logger=logger, - ) - - from unstructured.ingest.connector.box import BoxConnector, SimpleBoxConfig - - doc_connector = BoxConnector( # type: ignore - standard_config=connector_config, - config=SimpleBoxConfig( - path=remote_url, - recursive=recursive, - access_kwargs={"box_app_config": box_app_config}, - ), - ) - - process_documents(doc_connector=doc_connector, processor_config=processor_config) diff --git a/src/unstructured/ingest/runner/confluence.py b/src/unstructured/ingest/runner/confluence.py deleted file mode 100644 index 366ecaf..0000000 --- a/src/unstructured/ingest/runner/confluence.py +++ /dev/null @@ -1,51 +0,0 @@ -import hashlib -import logging -from typing import Optional - -from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents -from unstructured.ingest.runner.utils import update_download_dir_hash - - -def confluence( - verbose: bool, - connector_config: StandardConnectorConfig, - processor_config: ProcessorConfigs, - url: str, - user_email: str, - api_token: str, - list_of_spaces: Optional[str], - max_num_of_spaces: int, - max_num_of_docs_from_each_space: int, - **kwargs, -): - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - url.encode("utf-8"), - ) - connector_config.download_dir = update_download_dir_hash( - connector_config=connector_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.confluence import ( - ConfluenceConnector, - SimpleConfluenceConfig, - ) - - doc_connector = ConfluenceConnector( # type: ignore - standard_config=connector_config, - config=SimpleConfluenceConfig( - url=url, - user_email=user_email, - api_token=api_token, - list_of_spaces=list_of_spaces, - max_number_of_spaces=max_num_of_spaces, - max_number_of_docs_from_each_space=max_num_of_docs_from_each_space, - ), - ) - - process_documents(doc_connector=doc_connector, processor_config=processor_config) diff --git a/src/unstructured/ingest/runner/discord.py b/src/unstructured/ingest/runner/discord.py deleted file mode 100644 index f94546a..0000000 --- a/src/unstructured/ingest/runner/discord.py +++ /dev/null @@ -1,46 +0,0 @@ -import hashlib -import logging -from typing import Optional - -from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents -from unstructured.ingest.runner.utils import update_download_dir_hash - - -def discord( - verbose: bool, - connector_config: StandardConnectorConfig, - processor_config: ProcessorConfigs, - channels: str, - token: str, - period: Optional[int], - **kwargs, -): - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - channels.encode("utf-8"), - ) - connector_config.download_dir = update_download_dir_hash( - connector_config=connector_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.discord import ( - DiscordConnector, - SimpleDiscordConfig, - ) - - doc_connector = DiscordConnector( # type: ignore - standard_config=connector_config, - config=SimpleDiscordConfig( - channels=SimpleDiscordConfig.parse_channels(channels), - days=period, - token=token, - verbose=verbose, - ), - ) - - process_documents(doc_connector=doc_connector, processor_config=processor_config) diff --git a/src/unstructured/ingest/runner/dropbox.py b/src/unstructured/ingest/runner/dropbox.py deleted file mode 100644 index 5e7aeee..0000000 --- a/src/unstructured/ingest/runner/dropbox.py +++ /dev/null @@ -1,41 +0,0 @@ -import logging -from typing import Optional - -from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents -from unstructured.ingest.runner.utils import update_download_dir_remote_url - - -def dropbox( - verbose: bool, - connector_config: StandardConnectorConfig, - processor_config: ProcessorConfigs, - remote_url: str, - recursive: bool, - token: Optional[str], - **kwargs, -): - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - connector_config.download_dir = update_download_dir_remote_url( - connector_config=connector_config, - remote_url=remote_url, - logger=logger, - ) - - from unstructured.ingest.connector.dropbox import ( - DropboxConnector, - SimpleDropboxConfig, - ) - - doc_connector = DropboxConnector( # type: ignore - standard_config=connector_config, - config=SimpleDropboxConfig( - path=remote_url, - recursive=recursive, - access_kwargs={"token": token}, - ), - ) - - process_documents(doc_connector=doc_connector, processor_config=processor_config) diff --git a/src/unstructured/ingest/runner/elasticsearch.py b/src/unstructured/ingest/runner/elasticsearch.py deleted file mode 100644 index f6b066b..0000000 --- a/src/unstructured/ingest/runner/elasticsearch.py +++ /dev/null @@ -1,47 +0,0 @@ -import hashlib -import logging -from typing import Optional - -from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents -from unstructured.ingest.runner.utils import update_download_dir_hash - - -def elasticsearch( - verbose: bool, - connector_config: StandardConnectorConfig, - processor_config: ProcessorConfigs, - url: str, - index_name: str, - jq_query: Optional[str], - **kwargs, -): - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - f"{url}_{index_name}".encode( - "utf-8", - ), - ) - connector_config.download_dir = update_download_dir_hash( - connector_config=connector_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.elasticsearch import ( - ElasticsearchConnector, - SimpleElasticsearchConfig, - ) - - doc_connector = ElasticsearchConnector( # type: ignore - standard_config=connector_config, - config=SimpleElasticsearchConfig( - url=url, - index_name=index_name, - jq_query=jq_query, - ), - ) - - process_documents(doc_connector=doc_connector, processor_config=processor_config) diff --git a/src/unstructured/ingest/runner/fsspec.py b/src/unstructured/ingest/runner/fsspec.py deleted file mode 100644 index a82b10b..0000000 --- a/src/unstructured/ingest/runner/fsspec.py +++ /dev/null @@ -1,48 +0,0 @@ -import logging -import warnings -from urllib.parse import urlparse - -from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents -from unstructured.ingest.runner.utils import update_download_dir_remote_url - - -def fsspec( - verbose: bool, - connector_config: StandardConnectorConfig, - processor_config: ProcessorConfigs, - remote_url: str, - recursive: bool, - **kwargs, -): - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - connector_config.download_dir = update_download_dir_remote_url( - connector_config=connector_config, - remote_url=remote_url, - logger=logger, - ) - - protocol = urlparse(remote_url).scheme - warnings.warn( - f"`fsspec` protocol {protocol} is not directly supported by `unstructured`," - " so use it at your own risk. Supported protocols are `gcs`, `gs`, `s3`, `s3a`," - "`dropbox`, `abfs` and `az`.", - UserWarning, - ) - - from unstructured.ingest.connector.fsspec import ( - FsspecConnector, - SimpleFsspecConfig, - ) - - doc_connector = FsspecConnector( # type: ignore - standard_config=connector_config, - config=SimpleFsspecConfig( - path=remote_url, - recursive=recursive, - ), - ) - - process_documents(doc_connector=doc_connector, processor_config=processor_config) diff --git a/src/unstructured/ingest/runner/gcs.py b/src/unstructured/ingest/runner/gcs.py deleted file mode 100644 index d309d23..0000000 --- a/src/unstructured/ingest/runner/gcs.py +++ /dev/null @@ -1,38 +0,0 @@ -import logging -from typing import Optional - -from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents -from unstructured.ingest.runner.utils import update_download_dir_remote_url - - -def gcs( - verbose: bool, - connector_config: StandardConnectorConfig, - processor_config: ProcessorConfigs, - remote_url: str, - recursive: bool, - token: Optional[str], - **kwargs, -): - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - connector_config.download_dir = update_download_dir_remote_url( - connector_config=connector_config, - remote_url=remote_url, - logger=logger, - ) - - from unstructured.ingest.connector.gcs import GcsConnector, SimpleGcsConfig - - doc_connector = GcsConnector( # type: ignore - standard_config=connector_config, - config=SimpleGcsConfig( - path=remote_url, - recursive=recursive, - access_kwargs={"token": token}, - ), - ) - - process_documents(doc_connector=doc_connector, processor_config=processor_config) diff --git a/src/unstructured/ingest/runner/github.py b/src/unstructured/ingest/runner/github.py deleted file mode 100644 index 46d8bc8..0000000 --- a/src/unstructured/ingest/runner/github.py +++ /dev/null @@ -1,49 +0,0 @@ -import hashlib -import logging -from typing import Optional - -from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents -from unstructured.ingest.runner.utils import update_download_dir_hash - - -def github( - verbose: bool, - connector_config: StandardConnectorConfig, - processor_config: ProcessorConfigs, - url: str, - git_branch: str, - git_access_token: Optional[str], - git_file_glob: Optional[str], - **kwargs, -): - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - f"{url}_{git_branch}".encode( - "utf-8", - ), - ) - connector_config.download_dir = update_download_dir_hash( - connector_config=connector_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.github import ( - GitHubConnector, - SimpleGitHubConfig, - ) - - doc_connector = GitHubConnector( # type: ignore - standard_config=connector_config, - config=SimpleGitHubConfig( - url=url, - access_token=git_access_token, - branch=git_branch, - file_glob=git_file_glob, - ), - ) - - process_documents(doc_connector=doc_connector, processor_config=processor_config) diff --git a/src/unstructured/ingest/runner/gitlab.py b/src/unstructured/ingest/runner/gitlab.py deleted file mode 100644 index a75bb0d..0000000 --- a/src/unstructured/ingest/runner/gitlab.py +++ /dev/null @@ -1,49 +0,0 @@ -import hashlib -import logging -from typing import Optional - -from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents -from unstructured.ingest.runner.utils import update_download_dir_hash - - -def gitlab( - verbose: bool, - connector_config: StandardConnectorConfig, - processor_config: ProcessorConfigs, - url: str, - git_branch: str, - git_access_token: Optional[str], - git_file_glob: Optional[str], - **kwargs, -): - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - f"{url}_{git_branch}".encode( - "utf-8", - ), - ) - connector_config.download_dir = update_download_dir_hash( - connector_config=connector_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.gitlab import ( - GitLabConnector, - SimpleGitLabConfig, - ) - - doc_connector = GitLabConnector( # type: ignore - standard_config=connector_config, - config=SimpleGitLabConfig( - url=url, - access_token=git_access_token, - branch=git_branch, - file_glob=git_file_glob, - ), - ) - - process_documents(doc_connector=doc_connector, processor_config=processor_config) diff --git a/src/unstructured/ingest/runner/google_drive.py b/src/unstructured/ingest/runner/google_drive.py deleted file mode 100644 index ed1863b..0000000 --- a/src/unstructured/ingest/runner/google_drive.py +++ /dev/null @@ -1,47 +0,0 @@ -import hashlib -import logging -from typing import Optional - -from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents -from unstructured.ingest.runner.utils import update_download_dir_hash - - -def gdrive( - verbose: bool, - connector_config: StandardConnectorConfig, - processor_config: ProcessorConfigs, - service_account_key: str, - recursive: bool, - drive_id: str, - extension: Optional[str], - **kwargs, -): - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - drive_id.encode("utf-8"), - ) - connector_config.download_dir = update_download_dir_hash( - connector_config=connector_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.google_drive import ( - GoogleDriveConnector, - SimpleGoogleDriveConfig, - ) - - doc_connector = GoogleDriveConnector( # type: ignore - standard_config=connector_config, - config=SimpleGoogleDriveConfig( - drive_id=drive_id, - service_account_key=service_account_key, - recursive=recursive, - extension=extension, - ), - ) - - process_documents(doc_connector=doc_connector, processor_config=processor_config) diff --git a/src/unstructured/ingest/runner/local.py b/src/unstructured/ingest/runner/local.py deleted file mode 100644 index 4cde508..0000000 --- a/src/unstructured/ingest/runner/local.py +++ /dev/null @@ -1,34 +0,0 @@ -import logging -from typing import Optional - -from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig -from unstructured.ingest.logger import ingest_log_streaming_init -from unstructured.ingest.processor import process_documents - - -def local( - verbose: bool, - connector_config: StandardConnectorConfig, - processor_config: ProcessorConfigs, - input_path: str, - recursive: bool, - file_glob: Optional[str], - **kwargs, -): - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - from unstructured.ingest.connector.local import ( - LocalConnector, - SimpleLocalConfig, - ) - - doc_connector = LocalConnector( # type: ignore - standard_config=connector_config, - config=SimpleLocalConfig( - input_path=input_path, - recursive=recursive, - file_glob=file_glob, - ), - ) - - process_documents(doc_connector=doc_connector, processor_config=processor_config) diff --git a/src/unstructured/ingest/runner/notion.py b/src/unstructured/ingest/runner/notion.py deleted file mode 100644 index 00f5c03..0000000 --- a/src/unstructured/ingest/runner/notion.py +++ /dev/null @@ -1,62 +0,0 @@ -import hashlib -import logging -from typing import Optional - -from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents -from unstructured.ingest.runner.utils import update_download_dir_hash - - -def notion( - verbose: bool, - connector_config: StandardConnectorConfig, - processor_config: ProcessorConfigs, - api_key: str, - recursive: bool, - page_ids: Optional[str] = "", - database_ids: Optional[str] = "", - **kwargs, -): - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - if not page_ids and not database_ids: - raise ValueError("no page ids nor database ids provided") - - if page_ids and database_ids: - hashed_dir_name = hashlib.sha256( - f"{page_ids},{database_ids}".encode("utf-8"), - ) - elif page_ids: - hashed_dir_name = hashlib.sha256( - page_ids.encode("utf-8"), - ) - elif database_ids: - hashed_dir_name = hashlib.sha256( - database_ids.encode("utf-8"), - ) - else: - raise ValueError("could not create local cache directory name") - connector_config.download_dir = update_download_dir_hash( - connector_config=connector_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.notion.connector import ( - NotionConnector, - SimpleNotionConfig, - ) - - doc_connector = NotionConnector( # type: ignore - standard_config=connector_config, - config=SimpleNotionConfig( - page_ids=SimpleNotionConfig.parse_ids(ids_str=page_ids) if page_ids else [], - database_ids=SimpleNotionConfig.parse_ids(ids_str=database_ids) if database_ids else [], - api_key=api_key, - verbose=verbose, - recursive=recursive, - logger=logger, - ), - ) - - process_documents(doc_connector=doc_connector, processor_config=processor_config) diff --git a/src/unstructured/ingest/runner/onedrive.py b/src/unstructured/ingest/runner/onedrive.py deleted file mode 100644 index 4bc6cf9..0000000 --- a/src/unstructured/ingest/runner/onedrive.py +++ /dev/null @@ -1,53 +0,0 @@ -import hashlib -import logging -from typing import Optional - -from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents -from unstructured.ingest.runner.utils import update_download_dir_hash - - -def onedrive( - verbose: bool, - connector_config: StandardConnectorConfig, - processor_config: ProcessorConfigs, - tenant: str, - user_pname: str, - client_id: str, - client_cred: str, - authority_url: Optional[str], - path: Optional[str], - recursive: bool, - **kwargs, -): - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - f"{tenant}_{user_pname}".encode("utf-8"), - ) - connector_config.download_dir = update_download_dir_hash( - connector_config=connector_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.onedrive import ( - OneDriveConnector, - SimpleOneDriveConfig, - ) - - doc_connector = OneDriveConnector( # type: ignore - standard_config=connector_config, - config=SimpleOneDriveConfig( - client_id=client_id, - client_credential=client_cred, - user_pname=user_pname, - tenant=tenant, - authority_url=authority_url, - path=path, - recursive=recursive, - ), - ) - - process_documents(doc_connector=doc_connector, processor_config=processor_config) diff --git a/src/unstructured/ingest/runner/outlook.py b/src/unstructured/ingest/runner/outlook.py deleted file mode 100644 index 9afb65a..0000000 --- a/src/unstructured/ingest/runner/outlook.py +++ /dev/null @@ -1,53 +0,0 @@ -import hashlib -import logging -from typing import Optional - -from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents -from unstructured.ingest.runner.utils import update_download_dir_hash - - -def outlook( - verbose: bool, - connector_config: StandardConnectorConfig, - processor_config: ProcessorConfigs, - user_email: str, - client_id: Optional[str], - client_cred: Optional[str], - tenant: Optional[str], - authority_url: Optional[str], - outlook_folders: Optional[str], - recursive: bool, - **kwargs, -): - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256(user_email.encode("utf-8")) - connector_config.download_dir = update_download_dir_hash( - connector_config=connector_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.outlook import ( - OutlookConnector, - SimpleOutlookConfig, - ) - - doc_connector = OutlookConnector( # type: ignore - standard_config=connector_config, - config=SimpleOutlookConfig( - client_id=client_id, - client_credential=client_cred, - user_email=user_email, - tenant=tenant, - authority_url=authority_url, - ms_outlook_folders=SimpleOutlookConfig.parse_folders(outlook_folders) - if outlook_folders - else [], - recursive=recursive, - ), - ) - - process_documents(doc_connector=doc_connector, processor_config=processor_config) diff --git a/src/unstructured/ingest/runner/reddit.py b/src/unstructured/ingest/runner/reddit.py deleted file mode 100644 index 1ea5220..0000000 --- a/src/unstructured/ingest/runner/reddit.py +++ /dev/null @@ -1,51 +0,0 @@ -import hashlib -import logging -from typing import Optional - -from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents -from unstructured.ingest.runner.utils import update_download_dir_hash - - -def reddit( - verbose: bool, - connector_config: StandardConnectorConfig, - processor_config: ProcessorConfigs, - subreddit_name: str, - client_id: Optional[str], - client_secret: Optional[str], - user_agent: str, - search_query: Optional[str], - num_posts: int, - **kwargs, -): - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - subreddit_name.encode("utf-8"), - ) - connector_config.download_dir = update_download_dir_hash( - connector_config=connector_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.reddit import ( - RedditConnector, - SimpleRedditConfig, - ) - - doc_connector = RedditConnector( # type: ignore - standard_config=connector_config, - config=SimpleRedditConfig( - subreddit_name=subreddit_name, - client_id=client_id, - client_secret=client_secret, - user_agent=user_agent, - search_query=search_query, - num_posts=num_posts, - ), - ) - - process_documents(doc_connector=doc_connector, processor_config=processor_config) diff --git a/src/unstructured/ingest/runner/s3.py b/src/unstructured/ingest/runner/s3.py deleted file mode 100644 index f07a7ab..0000000 --- a/src/unstructured/ingest/runner/s3.py +++ /dev/null @@ -1,37 +0,0 @@ -import logging - -from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents -from unstructured.ingest.runner.utils import update_download_dir_remote_url - - -def s3( - verbose: bool, - connector_config: StandardConnectorConfig, - processor_config: ProcessorConfigs, - remote_url: str, - recursive: bool, - anonymous: bool, - **kwargs, -): - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - connector_config.download_dir = update_download_dir_remote_url( - connector_config=connector_config, - remote_url=remote_url, - logger=logger, - ) - - from unstructured.ingest.connector.s3 import S3Connector, SimpleS3Config - - doc_connector = S3Connector( # type: ignore - standard_config=connector_config, - config=SimpleS3Config( - path=remote_url, - recursive=recursive, - access_kwargs={"anon": anonymous}, - ), - ) - - process_documents(doc_connector=doc_connector, processor_config=processor_config) diff --git a/src/unstructured/ingest/runner/sharepoint.py b/src/unstructured/ingest/runner/sharepoint.py deleted file mode 100644 index 6eb7c8e..0000000 --- a/src/unstructured/ingest/runner/sharepoint.py +++ /dev/null @@ -1,50 +0,0 @@ -import hashlib -import logging - -from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents -from unstructured.ingest.runner.utils import update_download_dir_hash - - -def sharepoint( - verbose: bool, - connector_config: StandardConnectorConfig, - processor_config: ProcessorConfigs, - site: str, - client_id: str, - client_cred: str, - files_only: bool, - path: str, - recursive: bool, - **kwargs, -): - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - f"{site}_{path}".encode("utf-8"), - ) - connector_config.download_dir = update_download_dir_hash( - connector_config=connector_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.sharepoint import ( - SharepointConnector, - SimpleSharepointConfig, - ) - - doc_connector = SharepointConnector( # type: ignore - standard_config=connector_config, - config=SimpleSharepointConfig( - client_id=client_id, - client_credential=client_cred, - site_url=site, - path=path, - process_pages=(not files_only), - recursive=recursive, - ), - ) - - process_documents(doc_connector=doc_connector, processor_config=processor_config) diff --git a/src/unstructured/ingest/runner/slack.py b/src/unstructured/ingest/runner/slack.py deleted file mode 100644 index 0607eb0..0000000 --- a/src/unstructured/ingest/runner/slack.py +++ /dev/null @@ -1,48 +0,0 @@ -import hashlib -import logging -from typing import Optional - -from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents -from unstructured.ingest.runner.utils import update_download_dir_hash - - -def slack( - verbose: bool, - connector_config: StandardConnectorConfig, - processor_config: ProcessorConfigs, - channels: str, - token: str, - start_date: Optional[str], - end_date: Optional[str], - **kwargs, -): - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - channels.encode("utf-8"), - ) - connector_config.download_dir = update_download_dir_hash( - connector_config=connector_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.slack import ( - SimpleSlackConfig, - SlackConnector, - ) - - doc_connector = SlackConnector( # type: ignore - standard_config=connector_config, - config=SimpleSlackConfig( - channels=SimpleSlackConfig.parse_channels(channels), - token=token, - oldest=start_date, - latest=end_date, - verbose=verbose, - ), - ) - - process_documents(doc_connector=doc_connector, processor_config=processor_config) diff --git a/src/unstructured/ingest/runner/utils.py b/src/unstructured/ingest/runner/utils.py deleted file mode 100644 index 9e59b38..0000000 --- a/src/unstructured/ingest/runner/utils.py +++ /dev/null @@ -1,43 +0,0 @@ -from __future__ import annotations - -import hashlib -import logging -from pathlib import Path - -from unstructured.ingest.interfaces import ( - StandardConnectorConfig, -) - - -def update_download_dir_remote_url( - connector_config: StandardConnectorConfig, - remote_url: str, - logger: logging.Logger, -) -> str: - hashed_dir_name = hashlib.sha256(remote_url.encode("utf-8")) - return update_download_dir_hash( - connector_config=connector_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - -def update_download_dir_hash( - connector_config: StandardConnectorConfig, - hashed_dir_name: hashlib._Hash, - logger: logging.Logger, -) -> str: - new_download_dir = connector_config.download_dir - if not connector_config.download_dir: - cache_path = Path.home() / ".cache" / "unstructured" / "ingest" - if not cache_path.exists(): - cache_path.mkdir(parents=True, exist_ok=True) - download_dir = cache_path / hashed_dir_name.hexdigest()[:10] - if connector_config.preserve_downloads: - logger.warning( - f"Preserving downloaded files but download_dir is not specified," - f" using {download_dir}", - ) - new_download_dir = str(download_dir) - logger.debug(f"updating download directory to: {new_download_dir}") - return new_download_dir diff --git a/src/unstructured/ingest/runner/wikipedia.py b/src/unstructured/ingest/runner/wikipedia.py deleted file mode 100644 index f5fce83..0000000 --- a/src/unstructured/ingest/runner/wikipedia.py +++ /dev/null @@ -1,42 +0,0 @@ -import hashlib -import logging - -from unstructured.ingest.interfaces import ProcessorConfigs, StandardConnectorConfig -from unstructured.ingest.logger import ingest_log_streaming_init, logger -from unstructured.ingest.processor import process_documents -from unstructured.ingest.runner.utils import update_download_dir_hash - - -def wikipedia( - verbose: bool, - connector_config: StandardConnectorConfig, - processor_config: ProcessorConfigs, - page_title: str, - auto_suggest: bool, - **kwargs, -): - ingest_log_streaming_init(logging.DEBUG if verbose else logging.INFO) - - hashed_dir_name = hashlib.sha256( - page_title.encode("utf-8"), - ) - connector_config.download_dir = update_download_dir_hash( - connector_config=connector_config, - hashed_dir_name=hashed_dir_name, - logger=logger, - ) - - from unstructured.ingest.connector.wikipedia import ( - SimpleWikipediaConfig, - WikipediaConnector, - ) - - doc_connector = WikipediaConnector( # type: ignore - standard_config=connector_config, - config=SimpleWikipediaConfig( - title=page_title, - auto_suggest=auto_suggest, - ), - ) - - process_documents(doc_connector=doc_connector, processor_config=processor_config) diff --git a/src/unstructured/nlp/__init__.py b/src/unstructured/nlp/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/unstructured/nlp/partition.py b/src/unstructured/nlp/partition.py deleted file mode 100644 index 3ffa991..0000000 --- a/src/unstructured/nlp/partition.py +++ /dev/null @@ -1,7 +0,0 @@ -# flake8: noqa -from unstructured.partition.pdf import partition_pdf -from unstructured.partition.text_type import ( - is_bulleted_text, - is_possible_narrative_text, - is_possible_title, -) diff --git a/src/unstructured/partition/__init__.py b/src/unstructured/partition/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/unstructured/staging/__init__.py b/src/unstructured/staging/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/test_image.py b/tests/test_image.py index 083f540..6f192c5 100644 --- a/tests/test_image.py +++ b/tests/test_image.py @@ -1,5 +1,5 @@ -from unstructured.documents.pdf_parser.image import ImageDocument -from unstructured.documents.html_utils import visualize_html, save_to_txt +from bisheng_unstructured.documents.pdf_parser.image import ImageDocument +from bisheng_unstructured.documents.html_utils import visualize_html, save_to_txt TEST_RT_URL = 'http://192.168.106.12:9001/v2.1/models/' diff --git a/tests/test_partition.py b/tests/test_partition.py index 4292287..117cae0 100644 --- a/tests/test_partition.py +++ b/tests/test_partition.py @@ -1,5 +1,5 @@ -from unstructured.partition.html import partition_html -from unstructured.documents.html_utils import visualize_html, save_to_txt +from bisheng_unstructured.partition.html import partition_html +from bisheng_unstructured.documents.html_utils import visualize_html, save_to_txt def test_html1(): diff --git a/tests/test_partition_image.py b/tests/test_partition_image.py index 809cfdb..7175e01 100644 --- a/tests/test_partition_image.py +++ b/tests/test_partition_image.py @@ -1,5 +1,5 @@ -from unstructured.partition.image import partition_image -from unstructured.documents.markdown import ( +from bisheng_unstructured.partition.image import partition_image +from bisheng_unstructured.documents.markdown import ( transform_html_table_to_md, merge_html_tables) @@ -42,12 +42,18 @@ def test3(): def test4(): html_text = """ - - - - - - + + + + + + + + + + + +
DatasetBase Model|Large Model|Notes
PubLayNet[38]F/MMLayouts of modern scientific documents
PRImA [3]M:Layouts of scanned modern magaxines and sciertific reports
AALayouts of scanned US newspapers from the 20th century
TableBank[18]Table region on modern scientific and business document
HJDataset [31]F/MLayouts of history Japanese documents
DatasetBase Model|Large Model|Notes
PubLayNet[38]F/MMLayouts of modern scientific documents
PRImA [3]M:Layouts of scanned modern magaxines and sciertific reports
AALayouts of scanned US newspapers from the 20th century
TableBank[18]Table region on modern scientific and business document
HJDataset [31]F/MLayouts of history Japanese documents
""" @@ -58,23 +64,35 @@ def test4(): def test5(): html_text1 = """ - - - - - - + + + + + + + + + + + +
DatasetBase Model|Large Model|Notes
PubLayNet[38]F/MMLayouts of modern scientific documents
PRImA [3]M:Layouts of scanned modern magaxines and sciertific reports
AALayouts of scanned US newspapers from the 20th century
TableBank[18]Table region on modern scientific and business document
HJDataset [31]F/MLayouts of history Japanese documents
DatasetBase Model|Large Model|Notes
PubLayNet[38]F/MMLayouts of modern scientific documents
PRImA [3]M:Layouts of scanned modern magaxines and sciertific reports
AALayouts of scanned US newspapers from the 20th century
TableBank[18]Table region on modern scientific and business document
HJDataset [31]F/MLayouts of history Japanese documents
""" html_text2 = """ - - - - - - + + + + + + + + + + + +
DatasetBase Model|Large Model|Notes
PubLayNet[38]F/MMLayouts of modern scientific documents
PRImA [3]M:Layouts of scanned modern magaxines and sciertific reports
AALayouts of scanned US newspapers from the 20th century
TableBank[18]Table region on modern scientific and business document
HJDataset [31]F/MLayouts of history Japanese documents
DatasetBase Model|Large Model|Notes
PubLayNet[38]F/MMLayouts of modern scientific documents
PRImA [3]M:Layouts of scanned modern magaxines and sciertific reports
AALayouts of scanned US newspapers from the 20th century
TableBank[18]Table region on modern scientific and business document
HJDataset [31]F/MLayouts of history Japanese documents
""" diff --git a/tests/test_pdf.py b/tests/test_pdf.py index 203ae80..a896772 100644 --- a/tests/test_pdf.py +++ b/tests/test_pdf.py @@ -1,5 +1,5 @@ -from unstructured.partition.pdf import partition_pdf -from unstructured.documents.html_utils import visualize_html, save_to_txt +from bisheng_unstructured.partition.pdf import partition_pdf +from bisheng_unstructured.documents.html_utils import visualize_html, save_to_txt def test1(): diff --git a/tests/test_pdf_parser.py b/tests/test_pdf_parser.py index 284f188..2826f63 100644 --- a/tests/test_pdf_parser.py +++ b/tests/test_pdf_parser.py @@ -1,8 +1,9 @@ -from unstructured.documents.pdf_parser.pdf import PDFDocument -from unstructured.documents.html_utils import visualize_html, save_to_txt +from bisheng_unstructured.documents.pdf_parser.pdf import PDFDocument +from bisheng_unstructured.documents.html_utils import visualize_html, save_to_txt TEST_RT_URL = 'http://192.168.106.12:9001/v2.1/models/' + def test_pdf_doc(): url = TEST_RT_URL layout_ep = url + 'elem_layout_v1/infer' @@ -10,7 +11,6 @@ def test_pdf_doc(): rowcol_model_ep = url + 'elem_table_rowcol_detect_v1/infer' table_model_ep = url + 'elem_table_detect_v1/infer' - model_params = { 'layout_ep': layout_ep, 'cell_model_ep': cell_model_ep, @@ -37,7 +37,6 @@ def test_pdf_doc2(): rowcol_model_ep = url + 'elem_table_rowcol_detect_v1/infer' table_model_ep = url + 'elem_table_detect_v1/infer' - model_params = { 'layout_ep': layout_ep, 'cell_model_ep': cell_model_ep, @@ -70,7 +69,6 @@ def test_pdf_doc3(): 'table_model_ep': table_model_ep, } - filename = "examples/docs/sw-flp-1965-v1.pdf" pdf_doc = PDFDocument( file=filename,