From a3fdca563cf1eda1b743c7ff95fb93a7834afa0b Mon Sep 17 00:00:00 2001 From: rvztz Date: Fri, 1 Sep 2023 14:13:23 -0600 Subject: [PATCH 01/23] Adds data source properties to git connectors --- unstructured/ingest/connector/git.py | 47 ++++++++++++++++++++++++- unstructured/ingest/connector/github.py | 37 +++++++++++++++++-- unstructured/ingest/connector/gitlab.py | 32 +++++++++++++---- 3 files changed, 106 insertions(+), 10 deletions(-) diff --git a/unstructured/ingest/connector/git.py b/unstructured/ingest/connector/git.py index 85268a9203..76f391807f 100644 --- a/unstructured/ingest/connector/git.py +++ b/unstructured/ingest/connector/git.py @@ -2,7 +2,7 @@ import os from dataclasses import dataclass, field from pathlib import Path -from typing import Optional +from typing import Any, Dict, Optional from unstructured.ingest.interfaces import ( BaseConnector, @@ -23,10 +23,26 @@ class SimpleGitConfig(BaseConnectorConfig): repo_path: str = field(init=False, repr=False) +@dataclass +class GitFileMeta: + date_created: str + date_modified: str + version: str + + @dataclass class GitIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): config: SimpleGitConfig = field(repr=False) path: str + file_exists: Optional[bool] = None + file_metadata: Optional[GitFileMeta] = None + + def __post_init__(self): + self.file_created_at = None + self.file_updated_at = None + self.file_version = None + self.file_exists = False + self.file_download_url = None @property def filename(self): @@ -36,6 +52,32 @@ def filename(self): def _output_filename(self): return Path(self.standard_config.output_dir) / f"{self.path}.json" + @property + def date_modified(self) -> Optional[str]: + if self.file_metadata is None: + self.get_file_metadata() + return self.file_metadata.date_modified + + @property + def exists(self) -> Optional[bool]: + if self.file_exists is None: + self.get_file_metadata() + return self.file_exists + + @property + def version(self) -> Optional[str]: + if self.file_metadata is None: + self.get_file_metadata() + return self.file_metadata.version + + @property + def record_locator(self) -> Dict[str, Any]: + return { + "url": self.config.url, + "repo_path": self.config.repo_path, + "file_path": self.path, + } + def _create_full_tmp_dir_path(self): """includes directories in in the gitlab repository""" self.filename.parent.mkdir(parents=True, exist_ok=True) @@ -47,6 +89,9 @@ def get_file(self): logger.debug(f"Fetching {self} - PID: {os.getpid()}") self._fetch_and_write() + def _fetch_content(self) -> None: + raise NotImplementedError() + def _fetch_and_write(self) -> None: raise NotImplementedError() diff --git a/unstructured/ingest/connector/github.py b/unstructured/ingest/connector/github.py index 5a1e20cd91..eeca201e1b 100644 --- a/unstructured/ingest/connector/github.py +++ b/unstructured/ingest/connector/github.py @@ -1,4 +1,5 @@ from dataclasses import dataclass +from datetime import datetime from typing import TYPE_CHECKING from urllib.parse import urlparse @@ -6,6 +7,7 @@ from unstructured.ingest.connector.git import ( GitConnector, + GitFileMeta, GitIngestDoc, SimpleGitConfig, ) @@ -50,8 +52,17 @@ class GitHubIngestDoc(GitIngestDoc): config: SimpleGitHubConfig registry_name: str = "github" - def _fetch_and_write(self) -> None: - content_file = self.config._get_repo().get_contents(self.path) + def _fetch_content(self, is_content_file=False): + try: + content_file = self.config._get_repo().get_contents(self.path) + self.file_exists = True + except Exception as e: + logger.error(f"Couldn't retrieve file {self.path}") + self.file_exists = False + raise + if is_content_file: + return content_file + contents = b"" if ( not content_file.content # type: ignore @@ -59,14 +70,28 @@ def _fetch_and_write(self) -> None: and content_file.size # type: ignore ): logger.info("File too large for the GitHub API, using direct download link instead.") + # NOTE: Maybe add a raise_for_status to catch connection timeout or HTTP Errors? response = requests.get(content_file.download_url) # type: ignore if response.status_code != 200: + self.file_exists = False logger.info("Direct download link has failed... Skipping this file.") else: contents = response.content else: contents = content_file.decoded_content # type: ignore + return contents + def get_file_metadata(self): + content_file = self.config._get_repo().get_contents(self.path) + self.file_metadata = GitFileMeta( + None, + datetime.strptime(content_file.last_modified, "%a, %d %b %Y %H:%M:%S %Z").isoformat(), + content_file.etag, + ) + + def _fetch_and_write(self) -> None: + contents = self._fetch_content() + self.get_file_metadata() with open(self.filename, "wb") as f: f.write(contents) @@ -77,7 +102,13 @@ class GitHubConnector(GitConnector): config: SimpleGitHubConfig def get_ingest_docs(self): - repo = self.config._get_repo() + from github.GithubException import UnknownObjectException + + try: + repo = self.config._get_repo() + except UnknownObjectException: + logger.error(f"Repository {self.config.repo_path} does not exist.") + return [] # Load the Git tree with all files, and then create Ingest docs # for all blobs, i.e. all files, ignoring directories sha = self.config.branch or repo.default_branch diff --git a/unstructured/ingest/connector/gitlab.py b/unstructured/ingest/connector/gitlab.py index 82e6591489..3cd337b159 100644 --- a/unstructured/ingest/connector/gitlab.py +++ b/unstructured/ingest/connector/gitlab.py @@ -4,6 +4,7 @@ from unstructured.ingest.connector.git import ( GitConnector, + GitFileMeta, GitIngestDoc, SimpleGitConfig, ) @@ -39,17 +40,36 @@ class GitLabIngestDoc(GitIngestDoc): config: SimpleGitLabConfig registry_name: str = "gitlab" + def _fetch_content(self): + try: + project = self.config._get_project() + content_file = project.files.get( + self.path, + ref=self.config.branch or project.default_branch, + ) + except Exception as e: + self.file_exists = False + raise + self.file_exists = True + return content_file + def _fetch_and_write(self) -> None: - project = self.config._get_project() - content_file = project.files.get( - self.path, - ref=self.config.branch or project.default_branch, - ) + content_file = self._fetch_content() contents = content_file.decode() - + self.file_exists = True + self.get_file_metadata(content_file) with open(self.filename, "wb") as f: f.write(contents) + def get_file_metadata(self, content_file=None): + if content_file is None: + content_file = self._fetch_content() + self.file_metadata = GitFileMeta( + None, + None, + content_file.attributes.get("commit_id", ""), + ) + @requires_dependencies(["gitlab"], extras="gitlab") @dataclass From 128b90a361852203276d66caaaed6ca3f2ab3948 Mon Sep 17 00:00:00 2001 From: rvztz Date: Thu, 7 Sep 2023 04:16:49 -0600 Subject: [PATCH 02/23] Sets file_metadata as a functools.cached_property --- unstructured/ingest/connector/git.py | 34 +++++++++++-------------- unstructured/ingest/connector/github.py | 34 ++++++++++++++++++------- unstructured/ingest/connector/gitlab.py | 33 ++++++++++++++++-------- 3 files changed, 62 insertions(+), 39 deletions(-) diff --git a/unstructured/ingest/connector/git.py b/unstructured/ingest/connector/git.py index de3c8eaa5c..f6723206ae 100644 --- a/unstructured/ingest/connector/git.py +++ b/unstructured/ingest/connector/git.py @@ -1,6 +1,7 @@ import fnmatch import os from dataclasses import dataclass, field +from functools import cached_property from pathlib import Path from typing import Any, Dict, Optional @@ -26,24 +27,17 @@ class SimpleGitConfig(BaseConnectorConfig): @dataclass class GitFileMeta: - date_created: str - date_modified: str - version: str + date_created: Optional[str] = None + date_modified: Optional[str] = None + version: Optional[str] = None + source_url: Optional[str] = None + exists: Optional[bool] = None @dataclass class GitIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): config: SimpleGitConfig = field(repr=False) path: str - file_exists: Optional[bool] = None - file_metadata: Optional[GitFileMeta] = None - - def __post_init__(self): - self.file_created_at = None - self.file_updated_at = None - self.file_version = None - self.file_exists = False - self.file_download_url = None @property def filename(self): @@ -55,22 +49,20 @@ def _output_filename(self): @property def date_modified(self) -> Optional[str]: - if self.file_metadata is None: - self.get_file_metadata() return self.file_metadata.date_modified @property def exists(self) -> Optional[bool]: - if self.file_exists is None: - self.get_file_metadata() - return self.file_exists + return self.file_metadata.exists @property def version(self) -> Optional[str]: - if self.file_metadata is None: - self.get_file_metadata() return self.file_metadata.version + @property + def source_url(self) -> Optional[str]: + return self.file_metadata.source_url + @property def record_locator(self) -> Dict[str, Any]: return { @@ -97,6 +89,10 @@ def _fetch_content(self) -> None: def _fetch_and_write(self) -> None: raise NotImplementedError() + @cached_property + def file_metadata(self) -> GitFileMeta: + raise NotImplementedError() + @dataclass class GitConnector(ConnectorCleanupMixin, BaseConnector): diff --git a/unstructured/ingest/connector/github.py b/unstructured/ingest/connector/github.py index 81617075f8..8116f74239 100644 --- a/unstructured/ingest/connector/github.py +++ b/unstructured/ingest/connector/github.py @@ -1,5 +1,6 @@ from dataclasses import dataclass from datetime import datetime +from functools import cached_property from typing import TYPE_CHECKING from urllib.parse import urlparse @@ -54,14 +55,19 @@ class GitHubIngestDoc(GitIngestDoc): config: SimpleGitHubConfig registry_name: str = "github" + @requires_dependencies(["github"], extras="github") def _fetch_content(self, is_content_file=False): + from github.GithubException import UnknownObjectException + try: content_file = self.config._get_repo().get_contents(self.path) - self.file_exists = True - except Exception as e: - logger.error(f"Couldn't retrieve file {self.path}") - self.file_exists = False + except UnknownObjectException: + logger.error(f"File doesn't exists {self.config.url}/{self.path}") + return None + except Exception: + logger.error(f"Error processing {self.config.url}/{self.path}") raise + if is_content_file: return content_file @@ -75,25 +81,35 @@ def _fetch_content(self, is_content_file=False): # NOTE: Maybe add a raise_for_status to catch connection timeout or HTTP Errors? response = requests.get(content_file.download_url) # type: ignore if response.status_code != 200: - self.file_exists = False logger.info("Direct download link has failed... Skipping this file.") + return None else: contents = response.content else: contents = content_file.decoded_content # type: ignore return contents - def get_file_metadata(self): - content_file = self.config._get_repo().get_contents(self.path) - self.file_metadata = GitFileMeta( + @cached_property + def file_metadata(self) -> GitFileMeta: + content_file = self._fetch_content(True) + if content_file is None: + return GitFileMeta( + exists=False, + ) + return GitFileMeta( None, datetime.strptime(content_file.last_modified, "%a, %d %b %Y %H:%M:%S %Z").isoformat(), content_file.etag, + content_file.download_url, + True, ) def _fetch_and_write(self) -> None: contents = self._fetch_content() - self.get_file_metadata() + if contents is None: + raise ValueError( + f"Failed to retrieve file from repo " f"{self.config.url}/{self.path}. Check logs", + ) with open(self.filename, "wb") as f: f.write(contents) diff --git a/unstructured/ingest/connector/gitlab.py b/unstructured/ingest/connector/gitlab.py index 0170584601..cb028489b9 100644 --- a/unstructured/ingest/connector/gitlab.py +++ b/unstructured/ingest/connector/gitlab.py @@ -1,4 +1,5 @@ from dataclasses import dataclass +from functools import cached_property from typing import TYPE_CHECKING from urllib.parse import urlparse @@ -9,6 +10,7 @@ SimpleGitConfig, ) from unstructured.ingest.error import SourceConnectionError +from unstructured.ingest.logger import logger from unstructured.utils import requires_dependencies if TYPE_CHECKING: @@ -42,34 +44,43 @@ class GitLabIngestDoc(GitIngestDoc): config: SimpleGitLabConfig registry_name: str = "gitlab" + @requires_dependencies(["gitlab"], extras="gitlab") def _fetch_content(self): + from gitlab.exceptions import GitlabHttpError + try: project = self.config._get_project() content_file = project.files.get( self.path, ref=self.config.branch or project.default_branch, ) - except Exception as e: - self.file_exists = False + except GitlabHttpError as e: + if e.response_code == 404: + logger.error(f"File doesn't exists {self.config.url}/{self.path}") + return None raise - self.file_exists = True return content_file def _fetch_and_write(self) -> None: content_file = self._fetch_content() + if content_file is None: + raise ValueError( + f"Failed to retrieve file from repo " f"{self.config.url}/{self.path}. Check logs.", + ) contents = content_file.decode() - self.file_exists = True - self.get_file_metadata(content_file) with open(self.filename, "wb") as f: f.write(contents) - def get_file_metadata(self, content_file=None): + @cached_property + def file_metadata(self): + content_file = self._fetch_content() if content_file is None: - content_file = self._fetch_content() - self.file_metadata = GitFileMeta( - None, - None, - content_file.attributes.get("commit_id", ""), + return GitFileMeta( + exists=None, + ) + return GitFileMeta( + version=content_file.attributes.get("commit_id", ""), + exists=True, ) From 5104b26a3ec28e5bb916444d1f427cc2da2ab2f4 Mon Sep 17 00:00:00 2001 From: rvztz Date: Fri, 8 Sep 2023 02:49:43 -0600 Subject: [PATCH 03/23] Sets Gitlab version to last_commit_id --- unstructured/ingest/connector/gitlab.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unstructured/ingest/connector/gitlab.py b/unstructured/ingest/connector/gitlab.py index cb028489b9..57065fc192 100644 --- a/unstructured/ingest/connector/gitlab.py +++ b/unstructured/ingest/connector/gitlab.py @@ -63,6 +63,7 @@ def _fetch_content(self): def _fetch_and_write(self) -> None: content_file = self._fetch_content() + logger.debug(content_file.attributes) if content_file is None: raise ValueError( f"Failed to retrieve file from repo " f"{self.config.url}/{self.path}. Check logs.", @@ -79,7 +80,7 @@ def file_metadata(self): exists=None, ) return GitFileMeta( - version=content_file.attributes.get("commit_id", ""), + version=content_file.attributes.get("last_commit_id", ""), exists=True, ) From 906b5f17e28a6fda8d4cd359a9ca0d7ad5809191 Mon Sep 17 00:00:00 2001 From: rvztz Date: Fri, 8 Sep 2023 02:51:55 -0600 Subject: [PATCH 04/23] Removes debugging logger --- unstructured/ingest/connector/gitlab.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unstructured/ingest/connector/gitlab.py b/unstructured/ingest/connector/gitlab.py index 57065fc192..c8f0e95f9e 100644 --- a/unstructured/ingest/connector/gitlab.py +++ b/unstructured/ingest/connector/gitlab.py @@ -63,7 +63,6 @@ def _fetch_content(self): def _fetch_and_write(self) -> None: content_file = self._fetch_content() - logger.debug(content_file.attributes) if content_file is None: raise ValueError( f"Failed to retrieve file from repo " f"{self.config.url}/{self.path}. Check logs.", From d7c36f95202725ca3cfa6c1a4a6f553afa5ee5f1 Mon Sep 17 00:00:00 2001 From: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Date: Fri, 8 Sep 2023 02:00:49 -0700 Subject: [PATCH 05/23] Adds data source properties to git connectors <- Ingest test fixtures update (#1344) This pull request includes updated ingest test fixtures. Please review and merge if appropriate. Co-authored-by: rvztz --- .../github/LICENSE.txt.json | 55 +++++++++++-- .../github/test.html.json | 77 +++++++++++++++++-- 2 files changed, 120 insertions(+), 12 deletions(-) diff --git a/test_unstructured_ingest/expected-structured-output/github/LICENSE.txt.json b/test_unstructured_ingest/expected-structured-output/github/LICENSE.txt.json index b0b077a342..6759a07eea 100644 --- a/test_unstructured_ingest/expected-structured-output/github/LICENSE.txt.json +++ b/test_unstructured_ingest/expected-structured-output/github/LICENSE.txt.json @@ -3,7 +3,16 @@ "type": "Title", "element_id": "e3e5334b595ef9b648bf7f1f6c1a60c4", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/LICENSE.txt", + "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "LICENSE.txt" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/plain" }, "text": "Downloadify: Client Side File Creation JavaScript + Flash Library" @@ -12,7 +21,16 @@ "type": "Title", "element_id": "8dc8800e5660b2558bb7f5f5416ca498", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/LICENSE.txt", + "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "LICENSE.txt" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/plain" }, "text": "Copyright (c) 2009 Douglas C. Neiner" @@ -21,7 +39,16 @@ "type": "NarrativeText", "element_id": "fa3ff462f020dcadaf3c44b61f0df757", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/LICENSE.txt", + "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "LICENSE.txt" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/plain" }, "text": "Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:" @@ -30,7 +57,16 @@ "type": "NarrativeText", "element_id": "70760316a66259dc346c891a2b964556", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/LICENSE.txt", + "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "LICENSE.txt" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/plain" }, "text": "The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software." @@ -39,7 +75,16 @@ "type": "NarrativeText", "element_id": "1da9072633b5e4291608b205a664d5af", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/LICENSE.txt", + "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "LICENSE.txt" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/plain" }, "text": "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE." diff --git a/test_unstructured_ingest/expected-structured-output/github/test.html.json b/test_unstructured_ingest/expected-structured-output/github/test.html.json index 1aecea366e..305c99dc32 100644 --- a/test_unstructured_ingest/expected-structured-output/github/test.html.json +++ b/test_unstructured_ingest/expected-structured-output/github/test.html.json @@ -3,7 +3,16 @@ "type": "Title", "element_id": "56a9f768a0968be676f9addd5ec3032e", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", + "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "test.html" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/html", "page_number": 1 }, @@ -13,7 +22,16 @@ "type": "Title", "element_id": "d551bbfc9477547e4dce6264d8196c7b", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", + "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "test.html" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/html", "page_number": 1, "link_urls": [ @@ -29,7 +47,16 @@ "type": "Title", "element_id": "971b974235a86ca628dcc713d6e2e8d9", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", + "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "test.html" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/html", "page_number": 1 }, @@ -39,7 +66,16 @@ "type": "NarrativeText", "element_id": "43f65b1c5bd47774b25c72e2f96de300", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", + "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "test.html" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/html", "page_number": 1 }, @@ -49,7 +85,16 @@ "type": "NarrativeText", "element_id": "53a4db70c6d40ed5206711ed8a255e03", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", + "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "test.html" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/html", "page_number": 1 }, @@ -59,7 +104,16 @@ "type": "Title", "element_id": "839973fba0c850f1729fad098b031203", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", + "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "test.html" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/html", "page_number": 1 }, @@ -69,7 +123,16 @@ "type": "NarrativeText", "element_id": "b7db0dffb05f01f3f13d34420b82c261", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", + "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "test.html" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/html", "page_number": 1 }, From 6dafd0565759da90e771c01d0e7afd5e77d33b44 Mon Sep 17 00:00:00 2001 From: rvztz Date: Fri, 8 Sep 2023 20:04:45 -0600 Subject: [PATCH 06/23] Changelog bump --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 83d8f7c89a..9bb48622d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ * Updated documentation: Added back support doc types for partitioning, more Python codes in the API page, RAG definition, and use case. * Updated Hi-Res Metadata: PDFs and Images using Hi-Res strategy now have layout model class probabilities added ot metadata. * Updated the `_detect_filetype_from_octet_stream()` function to use libmagic to infer the content type of file when it is not a zip file. +* Adds data source properties (date_created, date_modified, version, exists, source_url, record_locator) to the git base interface. + ### Features * Add Jira Connector to be able to pull issues from a Jira organization From b99b644ac04310730b415814222c6160c621f84d Mon Sep 17 00:00:00 2001 From: rvztz Date: Sat, 9 Sep 2023 02:52:08 -0600 Subject: [PATCH 07/23] Removes updates fixtures to re-process --- .../github/LICENSE.txt.json | 55 ++----------- .../github/test.html.json | 77 ++----------------- 2 files changed, 12 insertions(+), 120 deletions(-) diff --git a/test_unstructured_ingest/expected-structured-output/github/LICENSE.txt.json b/test_unstructured_ingest/expected-structured-output/github/LICENSE.txt.json index 6759a07eea..b0b077a342 100644 --- a/test_unstructured_ingest/expected-structured-output/github/LICENSE.txt.json +++ b/test_unstructured_ingest/expected-structured-output/github/LICENSE.txt.json @@ -3,16 +3,7 @@ "type": "Title", "element_id": "e3e5334b595ef9b648bf7f1f6c1a60c4", "metadata": { - "data_source": { - "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/LICENSE.txt", - "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", - "record_locator": { - "url": "dcneiner/Downloadify", - "repo_path": "dcneiner/Downloadify", - "file_path": "LICENSE.txt" - }, - "date_modified": "2010-01-23T23:18:40" - }, + "data_source": {}, "filetype": "text/plain" }, "text": "Downloadify: Client Side File Creation JavaScript + Flash Library" @@ -21,16 +12,7 @@ "type": "Title", "element_id": "8dc8800e5660b2558bb7f5f5416ca498", "metadata": { - "data_source": { - "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/LICENSE.txt", - "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", - "record_locator": { - "url": "dcneiner/Downloadify", - "repo_path": "dcneiner/Downloadify", - "file_path": "LICENSE.txt" - }, - "date_modified": "2010-01-23T23:18:40" - }, + "data_source": {}, "filetype": "text/plain" }, "text": "Copyright (c) 2009 Douglas C. Neiner" @@ -39,16 +21,7 @@ "type": "NarrativeText", "element_id": "fa3ff462f020dcadaf3c44b61f0df757", "metadata": { - "data_source": { - "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/LICENSE.txt", - "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", - "record_locator": { - "url": "dcneiner/Downloadify", - "repo_path": "dcneiner/Downloadify", - "file_path": "LICENSE.txt" - }, - "date_modified": "2010-01-23T23:18:40" - }, + "data_source": {}, "filetype": "text/plain" }, "text": "Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:" @@ -57,16 +30,7 @@ "type": "NarrativeText", "element_id": "70760316a66259dc346c891a2b964556", "metadata": { - "data_source": { - "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/LICENSE.txt", - "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", - "record_locator": { - "url": "dcneiner/Downloadify", - "repo_path": "dcneiner/Downloadify", - "file_path": "LICENSE.txt" - }, - "date_modified": "2010-01-23T23:18:40" - }, + "data_source": {}, "filetype": "text/plain" }, "text": "The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software." @@ -75,16 +39,7 @@ "type": "NarrativeText", "element_id": "1da9072633b5e4291608b205a664d5af", "metadata": { - "data_source": { - "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/LICENSE.txt", - "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", - "record_locator": { - "url": "dcneiner/Downloadify", - "repo_path": "dcneiner/Downloadify", - "file_path": "LICENSE.txt" - }, - "date_modified": "2010-01-23T23:18:40" - }, + "data_source": {}, "filetype": "text/plain" }, "text": "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE." diff --git a/test_unstructured_ingest/expected-structured-output/github/test.html.json b/test_unstructured_ingest/expected-structured-output/github/test.html.json index bebcd07251..3af78803aa 100644 --- a/test_unstructured_ingest/expected-structured-output/github/test.html.json +++ b/test_unstructured_ingest/expected-structured-output/github/test.html.json @@ -3,16 +3,7 @@ "type": "Title", "element_id": "56a9f768a0968be676f9addd5ec3032e", "metadata": { - "data_source": { - "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", - "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", - "record_locator": { - "url": "dcneiner/Downloadify", - "repo_path": "dcneiner/Downloadify", - "file_path": "test.html" - }, - "date_modified": "2010-01-23T23:18:40" - }, + "data_source": {}, "filetype": "text/html", "page_number": 1 }, @@ -22,16 +13,7 @@ "type": "Title", "element_id": "d551bbfc9477547e4dce6264d8196c7b", "metadata": { - "data_source": { - "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", - "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", - "record_locator": { - "url": "dcneiner/Downloadify", - "repo_path": "dcneiner/Downloadify", - "file_path": "test.html" - }, - "date_modified": "2010-01-23T23:18:40" - }, + "data_source": {}, "filetype": "text/html", "page_number": 1, "link_urls": [ @@ -47,16 +29,7 @@ "type": "Title", "element_id": "971b974235a86ca628dcc713d6e2e8d9", "metadata": { - "data_source": { - "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", - "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", - "record_locator": { - "url": "dcneiner/Downloadify", - "repo_path": "dcneiner/Downloadify", - "file_path": "test.html" - }, - "date_modified": "2010-01-23T23:18:40" - }, + "data_source": {}, "filetype": "text/html", "page_number": 1 }, @@ -66,16 +39,7 @@ "type": "Title", "element_id": "4112a488690bdbc1d39d5b78068eae9f", "metadata": { - "data_source": { - "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", - "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", - "record_locator": { - "url": "dcneiner/Downloadify", - "repo_path": "dcneiner/Downloadify", - "file_path": "test.html" - }, - "date_modified": "2010-01-23T23:18:40" - }, + "data_source": {}, "filetype": "text/html", "page_number": 1 }, @@ -95,16 +59,7 @@ "type": "NarrativeText", "element_id": "53a4db70c6d40ed5206711ed8a255e03", "metadata": { - "data_source": { - "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", - "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", - "record_locator": { - "url": "dcneiner/Downloadify", - "repo_path": "dcneiner/Downloadify", - "file_path": "test.html" - }, - "date_modified": "2010-01-23T23:18:40" - }, + "data_source": {}, "filetype": "text/html", "page_number": 1 }, @@ -114,16 +69,7 @@ "type": "Title", "element_id": "839973fba0c850f1729fad098b031203", "metadata": { - "data_source": { - "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", - "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", - "record_locator": { - "url": "dcneiner/Downloadify", - "repo_path": "dcneiner/Downloadify", - "file_path": "test.html" - }, - "date_modified": "2010-01-23T23:18:40" - }, + "data_source": {}, "filetype": "text/html", "page_number": 1 }, @@ -133,16 +79,7 @@ "type": "NarrativeText", "element_id": "b7db0dffb05f01f3f13d34420b82c261", "metadata": { - "data_source": { - "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", - "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", - "record_locator": { - "url": "dcneiner/Downloadify", - "repo_path": "dcneiner/Downloadify", - "file_path": "test.html" - }, - "date_modified": "2010-01-23T23:18:40" - }, + "data_source": {}, "filetype": "text/html", "page_number": 1 }, From e407706d1076e21961b9ceba91fba49a109a2383 Mon Sep 17 00:00:00 2001 From: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Date: Sat, 9 Sep 2023 02:40:58 -0700 Subject: [PATCH 08/23] Adds data source properties to git connectors <- Ingest test fixtures update (#1358) This pull request includes updated ingest test fixtures. Please review and merge if appropriate. Co-authored-by: rvztz --- .../github/LICENSE.txt.json | 55 ++++++++++-- .../github/test.html.json | 88 +++++++++++++++++-- 2 files changed, 130 insertions(+), 13 deletions(-) diff --git a/test_unstructured_ingest/expected-structured-output/github/LICENSE.txt.json b/test_unstructured_ingest/expected-structured-output/github/LICENSE.txt.json index b0b077a342..6759a07eea 100644 --- a/test_unstructured_ingest/expected-structured-output/github/LICENSE.txt.json +++ b/test_unstructured_ingest/expected-structured-output/github/LICENSE.txt.json @@ -3,7 +3,16 @@ "type": "Title", "element_id": "e3e5334b595ef9b648bf7f1f6c1a60c4", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/LICENSE.txt", + "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "LICENSE.txt" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/plain" }, "text": "Downloadify: Client Side File Creation JavaScript + Flash Library" @@ -12,7 +21,16 @@ "type": "Title", "element_id": "8dc8800e5660b2558bb7f5f5416ca498", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/LICENSE.txt", + "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "LICENSE.txt" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/plain" }, "text": "Copyright (c) 2009 Douglas C. Neiner" @@ -21,7 +39,16 @@ "type": "NarrativeText", "element_id": "fa3ff462f020dcadaf3c44b61f0df757", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/LICENSE.txt", + "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "LICENSE.txt" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/plain" }, "text": "Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:" @@ -30,7 +57,16 @@ "type": "NarrativeText", "element_id": "70760316a66259dc346c891a2b964556", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/LICENSE.txt", + "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "LICENSE.txt" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/plain" }, "text": "The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software." @@ -39,7 +75,16 @@ "type": "NarrativeText", "element_id": "1da9072633b5e4291608b205a664d5af", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/LICENSE.txt", + "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "LICENSE.txt" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/plain" }, "text": "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE." diff --git a/test_unstructured_ingest/expected-structured-output/github/test.html.json b/test_unstructured_ingest/expected-structured-output/github/test.html.json index 3af78803aa..4aecfd4cfb 100644 --- a/test_unstructured_ingest/expected-structured-output/github/test.html.json +++ b/test_unstructured_ingest/expected-structured-output/github/test.html.json @@ -3,7 +3,16 @@ "type": "Title", "element_id": "56a9f768a0968be676f9addd5ec3032e", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", + "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "test.html" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/html", "page_number": 1 }, @@ -13,7 +22,16 @@ "type": "Title", "element_id": "d551bbfc9477547e4dce6264d8196c7b", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", + "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "test.html" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/html", "page_number": 1, "link_urls": [ @@ -29,7 +47,16 @@ "type": "Title", "element_id": "971b974235a86ca628dcc713d6e2e8d9", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", + "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "test.html" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/html", "page_number": 1 }, @@ -39,7 +66,16 @@ "type": "Title", "element_id": "4112a488690bdbc1d39d5b78068eae9f", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", + "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "test.html" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/html", "page_number": 1 }, @@ -49,7 +85,16 @@ "type": "NarrativeText", "element_id": "f89c9cf63bd2e72f560ee043d942a1e7", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", + "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "test.html" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/html", "page_number": 1 }, @@ -59,7 +104,16 @@ "type": "NarrativeText", "element_id": "53a4db70c6d40ed5206711ed8a255e03", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", + "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "test.html" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/html", "page_number": 1 }, @@ -69,7 +123,16 @@ "type": "Title", "element_id": "839973fba0c850f1729fad098b031203", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", + "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "test.html" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/html", "page_number": 1 }, @@ -79,7 +142,16 @@ "type": "NarrativeText", "element_id": "b7db0dffb05f01f3f13d34420b82c261", "metadata": { - "data_source": {}, + "data_source": { + "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", + "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", + "record_locator": { + "url": "dcneiner/Downloadify", + "repo_path": "dcneiner/Downloadify", + "file_path": "test.html" + }, + "date_modified": "2010-01-23T23:18:40" + }, "filetype": "text/html", "page_number": 1 }, From c32f51a573563801e9955a6af3019bf56f9285b7 Mon Sep 17 00:00:00 2001 From: rvztz Date: Mon, 11 Sep 2023 11:55:34 -0600 Subject: [PATCH 09/23] Solves merge issues --- unstructured/__version__.py | 2 +- unstructured/ingest/connector/git.py | 4 ++-- unstructured/ingest/connector/github.py | 7 ++++--- unstructured/ingest/connector/gitlab.py | 5 +++-- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index ce8de91302..e514376f02 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.14" # pragma: no cover +__version__ = "0.10.15-dev0" # pragma: no cover diff --git a/unstructured/ingest/connector/git.py b/unstructured/ingest/connector/git.py index 222bf8d3fb..e619812434 100644 --- a/unstructured/ingest/connector/git.py +++ b/unstructured/ingest/connector/git.py @@ -66,8 +66,8 @@ def source_url(self) -> t.Optional[str]: @property def record_locator(self) -> t.Dict[str, t.Any]: return { - "url": self.config.url, - "repo_path": self.config.repo_path, + "url": self.connector_config.url, + "repo_path": self.connector_config.repo_path, "file_path": self.path, } diff --git a/unstructured/ingest/connector/github.py b/unstructured/ingest/connector/github.py index e86d27b7b0..e114c8d9c1 100644 --- a/unstructured/ingest/connector/github.py +++ b/unstructured/ingest/connector/github.py @@ -60,7 +60,7 @@ def _fetch_content(self, is_content_file=False): from github.GithubException import UnknownObjectException try: - content_file = self.connector_config._get_repo().get_contents(self.path) + content_file = self.connector_config.get_repo().get_contents(self.path) except UnknownObjectException: logger.error(f"File doesn't exists {self.connector_config.url}/{self.path}") return None @@ -108,7 +108,8 @@ def _fetch_and_write(self) -> None: contents = self._fetch_content() if contents is None: raise ValueError( - f"Failed to retrieve file from repo " f"{self.connector_config.url}/{self.path}. Check logs", + f"Failed to retrieve file from repo " + f"{self.connector_config.url}/{self.path}. Check logs", ) with open(self.filename, "wb") as f: f.write(contents) @@ -123,7 +124,7 @@ def get_ingest_docs(self): from github.GithubException import UnknownObjectException try: - repo = self.connector_config._get_repo() + repo = self.connector_config.get_repo() except UnknownObjectException: logger.error(f"Repository {self.connector_config.repo_path} does not exist.") return [] diff --git a/unstructured/ingest/connector/gitlab.py b/unstructured/ingest/connector/gitlab.py index 3432d90a40..76409080af 100644 --- a/unstructured/ingest/connector/gitlab.py +++ b/unstructured/ingest/connector/gitlab.py @@ -49,7 +49,7 @@ def _fetch_content(self): from gitlab.exceptions import GitlabHttpError try: - project = self.connector_config._get_project() + project = self.connector_config.get_project() content_file = project.files.get( self.path, ref=self.connector_config.branch or project.default_branch, @@ -65,7 +65,8 @@ def _fetch_and_write(self) -> None: content_file = self._fetch_content() if content_file is None: raise ValueError( - f"Failed to retrieve file from repo " f"{self.connector_config.url}/{self.path}. Check logs.", + f"Failed to retrieve file from repo " + f"{self.connector_config.url}/{self.path}. Check logs.", ) contents = content_file.decode() with open(self.filename, "wb") as f: From 0412dabcc46d631150ebc41e57f30b15846614d1 Mon Sep 17 00:00:00 2001 From: rvztz Date: Wed, 13 Sep 2023 23:56:41 -0600 Subject: [PATCH 10/23] Adds `update_source_metadata` method to git-based connectors --- unstructured/ingest/connector/git.py | 33 ++----------------- unstructured/ingest/connector/github.py | 30 ++++++++++------- unstructured/ingest/connector/gitlab.py | 44 ++++++++++++++++--------- unstructured/ingest/interfaces.py | 36 ++++++++++++++++---- 4 files changed, 80 insertions(+), 63 deletions(-) diff --git a/unstructured/ingest/connector/git.py b/unstructured/ingest/connector/git.py index e619812434..148623fed0 100644 --- a/unstructured/ingest/connector/git.py +++ b/unstructured/ingest/connector/git.py @@ -2,7 +2,6 @@ import os import typing as t from dataclasses import dataclass, field -from functools import cached_property from pathlib import Path from unstructured.ingest.error import SourceConnectionError @@ -25,15 +24,6 @@ class SimpleGitConfig(BaseConnectorConfig): repo_path: str = field(init=False, repr=False) -@dataclass -class GitFileMeta: - date_created: t.Optional[str] = None - date_modified: t.Optional[str] = None - version: t.Optional[str] = None - source_url: t.Optional[str] = None - exists: t.Optional[bool] = None - - @dataclass class GitIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): connector_config: SimpleGitConfig = field(repr=False) @@ -47,22 +37,6 @@ def filename(self): def _output_filename(self): return Path(self.partition_config.output_dir) / f"{self.path}.json" - @property - def date_modified(self) -> t.Optional[str]: - return self.file_metadata.date_modified - - @property - def exists(self) -> t.Optional[bool]: - return self.file_metadata.exists - - @property - def version(self) -> t.Optional[str]: - return self.file_metadata.version - - @property - def source_url(self) -> t.Optional[str]: - return self.file_metadata.source_url - @property def record_locator(self) -> t.Dict[str, t.Any]: return { @@ -75,6 +49,9 @@ def _create_full_tmp_dir_path(self): """includes directories in in the gitlab repository""" self.filename.parent.mkdir(parents=True, exist_ok=True) + def update_source_metadata(self, **kwargs): + raise NotImplementedError() + @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists def get_file(self): @@ -89,10 +66,6 @@ def _fetch_content(self) -> None: def _fetch_and_write(self) -> None: raise NotImplementedError() - @cached_property - def file_metadata(self) -> GitFileMeta: - raise NotImplementedError() - @dataclass class GitSourceConnector(SourceConnectorCleanupMixin, BaseSourceConnector): diff --git a/unstructured/ingest/connector/github.py b/unstructured/ingest/connector/github.py index e114c8d9c1..3d4ea55d51 100644 --- a/unstructured/ingest/connector/github.py +++ b/unstructured/ingest/connector/github.py @@ -1,18 +1,17 @@ import typing as t from dataclasses import dataclass from datetime import datetime -from functools import cached_property from urllib.parse import urlparse import requests from unstructured.ingest.connector.git import ( - GitFileMeta, GitIngestDoc, GitSourceConnector, SimpleGitConfig, ) from unstructured.ingest.error import SourceConnectionError +from unstructured.ingest.interfaces import SourceMetadata from unstructured.ingest.logger import logger from unstructured.utils import requires_dependencies @@ -55,6 +54,10 @@ class GitHubIngestDoc(GitIngestDoc): connector_config: SimpleGitHubConfig registry_name: str = "github" + @property + def date_created(self) -> t.Optional[str]: + return None + @requires_dependencies(["github"], extras="github") def _fetch_content(self, is_content_file=False): from github.GithubException import UnknownObjectException @@ -89,23 +92,28 @@ def _fetch_content(self, is_content_file=False): contents = content_file.decoded_content # type: ignore return contents - @cached_property - def file_metadata(self) -> GitFileMeta: + def update_source_metadata(self, **kwargs): content_file = self._fetch_content(True) if content_file is None: - return GitFileMeta( + self.source_metadata = SourceMetadata( exists=False, ) - return GitFileMeta( - None, - datetime.strptime(content_file.last_modified, "%a, %d %b %Y %H:%M:%S %Z").isoformat(), - content_file.etag, - content_file.download_url, - True, + return + + date_modified = datetime.strptime( + content_file.last_modified, + "%a, %d %b %Y %H:%M:%S %Z", + ).isoformat() + self.source_metadata = SourceMetadata( + date_modified=date_modified, + version=content_file.etag, + source_url=content_file.download_url, + exists=True, ) def _fetch_and_write(self) -> None: contents = self._fetch_content() + self.update_source_metadata() if contents is None: raise ValueError( f"Failed to retrieve file from repo " diff --git a/unstructured/ingest/connector/gitlab.py b/unstructured/ingest/connector/gitlab.py index 76409080af..cea6b4397e 100644 --- a/unstructured/ingest/connector/gitlab.py +++ b/unstructured/ingest/connector/gitlab.py @@ -1,19 +1,18 @@ +import typing as t from dataclasses import dataclass -from functools import cached_property -from typing import TYPE_CHECKING from urllib.parse import urlparse from unstructured.ingest.connector.git import ( - GitFileMeta, GitIngestDoc, GitSourceConnector, SimpleGitConfig, ) from unstructured.ingest.error import SourceConnectionError +from unstructured.ingest.interfaces import SourceMetadata from unstructured.ingest.logger import logger from unstructured.utils import requires_dependencies -if TYPE_CHECKING: +if t.TYPE_CHECKING: from gitlab.v4.objects.projects import Project @@ -44,6 +43,18 @@ class GitLabIngestDoc(GitIngestDoc): connector_config: SimpleGitLabConfig registry_name: str = "gitlab" + @property + def date_created(self) -> t.Optional[str]: + return None + + @property + def date_modified(self) -> t.Optional[str]: + return None + + @property + def source_url(self) -> t.Optional[str]: + return None + @requires_dependencies(["gitlab"], extras="gitlab") def _fetch_content(self): from gitlab.exceptions import GitlabHttpError @@ -61,8 +72,21 @@ def _fetch_content(self): raise return content_file + def update_source_metadata(self, **kwargs): + content_file = kwargs.get("content_file", self._fetch_content()) + if content_file is None: + self.source_metadata = SourceMetadata( + exists=None, + ) + return + self.source_metadata = SourceMetadata( + version=content_file.attributes.get("last_commit_id", ""), + exists=True, + ) + def _fetch_and_write(self) -> None: content_file = self._fetch_content() + self.update_source_metadata(content_file=content_file) if content_file is None: raise ValueError( f"Failed to retrieve file from repo " @@ -72,18 +96,6 @@ def _fetch_and_write(self) -> None: with open(self.filename, "wb") as f: f.write(contents) - @cached_property - def file_metadata(self): - content_file = self._fetch_content() - if content_file is None: - return GitFileMeta( - exists=None, - ) - return GitFileMeta( - version=content_file.attributes.get("last_commit_id", ""), - exists=True, - ) - @requires_dependencies(["gitlab"], extras="gitlab") @dataclass diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py index 9efab2e52f..6c0dcaf722 100644 --- a/unstructured/ingest/interfaces.py +++ b/unstructured/ingest/interfaces.py @@ -70,6 +70,15 @@ class BaseConnectorConfig(ABC): """Abstract definition on which to define connector-specific attributes.""" +@dataclass +class SourceMetadata(DataClassJsonMixin, ABC): + date_created: t.Optional[str] = None + date_modified: t.Optional[str] = None + version: t.Optional[str] = None + source_url: t.Optional[str] = None + exists: t.Optional[bool] = None + + @dataclass class BaseIngestDoc(DataClassJsonMixin, ABC): """An "ingest document" is specific to a connector, and provides @@ -83,6 +92,7 @@ class BaseIngestDoc(DataClassJsonMixin, ABC): read_config: ReadConfig partition_config: PartitionConfig connector_config: BaseConnectorConfig + source_metadata: t.Optional[SourceMetadata] = field(init=False, default=None) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -91,23 +101,29 @@ def __init__(self, *args, **kwargs): @property def date_created(self) -> t.Optional[str]: """The date the document was created on the source system.""" - return None + if self.source_metadata is None: + self.update_source_metadata() + return self.source_metadata.date_created # type: ignore @property def date_modified(self) -> t.Optional[str]: """The date the document was last modified on the source system.""" - return None + if self.source_metadata is None: + self.update_source_metadata() + return self.source_metadata.date_modified # type: ignore @property def date_processed(self) -> t.Optional[str]: """The date the document was last processed by Unstructured. self._date_processed is assigned internally in self.partition_file()""" - return self._date_processed + return self._date_processed # type: ignore @property def exists(self) -> t.Optional[bool]: """Whether the document exists on the remote source.""" - return None + if self.source_metadata is None: + self.update_source_metadata() + return self.source_metadata.exists # type: ignore @property @abstractmethod @@ -128,14 +144,18 @@ def record_locator(self) -> t.Optional[t.Dict[str, t.Any]]: # Values must be JS @property def source_url(self) -> t.Optional[str]: """The url of the source document.""" - return None + if self.source_metadata is None: + self.update_source_metadata() + return self.source_metadata.source_url # type: ignore @property def version(self) -> t.Optional[str]: """The version of the source document, this could be the last modified date, an explicit version number, or anything else that can be used to uniquely identify the version of the document.""" - return None + if self.source_metadata is None: + self.update_source_metadata() + return self.source_metadata.version # type: ignore @abstractmethod def cleanup_file(self): @@ -160,6 +180,10 @@ def wrapper(self, *args, **kwargs): return wrapper + def update_source_metadata(self, **kwargs) -> None: + """Sets the SourceMetadata and the properties for the doc""" + pass + # NOTE(crag): Future BaseIngestDoc classes could define get_file_object() methods # in addition to or instead of get_file() @abstractmethod From 1f2f0f1c005f52d972fde10a3589004965e8404b Mon Sep 17 00:00:00 2001 From: rvztz Date: Thu, 14 Sep 2023 02:29:00 -0600 Subject: [PATCH 11/23] sets default `source_metadata` --- unstructured/ingest/interfaces.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py index 6c0dcaf722..3a887874e5 100644 --- a/unstructured/ingest/interfaces.py +++ b/unstructured/ingest/interfaces.py @@ -180,9 +180,10 @@ def wrapper(self, *args, **kwargs): return wrapper + #TODO: set as @abstractmethod and pass or raise NotImplementedError def update_source_metadata(self, **kwargs) -> None: """Sets the SourceMetadata and the properties for the doc""" - pass + self.source_metadata = SourceMetadata() # NOTE(crag): Future BaseIngestDoc classes could define get_file_object() methods # in addition to or instead of get_file() From 54e2381a18265a971841c83985c26210d9793109 Mon Sep 17 00:00:00 2001 From: rvztz Date: Thu, 14 Sep 2023 02:37:01 -0600 Subject: [PATCH 12/23] linting --- unstructured/ingest/interfaces.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py index 3a887874e5..cc9a020997 100644 --- a/unstructured/ingest/interfaces.py +++ b/unstructured/ingest/interfaces.py @@ -180,7 +180,7 @@ def wrapper(self, *args, **kwargs): return wrapper - #TODO: set as @abstractmethod and pass or raise NotImplementedError + # TODO: set as @abstractmethod and pass or raise NotImplementedError def update_source_metadata(self, **kwargs) -> None: """Sets the SourceMetadata and the properties for the doc""" self.source_metadata = SourceMetadata() From 56675af58239ebf46d4ef354746d9c8c1e1e34ea Mon Sep 17 00:00:00 2001 From: rvztz Date: Fri, 15 Sep 2023 08:32:58 -0600 Subject: [PATCH 13/23] Removes redundant exceptions --- unstructured/ingest/connector/github.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/unstructured/ingest/connector/github.py b/unstructured/ingest/connector/github.py index 3d4ea55d51..e8aefb9d92 100644 --- a/unstructured/ingest/connector/github.py +++ b/unstructured/ingest/connector/github.py @@ -67,9 +67,6 @@ def _fetch_content(self, is_content_file=False): except UnknownObjectException: logger.error(f"File doesn't exists {self.connector_config.url}/{self.path}") return None - except Exception: - logger.error(f"Error processing {self.connector_config.url}/{self.path}") - raise if is_content_file: return content_file @@ -129,13 +126,7 @@ class GitHubSourceConnector(GitSourceConnector): connector_config: SimpleGitHubConfig def get_ingest_docs(self): - from github.GithubException import UnknownObjectException - - try: - repo = self.connector_config.get_repo() - except UnknownObjectException: - logger.error(f"Repository {self.connector_config.repo_path} does not exist.") - return [] + repo = self.connector_config.get_repo() # Load the Git tree with all files, and then create Ingest docs # for all blobs, i.e. all files, ignoring directories sha = self.connector_config.branch or repo.default_branch From 53b20805dbe3552398cf6f5b7fd57da7964337ad Mon Sep 17 00:00:00 2001 From: rvztz Date: Fri, 15 Sep 2023 08:39:55 -0600 Subject: [PATCH 14/23] decouples logic between fetching repo file and actual content --- unstructured/ingest/connector/github.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/unstructured/ingest/connector/github.py b/unstructured/ingest/connector/github.py index e8aefb9d92..b09817c7fd 100644 --- a/unstructured/ingest/connector/github.py +++ b/unstructured/ingest/connector/github.py @@ -59,7 +59,7 @@ def date_created(self) -> t.Optional[str]: return None @requires_dependencies(["github"], extras="github") - def _fetch_content(self, is_content_file=False): + def _fetch_file(self): from github.GithubException import UnknownObjectException try: @@ -68,9 +68,9 @@ def _fetch_content(self, is_content_file=False): logger.error(f"File doesn't exists {self.connector_config.url}/{self.path}") return None - if is_content_file: - return content_file + return content_file + def _fetch_content(self, content_file): contents = b"" if ( not content_file.content # type: ignore @@ -90,7 +90,7 @@ def _fetch_content(self, is_content_file=False): return contents def update_source_metadata(self, **kwargs): - content_file = self._fetch_content(True) + content_file = kwargs.get("content_file", self._fetch_file()) if content_file is None: self.source_metadata = SourceMetadata( exists=False, @@ -109,8 +109,9 @@ def update_source_metadata(self, **kwargs): ) def _fetch_and_write(self) -> None: - contents = self._fetch_content() - self.update_source_metadata() + content_file = self._fetch_file() + self.update_source_metadata(content_file=content_file) + contents = self._fetch_content(content_file) if contents is None: raise ValueError( f"Failed to retrieve file from repo " From 9ae8515fe1a2f1ddd58547a59640e521ebc03d91 Mon Sep 17 00:00:00 2001 From: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Date: Fri, 15 Sep 2023 16:54:04 -0700 Subject: [PATCH 15/23] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 205c6e063a..79e8ab2471 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,7 @@ * **Better debug output related to sentence counting internals**. Clarify message when sentence is not counted toward sentence count because there aren't enough words, relevant for developers focused on `unstructured`s NLP internals. * **Faster ocr_only speed for partitioning PDF and images.** Use `unstructured_pytesseract.run_and_get_multiple_output` function to reduce the number of calls to `tesseract` by half when partitioning pdf or image with `tesseract` * **Adds data source properties to fsspec connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. -* **Adds data source properties (date_created, date_modified, version, exists, source_url, record_locator) to the git base interface.** Implements `update_source_metadata` method in git-based connectors. +**Adds data source properties to git connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. ### Features From 6c4945d0060d623a1fc3a17372fc8aace384b46c Mon Sep 17 00:00:00 2001 From: rvztz Date: Wed, 20 Sep 2023 10:49:29 -0600 Subject: [PATCH 16/23] Version bump --- CHANGELOG.md | 2 +- unstructured/__version__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fdf7ef56b2..3921e84ed3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.10.17-dev2 +## 0.10.17-dev3 ### Enhancements diff --git a/unstructured/__version__.py b/unstructured/__version__.py index bbc523ae99..ae5d8af264 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.17-dev2" # pragma: no cover +__version__ = "0.10.17-dev3" # pragma: no cover From 06d0396464593ead9967138e36897f2bce604f66 Mon Sep 17 00:00:00 2001 From: rvztz Date: Wed, 20 Sep 2023 23:35:23 -0600 Subject: [PATCH 17/23] Removes url property from `record_locator`. Adds branch property to `record_locator` --- unstructured/ingest/connector/git.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unstructured/ingest/connector/git.py b/unstructured/ingest/connector/git.py index 148623fed0..a6d811d0f0 100644 --- a/unstructured/ingest/connector/git.py +++ b/unstructured/ingest/connector/git.py @@ -40,9 +40,9 @@ def _output_filename(self): @property def record_locator(self) -> t.Dict[str, t.Any]: return { - "url": self.connector_config.url, "repo_path": self.connector_config.repo_path, "file_path": self.path, + "branch": self.connector_config.branch, } def _create_full_tmp_dir_path(self): @@ -55,6 +55,7 @@ def update_source_metadata(self, **kwargs): @SourceConnectionError.wrap @BaseIngestDoc.skip_if_file_exists def get_file(self): + print(self) """Fetches the "remote" doc and stores it locally on the filesystem.""" self._create_full_tmp_dir_path() logger.debug(f"Fetching {self} - PID: {os.getpid()}") From 404638586dc4ec2ebf7241671788dca5c3416aba Mon Sep 17 00:00:00 2001 From: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Date: Thu, 21 Sep 2023 13:08:14 -0700 Subject: [PATCH 18/23] Adds data source properties to git connectors <- Ingest test fixtures update (#1491) This pull request includes updated ingest test fixtures. Please review and merge if appropriate. Co-authored-by: rvztz --- .../github/LICENSE.txt.json | 20 ++++++------ .../github/test.html.json | 32 +++++++++---------- .../d3d87fc6-61cc-4bb5-89ed-e9dff0df1526.json | 16 ---------- 3 files changed, 26 insertions(+), 42 deletions(-) delete mode 100644 test_unstructured_ingest/expected-structured-output/notion/d3d87fc6-61cc-4bb5-89ed-e9dff0df1526.json diff --git a/test_unstructured_ingest/expected-structured-output/github/LICENSE.txt.json b/test_unstructured_ingest/expected-structured-output/github/LICENSE.txt.json index 6759a07eea..d5355cc1ff 100644 --- a/test_unstructured_ingest/expected-structured-output/github/LICENSE.txt.json +++ b/test_unstructured_ingest/expected-structured-output/github/LICENSE.txt.json @@ -7,9 +7,9 @@ "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/LICENSE.txt", "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", "record_locator": { - "url": "dcneiner/Downloadify", "repo_path": "dcneiner/Downloadify", - "file_path": "LICENSE.txt" + "file_path": "LICENSE.txt", + "branch": null }, "date_modified": "2010-01-23T23:18:40" }, @@ -25,9 +25,9 @@ "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/LICENSE.txt", "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", "record_locator": { - "url": "dcneiner/Downloadify", "repo_path": "dcneiner/Downloadify", - "file_path": "LICENSE.txt" + "file_path": "LICENSE.txt", + "branch": null }, "date_modified": "2010-01-23T23:18:40" }, @@ -43,9 +43,9 @@ "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/LICENSE.txt", "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", "record_locator": { - "url": "dcneiner/Downloadify", "repo_path": "dcneiner/Downloadify", - "file_path": "LICENSE.txt" + "file_path": "LICENSE.txt", + "branch": null }, "date_modified": "2010-01-23T23:18:40" }, @@ -61,9 +61,9 @@ "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/LICENSE.txt", "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", "record_locator": { - "url": "dcneiner/Downloadify", "repo_path": "dcneiner/Downloadify", - "file_path": "LICENSE.txt" + "file_path": "LICENSE.txt", + "branch": null }, "date_modified": "2010-01-23T23:18:40" }, @@ -79,9 +79,9 @@ "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/LICENSE.txt", "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", "record_locator": { - "url": "dcneiner/Downloadify", "repo_path": "dcneiner/Downloadify", - "file_path": "LICENSE.txt" + "file_path": "LICENSE.txt", + "branch": null }, "date_modified": "2010-01-23T23:18:40" }, diff --git a/test_unstructured_ingest/expected-structured-output/github/test.html.json b/test_unstructured_ingest/expected-structured-output/github/test.html.json index 4aecfd4cfb..a0d0eaabfc 100644 --- a/test_unstructured_ingest/expected-structured-output/github/test.html.json +++ b/test_unstructured_ingest/expected-structured-output/github/test.html.json @@ -7,9 +7,9 @@ "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", "record_locator": { - "url": "dcneiner/Downloadify", "repo_path": "dcneiner/Downloadify", - "file_path": "test.html" + "file_path": "test.html", + "branch": null }, "date_modified": "2010-01-23T23:18:40" }, @@ -26,9 +26,9 @@ "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", "record_locator": { - "url": "dcneiner/Downloadify", "repo_path": "dcneiner/Downloadify", - "file_path": "test.html" + "file_path": "test.html", + "branch": null }, "date_modified": "2010-01-23T23:18:40" }, @@ -51,9 +51,9 @@ "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", "record_locator": { - "url": "dcneiner/Downloadify", "repo_path": "dcneiner/Downloadify", - "file_path": "test.html" + "file_path": "test.html", + "branch": null }, "date_modified": "2010-01-23T23:18:40" }, @@ -70,9 +70,9 @@ "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", "record_locator": { - "url": "dcneiner/Downloadify", "repo_path": "dcneiner/Downloadify", - "file_path": "test.html" + "file_path": "test.html", + "branch": null }, "date_modified": "2010-01-23T23:18:40" }, @@ -89,9 +89,9 @@ "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", "record_locator": { - "url": "dcneiner/Downloadify", "repo_path": "dcneiner/Downloadify", - "file_path": "test.html" + "file_path": "test.html", + "branch": null }, "date_modified": "2010-01-23T23:18:40" }, @@ -108,9 +108,9 @@ "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", "record_locator": { - "url": "dcneiner/Downloadify", "repo_path": "dcneiner/Downloadify", - "file_path": "test.html" + "file_path": "test.html", + "branch": null }, "date_modified": "2010-01-23T23:18:40" }, @@ -127,9 +127,9 @@ "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", "record_locator": { - "url": "dcneiner/Downloadify", "repo_path": "dcneiner/Downloadify", - "file_path": "test.html" + "file_path": "test.html", + "branch": null }, "date_modified": "2010-01-23T23:18:40" }, @@ -146,9 +146,9 @@ "url": "https://raw.githubusercontent.com/dcneiner/Downloadify/master/test.html", "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", "record_locator": { - "url": "dcneiner/Downloadify", "repo_path": "dcneiner/Downloadify", - "file_path": "test.html" + "file_path": "test.html", + "branch": null }, "date_modified": "2010-01-23T23:18:40" }, diff --git a/test_unstructured_ingest/expected-structured-output/notion/d3d87fc6-61cc-4bb5-89ed-e9dff0df1526.json b/test_unstructured_ingest/expected-structured-output/notion/d3d87fc6-61cc-4bb5-89ed-e9dff0df1526.json deleted file mode 100644 index cede094b13..0000000000 --- a/test_unstructured_ingest/expected-structured-output/notion/d3d87fc6-61cc-4bb5-89ed-e9dff0df1526.json +++ /dev/null @@ -1,16 +0,0 @@ -[ - { - "type": "Table", - "element_id": "2026c17673ac794e40e78d1c8e28df5c", - "metadata": { - "data_source": { - "date_created": "2023-08-02T20:36:00.000Z", - "date_modified": "2023-08-17T18:49:00.000Z" - }, - "filetype": "text/html", - "page_number": 1, - "text_as_html": "






Completed tasks DatesMy Checkbox My Created ByMy Created TimeMy DateMy EmailMy Person My TextMy formula My multiselectMy number My phone numMy select Sprint IDSprint nameSprint statusTasksTotal tasks URL
0 2023-08-14 - 2023-08-27 Roman Isecke 2023-08-02T20:36:00.000Z 2023-08-31 email@custom.domaine DevOps-Bot False Option 1 12 SPRI1-2 Next notion://sprints/sprint_task_relation 1
0 2023-08-28 - 2023-09-10 Roman Isecke 2023-08-02T20:36:00.000Z 2023-08-29T00:00:00.000-04:00 - 2023-08-31T00:00:00.000-04:00 text More text with link False 45666645345465454 option 1 SPRI1-3 Sprint 3 Future notion://sprints/sprint_task_relation 1
0.25 2023-07-31 - 2023-08-13 Roman Isecke 2023-08-02T20:36:00.000Z 2023-08-07 roman@unstructured.io Roman Isecke Jason Scheirer This is someformattedtext TrueOption 2 Option 1 32 1234 option 2 SPRI1-1 Sprint 1 Currentnotion://sprints/sprint_task_relation
4
www.google.com
" - }, - "text": "Completed tasks\n \n \n Dates\n \n \n My Checkbox\n \n \n My Created By\n \n \n My Created Time\n \n \n My Date\n \n \n My Email\n \n \n My Person\n \n \n My Text\n \n \n My formula\n \n \n My multiselect\n \n \n My number\n \n \n My phone num\n \n \n My select\n \n \n Sprint ID\n \n \n Sprint name\n \n \n Sprint status\n \n \n Tasks\n \n \n Total tasks\n \n \n URL\n \n \n \n \n 0\n \n \n \n \n 2023-08-14 - 2023-08-27\n \n \n \n \n \n \n \n \n \n Roman Isecke\n \n \n \n \n 2023-08-02T20:36:00.000Z\n \n \n \n \n 2023-08-31\n \n \n \n \n email@custom.domaine\n \n \n \n \n \n \n DevOps-Bot\n \n \n \n \n \n \n \n \n \n False\n \n \n \n \n \n Option 1\n \n \n \n \n \n 12\n \n \n \n \n \n \n \n \n \n \n SPRI1-2\n \n \n \n \n \n \n \n Next\n \n \n \n \n notion://sprints/sprint_task_relation\n \n \n \n \n 1\n \n \n \n \n \n \n \n \n 0\n \n \n \n \n 2023-08-28 - 2023-09-10\n \n \n \n \n \n \n \n \n \n Roman Isecke\n \n \n \n \n 2023-08-02T20:36:00.000Z\n \n \n \n \n 2023-08-29T00:00:00.000-04:00 - 2023-08-31T00:00:00.000-04:00\n \n \n \n \n text\n \n \n \n \n \n \n \n \n More \n \n \n \n text\n \n \n \n with \n \n \n \n link\n \n \n \n \n \n \n False\n \n \n \n \n \n \n \n \n \n \n 45666645345465454\n \n \n \n \n option 1\n \n \n \n \n SPRI1-3\n \n \n \n \n Sprint 3\n \n \n \n \n Future\n \n \n \n \n notion://sprints/sprint_task_relation\n \n \n \n \n 1\n \n \n \n \n \n \n \n \n 0.25\n \n \n \n \n 2023-07-31 - 2023-08-13\n \n \n \n \n \n \n \n \n \n Roman Isecke\n \n \n \n \n 2023-08-02T20:36:00.000Z\n \n \n \n \n 2023-08-07\n \n \n \n \n roman@unstructured.io\n \n \n \n \n \n \n Roman Isecke\n \n \n \n \n Jason Scheirer\n \n \n \n \n \n \n \n This is some \n \n \n \n formatted\n \n \n \n text\n \n \n \n \n \n True\n \n \n \n \n \n Option 2\n \n \n Option 1\n \n \n \n \n \n 32\n \n \n \n \n 1234\n \n \n \n \n option 2\n \n \n \n \n SPRI1-1\n \n \n \n \n Sprint 1\n \n \n \n \n Current\n \n \n \n \n notion://sprints/sprint_task_relation\n \n \n \n \n 4\n \n \n \n \n www.google.com" - } -] \ No newline at end of file From 4fe653e281d4ce653e6e16cefd7ff29cd8c4feb0 Mon Sep 17 00:00:00 2001 From: rvztz Date: Wed, 27 Sep 2023 04:10:58 -0600 Subject: [PATCH 19/23] Avoids setting `branch` on `record_locator`if its value is None --- unstructured/ingest/connector/git.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/unstructured/ingest/connector/git.py b/unstructured/ingest/connector/git.py index a6d811d0f0..8807436f45 100644 --- a/unstructured/ingest/connector/git.py +++ b/unstructured/ingest/connector/git.py @@ -39,11 +39,13 @@ def _output_filename(self): @property def record_locator(self) -> t.Dict[str, t.Any]: - return { + record_locator = { "repo_path": self.connector_config.repo_path, "file_path": self.path, - "branch": self.connector_config.branch, } + if self.connector_config.branch is not None: + record_locator["branch"] = self.connector_config.branch + return record_locator def _create_full_tmp_dir_path(self): """includes directories in in the gitlab repository""" From ff02748309d600e15b71ee7eaf631c21f7ae6bd5 Mon Sep 17 00:00:00 2001 From: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Date: Wed, 27 Sep 2023 12:48:21 -0700 Subject: [PATCH 20/23] Adds data source properties to git connectors <- Ingest test fixtures update (#1554) This pull request includes updated ingest test fixtures. Please review and merge if appropriate. Co-authored-by: rvztz --- .../github/LICENSE.txt.json | 40 ++++++++++++------- .../github/test.html.json | 24 ++++------- .../60377009-e6b2-47f3-a8ff-f159fd8b69f5.json | 28 ------------- .../d3d87fc6-61cc-4bb5-89ed-e9dff0df1526.json | 16 ++++++++ 4 files changed, 49 insertions(+), 59 deletions(-) delete mode 100644 test_unstructured_ingest/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.json create mode 100644 test_unstructured_ingest/expected-structured-output/notion/d3d87fc6-61cc-4bb5-89ed-e9dff0df1526.json diff --git a/test_unstructured_ingest/expected-structured-output/github/LICENSE.txt.json b/test_unstructured_ingest/expected-structured-output/github/LICENSE.txt.json index d5355cc1ff..6c1bd7d9c5 100644 --- a/test_unstructured_ingest/expected-structured-output/github/LICENSE.txt.json +++ b/test_unstructured_ingest/expected-structured-output/github/LICENSE.txt.json @@ -8,12 +8,14 @@ "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", "record_locator": { "repo_path": "dcneiner/Downloadify", - "file_path": "LICENSE.txt", - "branch": null + "file_path": "LICENSE.txt" }, "date_modified": "2010-01-23T23:18:40" }, - "filetype": "text/plain" + "filetype": "text/plain", + "languages": [ + "eng" + ] }, "text": "Downloadify: Client Side File Creation JavaScript + Flash Library" }, @@ -26,12 +28,14 @@ "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", "record_locator": { "repo_path": "dcneiner/Downloadify", - "file_path": "LICENSE.txt", - "branch": null + "file_path": "LICENSE.txt" }, "date_modified": "2010-01-23T23:18:40" }, - "filetype": "text/plain" + "filetype": "text/plain", + "languages": [ + "eng" + ] }, "text": "Copyright (c) 2009 Douglas C. Neiner" }, @@ -44,12 +48,14 @@ "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", "record_locator": { "repo_path": "dcneiner/Downloadify", - "file_path": "LICENSE.txt", - "branch": null + "file_path": "LICENSE.txt" }, "date_modified": "2010-01-23T23:18:40" }, - "filetype": "text/plain" + "filetype": "text/plain", + "languages": [ + "eng" + ] }, "text": "Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:" }, @@ -62,12 +68,14 @@ "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", "record_locator": { "repo_path": "dcneiner/Downloadify", - "file_path": "LICENSE.txt", - "branch": null + "file_path": "LICENSE.txt" }, "date_modified": "2010-01-23T23:18:40" }, - "filetype": "text/plain" + "filetype": "text/plain", + "languages": [ + "eng" + ] }, "text": "The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software." }, @@ -80,12 +88,14 @@ "version": "W/\"2c4f1ab8689a6dfef4ee7d13d4d935cb6663a7e4\"", "record_locator": { "repo_path": "dcneiner/Downloadify", - "file_path": "LICENSE.txt", - "branch": null + "file_path": "LICENSE.txt" }, "date_modified": "2010-01-23T23:18:40" }, - "filetype": "text/plain" + "filetype": "text/plain", + "languages": [ + "eng" + ] }, "text": "THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE." } diff --git a/test_unstructured_ingest/expected-structured-output/github/test.html.json b/test_unstructured_ingest/expected-structured-output/github/test.html.json index a0d0eaabfc..40ce4ce970 100644 --- a/test_unstructured_ingest/expected-structured-output/github/test.html.json +++ b/test_unstructured_ingest/expected-structured-output/github/test.html.json @@ -8,8 +8,7 @@ "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", "record_locator": { "repo_path": "dcneiner/Downloadify", - "file_path": "test.html", - "branch": null + "file_path": "test.html" }, "date_modified": "2010-01-23T23:18:40" }, @@ -27,8 +26,7 @@ "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", "record_locator": { "repo_path": "dcneiner/Downloadify", - "file_path": "test.html", - "branch": null + "file_path": "test.html" }, "date_modified": "2010-01-23T23:18:40" }, @@ -52,8 +50,7 @@ "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", "record_locator": { "repo_path": "dcneiner/Downloadify", - "file_path": "test.html", - "branch": null + "file_path": "test.html" }, "date_modified": "2010-01-23T23:18:40" }, @@ -71,8 +68,7 @@ "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", "record_locator": { "repo_path": "dcneiner/Downloadify", - "file_path": "test.html", - "branch": null + "file_path": "test.html" }, "date_modified": "2010-01-23T23:18:40" }, @@ -90,8 +86,7 @@ "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", "record_locator": { "repo_path": "dcneiner/Downloadify", - "file_path": "test.html", - "branch": null + "file_path": "test.html" }, "date_modified": "2010-01-23T23:18:40" }, @@ -109,8 +104,7 @@ "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", "record_locator": { "repo_path": "dcneiner/Downloadify", - "file_path": "test.html", - "branch": null + "file_path": "test.html" }, "date_modified": "2010-01-23T23:18:40" }, @@ -128,8 +122,7 @@ "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", "record_locator": { "repo_path": "dcneiner/Downloadify", - "file_path": "test.html", - "branch": null + "file_path": "test.html" }, "date_modified": "2010-01-23T23:18:40" }, @@ -147,8 +140,7 @@ "version": "W/\"c63c8fc21d46d44de85a14a3ed4baec0348ce344\"", "record_locator": { "repo_path": "dcneiner/Downloadify", - "file_path": "test.html", - "branch": null + "file_path": "test.html" }, "date_modified": "2010-01-23T23:18:40" }, diff --git a/test_unstructured_ingest/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.json b/test_unstructured_ingest/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.json deleted file mode 100644 index a12a7d39d6..0000000000 --- a/test_unstructured_ingest/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.json +++ /dev/null @@ -1,28 +0,0 @@ -[ - { - "type": "Title", - "element_id": "c911244e369f9ee203656a820c260e4d", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "Vacation Policy" - }, - { - "type": "NarrativeText", - "element_id": "94bc9e2e465cfac3060a7f7ab8082e89", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 - }, - "text": "💡\n \n Notion Tip: Vacation policies are crucial for employee well-being and productivity. They provide rest and recharge, reduce burnout and increase job satisfaction." - } -] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/notion/d3d87fc6-61cc-4bb5-89ed-e9dff0df1526.json b/test_unstructured_ingest/expected-structured-output/notion/d3d87fc6-61cc-4bb5-89ed-e9dff0df1526.json new file mode 100644 index 0000000000..cede094b13 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/notion/d3d87fc6-61cc-4bb5-89ed-e9dff0df1526.json @@ -0,0 +1,16 @@ +[ + { + "type": "Table", + "element_id": "2026c17673ac794e40e78d1c8e28df5c", + "metadata": { + "data_source": { + "date_created": "2023-08-02T20:36:00.000Z", + "date_modified": "2023-08-17T18:49:00.000Z" + }, + "filetype": "text/html", + "page_number": 1, + "text_as_html": "






Completed tasks DatesMy Checkbox My Created ByMy Created TimeMy DateMy EmailMy Person My TextMy formula My multiselectMy number My phone numMy select Sprint IDSprint nameSprint statusTasksTotal tasks URL
0 2023-08-14 - 2023-08-27 Roman Isecke 2023-08-02T20:36:00.000Z 2023-08-31 email@custom.domaine DevOps-Bot False Option 1 12 SPRI1-2 Next notion://sprints/sprint_task_relation 1
0 2023-08-28 - 2023-09-10 Roman Isecke 2023-08-02T20:36:00.000Z 2023-08-29T00:00:00.000-04:00 - 2023-08-31T00:00:00.000-04:00 text More text with link False 45666645345465454 option 1 SPRI1-3 Sprint 3 Future notion://sprints/sprint_task_relation 1
0.25 2023-07-31 - 2023-08-13 Roman Isecke 2023-08-02T20:36:00.000Z 2023-08-07 roman@unstructured.io Roman Isecke Jason Scheirer This is someformattedtext TrueOption 2 Option 1 32 1234 option 2 SPRI1-1 Sprint 1 Currentnotion://sprints/sprint_task_relation
4
www.google.com
" + }, + "text": "Completed tasks\n \n \n Dates\n \n \n My Checkbox\n \n \n My Created By\n \n \n My Created Time\n \n \n My Date\n \n \n My Email\n \n \n My Person\n \n \n My Text\n \n \n My formula\n \n \n My multiselect\n \n \n My number\n \n \n My phone num\n \n \n My select\n \n \n Sprint ID\n \n \n Sprint name\n \n \n Sprint status\n \n \n Tasks\n \n \n Total tasks\n \n \n URL\n \n \n \n \n 0\n \n \n \n \n 2023-08-14 - 2023-08-27\n \n \n \n \n \n \n \n \n \n Roman Isecke\n \n \n \n \n 2023-08-02T20:36:00.000Z\n \n \n \n \n 2023-08-31\n \n \n \n \n email@custom.domaine\n \n \n \n \n \n \n DevOps-Bot\n \n \n \n \n \n \n \n \n \n False\n \n \n \n \n \n Option 1\n \n \n \n \n \n 12\n \n \n \n \n \n \n \n \n \n \n SPRI1-2\n \n \n \n \n \n \n \n Next\n \n \n \n \n notion://sprints/sprint_task_relation\n \n \n \n \n 1\n \n \n \n \n \n \n \n \n 0\n \n \n \n \n 2023-08-28 - 2023-09-10\n \n \n \n \n \n \n \n \n \n Roman Isecke\n \n \n \n \n 2023-08-02T20:36:00.000Z\n \n \n \n \n 2023-08-29T00:00:00.000-04:00 - 2023-08-31T00:00:00.000-04:00\n \n \n \n \n text\n \n \n \n \n \n \n \n \n More \n \n \n \n text\n \n \n \n with \n \n \n \n link\n \n \n \n \n \n \n False\n \n \n \n \n \n \n \n \n \n \n 45666645345465454\n \n \n \n \n option 1\n \n \n \n \n SPRI1-3\n \n \n \n \n Sprint 3\n \n \n \n \n Future\n \n \n \n \n notion://sprints/sprint_task_relation\n \n \n \n \n 1\n \n \n \n \n \n \n \n \n 0.25\n \n \n \n \n 2023-07-31 - 2023-08-13\n \n \n \n \n \n \n \n \n \n Roman Isecke\n \n \n \n \n 2023-08-02T20:36:00.000Z\n \n \n \n \n 2023-08-07\n \n \n \n \n roman@unstructured.io\n \n \n \n \n \n \n Roman Isecke\n \n \n \n \n Jason Scheirer\n \n \n \n \n \n \n \n This is some \n \n \n \n formatted\n \n \n \n text\n \n \n \n \n \n True\n \n \n \n \n \n Option 2\n \n \n Option 1\n \n \n \n \n \n 32\n \n \n \n \n 1234\n \n \n \n \n option 2\n \n \n \n \n SPRI1-1\n \n \n \n \n Sprint 1\n \n \n \n \n Current\n \n \n \n \n notion://sprints/sprint_task_relation\n \n \n \n \n 4\n \n \n \n \n www.google.com" + } +] \ No newline at end of file From a76a1409fabc50a423f8da9abf9eafb9a2826ea9 Mon Sep 17 00:00:00 2001 From: rvztz Date: Thu, 28 Sep 2023 21:17:39 -0600 Subject: [PATCH 21/23] Adds previously removed expected-structured-output for notion --- .../60377009-e6b2-47f3-a8ff-f159fd8b69f5.json | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 test_unstructured_ingest/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.json diff --git a/test_unstructured_ingest/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.json b/test_unstructured_ingest/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.json new file mode 100644 index 0000000000..0b592fcd63 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.json @@ -0,0 +1,28 @@ +[ + { + "type": "Title", + "element_id": "c911244e369f9ee203656a820c260e4d", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "Vacation Policy" + }, + { + "type": "NarrativeText", + "element_id": "94bc9e2e465cfac3060a7f7ab8082e89", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" + }, + "filetype": "text/html", + "page_number": 1 + }, + "text": "💡\n \n Notion Tip: Vacation policies are crucial for employee well-being and productivity. They provide rest and recharge, reduce burnout and increase job satisfaction." + } + ] \ No newline at end of file From 04ad11f3ed86c14e7a4bbffcf3cff4a97d913987 Mon Sep 17 00:00:00 2001 From: rvztz Date: Thu, 28 Sep 2023 21:22:00 -0600 Subject: [PATCH 22/23] removes additional spaces --- .../60377009-e6b2-47f3-a8ff-f159fd8b69f5.json | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/test_unstructured_ingest/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.json b/test_unstructured_ingest/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.json index 0b592fcd63..a12a7d39d6 100644 --- a/test_unstructured_ingest/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.json +++ b/test_unstructured_ingest/expected-structured-output/notion/60377009-e6b2-47f3-a8ff-f159fd8b69f5.json @@ -1,28 +1,28 @@ [ - { - "type": "Title", - "element_id": "c911244e369f9ee203656a820c260e4d", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 + { + "type": "Title", + "element_id": "c911244e369f9ee203656a820c260e4d", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" }, - "text": "Vacation Policy" + "filetype": "text/html", + "page_number": 1 }, - { - "type": "NarrativeText", - "element_id": "94bc9e2e465cfac3060a7f7ab8082e89", - "metadata": { - "data_source": { - "date_created": "2023-08-04T18:31:00.000Z", - "date_modified": "2023-08-04T18:31:00.000Z" - }, - "filetype": "text/html", - "page_number": 1 + "text": "Vacation Policy" + }, + { + "type": "NarrativeText", + "element_id": "94bc9e2e465cfac3060a7f7ab8082e89", + "metadata": { + "data_source": { + "date_created": "2023-08-04T18:31:00.000Z", + "date_modified": "2023-08-04T18:31:00.000Z" }, - "text": "💡\n \n Notion Tip: Vacation policies are crucial for employee well-being and productivity. They provide rest and recharge, reduce burnout and increase job satisfaction." - } - ] \ No newline at end of file + "filetype": "text/html", + "page_number": 1 + }, + "text": "💡\n \n Notion Tip: Vacation policies are crucial for employee well-being and productivity. They provide rest and recharge, reduce burnout and increase job satisfaction." + } +] \ No newline at end of file From fa77878c6cbfe5899f4cbf2f746b33fa2132e519 Mon Sep 17 00:00:00 2001 From: rvztz Date: Tue, 3 Oct 2023 16:16:43 -0600 Subject: [PATCH 23/23] version bump --- CHANGELOG.md | 6 +++--- unstructured/__version__.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d5ae77ffe1..5d77b62931 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,11 @@ -## 0.10.19-dev9 +## 0.10.19-dev10 ### Enhancements * **bump `unstructured-inference` to `0.6.6`** The updated version of `unstructured-inference` makes table extraction in `hi_res` mode configurable to fine tune table extraction performance; it also improves element detection by adding a deduplication post processing step in the `hi_res` partitioning of pdfs and images. * **Detect text in HTML Heading Tags as Titles** This will increase the accuracy of hierarchies in HTML documents and provide more accurate element categorization. If text is in an HTML heading tag and is not a list item, address, or narrative text, categorize it as a title. * **Update python-based docs** Refactor docs to use the actual unstructured code rather than using the subprocess library to run the cli command itself. -* **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, and Slack connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. +* **Adds data source properties to Github and Gitlab connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. * **Adds Table support for the `add_chunking_strategy` decorator to partition functions.** In addition to combining elements under Title elements, user's can now specify the `max_characters=` argument to chunk Table elements into TableChunk elements with `text` and `text_as_html` of length characters. This means partitioned Table results are ready for use in downstream applications without any post processing. * **Expose endpoint url for s3 connectors** By allowing for the endpoint url to be explicitly overwritten, this allows for any non-AWS data providers supporting the s3 protocol to be supported (i.e. minio). @@ -27,7 +27,7 @@ * **Better detection of natural reading order in images and PDF's** The elements returned by partition better reflect natural reading order in some cases, particularly in complicated multi-column layouts, leading to better chunking and retrieval for downstream applications. Achieved by improving the `xy-cut` sorting to preprocess bboxes, shrinking all bounding boxes by 90% along x and y axes (still centered around the same center point), which allows projection lines to be drawn where not possible before if layout bboxes overlapped. * **Improves `partition_xml` to be faster and more memory efficient when partitioning large XML files** The new behavior is to partition iteratively to prevent loading the entire XML tree into memory at once in most use cases. -* **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, Slack, DeltaTable, Github and Gitlab connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. +* **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, Slack, DeltaTable** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. * **Add functionality to save embedded images in PDF's separately as images** This allows users to save embedded images in PDF's separately as images, given some directory path. The saved image path is written to the metadata for the Image element. Downstream applications may benefit by providing users with image links from relevant "hits." * **Azure Cognite Search destination connector** New Azure Cognitive Search destination connector added to ingest CLI. Users may now use `unstructured-ingest` to write partitioned data from over 20 data sources (so far) to an Azure Cognitive Search index. * **Improves salesforce partitioning** Partitions Salesforce data as xlm instead of text for improved detail and flexibility. Partitions htmlbody instead of textbody for Salesforce emails. Importance: Allows all Salesforce fields to be ingested and gives Salesforce emails more detailed partitioning. diff --git a/unstructured/__version__.py b/unstructured/__version__.py index d71d465e92..3d63527b85 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.19-dev9" # pragma: no cover +__version__ = "0.10.19-dev10" # pragma: no cover