Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
  • Loading branch information
Klaijan committed Oct 6, 2023
2 parents cfb766a + e450342 commit 31b3098
Show file tree
Hide file tree
Showing 16 changed files with 802 additions and 803 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

### Enhancements

* **Align to top left when shrinking bounding boxes for `xy-curt` sorting:** Update `shrink_bbox()` to keep top left rather than center
* **Add visualization script to annotate elements** This script is often used to analyze/visualize elements with coordinates (e.g. partition_pdf()).
* **Adds data source properties to the Jira connector** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc.
* **Improve title detection in pptx documents** The default title textboxes on a pptx slide are now categorized as titles.
Expand Down
8 changes: 4 additions & 4 deletions test_unstructured/partition/pdf-image/test_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,9 +200,9 @@ def test_partition_pdf_with_auto_strategy(
):
elements = pdf.partition_pdf(filename=filename, strategy="auto")
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
assert elements[7].text == title
assert elements[7].metadata.filename == "layout-parser-paper-fast.pdf"
assert elements[7].metadata.file_directory == "example-docs"
assert elements[6].text == title
assert elements[6].metadata.filename == "layout-parser-paper-fast.pdf"
assert elements[6].metadata.file_directory == "example-docs"


def test_partition_pdf_with_page_breaks(
Expand Down Expand Up @@ -521,7 +521,7 @@ def test_partition_pdf_with_auto_strategy_exclude_metadata(
include_metadata=False,
)
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
assert elements[7].text == title
assert elements[6].text == title
for i in range(len(elements)):
assert elements[i].metadata.to_dict() == {}

Expand Down
10 changes: 5 additions & 5 deletions test_unstructured/partition/utils/test_sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,12 +116,12 @@ def test_coordinates_to_bbox():


def test_shrink_bbox():
bbox = (0, 0, 100, 100)
shrink_factor = 0.5
expected_result = (25, 25, 75, 75)
bbox = (0, 0, 200, 100)
shrink_factor = 0.9
expected_result = (0, 0, 180, 90)
assert shrink_bbox(bbox, shrink_factor) == expected_result

bbox = (0, 0, 200, 100)
bbox = (20, 20, 320, 120)
shrink_factor = 0.9
expected_result = (10, 5, 190, 95)
expected_result = (20, 20, 290, 110)
assert shrink_bbox(bbox, shrink_factor) == expected_result

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,23 +1,4 @@
[
{
"type": "UncategorizedText",
"element_id": "e16bce609163ec96985ae522ca81502a",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.png",
"version": 328871203465633719836776597535876541325,
"record_locator": {
"protocol": "abfs",
"remote_file_path": "container1/IRS-form-1987.png"
},
"date_created": "2023-03-10T09:44:55+00:00",
"date_modified": "2023-03-10T09:44:55+00:00"
},
"filetype": "image/png",
"page_number": 1
},
"text": "‘A."
},
{
"type": "Title",
"element_id": "92405c82f76df8b2cbbc6047bd10e0ff",
Expand Down Expand Up @@ -341,6 +322,25 @@
},
"text": "Generally, applicants must complete Section In addition, complete the appropriate sections (B:1 through H) for which a change is desired."
},
{
"type": "UncategorizedText",
"element_id": "e16bce609163ec96985ae522ca81502a",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.png",
"version": 328871203465633719836776597535876541325,
"record_locator": {
"protocol": "abfs",
"remote_file_path": "container1/IRS-form-1987.png"
},
"date_created": "2023-03-10T09:44:55+00:00",
"date_modified": "2023-03-10T09:44:55+00:00"
},
"filetype": "image/png",
"page_number": 1
},
"text": "‘A."
},
{
"type": "NarrativeText",
"element_id": "bf2a070cb9d03d056e70b26bebf1ef79",
Expand Down Expand Up @@ -513,8 +513,8 @@
"text": "You should normally receive an acknowledgment of receipt of your application within 30 days. If you do not hear from IRS within 30 days of submitting your completed Form 3115, you may inquire as to the receipt of your application by writing to: Control Clerk, CC:C:4, Internal Revenue Service, Room 5040, 1111 Constitution Avenue, NW, Washington, DC 20224."
},
{
"type": "Title",
"element_id": "ea325d761f98c6b73320e442b67f2a35",
"type": "NarrativeText",
"element_id": "e3e2ccf4f0d1524d4f5ce42e8f2d1efa",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.png",
Expand All @@ -529,11 +529,11 @@
"filetype": "image/png",
"page_number": 1
},
"text": "an"
"text": "See section 5.03 of Rev. Proc. 84-74 for filing early application,"
},
{
"type": "NarrativeText",
"element_id": "e3e2ccf4f0d1524d4f5ce42e8f2d1efa",
"type": "Title",
"element_id": "ea325d761f98c6b73320e442b67f2a35",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.png",
Expand All @@ -548,7 +548,7 @@
"filetype": "image/png",
"page_number": 1
},
"text": "See section 5.03 of Rev. Proc. 84-74 for filing early application,"
"text": "an"
},
{
"type": "NarrativeText",
Expand Down Expand Up @@ -646,8 +646,8 @@
"text": "Individuals. —An individual should enter his or her social security number in this block. If the application is made on behalf of a husband and wife who file their income tax return jointly, enter the social security numbers of both."
},
{
"type": "Title",
"element_id": "ea325d761f98c6b73320e442b67f2a35",
"type": "NarrativeText",
"element_id": "e72d9c8a779a47796c4362b7885aa80b",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.png",
Expand All @@ -662,11 +662,11 @@
"filetype": "image/png",
"page_number": 1
},
"text": "an"
"text": "Others.-—The employer identification number applicant other than an individual should be entered in this block,"
},
{
"type": "NarrativeText",
"element_id": "e72d9c8a779a47796c4362b7885aa80b",
"type": "Title",
"element_id": "ea325d761f98c6b73320e442b67f2a35",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.png",
Expand All @@ -681,7 +681,7 @@
"filetype": "image/png",
"page_number": 1
},
"text": "Others.-—The employer identification number applicant other than an individual should be entered in this block,"
"text": "an"
},
{
"type": "Title",
Expand Down Expand Up @@ -855,8 +855,8 @@
"text": "Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page 6."
},
{
"type": "Title",
"element_id": "ca978112ca1bbdcafac231b39a23dc4d",
"type": "NarrativeText",
"element_id": "8200352b4e91b1be4f14e9248d50380a",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.png",
Expand All @@ -871,11 +871,11 @@
"filetype": "image/png",
"page_number": 1
},
"text": "a"
"text": "Ifthe individual or firm is also authorized to represent the applicant before the IRS, receive copy of the requested ruling, or perform any other act(s), the power of attorney must reflect such authorization(s)."
},
{
"type": "NarrativeText",
"element_id": "8200352b4e91b1be4f14e9248d50380a",
"type": "Title",
"element_id": "ca978112ca1bbdcafac231b39a23dc4d",
"metadata": {
"data_source": {
"url": "abfs://container1/IRS-form-1987.png",
Expand All @@ -890,7 +890,7 @@
"filetype": "image/png",
"page_number": 1
},
"text": "Ifthe individual or firm is also authorized to represent the applicant before the IRS, receive copy of the requested ruling, or perform any other act(s), the power of attorney must reflect such authorization(s)."
"text": "a"
},
{
"type": "Title",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -250,24 +250,24 @@
"text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457"
},
{
"type": "NarrativeText",
"element_id": "6928b78d26af54b6acb804ed319b5c05",
"type": "Table",
"element_id": "5eb814dac721c11581f011fbca57a17e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
"text": "How data were acquired"
"text": "How data were acquired Data format Experimental factors Experimental features Data source location Accessibility Related research article The cleaned and weighed specimen was suspended in beakers con- taining 0.5 M H2SO, solution of different concentrations of egg shell powder. The pre-weighed stainless steel samples were retrieved from the test solutions after every 24h, cleaned appropriately, dried and reweighed. Raw, analyzed The difference between the weight at a given time and the initial weight of the specimen was taken as the weight loss, which was used to calculate the corrosion rate and inhibition efficiency. Inhibitor concentration, exposure time Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa Data are available within this article O. Sanni, A. P. I. Popoola, and O. S. I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution using eco-friendly waste product, Results in Physics, 9 (2018) 225-230."
},
{
"type": "Table",
"element_id": "5eb814dac721c11581f011fbca57a17e",
"type": "NarrativeText",
"element_id": "6928b78d26af54b6acb804ed319b5c05",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
"text": "How data were acquired Data format Experimental factors Experimental features Data source location Accessibility Related research article The cleaned and weighed specimen was suspended in beakers con- taining 0.5 M H2SO, solution of different concentrations of egg shell powder. The pre-weighed stainless steel samples were retrieved from the test solutions after every 24h, cleaned appropriately, dried and reweighed. Raw, analyzed The difference between the weight at a given time and the initial weight of the specimen was taken as the weight loss, which was used to calculate the corrosion rate and inhibition efficiency. Inhibitor concentration, exposure time Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa Data are available within this article O. Sanni, A. P. I. Popoola, and O. S. I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution using eco-friendly waste product, Results in Physics, 9 (2018) 225-230."
"text": "How data were acquired"
},
{
"type": "NarrativeText",
Expand Down Expand Up @@ -419,6 +419,16 @@
},
"text": "O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457"
},
{
"type": "Image",
"element_id": "84d160dc9075c76de6f6d6c3f2651fe3",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": " Corrosion rate (mm/year) 24 48 72 96 120 144 168 192 Exposure time"
},
{
"type": "NarrativeText",
"element_id": "4f0139b605dfdd9eb93e920a6115e1b5",
Expand Down Expand Up @@ -449,16 +459,6 @@
},
"text": "i"
},
{
"type": "Image",
"element_id": "84d160dc9075c76de6f6d6c3f2651fe3",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": " Corrosion rate (mm/year) 24 48 72 96 120 144 168 192 Exposure time"
},
{
"type": "Title",
"element_id": "239bb77f5ec344ce5e614979b8c49742",
Expand Down Expand Up @@ -621,43 +621,43 @@
},
{
"type": "Title",
"element_id": "bcf00b4904f5661d6baef52e7e09e9b1",
"element_id": "362d4a20958df0c88550b9e5d1f2ef5b",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
"text": "bc (V/dec)"
"text": "Inhibitor concentration (g)"
},
{
"type": "Title",
"element_id": "12e486f4a9b3a1805bf7e95b5d01847b",
"element_id": "bcf00b4904f5661d6baef52e7e09e9b1",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
"text": "ba (V/dec)"
"text": "bc (V/dec)"
},
{
"type": "Title",
"element_id": "7bc31ed7ab5a625735657499f636c1f2",
"element_id": "12e486f4a9b3a1805bf7e95b5d01847b",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
"text": "Ecorr (V)"
"text": "ba (V/dec)"
},
{
"type": "Title",
"element_id": "362d4a20958df0c88550b9e5d1f2ef5b",
"element_id": "7bc31ed7ab5a625735657499f636c1f2",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
"text": "Inhibitor concentration (g)"
"text": "Ecorr (V)"
},
{
"type": "Title",
Expand Down Expand Up @@ -971,23 +971,23 @@
},
{
"type": "UncategorizedText",
"element_id": "33a2b57b388470db1cb13defbe73dc18",
"element_id": "825c6ae49ec498c873be5355109ca093",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 6
},
"text": "(cid:3)"
"text": "(cid:1) Þ ¼ 87:6W DAT"
},
{
"type": "UncategorizedText",
"element_id": "825c6ae49ec498c873be5355109ca093",
"element_id": "33a2b57b388470db1cb13defbe73dc18",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 6
},
"text": "(cid:1) Þ ¼ 87:6W DAT"
"text": "(cid:3)"
},
{
"type": "NarrativeText",
Expand Down
Loading

0 comments on commit 31b3098

Please sign in to comment.