Skip to content

Commit

Permalink
Merge branch 'main' into klaijan/remove-files-copy-from-ingest
Browse files Browse the repository at this point in the history
  • Loading branch information
Klaijan authored Nov 3, 2023
2 parents a886abf + ba4477a commit f943496
Show file tree
Hide file tree
Showing 26 changed files with 1,196 additions and 1,854 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
## 0.10.29-dev11
## 0.10.29-dev12

### Enhancements

* **Add include_header argument for partition_csv and partition_tsv** Now supports retaining header rows in CSV and TSV documents element partitioning.
* **Add retry logic for all source connectors** All http calls being made by the ingest source connectors have been isolated and wrapped by the `SourceConnectionNetworkError` custom error, which triggers the retry logic, if enabled, in the ingest pipeline.
* **Google Drive source connector supports credentials from memory** Originally, the connector expected a filepath to pull the credentials from when creating the client. This was expanded to support passing that information from memory as a dict if access to the file system might not be available.
* **Add support for generic partition configs in ingest cli** Along with the explicit partition options supported by the cli, an `additional_partition_args` arg was added to allow users to pass in any other arguments that should be added when calling partition(). This helps keep any changes to the input parameters of the partition() exposed in the CLI.
* **Map full output schema for table-based destination connectors** A full schema was introduced to map the type of all output content from the json partition output and mapped to a flattened table structure to leverage table-based destination connectors. The delta table destination connector was updated at the moment to take advantage of this.

### Features

Expand Down
74 changes: 74 additions & 0 deletions test_unstructured/staging/test_base_staging.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,17 @@
from unstructured.documents.elements import (
Address,
CheckBox,
CoordinatesMetadata,
CoordinateSystem,
DataSourceMetadata,
ElementMetadata,
FigureCaption,
Image,
Link,
ListItem,
NarrativeText,
PageBreak,
RegexMetadata,
Text,
Title,
)
Expand Down Expand Up @@ -102,6 +107,75 @@ def test_convert_to_dataframe_maintains_fields(
assert "regex_metadata_punc" in df.columns


def test_default_pandas_dtypes():
"""
Make sure that all the values that can exist on an element have a corresponding dtype
mapped in the dict returned by get_default_pandas_dtypes()
"""
full_element = Text(
text="some text",
element_id="123",
coordinates=((1, 2), (3, 4)),
coordinate_system=CoordinateSystem(width=12.3, height=99.4),
detection_origin="some origin",
embeddings=[1.1, 2.2, 3.3, 4.4],
metadata=ElementMetadata(
coordinates=CoordinatesMetadata(
points=((1, 2), (3, 4)),
system=CoordinateSystem(width=12.3, height=99.4),
),
data_source=DataSourceMetadata(
url="http://mysite.com",
version="123",
record_locator={"some": "data", "value": 3},
date_created="then",
date_processed="now",
date_modified="before",
permissions_data=[{"data": 1}, {"data": 2}],
),
filename="filename",
file_directory="file_directory",
last_modified="last_modified",
filetype="filetype",
attached_to_filename="attached_to_filename",
parent_id="parent_id",
category_depth=1,
image_path="image_path",
languages=["eng", "spa"],
page_number=1,
page_name="page_name",
url="url",
link_urls=["links", "url"],
link_texts=["links", "texts"],
links=[Link(text="text", url="url", start_index=1)],
sent_from=["sent", "from"],
sent_to=["sent", "to"],
subject="subject",
section="section",
header_footer_type="header_footer_type",
emphasized_text_contents=["emphasized", "text", "contents"],
emphasized_text_tags=["emphasized", "text", "tags"],
text_as_html="text_as_html",
regex_metadata={"key": [RegexMetadata(text="text", start=0, end=4)]},
max_characters=2,
is_continuation=True,
detection_class_prob=0.5,
),
)
element_as_dict = full_element.to_dict()
element_as_dict.update(
base.flatten_dict(
element_as_dict.pop("metadata"),
keys_to_omit=["data_source_record_locator"],
),
)
flattened_element_keys = element_as_dict.keys()
default_dtypes = base.get_default_pandas_dtypes()
dtype_keys = default_dtypes.keys()
for key in flattened_element_keys:
assert key in dtype_keys


@pytest.mark.skipif(
platform.system() == "Windows",
reason="Posix Paths are not available on Windows",
Expand Down
1 change: 0 additions & 1 deletion test_unstructured_ingest/dest/delta-table.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--input-path example-docs/fake-memo.pdf \
--work-dir "$WORK_DIR" \
delta-table \
--write-column json_data \
--table-uri "$DESTINATION_TABLE"

python "$SCRIPT_DIR"/python/test-ingest-delta-table-output.py --table-uri "$DESTINATION_TABLE"
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@
]
},
"filetype": "application/pdf",
"page_number": 1,
"links": []
"page_number": 1
},
"text": "Core Skills for Biomedical Data Scientists"
},
Expand All @@ -30,8 +29,7 @@
]
},
"filetype": "application/pdf",
"page_number": 1,
"links": []
"page_number": 1
},
"text": "Maryam Zaringhalam, PhD, AAAS Science & Technology Policy Fellow"
},
Expand All @@ -48,8 +46,7 @@
]
},
"filetype": "application/pdf",
"page_number": 1,
"links": []
"page_number": 1
},
"text": "Lisa Federer, MLIS, Data Science Training Coordinator"
},
Expand All @@ -66,8 +63,7 @@
]
},
"filetype": "application/pdf",
"page_number": 1,
"links": []
"page_number": 1
},
"text": "Michael F. Huerta, PhD, Associate Director of NLM for Program Development and NLM Coordinator of Data Science and Open Science Initiatives"
},
Expand All @@ -84,8 +80,7 @@
]
},
"filetype": "application/pdf",
"page_number": 1,
"links": []
"page_number": 1
},
"text": "Executive Summary"
},
Expand All @@ -102,8 +97,7 @@
]
},
"filetype": "application/pdf",
"page_number": 1,
"links": []
"page_number": 1
},
"text": "This report provides recommendations for a minimal set of core skills for biomedical data scientists based on analysis that draws on opinions of data scientists, curricula for existing biomedical data science programs, and requirements for biomedical data science jobs. Suggested high-level core skills include:"
},
Expand All @@ -120,8 +114,7 @@
]
},
"filetype": "application/pdf",
"page_number": 1,
"links": []
"page_number": 1
},
"text": "1. General biomedical subject matter knowledge: biomedical data scientists should have a general working knowledge of the principles of biology, bioinformatics, and basic clinical science;"
},
Expand All @@ -138,8 +131,7 @@
]
},
"filetype": "application/pdf",
"page_number": 1,
"links": []
"page_number": 1
},
"text": "2. Programming language expertise: biomedical data scientists should be fluent in at"
},
Expand All @@ -156,8 +148,7 @@
]
},
"filetype": "application/pdf",
"page_number": 1,
"links": []
"page_number": 1
},
"text": "least one programming language (typically R and/or Python);"
},
Expand All @@ -174,8 +165,7 @@
]
},
"filetype": "application/pdf",
"page_number": 1,
"links": []
"page_number": 1
},
"text": "3. Predictive analytics, modeling, and machine learning: while a range of statistical methods may be useful, predictive analytics, modeling, and machine learning emerged as especially important skills in biomedical data science;"
},
Expand All @@ -192,8 +182,7 @@
]
},
"filetype": "application/pdf",
"page_number": 1,
"links": []
"page_number": 1
},
"text": "4. Team science and scientific communication: “soft” skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be as important as the more technical skills typically associated with data science."
},
Expand All @@ -210,8 +199,7 @@
]
},
"filetype": "application/pdf",
"page_number": 1,
"links": []
"page_number": 1
},
"text": "5. Responsible data stewardship: a successful data scientist must be able to implement best practices for data management and stewardship, as well as conduct research in an ethical manner that maintains data security and privacy."
},
Expand All @@ -228,8 +216,7 @@
]
},
"filetype": "application/pdf",
"page_number": 1,
"links": []
"page_number": 1
},
"text": "The report further details specific skills and expertise relevant to biomedical data scientists."
},
Expand All @@ -246,8 +233,7 @@
]
},
"filetype": "application/pdf",
"page_number": 1,
"links": []
"page_number": 1
},
"text": "Motivation"
},
Expand All @@ -264,8 +250,7 @@
]
},
"filetype": "application/pdf",
"page_number": 1,
"links": []
"page_number": 1
},
"text": "Training a biomedical data science (BDS) workforce is a central theme in NLM’s Strategic Plan for the coming decade. That commitment is echoed in the NIH-wide Big Data to Knowledge (BD2K) initiative, which invested $61 million between FY2014 and FY2017 in training programs for the development and use of biomedical big data science methods and tools. In line with"
},
Expand All @@ -282,8 +267,7 @@
]
},
"filetype": "application/pdf",
"page_number": 2,
"links": []
"page_number": 2
},
"text": "Core Skills for Biomedical Data Scientists _____________________________________________________________________________________________"
},
Expand All @@ -300,8 +284,7 @@
]
},
"filetype": "application/pdf",
"page_number": 2,
"links": []
"page_number": 2
},
"text": "this commitment, a recent report to the NLM Director recommended working across NIH to identify and develop core skills required of a biomedical data scientist to consistency across the cohort of NIH-trained data scientists. This report provides a set of recommended core skills based on analysis of current BD2K-funded training programs, biomedical data science job ads, and practicing members of the current data science workforce."
},
Expand All @@ -318,8 +301,7 @@
]
},
"filetype": "application/pdf",
"page_number": 2,
"links": []
"page_number": 2
},
"text": "Methodology"
},
Expand All @@ -336,8 +318,7 @@
]
},
"filetype": "application/pdf",
"page_number": 2,
"links": []
"page_number": 2
},
"text": "The Workforce Excellence team took a three-pronged approach to identifying core skills required of a biomedical data scientist (BDS), drawing from:"
},
Expand All @@ -354,8 +335,7 @@
]
},
"filetype": "application/pdf",
"page_number": 2,
"links": []
"page_number": 2
},
"text": "a) Responses to a 2017 Kaggle1 survey2 of over 16,000 self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use."
},
Expand All @@ -372,8 +352,7 @@
]
},
"filetype": "application/pdf",
"page_number": 2,
"links": []
"page_number": 2
},
"text": "b) Data science skills taught in BD2K-funded training programs. A qualitative content analysis was applied to the descriptions of required courses offered under the 12 BD2K-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized into four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding schema is detailed in Appendix A."
},
Expand All @@ -390,8 +369,7 @@
]
},
"filetype": "application/pdf",
"page_number": 2,
"links": []
"page_number": 2
},
"text": "c) Desired skills identified from data science-related job ads. 59 job ads from government (8.5%), academia (42.4%), industry (33.9%), and the nonprofit sector (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The content analysis methodology and coding schema utilized in analyzing the training programs were applied to the job descriptions. Because many job ads mentioned the same skill more than once, each occurrence of the skill was coded, therefore weighting important skills that were mentioned multiple times in a single ad."
},
Expand All @@ -408,8 +386,7 @@
]
},
"filetype": "application/pdf",
"page_number": 2,
"links": []
"page_number": 2
},
"text": "Analysis of the above data provided insights into the current state of biomedical data science training, as well as a view into data science-related skills likely to be needed to prepare the BDS workforce to succeed in the future. Together, these analyses informed recommendations for core skills necessary for a competitive biomedical data scientist."
},
Expand All @@ -426,8 +403,7 @@
]
},
"filetype": "application/pdf",
"page_number": 2,
"links": []
"page_number": 2
},
"text": "1 Kaggle is an online community for data scientists, serving as a platform for collaboration, competition, and learning: http://kaggle.com 2 In August 2017, Kaggle conducted an industry-wide survey to gain a clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017"
},
Expand All @@ -444,8 +420,7 @@
]
},
"filetype": "application/pdf",
"page_number": 2,
"links": []
"page_number": 2
},
"text": "2"
}
Expand Down
Loading

0 comments on commit f943496

Please sign in to comment.