Skip to content

Commit

Permalink
Merge branch 'main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
MthwRobinson authored May 16, 2024
2 parents 66fc7ec + 0de9215 commit 7898923
Show file tree
Hide file tree
Showing 5 changed files with 13 additions and 16 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.13.8-dev13
## 0.13.8-dev14

### Enhancements

Expand All @@ -19,6 +19,7 @@
* **Remove links param from partition_pdf** `links` is extracted during partitioning and is not needed as a paramter in partition_pdf.
* **Improve CSV delimeter detection.** `partition_csv()` would raise on CSV files with very long lines.
* **Fix disk-space leak in `partition_doc()`.** Remove temporary file created but not removed when `file` argument is passed to `partition_doc()`.
* **Fix possible `SyntaxError` or `SyntaxWarning` on regex patterns.** Change regex patterns to raw strings to avoid these warnings/errors in Python 3.11+.

## 0.13.7

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ lint.select = [
"UP018", # -- Unnecessary {literal_type} call like `str("abc")`. (rewrite as a literal) --
"UP032", # -- Use f-string instead of `.format()` call --
"UP034", # -- Avoid extraneous parentheses --
"W", # -- Warnings, including invalid escape-sequence --
]
lint.ignore = [
"COM812", # -- over aggressively insists on trailing commas where not desireable --
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.13.8-dev13" # pragma: no cover
__version__ = "0.13.8-dev14" # pragma: no cover
19 changes: 7 additions & 12 deletions unstructured/nlp/patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
"\u29BF",
"\u002D",
"",
"\*", # noqa: W605 NOTE(robinson) - skipping qa because we need the escape for the regex
r"\*",
"\x95",
"·",
]
Expand All @@ -76,7 +76,7 @@

# Helps split text by paragraphs. There must be one newline, with potential whitespace
# (incluing \r and \n chars) on either side
PARAGRAPH_PATTERN = r"\s*\n\s*" # noqa: W605 NOTE(harrell)
PARAGRAPH_PATTERN = r"\s*\n\s*"

PARAGRAPH_PATTERN_RE = re.compile(
f"((?:{BULLETS_PATTERN})|{PARAGRAPH_PATTERN})(?!{BULLETS_PATTERN}|$)",
Expand All @@ -94,28 +94,23 @@

# IP Address examples: ba23::58b5:2236:45g2:88h2 or 10.0.2.01
IP_ADDRESS_PATTERN = (
"[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}", # noqa: W605 NOTE(harrell)
# - skipping qa because we need the escape for the regex
r"[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}",
"[a-z0-9]{4}::[a-z0-9]{4}:[a-z0-9]{4}:[a-z0-9]{4}:[a-z0-9]{4}%?[0-9]*",
)
IP_ADDRESS_PATTERN_RE = re.compile(f"({'|'.join(IP_ADDRESS_PATTERN)})")

IP_ADDRESS_NAME_PATTERN = "[a-zA-Z0-9-]*\.[a-zA-Z]*\.[a-zA-Z]*" # noqa: W605 NOTE(harrell)
# - skipping qa because we need the escape for the regex
IP_ADDRESS_NAME_PATTERN = r"[a-zA-Z0-9-]*\.[a-zA-Z]*\.[a-zA-Z]*"

# Mapi ID example: 32.88.5467.123
MAPI_ID_PATTERN = "[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*;" # noqa: W605 NOTE(harrell)
# - skipping qa because we need the escape for the regex
MAPI_ID_PATTERN = r"[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*;"

# Date, time, timezone example: Fri, 26 Mar 2021 11:04:09 +1200
# NOTE(harrell) - skipping qa because we need the escape for the regex
EMAIL_DATETIMETZ_PATTERN = (
r"[A-Za-z]{3},\s\d{1,2}\s[A-Za-z]{3}\s\d{4}\s\d{2}:\d{2}:\d{2}\s[+-]\d{4}" # noqa: W605,E501
r"[A-Za-z]{3},\s\d{1,2}\s[A-Za-z]{3}\s\d{4}\s\d{2}:\d{2}:\d{2}\s[+-]\d{4}"
)
EMAIL_DATETIMETZ_PATTERN_RE = re.compile(EMAIL_DATETIMETZ_PATTERN)

EMAIL_ADDRESS_PATTERN = "[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+" # noqa: W605 NOTE(harrell)
# - skipping qa because we need the escape for the regex
EMAIL_ADDRESS_PATTERN = r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+"
EMAIL_ADDRESS_PATTERN_RE = re.compile(EMAIL_ADDRESS_PATTERN)

ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
Expand Down
4 changes: 2 additions & 2 deletions unstructured/partition/email.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def _parse_received_data(data: str) -> list[Element]:
def _parse_email_address(data: str) -> tuple[str, str]:
email_address = extract_email_address(data)

PATTERN = "<[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+>" # noqa: W605 Note(harrell)
PATTERN = r"<[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+>"
name = re.split(PATTERN, data.lower())[0].title().strip()

return name, email_address[0]
Expand Down Expand Up @@ -224,7 +224,7 @@ def extract_attachment_info(


def has_embedded_image(element):
PATTERN = re.compile("\[image: .+\]") # noqa: W605 NOTE(harrell)
PATTERN = re.compile(r"\[image: .+\]")
return PATTERN.search(element.text)


Expand Down

0 comments on commit 7898923

Please sign in to comment.