fix: use raw strings for regex patterns

Avoid `SyntaxWarning` and/or `SyntaxError` messages when importing `unstructured.nlp.patterns` by using raw strings (`"r"` prefix) for regex patterns which may contain `\x` character sequences not recognized by the Python parser for normal strings.
Unstructured-IO · May 15, 2024 · 8d5a0bc · 8d5a0bc
1 parent a164b01
commit 8d5a0bc
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 16 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.13.8-dev10
+## 0.13.8-dev11
 
 ### Enhancements
 
@@ -16,6 +16,7 @@
 * **Remove links param from partition_pdf** `links` is extracted during partitioning and is not needed as a paramter in partition_pdf.
 * **Improve CSV delimeter detection.** `partition_csv()` would raise on CSV files with very long lines.
 * **Fix disk-space leak in `partition_doc()`.** Remove temporary file created but not removed when `file` argument is passed to `partition_doc()`.
+* **Fix possible `SyntaxError` or `SyntaxWarning` on regex patterns.** Change regex patterns to raw strings to avoid these warnings/errors in Python 3.11+.
 
 ## 0.13.7
 

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.13.8-dev10"  # pragma: no cover
+__version__ = "0.13.8-dev11"  # pragma: no cover
diff --git a/unstructured/nlp/patterns.py b/unstructured/nlp/patterns.py
@@ -53,7 +53,7 @@
     "\u29BF",
     "\u002D",
     "",
-    "\*",  # noqa: W605 NOTE(robinson) - skipping qa because we need the escape for the regex
+    r"\*",
     "\x95",
     "·",
 ]
@@ -76,7 +76,7 @@
 
 # Helps split text by paragraphs. There must be one newline, with potential whitespace
 # (incluing \r and \n chars) on either side
-PARAGRAPH_PATTERN = r"\s*\n\s*"  # noqa: W605 NOTE(harrell)
+PARAGRAPH_PATTERN = r"\s*\n\s*"
 
 PARAGRAPH_PATTERN_RE = re.compile(
     f"((?:{BULLETS_PATTERN})|{PARAGRAPH_PATTERN})(?!{BULLETS_PATTERN}|$)",
@@ -94,28 +94,23 @@
 
 # IP Address examples: ba23::58b5:2236:45g2:88h2 or 10.0.2.01
 IP_ADDRESS_PATTERN = (
-    "[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}",  # noqa: W605 NOTE(harrell)
-    # - skipping qa because we need the escape for the regex
+    r"[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}\.[0-9]{1,2}",
     "[a-z0-9]{4}::[a-z0-9]{4}:[a-z0-9]{4}:[a-z0-9]{4}:[a-z0-9]{4}%?[0-9]*",
 )
 IP_ADDRESS_PATTERN_RE = re.compile(f"({'|'.join(IP_ADDRESS_PATTERN)})")
 
-IP_ADDRESS_NAME_PATTERN = "[a-zA-Z0-9-]*\.[a-zA-Z]*\.[a-zA-Z]*"  # noqa: W605 NOTE(harrell)
-# - skipping qa because we need the escape for the regex
+IP_ADDRESS_NAME_PATTERN = r"[a-zA-Z0-9-]*\.[a-zA-Z]*\.[a-zA-Z]*"
 
 # Mapi ID example: 32.88.5467.123
-MAPI_ID_PATTERN = "[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*;"  # noqa: W605 NOTE(harrell)
-# - skipping qa because we need the escape for the regex
+MAPI_ID_PATTERN = r"[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*;"
 
 # Date, time, timezone example: Fri, 26 Mar 2021 11:04:09 +1200
-# NOTE(harrell) - skipping qa because we need the escape for the regex
 EMAIL_DATETIMETZ_PATTERN = (
-    r"[A-Za-z]{3},\s\d{1,2}\s[A-Za-z]{3}\s\d{4}\s\d{2}:\d{2}:\d{2}\s[+-]\d{4}"  # noqa: W605,E501
+    r"[A-Za-z]{3},\s\d{1,2}\s[A-Za-z]{3}\s\d{4}\s\d{2}:\d{2}:\d{2}\s[+-]\d{4}"
 )
 EMAIL_DATETIMETZ_PATTERN_RE = re.compile(EMAIL_DATETIMETZ_PATTERN)
 
-EMAIL_ADDRESS_PATTERN = "[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+"  # noqa: W605 NOTE(harrell)
-# - skipping qa because we need the escape for the regex
+EMAIL_ADDRESS_PATTERN = r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+"
 EMAIL_ADDRESS_PATTERN_RE = re.compile(EMAIL_ADDRESS_PATTERN)
 
 ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"

diff --git a/unstructured/partition/email.py b/unstructured/partition/email.py
@@ -91,7 +91,7 @@ def _parse_received_data(data: str) -> list[Element]:
 def _parse_email_address(data: str) -> tuple[str, str]:
     email_address = extract_email_address(data)
 
-    PATTERN = "<[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+>"  # noqa: W605 Note(harrell)
+    PATTERN = r"<[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+>"
     name = re.split(PATTERN, data.lower())[0].title().strip()
 
     return name, email_address[0]
@@ -224,7 +224,7 @@ def extract_attachment_info(
 
 
 def has_embedded_image(element):
-    PATTERN = re.compile("\[image: .+\]")  # noqa: W605 NOTE(harrell)
+    PATTERN = re.compile(r"\[image: .+\]")
     return PATTERN.search(element.text)
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.13.8-dev10" # pragma: no cover
		__version__ = "0.13.8-dev11" # pragma: no cover