Skip to content

Commit

Permalink
Add XML tag detection function
Browse files Browse the repository at this point in the history
  • Loading branch information
PhilipMay committed Jan 5, 2024
1 parent 7b88367 commit ffb97a5
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 0 deletions.
13 changes: 13 additions & 0 deletions mltb2/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,19 @@

MULTI_SPACE_PATTERN: Pattern = re.compile(r" {2,}")

XML_TAG_PATTERN: Pattern = re.compile(r"<\/?[\w:]+( \/|\/|)>")


def has_xml_tag(text: str) -> bool:
"""Check if text contains XML tags (one or multiple).
Args:
text: The text to check.
Returns:
``True`` if the text contains XML tags, ``False`` otherwise.
"""
return re.search(XML_TAG_PATTERN, text) is not None


def remove_invisible_characters(text: str) -> str:
"""Remove invisible characters from text.
Expand Down
36 changes: 36 additions & 0 deletions tests/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
clean_all_invisible_chars_and_whitespaces,
has_invisible_characters,
has_special_whitespaces,
has_xml_tag,
remove_invisible_characters,
replace_multiple_whitespaces,
replace_special_whitespaces,
Expand Down Expand Up @@ -229,3 +230,38 @@ def test_normalize_counter_to_defaultdict_empty_counter():

assert isinstance(normalized_counter, defaultdict)
assert len(normalized_counter) == 0


@pytest.mark.parametrize(
"text",
[
"Some text<ta_g>more text",
"Some text<ta:g>more text",
"Some text</tag>more text",
"Some text<tag/>more text",
"Some text<tag />more text",
],
)
def test_has_xml_tag_with_tags(text: str):
assert has_xml_tag(text)


@pytest.mark.parametrize(
"text",
[
"Some text",
"",
"a < b but x > y",
],
)
def test_has_xml_tag_without_tags(text: str):
assert not has_xml_tag(text)


@settings(max_examples=1000)
@given(text())
def test_has_xml_tag_hypothesis(text: str):
result = has_xml_tag(text)
if result:
assert "<" in text
assert ">" in text

0 comments on commit ffb97a5

Please sign in to comment.