-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add remove_invisible_chars * add sphinx doc for text module * improve docstring * add replace_special_whitespaces * extend SPECIAL_WHITESPACES * extend SPECIAL_WHITESPACES * fix type * add tests
- Loading branch information
Showing
3 changed files
with
121 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
.. _text_code_doc: | ||
|
||
:mod:`~mltb2.text` | ||
================== | ||
|
||
.. automodule:: mltb2.text |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
# Copyright (c) 2023 Philip May | ||
# This software is distributed under the terms of the MIT license | ||
# which is available at https://opensource.org/licenses/MIT | ||
|
||
"""Text specific functionality.""" | ||
|
||
from typing import Dict, Final, Tuple | ||
|
||
INVISIBLE_CHARACTERS: Final[Tuple[str, ...]] = ( | ||
"\u200b", # Zero Width Space (ZWSP) https://www.compart.com/en/unicode/U+200b | ||
"\u00ad", # Soft Hyphen (SHY) https://www.compart.com/en/unicode/U+00ad | ||
# TODO: what about: | ||
# https://www.compart.com/en/unicode/U+2028 | ||
# https://www.compart.com/en/unicode/U+2029 | ||
) | ||
|
||
INVISIBLE_CHARACTERS_TRANS: Final[Dict[int, None]] = str.maketrans({char: None for char in INVISIBLE_CHARACTERS}) | ||
|
||
SPECIAL_WHITESPACES: Final[Tuple[str, ...]] = ( | ||
# unicode block "General Punctuation": https://www.compart.com/en/unicode/block/U+2000 | ||
"\u2000", # En Quad | ||
"\u2001", # Em Quad | ||
"\u2002", # En Space | ||
"\u2003", # Em Space | ||
"\u2004", # Three-Per-Em Space | ||
"\u2005", # Four-Per-Em Space | ||
"\u2006", # Six-Per-Em Space | ||
"\u2007", # Figure Space https://www.compart.com/en/unicode/U+2007 | ||
"\u2008", # Punctuation Space | ||
"\u2009", # Thin Space https://www.compart.com/en/unicode/U+2009 | ||
"\u200a", # Hair Space https://www.compart.com/en/unicode/U+200A | ||
"\u202f", # Narrow No-Break Space (NNBSP) https://www.compart.com/en/unicode/U+202f | ||
# other unicode blocks | ||
"\u00a0", # No-Break Space (NBSP) https://www.compart.com/en/unicode/U+00a0 | ||
) | ||
|
||
SPECIAL_WHITESPACES_TRANS: Final[Dict[int, str]] = str.maketrans({char: " " for char in SPECIAL_WHITESPACES}) | ||
|
||
|
||
def remove_invisible_characters(text: str) -> str: | ||
"""Remove invisible characters from text. | ||
The invisible characters are defined in the constant `INVISIBLE_CHARACTERS`. | ||
Args: | ||
text: The text from which the invisible characters are to be removed. | ||
Returns: | ||
The cleaned text. | ||
""" | ||
return text.translate(INVISIBLE_CHARACTERS_TRANS) | ||
|
||
|
||
def replace_special_whitespaces(text: str) -> str: | ||
"""Replace special whitespaces with normal whitespaces. | ||
The special whitespaces are defined in the constant `SPECIAL_WHITESPACES`. | ||
Args: | ||
text: The text from which the special whitespaces are to be replaced. | ||
Returns: | ||
The cleaned text. | ||
""" | ||
return text.translate(SPECIAL_WHITESPACES_TRANS) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
# Copyright (c) 2023 Philip May | ||
# This software is distributed under the terms of the MIT license | ||
# which is available at https://opensource.org/licenses/MIT | ||
|
||
import pytest | ||
|
||
from mltb2.text import ( | ||
INVISIBLE_CHARACTERS, | ||
SPECIAL_WHITESPACES, | ||
remove_invisible_characters, | ||
replace_special_whitespaces, | ||
) | ||
|
||
|
||
def test_remove_invisible_characters(): | ||
text = "Hello\u200bWorld\u00ad!" | ||
result = remove_invisible_characters(text) | ||
assert result == "HelloWorld!" | ||
|
||
|
||
def test_remove_invisible_characters_empty(): | ||
text = "" | ||
result = remove_invisible_characters(text) | ||
assert result == "" | ||
|
||
|
||
@pytest.mark.parametrize("char", INVISIBLE_CHARACTERS) | ||
def test_remove_invisible_characters_single_char(char: str): | ||
text = f">{char}<" | ||
result = remove_invisible_characters(text) | ||
assert result == "><" | ||
|
||
|
||
def test_replace_special_whitespaces(): | ||
text = "a\u00a0b\u2009c\u202fd\u2007e\u200af" | ||
result = replace_special_whitespaces(text) | ||
assert result == "a b c d e f" | ||
|
||
|
||
def test_replace_special_whitespaces_empty(): | ||
text = "" | ||
result = replace_special_whitespaces(text) | ||
assert result == "" | ||
|
||
|
||
@pytest.mark.parametrize("char", SPECIAL_WHITESPACES) | ||
def test_replace_special_whitespaces_single_char(char: str): | ||
text = f">{char}<" | ||
result = replace_special_whitespaces(text) | ||
assert result == "> <" |