Skip to content

Commit

Permalink
Add text clean tool. (#105)
Browse files Browse the repository at this point in the history
* add remove_invisible_chars

* add sphinx doc for text module

* improve docstring

* add replace_special_whitespaces

* extend SPECIAL_WHITESPACES

* extend SPECIAL_WHITESPACES

* fix type

* add tests
  • Loading branch information
PhilipMay authored Dec 9, 2023
1 parent 41ae121 commit 8c09329
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 0 deletions.
6 changes: 6 additions & 0 deletions docs/source/api-reference/text.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
.. _text_code_doc:

:mod:`~mltb2.text`
==================

.. automodule:: mltb2.text
65 changes: 65 additions & 0 deletions mltb2/text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Copyright (c) 2023 Philip May
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT

"""Text specific functionality."""

from typing import Dict, Final, Tuple

INVISIBLE_CHARACTERS: Final[Tuple[str, ...]] = (
"\u200b", # Zero Width Space (ZWSP) https://www.compart.com/en/unicode/U+200b
"\u00ad", # Soft Hyphen (SHY) https://www.compart.com/en/unicode/U+00ad
# TODO: what about:
# https://www.compart.com/en/unicode/U+2028
# https://www.compart.com/en/unicode/U+2029
)

INVISIBLE_CHARACTERS_TRANS: Final[Dict[int, None]] = str.maketrans({char: None for char in INVISIBLE_CHARACTERS})

SPECIAL_WHITESPACES: Final[Tuple[str, ...]] = (
# unicode block "General Punctuation": https://www.compart.com/en/unicode/block/U+2000
"\u2000", # En Quad
"\u2001", # Em Quad
"\u2002", # En Space
"\u2003", # Em Space
"\u2004", # Three-Per-Em Space
"\u2005", # Four-Per-Em Space
"\u2006", # Six-Per-Em Space
"\u2007", # Figure Space https://www.compart.com/en/unicode/U+2007
"\u2008", # Punctuation Space
"\u2009", # Thin Space https://www.compart.com/en/unicode/U+2009
"\u200a", # Hair Space https://www.compart.com/en/unicode/U+200A
"\u202f", # Narrow No-Break Space (NNBSP) https://www.compart.com/en/unicode/U+202f
# other unicode blocks
"\u00a0", # No-Break Space (NBSP) https://www.compart.com/en/unicode/U+00a0
)

SPECIAL_WHITESPACES_TRANS: Final[Dict[int, str]] = str.maketrans({char: " " for char in SPECIAL_WHITESPACES})


def remove_invisible_characters(text: str) -> str:
"""Remove invisible characters from text.
The invisible characters are defined in the constant `INVISIBLE_CHARACTERS`.
Args:
text: The text from which the invisible characters are to be removed.
Returns:
The cleaned text.
"""
return text.translate(INVISIBLE_CHARACTERS_TRANS)


def replace_special_whitespaces(text: str) -> str:
"""Replace special whitespaces with normal whitespaces.
The special whitespaces are defined in the constant `SPECIAL_WHITESPACES`.
Args:
text: The text from which the special whitespaces are to be replaced.
Returns:
The cleaned text.
"""
return text.translate(SPECIAL_WHITESPACES_TRANS)
50 changes: 50 additions & 0 deletions tests/test_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Copyright (c) 2023 Philip May
# This software is distributed under the terms of the MIT license
# which is available at https://opensource.org/licenses/MIT

import pytest

from mltb2.text import (
INVISIBLE_CHARACTERS,
SPECIAL_WHITESPACES,
remove_invisible_characters,
replace_special_whitespaces,
)


def test_remove_invisible_characters():
text = "Hello\u200bWorld\u00ad!"
result = remove_invisible_characters(text)
assert result == "HelloWorld!"


def test_remove_invisible_characters_empty():
text = ""
result = remove_invisible_characters(text)
assert result == ""


@pytest.mark.parametrize("char", INVISIBLE_CHARACTERS)
def test_remove_invisible_characters_single_char(char: str):
text = f">{char}<"
result = remove_invisible_characters(text)
assert result == "><"


def test_replace_special_whitespaces():
text = "a\u00a0b\u2009c\u202fd\u2007e\u200af"
result = replace_special_whitespaces(text)
assert result == "a b c d e f"


def test_replace_special_whitespaces_empty():
text = ""
result = replace_special_whitespaces(text)
assert result == ""


@pytest.mark.parametrize("char", SPECIAL_WHITESPACES)
def test_replace_special_whitespaces_single_char(char: str):
text = f">{char}<"
result = replace_special_whitespaces(text)
assert result == "> <"

0 comments on commit 8c09329

Please sign in to comment.