diff --git a/src/segments/tokenizer.py b/src/segments/tokenizer.py index e72c751..2a1a3fc 100644 --- a/src/segments/tokenizer.py +++ b/src/segments/tokenizer.py @@ -251,7 +251,7 @@ def pp(word): return separator.join(pp(word) for word in res) - def characters(self, string): + def characters(self, string, segment_separator=' ', separator=' # ',): """ Given a string as input, return a space-delimited string of Unicode characters (code points rendered as glyphs). @@ -270,7 +270,7 @@ def characters(self, string): Input is first normalized according to Normalization Ford D(ecomposition). String returned contains "#" to mark word boundaries. """ - return ' # '.join(' '.join(word) for word in nfd(string).split()) + return separator.join(segment_separator.join(word) for word in nfd(string).split()) def grapheme_clusters(self, word): """ diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index 45d4cfe..845b73a 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -36,7 +36,9 @@ def test_jipa(lang): def test_characters(): - assert Tokenizer().characters("ĉháɾã̌ctʼɛ↗ʐː| k͡p") == "c ̂ h a ́ ɾ a ̃ ̌ c t ʼ ɛ ↗ ʐ ː | # k ͡ p" + t = Tokenizer() + assert t.characters("ĉháɾã̌ctʼɛ↗ʐː| k͡p") == "c ̂ h a ́ ɾ a ̃ ̌ c t ʼ ɛ ↗ ʐ ː | # k ͡ p" + assert t.characters('abc def', segment_separator='_', separator='|') == 'a_b_c|d_e_f' def test_missing_header():