slightly enhanced characters function to be more consistent with __ca…

…ll__
cldf · Jun 22, 2018 · 2898775 · 2898775
1 parent dd0523a
commit 2898775
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 3 deletions.
diff --git a/src/segments/tokenizer.py b/src/segments/tokenizer.py
@@ -251,7 +251,7 @@ def pp(word):
 
         return separator.join(pp(word) for word in res)
 
-    def characters(self, string):
+    def characters(self, string, segment_separator=' ', separator=' # ',):
         """
         Given a string as input, return a space-delimited string of Unicode characters
         (code points rendered as glyphs).
@@ -270,7 +270,7 @@ def characters(self, string):
         Input is first normalized according to Normalization Ford D(ecomposition).
         String returned contains "#" to mark word boundaries.
         """
-        return ' # '.join(' '.join(word) for word in nfd(string).split())
+        return separator.join(segment_separator.join(word) for word in nfd(string).split())
 
     def grapheme_clusters(self, word):
         """

diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
@@ -36,7 +36,9 @@ def test_jipa(lang):
 
 
 def test_characters():
-    assert Tokenizer().characters("ĉháɾã̌ctʼɛ↗ʐː| k͡p") == "c ̂ h a ́ ɾ a ̃ ̌ c t ʼ ɛ ↗ ʐ ː | # k ͡ p"
+    t = Tokenizer()
+    assert t.characters("ĉháɾã̌ctʼɛ↗ʐː| k͡p") == "c ̂ h a ́ ɾ a ̃ ̌ c t ʼ ɛ ↗ ʐ ː | # k ͡ p"
+    assert t.characters('abc def', segment_separator='_', separator='|') == 'a_b_c|d_e_f'
 
 
 def test_missing_header():