Skip to content

Commit

Permalink
slightly enhanced characters function to be more consistent with __ca…
Browse files Browse the repository at this point in the history
…ll__
  • Loading branch information
xrotwang committed Jun 22, 2018
1 parent dd0523a commit 2898775
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 3 deletions.
4 changes: 2 additions & 2 deletions src/segments/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ def pp(word):

return separator.join(pp(word) for word in res)

def characters(self, string):
def characters(self, string, segment_separator=' ', separator=' # ',):
"""
Given a string as input, return a space-delimited string of Unicode characters
(code points rendered as glyphs).
Expand All @@ -270,7 +270,7 @@ def characters(self, string):
Input is first normalized according to Normalization Ford D(ecomposition).
String returned contains "#" to mark word boundaries.
"""
return ' # '.join(' '.join(word) for word in nfd(string).split())
return separator.join(segment_separator.join(word) for word in nfd(string).split())

def grapheme_clusters(self, word):
"""
Expand Down
4 changes: 3 additions & 1 deletion tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ def test_jipa(lang):


def test_characters():
assert Tokenizer().characters("ĉháɾã̌ctʼɛ↗ʐː| k͡p") == "c ̂ h a ́ ɾ a ̃ ̌ c t ʼ ɛ ↗ ʐ ː | # k ͡ p"
t = Tokenizer()
assert t.characters("ĉháɾã̌ctʼɛ↗ʐː| k͡p") == "c ̂ h a ́ ɾ a ̃ ̌ c t ʼ ɛ ↗ ʐ ː | # k ͡ p"
assert t.characters('abc def', segment_separator='_', separator='|') == 'a_b_c|d_e_f'


def test_missing_header():
Expand Down

0 comments on commit 2898775

Please sign in to comment.