-
Notifications
You must be signed in to change notification settings - Fork 50
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #510 from jaydom28/jdom/plugin/thai
Jdom/plugin/thai
- Loading branch information
Showing
10 changed files
with
264 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
The Lute Thai parser. | ||
|
||
See [the wiki](https://github.com/LuteOrg/lute-v3/wiki/Developing-language-parser-plugins) for development notes. | ||
|
||
See the [Pypi readme](./README_PyPi.md) for extra config notes. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# `lute3-thai` | ||
|
||
A Thai parser for Lute (`lute3`) using the `pythainlp` library. | ||
|
||
## Installation | ||
|
||
See the [Lute manual](https://luteorg.github.io/lute-manual/install/plugins.html). | ||
|
||
## Usage | ||
|
||
When this parser is installed, you can add "Thai" as a | ||
language to Lute, which comes with a simple story. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
name: Thai | ||
dictionaries: | ||
- for: terms | ||
type: embedded | ||
url: https://dict.com/thai-english/### | ||
- for: terms | ||
type: embedded | ||
url: https://en.wiktionary.org/wiki/### | ||
- for: terms | ||
type: popup | ||
url: https://glosbe.com/th/en/### | ||
- for: sentences | ||
type: popup | ||
url: https://www.bing.com/translator/?from=th&to=en&text=### | ||
show_romanization: true | ||
# right_to_left: | ||
|
||
parser_type: lute_thai | ||
# character_substitutions: | ||
split_sentences: ฯ!? | ||
# split_sentence_exceptions: | ||
word_chars: ก-๛ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
""" | ||
Lute Thai Parser | ||
""" | ||
|
||
__version__ = "0.0.3" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
""" | ||
Parsing using pythainlp | ||
Includes classes: | ||
- ThaiParser | ||
""" | ||
|
||
import re | ||
import os | ||
import pythainlp | ||
|
||
from typing import List | ||
|
||
from lute.parse.base import ParsedToken, AbstractParser | ||
|
||
|
||
class ThaiParser(AbstractParser): | ||
""" | ||
A parser for Thai that uses the pythainlp library for text segmentation. | ||
The user can add some exceptions to the "parsing_exceptions.txt" | ||
data file. | ||
""" | ||
|
||
@classmethod | ||
def name(cls): | ||
return "Lute Thai" | ||
|
||
@classmethod | ||
def uses_data_directory(cls): | ||
"Uses the data_directory (defined in the AbstractParser)." | ||
return False | ||
|
||
# @classmethod | ||
# def init_data_directory(cls): | ||
# "Set up necessary files." | ||
# pass | ||
|
||
def get_parsed_tokens(self, text: str, language) -> List[ParsedToken]: | ||
""" | ||
Returns ParsedToken array for given language. | ||
""" | ||
text = text.replace("\r\n", "\n") | ||
|
||
words = pythainlp.word_tokenize(text) | ||
tokens = [] | ||
pattern = f"[{language.word_characters}]" | ||
for word in words: | ||
is_word_char = re.match(pattern, word) is not None | ||
is_end_of_sentence = word in language.regexp_split_sentences | ||
if is_end_of_sentence: | ||
is_word_char = False | ||
if word == "\n": | ||
word = "¶" | ||
if word == "¶": | ||
is_word_char = False | ||
is_end_of_sentence = True | ||
t = ParsedToken(word, is_word_char, is_end_of_sentence) | ||
tokens.append(t) | ||
return tokens | ||
|
||
def get_reading(self, text: str): # pylint: disable=unused-argument | ||
""" | ||
Get the pronunciation for the given text. For most | ||
languages, this can't be automated. | ||
""" | ||
return None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
[build-system] | ||
requires = ["flit_core >=3.2,<4"] | ||
build-backend = "flit_core.buildapi" | ||
|
||
[tool.flit.module] | ||
name = "lute_thai_parser" | ||
|
||
[project] | ||
name = "lute3-thai" | ||
dynamic = ['version'] | ||
description = "Learning Using Texts - Thai Parser" | ||
requires-python = ">=3.8" | ||
authors = [ | ||
{name = "Justin Dom"} | ||
] | ||
readme = "README_PyPi.md" | ||
|
||
dependencies = [ | ||
"lute3>=3.4.2", | ||
"pythainlp==5.0.4" | ||
] | ||
|
||
[project.entry-points."lute.plugin.parse"] | ||
lute_thai = "lute_thai_parser.parser:ThaiParser" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# Required dependency for base classes. | ||
lute3>=3.4.2 | ||
|
||
# extra requirements here. | ||
pythainlp==5.0.4 |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
""" | ||
Common fixtures used by many tests. | ||
""" | ||
|
||
import os | ||
import yaml | ||
import pytest | ||
|
||
|
||
from lute.parse.registry import init_parser_plugins | ||
|
||
from lute.models.language import Language | ||
|
||
|
||
def pytest_sessionstart(session): # pylint: disable=unused-argument | ||
""" | ||
Initialize parser list | ||
""" | ||
init_parser_plugins() | ||
|
||
|
||
def _get_test_language(): | ||
""" | ||
Retrieve the language definition file for testing ths plugin from definition.yaml | ||
""" | ||
thisdir = os.path.dirname(os.path.realpath(__file__)) | ||
definition_file = os.path.join(thisdir, "..", "definition.yaml") | ||
with open(definition_file, "r", encoding="utf-8") as df: | ||
d = yaml.safe_load(df) | ||
lang = Language.from_dict(d) | ||
return lang | ||
|
||
|
||
@pytest.fixture(name="thai") | ||
def fixture_thai(): | ||
return _get_test_language() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
""" | ||
ThaiParser tests. | ||
""" | ||
|
||
|
||
import pytest | ||
|
||
# pylint: disable=wrong-import-order | ||
from lute.models.term import Term | ||
from lute.parse.base import ParsedToken | ||
|
||
from lute_thai_parser.parser import ThaiParser | ||
|
||
|
||
def test_token_count(thai): | ||
""" | ||
token_count checks. | ||
""" | ||
cases = [ | ||
("สวัสดี", 1), | ||
("ลาก่อน", 1), | ||
("ฉันรักคุณ", 3), | ||
("ฉันกำลังเรียนภาษาไทย", 4), | ||
] | ||
for text, expected_count in cases: | ||
t = Term(thai, text) | ||
assert t.token_count == expected_count, text | ||
assert t.text_lc == t.text, "case" | ||
|
||
|
||
def assert_tokens_equals(text, lang, expected): | ||
""" | ||
Parsing a text using a language should give the expected parsed tokens. | ||
expected is given as array of: | ||
[ original_text, is_word, is_end_of_sentence ] | ||
""" | ||
p = ThaiParser() | ||
actual = p.get_parsed_tokens(text, lang) | ||
expected = [ParsedToken(*a) for a in expected] | ||
assert [str(a) for a in actual] == [str(e) for e in expected] | ||
|
||
|
||
def test_end_of_sentence_stored_in_parsed_tokens(thai): | ||
""" | ||
ParsedToken is marked as EOS=True at ends of sentences. | ||
""" | ||
s = "สวัสดีทุกคน! ฉันเรียนภาษาไทยมา2เดือนแล้วฯ" | ||
|
||
expected = [ | ||
("สวัสดี", True), | ||
("ทุกคน", True), | ||
("!", False, True), | ||
(" ", False), | ||
("ฉัน", True), | ||
("เรียน", True), | ||
("ภาษาไทย", True), | ||
("มา", True), | ||
("2", False), | ||
("เดือน", True), | ||
("แล้ว", True, False), | ||
("ฯ", False, True), | ||
] | ||
assert_tokens_equals(s, thai, expected) | ||
|
||
|
||
def test_carriage_returns_treated_as_reverse_p_character(thai): | ||
""" | ||
Returns need to be marked with the backwards P for rendering etc. | ||
""" | ||
s = "สวัสดีทุกคน!\nฉันเรียนภาษาไทยมา2เดือนแล้ว" | ||
|
||
expected = [ | ||
("สวัสดี", True), | ||
("ทุกคน", True), | ||
("!", False, True), | ||
("¶", False, True), | ||
("ฉัน", True), | ||
("เรียน", True), | ||
("ภาษาไทย", True), | ||
("มา", True), | ||
("2", False), | ||
("เดือน", True), | ||
("แล้ว", True, False), | ||
] | ||
assert_tokens_equals(s, thai, expected) |