Skip to content

Commit

Permalink
Merge pull request #510 from jaydom28/jdom/plugin/thai
Browse files Browse the repository at this point in the history
Jdom/plugin/thai
  • Loading branch information
jzohrab authored Nov 2, 2024
2 parents 21e3520 + 52c681e commit 0d06197
Show file tree
Hide file tree
Showing 10 changed files with 264 additions and 0 deletions.
5 changes: 5 additions & 0 deletions plugins/lute-thai/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
The Lute Thai parser.

See [the wiki](https://github.com/LuteOrg/lute-v3/wiki/Developing-language-parser-plugins) for development notes.

See the [Pypi readme](./README_PyPi.md) for extra config notes.
12 changes: 12 additions & 0 deletions plugins/lute-thai/README_PyPi.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# `lute3-thai`

A Thai parser for Lute (`lute3`) using the `pythainlp` library.

## Installation

See the [Lute manual](https://luteorg.github.io/lute-manual/install/plugins.html).

## Usage

When this parser is installed, you can add "Thai" as a
language to Lute, which comes with a simple story.
22 changes: 22 additions & 0 deletions plugins/lute-thai/definition.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
name: Thai
dictionaries:
- for: terms
type: embedded
url: https://dict.com/thai-english/###
- for: terms
type: embedded
url: https://en.wiktionary.org/wiki/###
- for: terms
type: popup
url: https://glosbe.com/th/en/###
- for: sentences
type: popup
url: https://www.bing.com/translator/?from=th&to=en&text=###
show_romanization: true
# right_to_left:

parser_type: lute_thai
# character_substitutions:
split_sentences: ฯ!?
# split_sentence_exceptions:
word_chars: ก-๛
5 changes: 5 additions & 0 deletions plugins/lute-thai/lute_thai_parser/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""
Lute Thai Parser
"""

__version__ = "0.0.3"
69 changes: 69 additions & 0 deletions plugins/lute-thai/lute_thai_parser/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""
Parsing using pythainlp
Includes classes:
- ThaiParser
"""

import re
import os
import pythainlp

from typing import List

from lute.parse.base import ParsedToken, AbstractParser


class ThaiParser(AbstractParser):
"""
A parser for Thai that uses the pythainlp library for text segmentation.
The user can add some exceptions to the "parsing_exceptions.txt"
data file.
"""

@classmethod
def name(cls):
return "Lute Thai"

@classmethod
def uses_data_directory(cls):
"Uses the data_directory (defined in the AbstractParser)."
return False

# @classmethod
# def init_data_directory(cls):
# "Set up necessary files."
# pass

def get_parsed_tokens(self, text: str, language) -> List[ParsedToken]:
"""
Returns ParsedToken array for given language.
"""
text = text.replace("\r\n", "\n")

words = pythainlp.word_tokenize(text)
tokens = []
pattern = f"[{language.word_characters}]"
for word in words:
is_word_char = re.match(pattern, word) is not None
is_end_of_sentence = word in language.regexp_split_sentences
if is_end_of_sentence:
is_word_char = False
if word == "\n":
word = "¶"
if word == "¶":
is_word_char = False
is_end_of_sentence = True
t = ParsedToken(word, is_word_char, is_end_of_sentence)
tokens.append(t)
return tokens

def get_reading(self, text: str): # pylint: disable=unused-argument
"""
Get the pronunciation for the given text. For most
languages, this can't be automated.
"""
return None
24 changes: 24 additions & 0 deletions plugins/lute-thai/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
[build-system]
requires = ["flit_core >=3.2,<4"]
build-backend = "flit_core.buildapi"

[tool.flit.module]
name = "lute_thai_parser"

[project]
name = "lute3-thai"
dynamic = ['version']
description = "Learning Using Texts - Thai Parser"
requires-python = ">=3.8"
authors = [
{name = "Justin Dom"}
]
readme = "README_PyPi.md"

dependencies = [
"lute3>=3.4.2",
"pythainlp==5.0.4"
]

[project.entry-points."lute.plugin.parse"]
lute_thai = "lute_thai_parser.parser:ThaiParser"
5 changes: 5 additions & 0 deletions plugins/lute-thai/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Required dependency for base classes.
lute3>=3.4.2

# extra requirements here.
pythainlp==5.0.4
Empty file.
36 changes: 36 additions & 0 deletions plugins/lute-thai/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""
Common fixtures used by many tests.
"""

import os
import yaml
import pytest


from lute.parse.registry import init_parser_plugins

from lute.models.language import Language


def pytest_sessionstart(session): # pylint: disable=unused-argument
"""
Initialize parser list
"""
init_parser_plugins()


def _get_test_language():
"""
Retrieve the language definition file for testing ths plugin from definition.yaml
"""
thisdir = os.path.dirname(os.path.realpath(__file__))
definition_file = os.path.join(thisdir, "..", "definition.yaml")
with open(definition_file, "r", encoding="utf-8") as df:
d = yaml.safe_load(df)
lang = Language.from_dict(d)
return lang


@pytest.fixture(name="thai")
def fixture_thai():
return _get_test_language()
86 changes: 86 additions & 0 deletions plugins/lute-thai/tests/test_ThaiParser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""
ThaiParser tests.
"""


import pytest

# pylint: disable=wrong-import-order
from lute.models.term import Term
from lute.parse.base import ParsedToken

from lute_thai_parser.parser import ThaiParser


def test_token_count(thai):
"""
token_count checks.
"""
cases = [
("สวัสดี", 1),
("ลาก่อน", 1),
("ฉันรักคุณ", 3),
("ฉันกำลังเรียนภาษาไทย", 4),
]
for text, expected_count in cases:
t = Term(thai, text)
assert t.token_count == expected_count, text
assert t.text_lc == t.text, "case"


def assert_tokens_equals(text, lang, expected):
"""
Parsing a text using a language should give the expected parsed tokens.
expected is given as array of:
[ original_text, is_word, is_end_of_sentence ]
"""
p = ThaiParser()
actual = p.get_parsed_tokens(text, lang)
expected = [ParsedToken(*a) for a in expected]
assert [str(a) for a in actual] == [str(e) for e in expected]


def test_end_of_sentence_stored_in_parsed_tokens(thai):
"""
ParsedToken is marked as EOS=True at ends of sentences.
"""
s = "สวัสดีทุกคน! ฉันเรียนภาษาไทยมา2เดือนแล้วฯ"

expected = [
("สวัสดี", True),
("ทุกคน", True),
("!", False, True),
(" ", False),
("ฉัน", True),
("เรียน", True),
("ภาษาไทย", True),
("มา", True),
("2", False),
("เดือน", True),
("แล้ว", True, False),
("ฯ", False, True),
]
assert_tokens_equals(s, thai, expected)


def test_carriage_returns_treated_as_reverse_p_character(thai):
"""
Returns need to be marked with the backwards P for rendering etc.
"""
s = "สวัสดีทุกคน!\nฉันเรียนภาษาไทยมา2เดือนแล้ว"

expected = [
("สวัสดี", True),
("ทุกคน", True),
("!", False, True),
("¶", False, True),
("ฉัน", True),
("เรียน", True),
("ภาษาไทย", True),
("มา", True),
("2", False),
("เดือน", True),
("แล้ว", True, False),
]
assert_tokens_equals(s, thai, expected)

0 comments on commit 0d06197

Please sign in to comment.