Skip to content

Commit

Permalink
update invalid cluster detection capability
Browse files Browse the repository at this point in the history
  • Loading branch information
amankhandelia committed Feb 10, 2023
1 parent 08af682 commit 510d172
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 21 deletions.
56 changes: 49 additions & 7 deletions src/histr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

"Class to split devnagri text in shabdansh"
# for more details refer to https://www.unicode.org/versions/Unicode11.0.0/ch12.pdf
# and https://learn.microsoft.com/en-us/typography/script-development/devanagari


class Shabdansh(str):
Expand All @@ -13,6 +14,29 @@ class Shabdansh(str):
TOP_MATRA = ["ँ", "ं", "ॅ", "े", "ै"]
BOTTOM_MATRA = ["़", "ु", "ू", "ृ"]
MATRA = LEFT_MATRA + RIGHT_MATRA + TOP_MATRA + BOTTOM_MATRA
INVALID_COMBO_RIGHT_TOP = [
("ॉ", "ँ"),
("ॉ", "ं"),
("ॉ", "ॅ"),
("ॉ", "े"),
("ॉ", "ै"),
("ो", "ँ"),
("ो", "ॅ"),
("ो", "े"),
("ो", "ै"),
("ौ", "ँ"),
("ौ", "ॅ"),
("ौ", "े"),
("ौ", "ै"),
("ा", "े"),
("ा", "ै"),
("ी", "ँ"),
("ी", "ॅ"),
("ी", "े"),
("ी", "ै"),
]
INVALID_COMBOS = [("ं", "़"), ("ा", "ू"), ("ै", "ि")] + INVALID_COMBO_RIGHT_TOP
HALANT_THRESHOLD_VALUE = 3

def __init__(self, devnagari_text: str):
self.str = devnagari_text
Expand Down Expand Up @@ -53,7 +77,10 @@ def is_valid_cluster(cluster: str) -> bool:
can't be more than >1 matra in top, bottom, left, right position
3. Cluster can contain top and bottom matra at the same time
4. There has to be consonant, i.e. unicode category Mn after a halant
5. Cluster cannot contain bottom matra and halant at the same time
5. If a halant is immediately followed by bottom matra
6. Number of halant should always be N-1, where is the number of consonants
7. If present ANUSVARA has to be last code point in a cluster
Parameters
----------
Expand All @@ -66,10 +93,23 @@ def is_valid_cluster(cluster: str) -> bool:
whether the grapheme cluster is valid or not
"""

# check if the cluster invalid combo
for matra_1, matra_2 in Shabdansh.INVALID_COMBOS:
if matra_1 in cluster and matra_2 in cluster:
return False

# check if the anuswara is last code point
if "ं" in cluster and cluster[-1] != "ं":
return False

# check for count based validation
left_matra_count, right_matra_count, top_matra_count, bottom_matra_count = 0, 0, 0, 0
halant_count = 0
for char in cluster:
if char in Shabdansh.LEFT_MATRA:
consonant_count = 0
for idx, char in enumerate(cluster):
if unicodedata.category(char)[0] == "L":
consonant_count += 1
elif char in Shabdansh.LEFT_MATRA:
left_matra_count += 1
elif char in Shabdansh.RIGHT_MATRA:
right_matra_count += 1
Expand All @@ -79,10 +119,11 @@ def is_valid_cluster(cluster: str) -> bool:
bottom_matra_count += 1
elif char == Shabdansh.HALANT:
halant_count += 1
# invalid if the preceding code point was not a consonant
if (idx > 0 and unicodedata.category(cluster[idx - 1])[0] != "L") or idx == 0:
return False

# invalidated because of invalid combination
if halant_count and bottom_matra_count:
return False
# invalidated because of invalid combination of matra categories
if left_matra_count and right_matra_count:
return False

Expand All @@ -95,7 +136,8 @@ def is_valid_cluster(cluster: str) -> bool:
return False
if bottom_matra_count > 1:
return False
if halant_count > 1:

if halant_count >= consonant_count or halant_count > Shabdansh.HALANT_THRESHOLD_VALUE:
return False

return True
Expand Down
50 changes: 36 additions & 14 deletions tests/test_hi_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,19 +29,41 @@ def test_cluster_splitting(id: int, text: str, splits: List[str]):
@pytest.mark.parametrize(
"id, cluster, expected_result",
[
(1, "प्ेा्र", True),
(2, "श्ा", True),
(3, "मूँ्््गा", True),
(4, "ग्रंा", True),
(5, "श्च्ा", True),
(6, "्ज्ञा", True),
(7, "्अं", True),
(8, "त्थ्ॉू", True),
(9, "र्ुि", True),
(10, "डं़ो", True),
(11, "च्द्धा", False),
(12, "स्दा", False),
# invalid cases
(1, "प्ेा्र", False),
(2, "श्ा", False),
(3, "मूँ्््गा", False),
(4, "ग्रंा", False),
(5, "श्च्ा", False),
(6, "्ज्ञा", False),
(7, "्अं", False),
(8, "त्थ्ॉू", False),
(9, "र्ुि", False),
(10, "डं़ो", False),
(11, "मू्गी", False),
(12, "तेीे", False),
(13, "त्त्", False),
(14, "ताू", False),
(15, "स्सोे", False),
(16, "न्द्र्र्र्र्र्र्र्र", False),
(17, "र्इ्श", False),
(18, "अँ्ग", False),
(19, "स्र्जा", False),
(20, "मूैि", False),
(21, "भैौ", False),
(22, "न्नँू", False),
# valid cases
(101, "च्द्धा", True),
(102, "स्दा", True),
(103, "न्द", True),
(104, "सिं", True),
(105, "स्दा", True),
(106, "मूगी", True),
(107, "ग्रां", True),
(108, "फ्पु", True),
(109, "ज्ज़", True),
(110, "स्कृ", True),
],
)
def test_contains_hanging_matra(id: int, cluster: str, expected_result: bool):
assert False
def test_is_valid_cluster(id: int, cluster: str, expected_result: bool):
assert Shabdansh.is_valid_cluster(cluster) == expected_result

0 comments on commit 510d172

Please sign in to comment.