Skip to content

Commit

Permalink
improvement in invalid cluster detection
Browse files Browse the repository at this point in the history
  • Loading branch information
amankhandelia committed Feb 9, 2023
1 parent 42cac83 commit 08af682
Showing 1 changed file with 40 additions and 1 deletion.
41 changes: 40 additions & 1 deletion src/histr.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
import unicodedata


"Class to split devnagri text in shabdansh"
# for more details refer to https://www.unicode.org/versions/Unicode11.0.0/ch12.pdf


class Shabdansh(str):
VIRAMA = "\N{DEVANAGARI SIGN VIRAMA}"
HALLANK = "्"
HALANT = "्"
LEFT_MATRA = ["ि"]
RIGHT_MATRA = ["ॉ", "ो", "ौ", "ा", "ी", "ः"]
TOP_MATRA = ["ँ", "ं", "ॅ", "े", "ै"]
BOTTOM_MATRA = ["़", "ु", "ू", "ृ"]
MATRA = LEFT_MATRA + RIGHT_MATRA + TOP_MATRA + BOTTOM_MATRA

def __init__(self, devnagari_text: str):
self.str = devnagari_text
Expand Down Expand Up @@ -42,7 +45,29 @@ def splitclusters(devnagari_text: str):

@staticmethod
def is_valid_cluster(cluster: str) -> bool:
"""Identify if a grapheme cluster is valid or not
Following are the rules:
1. Cluster cannot contain both left and right matra
2. Cluster cannot have multiple matra from in the same position, i.e. there
can't be more than >1 matra in top, bottom, left, right position
3. Cluster can contain top and bottom matra at the same time
4. There has to be consonant, i.e. unicode category Mn after a halant
5. Cluster cannot contain bottom matra and halant at the same time
Parameters
----------
cluster : str
grapheme cluster for evaluation
Returns
-------
bool
whether the grapheme cluster is valid or not
"""

left_matra_count, right_matra_count, top_matra_count, bottom_matra_count = 0, 0, 0, 0
halant_count = 0
for char in cluster:
if char in Shabdansh.LEFT_MATRA:
left_matra_count += 1
Expand All @@ -52,8 +77,16 @@ def is_valid_cluster(cluster: str) -> bool:
top_matra_count += 1
elif char in Shabdansh.BOTTOM_MATRA:
bottom_matra_count += 1
elif char == Shabdansh.HALANT:
halant_count += 1

# invalidated because of invalid combination
if halant_count and bottom_matra_count:
return False
if left_matra_count and right_matra_count:
return False

# invalidated because of more than one matra from the same category
if left_matra_count > 1:
return False
if right_matra_count > 1:
Expand All @@ -62,9 +95,15 @@ def is_valid_cluster(cluster: str) -> bool:
return False
if bottom_matra_count > 1:
return False
if halant_count > 1:
return False

return True

@staticmethod
def contains_hanging_matras(cluster: str) -> bool:
return False

def __getitem__(self, index):
return self.str_ls[index]

Expand Down

0 comments on commit 08af682

Please sign in to comment.