Skip to content

Commit

Permalink
Merge pull request #108 from allo-media/master
Browse files Browse the repository at this point in the history
Bug fix release.
  • Loading branch information
rtxm authored Mar 19, 2024
2 parents a24659d + c0d7bd3 commit 2cbdb65
Show file tree
Hide file tree
Showing 10 changed files with 85 additions and 21 deletions.
16 changes: 16 additions & 0 deletions .readthedocs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Required
version: 2

# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: docs/conf.py

# Build documentation with MkDocs
#mkdocs:
# configuration: mkdocs.yml

build:
os: ubuntu-22.04
tools:
python: "3.8"

2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from setuptools import setup, find_packages


VERSION = "2.5.0"
VERSION = "2.5.1"


def readme():
Expand Down
23 changes: 23 additions & 0 deletions tests/test_text_to_num_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ def test_text2num_zeroes(self):
self.assertRaises(ValueError, text2num, "fünfzignullzwei", "de")
self.assertRaises(ValueError, text2num, "fünfzigdreinull", "de")

def test_text2num_hundred_addition(self):
self.assertRaises(ValueError, text2num, "achtundachtzig dreihundert", "de")
self.assertRaises(ValueError, text2num, "zwanzig dreihundert", "de")
self.assertRaises(ValueError, text2num, "zwei zwölfhundert", "de")

def test_alpha2digit_integers(self):
source = "fünfundzwanzig Kühe, zwölf Hühner und einhundertfünfundzwanzig kg Kartoffeln."
expected = "25 Kühe, 12 Hühner und 125 kg Kartoffeln."
Expand Down Expand Up @@ -297,3 +302,21 @@ def test_uppercase(self):
source = "FÜNFZEHN EINS ZEHN EINS"
expected = "15 1 10 1"
self.assertEqual(alpha2digit(source, "de"), expected)

def test_ordinals_false_positives(self):
source = "In zehnten Jahrzehnten. Und einmal mit den Vereinten."
expected = "In 10. Jahrzehnten. Und einmal mit den Vereinten."
self.assertEqual(alpha2digit(source, "de"), expected)

source = "Dies ist eine Liste oder die Einkaufsliste."
expected = source
self.assertEqual(alpha2digit(source, "de"), expected)

def test_hundred_addition(self):
source = "Zahlen wie vierzig fünfhundert Tausend zweiundzwanzig hundert sind gut."
expected = "Zahlen wie 40 500022 100 sind gut."
self.assertEqual(alpha2digit(source, "de"), expected)

source = "achtundachtzig sieben hundert, acht und achtzig siebenhundert, achtundachtzig sieben hundert, acht und achtzig sieben hundert"
expected = "88 700, 88 700, 88 700, 88 700"
self.assertEqual(alpha2digit(source, "de"), expected)
6 changes: 6 additions & 0 deletions tests/test_text_to_num_en.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def test_text2num(self):
self.assertEqual(text2num("one hundred fifteen", "en"), 115)
self.assertEqual(text2num("seventy-five thousands", "en"), 75000)
self.assertEqual(text2num("thousand nine hundred twenty", "en"), 1920)
self.assertEqual(text2num("one billion twenty-five millions", "en"), 1_025_000_000)

def test_text2num_centuries(self):
self.assertEqual(text2num("nineteen hundred seventy-three", "en"), 1973)
Expand Down Expand Up @@ -125,6 +126,7 @@ def test_alpha2digit_zero(self):
self.assertEqual(alpha2digit(source, "en"), expected)

self.assertEqual(alpha2digit("zero", "en"), "0")
self.assertEqual(alpha2digit("zero love", "en"), "0 love")

def test_alpha2digit_ordinals(self):
source = (
Expand Down Expand Up @@ -174,6 +176,10 @@ def test_one_as_noun_or_article(self):
self.assertEqual(alpha2digit(source, "en"), source)
source = "one cannot know"
self.assertEqual(alpha2digit(source, "en"), source)
# Following an ordinal
source = "the sixth one"
expected = "the 6th one"
self.assertEqual(alpha2digit(source, "en"), expected)
# End of segment
source = "No one. Another one. One one. Twenty one"
expected = "No one. Another one. 1 1. 21"
Expand Down
6 changes: 6 additions & 0 deletions tests/test_text_to_num_fr.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ def test_text2num(self):

self.assertEqual(text2num("quinze", "fr"), 15)
self.assertEqual(text2num("soixante quinze mille", "fr"), 75000)
self.assertEqual(text2num("un milliard vingt-cinq millions", "fr"), 1_025_000_000)

def test_text2num_variants(self):
self.assertEqual(text2num("quatre-vingt dix-huit", "fr"), 98)
Expand Down Expand Up @@ -142,6 +143,8 @@ def test_alpha2digit_zero(self):
# self.assertEqual(alpha2digit(source, "fr"), source)

self.assertEqual(alpha2digit("zéro", "fr"), "0")
self.assertEqual(alpha2digit("a a un trois sept trois trois sept cinq quatre zéro c c", "fr"), "a a 1 3 7 3 3 7 5 4 0 c c")
self.assertEqual(alpha2digit("sept un zéro", "fr"), "7 1 0")

def test_alpha2digit_ordinals(self):
source = (
Expand Down Expand Up @@ -200,3 +203,6 @@ def test_article(self):
def test_un_pronoun(self):
source = "Je n'en veux qu'un. J'annonce: le un"
self.assertEqual(alpha2digit(source, "fr"), source)

def test_alpha2digit_newline(self):
self.assertEqual(alpha2digit("dix + deux\n= douze", "fr"), "10 + 2\n= 12")
2 changes: 1 addition & 1 deletion text_to_num/lang/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def normalize(self, word: str) -> str:
return NotImplemented

def not_numeric_word(self, word: Optional[str]) -> bool:
return word is None or word != self.DECIMAL_SEP and word not in self.NUMBERS
return word is None or word != self.DECIMAL_SEP and word not in self.NUMBERS and word not in self.ZERO

def split_number_word(self, word: str) -> str: # maybe use: List[str]
"""In some languages numbers are written as one word, e.g. German
Expand Down
2 changes: 1 addition & 1 deletion text_to_num/lang/english.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ class English(Language):

AND_NUMS: Set[str] = set()
AND = "and"
NEVER_IF_ALONE = {"one"}
NEVER_IF_ALONE = {"one", "o"}

# Relaxed composed numbers (two-words only)
# start => (next, target)
Expand Down
2 changes: 1 addition & 1 deletion text_to_num/lang/german.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ def split_number_word(self, word: str) -> str:
if not found:
# is (large) ordinal ending?
ord_match = None
if len(result) > 3 and text.startswith("ste"):
if not invalid_word and len(result) > 3 and text.startswith("ste"):
ord_match = re.search(self.LARGE_ORDINAL_SUFFIXES_GER, text)

if ord_match:
Expand Down
30 changes: 17 additions & 13 deletions text_to_num/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,13 +124,14 @@ def is_coef_appliable(self, coef: int) -> bool:
# a multiplier can be applied to anything lesser than itself,
# as long as it not zero (special case for 1000 which then implies 1)
return True
if coef * coef <= self.n000_val:
if coef * 1000 <= self.n000_val:
# a multiplier can not be applied to a value bigger than itself,
# so it must be applied to the current group only.
# ex. for "mille": "deux millions cent cinquante mille"
# ex. for "millions": "trois milliard deux cent millions"
# But not twice: "dix mille cinq mille" is invalid for example. Therefore,
# we test the square of ``coef``.
# we test the 1000 × ``coef`` (as the multipliers above 100,
# are a geometric progression of ratio 1000)
return (
self.grp_val > 0 or coef == 1000
) # "mille" without unit is additive
Expand Down Expand Up @@ -345,6 +346,8 @@ def parse(self, text: str) -> bool:
elif (ng[hundred_index - 1] in self.lang.UNITS) or (
ng[hundred_index - 1] in self.lang.STENS
):
if hundred_index - 2 >= 0 and ng[hundred_index - 2] not in self.lang.MULTIPLIERS:
raise ValueError("invalid {} without multiplier: {}".format(STATIC_HUNDRED, repr(ng)))
multiplier = German.NUMBER_DICT_GER[ng[hundred_index - 1]]
equation += "(" + str(multiplier) + " * 100)"
equation_results.append(multiplier * 100)
Expand Down Expand Up @@ -555,6 +558,7 @@ def __init__(
relaxed: bool = False,
signed: bool = True,
ordinal_threshold: int = 3,
preceding_word: Optional[str] = None
) -> None:
"""Initialize the parser.
Expand All @@ -574,7 +578,7 @@ def __init__(
self.in_frac = False
self.closed = False # For deferred stop
self.open = False # For efficiency
self.last_word: Optional[str] = None # For context
self.last_word: Optional[str] = preceding_word # For context
self.ordinal_threshold = ordinal_threshold

@property
Expand Down Expand Up @@ -651,21 +655,21 @@ def push(self, word: str, look_ahead: Optional[str] = None) -> bool:
elif (
word in self.lang.ZERO
and self.at_start_of_seq()
and (
look_ahead is None
or look_ahead in self.lang.NUMBERS
or look_ahead in self.lang.ZERO
or look_ahead in self.lang.DECIMAL_SEP
)
and look_ahead is not None
and look_ahead in self.lang.DECIMAL_SEP
):
self._value.append("0")
pass
elif (
word in self.lang.ZERO
and self.at_start_of_seq()
and look_ahead is not None
and look_ahead in self.lang.DECIMAL_SEP
# and (
# look_ahead is None
# or look_ahead in self.lang.NUMBERS
# or look_ahead in self.lang.ZERO
# or look_ahead in self.lang.DECIMAL_SEP
# )
):
pass
self._value.append("0")
elif self._push(self.lang.ord2card(word) or "", look_ahead):
self._value.append(
self.lang.num_ord(
Expand Down
17 changes: 13 additions & 4 deletions text_to_num/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@

omg = OrdinalsMerger()
USE_PT_ORDINALS_MERGER = True
WORD_SEP = re.compile(r"\s*[\.,;\(\)…\[\]:!\?]+\s*|\n")


def look_ahead(sequence: Sequence[Any]) -> Iterator[Tuple[Any, Any]]:
Expand Down Expand Up @@ -108,10 +109,8 @@ def alpha2digit(
raise Exception("Language not supported")

language = LANG[lang]
segments = re.split(
r"\s*[\.,;\(\)…\[\]:!\?]+\s*", text
)
punct = re.findall(r"\s*[\.,;\(\)…\[\]:!\?]+\s*", text)
segments = WORD_SEP.split(text)
punct = WORD_SEP.findall(text)
if len(punct) < len(segments):
punct.append("")

Expand All @@ -138,6 +137,7 @@ def alpha2digit(
signed=signed,
ordinal_threshold=ordinal_threshold,
)
last_word = None
in_number = False
out_tokens: List[str] = []
for word, ahead in look_ahead(tokens):
Expand All @@ -150,10 +150,12 @@ def alpha2digit(
relaxed=relaxed,
signed=signed,
ordinal_threshold=ordinal_threshold,
preceding_word=last_word
)
in_number = num_builder.push(word.lower(), ahead and ahead.lower())
if not in_number:
out_tokens.append(word)
last_word = word.lower()
# End of segment
num_builder.close()
if num_builder.value:
Expand Down Expand Up @@ -258,6 +260,13 @@ def revert_if_alone(sentence_effective_len: int, current_sentence: List[str]) ->
# finish LAST group but keep token_index
token_to_add = str(combined_num_result)
token_to_add_is_num = True
elif tmp_token_ordinal_org is not None:
# revert ordinal
sentence[len(sentence) - 1] = str(tmp_token_ordinal_org)
token_index += 1
token_to_add = " ".join(sentence)
token_to_add_is_num = False
current_token_ordinal_org = None
else:
# previous text was not a valid number
# prep. for next group
Expand Down

0 comments on commit 2cbdb65

Please sign in to comment.