Merge pull request #108 from allo-media/master

Bug fix release.
allo-media · Mar 19, 2024 · 2cbdb65 · 2cbdb65
2 parents a24659d + c0d7bd3
commit 2cbdb65
Show file tree

Hide file tree

Showing 10 changed files with 85 additions and 21 deletions.
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -0,0 +1,16 @@
+# Required
+version: 2
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+  configuration: docs/conf.py
+
+# Build documentation with MkDocs
+#mkdocs:
+#  configuration: mkdocs.yml
+
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.8"
+
diff --git a/setup.py b/setup.py
@@ -1,7 +1,7 @@
 from setuptools import setup, find_packages
 
 
-VERSION = "2.5.0"
+VERSION = "2.5.1"
 
 
 def readme():

diff --git a/tests/test_text_to_num_de.py b/tests/test_text_to_num_de.py
@@ -84,6 +84,11 @@ def test_text2num_zeroes(self):
         self.assertRaises(ValueError, text2num, "fünfzignullzwei", "de")
         self.assertRaises(ValueError, text2num, "fünfzigdreinull", "de")
 
+    def test_text2num_hundred_addition(self):
+        self.assertRaises(ValueError, text2num, "achtundachtzig dreihundert", "de")
+        self.assertRaises(ValueError, text2num, "zwanzig dreihundert", "de")
+        self.assertRaises(ValueError, text2num, "zwei zwölfhundert", "de")
+
     def test_alpha2digit_integers(self):
         source = "fünfundzwanzig Kühe, zwölf Hühner und einhundertfünfundzwanzig kg Kartoffeln."
         expected = "25 Kühe, 12 Hühner und 125 kg Kartoffeln."
@@ -297,3 +302,21 @@ def test_uppercase(self):
         source = "FÜNFZEHN EINS ZEHN EINS"
         expected = "15 1 10 1"
         self.assertEqual(alpha2digit(source, "de"), expected)
+
+    def test_ordinals_false_positives(self):
+        source = "In zehnten Jahrzehnten. Und einmal mit den Vereinten."
+        expected = "In 10. Jahrzehnten. Und einmal mit den Vereinten."
+        self.assertEqual(alpha2digit(source, "de"), expected)
+
+        source = "Dies ist eine Liste oder die Einkaufsliste."
+        expected = source
+        self.assertEqual(alpha2digit(source, "de"), expected)
+
+    def test_hundred_addition(self):
+        source = "Zahlen wie vierzig fünfhundert Tausend zweiundzwanzig hundert sind gut."
+        expected = "Zahlen wie 40 500022 100 sind gut."
+        self.assertEqual(alpha2digit(source, "de"), expected)
+
+        source = "achtundachtzig sieben hundert, acht und achtzig siebenhundert, achtundachtzig sieben hundert, acht und achtzig sieben hundert"
+        expected = "88 700, 88 700, 88 700, 88 700"
+        self.assertEqual(alpha2digit(source, "de"), expected)
diff --git a/tests/test_text_to_num_en.py b/tests/test_text_to_num_en.py
@@ -49,6 +49,7 @@ def test_text2num(self):
         self.assertEqual(text2num("one hundred fifteen", "en"), 115)
         self.assertEqual(text2num("seventy-five thousands", "en"), 75000)
         self.assertEqual(text2num("thousand nine hundred twenty", "en"), 1920)
+        self.assertEqual(text2num("one billion twenty-five millions", "en"), 1_025_000_000)
 
     def test_text2num_centuries(self):
         self.assertEqual(text2num("nineteen hundred seventy-three", "en"), 1973)
@@ -125,6 +126,7 @@ def test_alpha2digit_zero(self):
         self.assertEqual(alpha2digit(source, "en"), expected)
 
         self.assertEqual(alpha2digit("zero", "en"), "0")
+        self.assertEqual(alpha2digit("zero love", "en"), "0 love")
 
     def test_alpha2digit_ordinals(self):
         source = (
@@ -174,6 +176,10 @@ def test_one_as_noun_or_article(self):
         self.assertEqual(alpha2digit(source, "en"), source)
         source = "one cannot know"
         self.assertEqual(alpha2digit(source, "en"), source)
+        # Following an ordinal
+        source = "the sixth one"
+        expected = "the 6th one"
+        self.assertEqual(alpha2digit(source, "en"), expected)
         # End of segment
         source = "No one. Another one. One one. Twenty one"
         expected = "No one. Another one. 1 1. 21"

diff --git a/tests/test_text_to_num_fr.py b/tests/test_text_to_num_fr.py
@@ -46,6 +46,7 @@ def test_text2num(self):
 
         self.assertEqual(text2num("quinze", "fr"), 15)
         self.assertEqual(text2num("soixante quinze mille", "fr"), 75000)
+        self.assertEqual(text2num("un milliard vingt-cinq millions", "fr"), 1_025_000_000)
 
     def test_text2num_variants(self):
         self.assertEqual(text2num("quatre-vingt dix-huit", "fr"), 98)
@@ -142,6 +143,8 @@ def test_alpha2digit_zero(self):
         # self.assertEqual(alpha2digit(source, "fr"), source)
 
         self.assertEqual(alpha2digit("zéro", "fr"), "0")
+        self.assertEqual(alpha2digit("a a un trois sept trois trois sept cinq quatre zéro c c", "fr"), "a a 1 3 7 3 3 7 5 4 0 c c")
+        self.assertEqual(alpha2digit("sept un zéro", "fr"), "7 1 0")
 
     def test_alpha2digit_ordinals(self):
         source = (
@@ -200,3 +203,6 @@ def test_article(self):
     def test_un_pronoun(self):
         source = "Je n'en veux qu'un. J'annonce: le un"
         self.assertEqual(alpha2digit(source, "fr"), source)
+
+    def test_alpha2digit_newline(self):
+        self.assertEqual(alpha2digit("dix + deux\n= douze", "fr"), "10 + 2\n= 12")
diff --git a/text_to_num/lang/base.py b/text_to_num/lang/base.py
@@ -70,7 +70,7 @@ def normalize(self, word: str) -> str:
         return NotImplemented
 
     def not_numeric_word(self, word: Optional[str]) -> bool:
-        return word is None or word != self.DECIMAL_SEP and word not in self.NUMBERS
+        return word is None or word != self.DECIMAL_SEP and word not in self.NUMBERS and word not in self.ZERO
 
     def split_number_word(self, word: str) -> str:  # maybe use: List[str]
         """In some languages numbers are written as one word, e.g. German

diff --git a/text_to_num/lang/english.py b/text_to_num/lang/english.py
@@ -117,7 +117,7 @@ class English(Language):
 
     AND_NUMS: Set[str] = set()
     AND = "and"
-    NEVER_IF_ALONE = {"one"}
+    NEVER_IF_ALONE = {"one", "o"}
 
     # Relaxed composed numbers (two-words only)
     # start => (next, target)

diff --git a/text_to_num/lang/german.py b/text_to_num/lang/german.py
@@ -211,7 +211,7 @@ def split_number_word(self, word: str) -> str:
             if not found:
                 # is (large) ordinal ending?
                 ord_match = None
-                if len(result) > 3 and text.startswith("ste"):
+                if not invalid_word and len(result) > 3 and text.startswith("ste"):
                     ord_match = re.search(self.LARGE_ORDINAL_SUFFIXES_GER, text)
 
                 if ord_match:

diff --git a/text_to_num/parsers.py b/text_to_num/parsers.py
@@ -124,13 +124,14 @@ def is_coef_appliable(self, coef: int) -> bool:
             # a multiplier can be applied to anything lesser than itself,
             # as long as it not zero (special case for 1000 which then implies 1)
             return True
-        if coef * coef <= self.n000_val:
+        if coef * 1000 <= self.n000_val:
             # a multiplier can not be applied to a value bigger than itself,
             # so it must be applied to the current group only.
             # ex. for "mille": "deux millions cent cinquante mille"
             # ex. for "millions": "trois milliard deux cent millions"
             # But not twice: "dix mille cinq mille" is invalid for example. Therefore,
-            # we test the square of ``coef``.
+            # we test the 1000 × ``coef`` (as the multipliers above 100,
+            # are a geometric progression of ratio 1000)
             return (
                 self.grp_val > 0 or coef == 1000
             )  # "mille" without unit      is additive
@@ -345,6 +346,8 @@ def parse(self, text: str) -> bool:
                 elif (ng[hundred_index - 1] in self.lang.UNITS) or (
                     ng[hundred_index - 1] in self.lang.STENS
                 ):
+                    if hundred_index - 2 >= 0 and ng[hundred_index - 2] not in self.lang.MULTIPLIERS:
+                        raise ValueError("invalid {} without multiplier: {}".format(STATIC_HUNDRED, repr(ng)))
                     multiplier = German.NUMBER_DICT_GER[ng[hundred_index - 1]]
                     equation += "(" + str(multiplier) + " * 100)"
                     equation_results.append(multiplier * 100)
@@ -555,6 +558,7 @@ def __init__(
         relaxed: bool = False,
         signed: bool = True,
         ordinal_threshold: int = 3,
+        preceding_word: Optional[str] = None
     ) -> None:
         """Initialize the parser.
 
@@ -574,7 +578,7 @@ def __init__(
         self.in_frac = False
         self.closed = False  # For deferred stop
         self.open = False  # For efficiency
-        self.last_word: Optional[str] = None  # For context
+        self.last_word: Optional[str] = preceding_word  # For context
         self.ordinal_threshold = ordinal_threshold
 
     @property
@@ -651,21 +655,21 @@ def push(self, word: str, look_ahead: Optional[str] = None) -> bool:
         elif (
             word in self.lang.ZERO
             and self.at_start_of_seq()
-            and (
-                look_ahead is None
-                or look_ahead in self.lang.NUMBERS
-                or look_ahead in self.lang.ZERO
-                or look_ahead in self.lang.DECIMAL_SEP
-            )
+            and look_ahead is not None
+            and look_ahead in self.lang.DECIMAL_SEP
         ):
-            self._value.append("0")
+            pass
         elif (
             word in self.lang.ZERO
             and self.at_start_of_seq()
-            and look_ahead is not None
-            and look_ahead in self.lang.DECIMAL_SEP
+            # and (
+            #     look_ahead is None
+            #     or look_ahead in self.lang.NUMBERS
+            #     or look_ahead in self.lang.ZERO
+            #     or look_ahead in self.lang.DECIMAL_SEP
+            # )
         ):
-            pass
+            self._value.append("0")
         elif self._push(self.lang.ord2card(word) or "", look_ahead):
             self._value.append(
                 self.lang.num_ord(

diff --git a/text_to_num/transforms.py b/text_to_num/transforms.py
@@ -36,6 +36,7 @@
 
 omg = OrdinalsMerger()
 USE_PT_ORDINALS_MERGER = True
+WORD_SEP = re.compile(r"\s*[\.,;\(\)…\[\]:!\?]+\s*|\n")
 
 
 def look_ahead(sequence: Sequence[Any]) -> Iterator[Tuple[Any, Any]]:
@@ -108,10 +109,8 @@ def alpha2digit(
         raise Exception("Language not supported")
 
     language = LANG[lang]
-    segments = re.split(
-        r"\s*[\.,;\(\)…\[\]:!\?]+\s*", text
-    )
-    punct = re.findall(r"\s*[\.,;\(\)…\[\]:!\?]+\s*", text)
+    segments = WORD_SEP.split(text)
+    punct = WORD_SEP.findall(text)
     if len(punct) < len(segments):
         punct.append("")
 
@@ -138,6 +137,7 @@ def alpha2digit(
                 signed=signed,
                 ordinal_threshold=ordinal_threshold,
             )
+            last_word = None
             in_number = False
             out_tokens: List[str] = []
             for word, ahead in look_ahead(tokens):
@@ -150,10 +150,12 @@ def alpha2digit(
                         relaxed=relaxed,
                         signed=signed,
                         ordinal_threshold=ordinal_threshold,
+                        preceding_word=last_word
                     )
                     in_number = num_builder.push(word.lower(), ahead and ahead.lower())
                 if not in_number:
                     out_tokens.append(word)
+                last_word = word.lower()
             # End of segment
             num_builder.close()
             if num_builder.value:
@@ -258,6 +260,13 @@ def revert_if_alone(sentence_effective_len: int, current_sentence: List[str]) ->
                         # finish LAST group but keep token_index
                         token_to_add = str(combined_num_result)
                         token_to_add_is_num = True
+                elif tmp_token_ordinal_org is not None:
+                    # revert ordinal
+                    sentence[len(sentence) - 1] = str(tmp_token_ordinal_org)
+                    token_index += 1
+                    token_to_add = " ".join(sentence)
+                    token_to_add_is_num = False
+                    current_token_ordinal_org = None
                 else:
                     # previous text was not a valid number
                     # prep. for next group