leolivier · leolivier · Oct 2, 2024 · Sep 23, 2024 · Sep 23, 2024 · Sep 23, 2024
diff --git a/.env.example b/.env.example
@@ -55,6 +55,8 @@ TEMPERATURE=0.2
 # FUZZY=False
 # Force translating already translated entries. Can be overriden on the command line (-f or --force). Default is False
 # FORCE=False
+# Overwrite the output po file. Can be overriden on the command line (-o or --overwrite). Default is False
+# OVERWRITE_OUTPUT=False
 # Compile the output po file to an mo file. Can be overriden on the command line (-c or --compile). Default is False
 # COMPILE=False
 

diff --git a/.github/workflows/build-package.yaml b/.github/workflows/build-package.yaml
@@ -12,20 +12,20 @@ on:
 
 env:
   LLM_CLIENT: ollama
-  LLM_MODEL: gemma2:2b  # or phi3
+  LLM_MODEL: qwen2.5:3b  # or gemma2:2b
   LOG_LEVEL: INFO
-  INPUT_PO: tests/input/input.po
+  INPUT_PO: tests/input/test-small.po
   ORIGINAL_LANGUAGE: English
   CONTEXT_LANGUAGE: French
   TARGET_LANGUAGES: Italian  # comma separated list
   OLLAMA_BASE_URL: "http://localhost:11434/v1"
   # 2 files used to cache the Ollama version and model list
   # so that they do not need to be downloaded every time
   # Touch this file to force it to update Ollama
-  OLLAMA_VERSION_FILE: '.github/workflows/ollama-version.txt'
+  OLLAMA_VERSION_FILE: 'tests/resources/ollama-version.txt'
   # Put in this file a list of all models you want to pull from Ollama, one per line. 
   # LLM_MODEL must be set to one of these
-  MODEL_LIST_FILE: '.github/workflows/model-list.txt'
+  MODEL_LIST_FILE: 'tests/resources/model-list.txt'
 
 jobs:
 
@@ -146,4 +146,4 @@ jobs:
         # EXTREMELY WEIRD: if you remove these 2 lines, the test fails because LLM_MODEL is not set.
         echo "Running pytest with environment variables:"
         env | grep -E 'LOG_LEVEL|INPUT_PO|ORIGINAL_LANGUAGE|CONTEXT_LANGUAGE|TARGET_LANGUAGES|LLM_CLIENT|LLM_MODEL|OLLAMA_BASE_URL'
-        pytest -s -v ./tests
+        pytest -m 'not gentestvalues and not asserts_llm_results' -x -s -v
diff --git a/.github/workflows/model-list.txt b/.github/workflows/model-list.txt
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -4,6 +4,23 @@
 	// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 	"version": "0.2.0",
 	"configurations": [
+		{
+			"name": "Python Debugger: pytest",
+			"type": "debugpy",
+			"request": "launch",
+			"module": "pytest",
+			"args": [
+				"-m",
+				"not gentestvalues and not asserts_llm_results",
+				"-v",
+				"-x",
+				"-s",
+				"-k",
+				"test_django"
+				// "accounts"
+			],
+			"justMyCode": false,
+		},
 		{
 			"name": "Python Debugger: Current File",
 			"type": "debugpy",
@@ -12,14 +29,20 @@
 			"console": "integratedTerminal",
 			"justMyCode": false,
 			"args": [
-				"--llm", "ollama",
-				"--model", "gemma2:2b",
-				"--input_po", "tests/input/fr/LC_MESSAGES/input.po",
-				"--target_language", "Italian",
-				"--context_language", "English",
-				"--original_language", "French",
+				"--llm",
+				"ollama",
+				"--model",
+				"gemma2:2b",
+				"--input_po",
+				"tests/input/fr/LC_MESSAGES/input.po",
+				"--target_language",
+				"Italian",
+				"--context_language",
+				"English",
+				"--original_language",
+				"French",
 				"--verbose"
 			]
-		}
+		},
 	]
 }
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,7 @@
+{
+	"python.testing.pytestArgs": [
+		"tests"
+	],
+	"python.testing.unittestEnabled": false,
+	"python.testing.pytestEnabled": true
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -54,3 +54,12 @@ auto_djangopo_lyglot = "auto_po_lyglot.po_django_main:main"
 
 [tool.hatch.build]
 include = ["src"]
+
+[tool.pytest.ini_options]
+markers = [
+  "gentestvalues: marks tests as generating test values (deselect with '-m \"not genetestvalues\"')...",
+  "asserts_llm_results: tests which can fail because they checking LLM results. (deselect with '-m \"not asserts_llm_results\"')...",
+]
+testpaths = [
+    "./tests",
+]
diff --git a/src/auto_po_lyglot/clients/client_base.py b/src/auto_po_lyglot/clients/client_base.py
@@ -170,11 +170,11 @@ def get_system_prompt(self):
       raise PoLyglotException(f"examples.py does not contain an example for these piece: {str(e)}")
 
     system_prompt = format.format(**prompt_params)
-    if self.first:
-      logger.info(f"First system prompt:\n{system_prompt}")
-      self.first = False
-    else:
-      logger.debug(f"System prompt:\n{system_prompt}")
+    # if self.first:
+    #   logger.info(f"First system prompt:\n{system_prompt}")
+    #   self.first = False
+    # else:
+    logger.debug(f"System prompt:\n{system_prompt}")
     return system_prompt
 
   def get_user_prompt(self, phrase, context_translation):
@@ -191,6 +191,13 @@ def get_user_prompt(self, phrase, context_translation):
     return format.format(**params)
 
   def process_translation(self, raw_result):
+    """
+    Process the raw translation result
+    Args:
+        raw_result (str): The raw translation result
+    Returns:
+        tuple(str,str): The translation and its explanation
+    """
     translation_result = raw_result.split('\n')
     translation = translation_result[0].strip(' "')
     explanation = None
@@ -202,24 +209,29 @@ def process_translation(self, raw_result):
     return translation, explanation
 
   def translate(self, phrase, context_translation):
+      """
+      Translate a single phrase using the given context translation
+      Args:
+          phrase (str): The phrase to translate
+          context_translation (str): The context translation
+      Returns:
+          str: The translated phrase and its explanation
+      """
       if self.target_language is None:
         raise PoLyglotException("Error:target_language must be set before trying to translate anything")
       system_prompt = self.get_system_prompt()
       user_prompt = self.get_user_prompt(phrase, context_translation)
       raw_result = self.get_translation(system_prompt, user_prompt)
       return self.process_translation(raw_result)
 
-  def translate_pofile(self, input_file, output_file):
-    logger.info(f"Translating {input_file} to {self.target_language} in {output_file}")
+  def set_po_header_and_metadata(self, po, input_file):
     input_path = Path(input_file)
     if str(input_path.parents[1]) == 'LC_MESSAGES':
       app_name = input_path.parents[4].name.capitalize()
       wr_input_file = '/'.join(input_path.parts[-6:])  # don't keep the beginning of the file name to put in the header
     else:
       app_name = "NO NAME FOUND"
       wr_input_file = input_file
-    po = polib.pofile(input_file)
-    out_po = polib.pofile(output_file) if Path(output_file).exists() else None
     po.header = f"""{self.target_language} Translations for {app_name} app.
 Copyright (C) {datetime.now().year} {self.params.owner}
 This file is distributed under the same license as the application.
@@ -233,58 +245,110 @@ def translate_pofile(self, input_file, output_file):
     po.metadata['Language'] = get_language_code(self.target_language).upper()
     po.metadata['PO-Revision-Date'] = f"{datetime.now():%Y-%m-%d %H:%M+00:00}\n"  # "2024-08-07 20:09+0200""
 
+  def _copy_entry(self, to_entry, from_entry):
+    for attr in ["msgid", "msgstr", "msgid_plural", "fuzzy",
+                 "obsolete", "comment", "msgctxt", "encoding",
+                 "occurrences", "tcomment", "flags",
+                 "previous_msgctxt", "previous_msgid",
+                 "previous_msgid_plural", "linenum"]:
+      setattr(to_entry, attr, getattr(from_entry, attr))
+    if from_entry.msgstr_plural:  # entry with plural management. Deep copy the plural case
+      to_entry.msgstr_plural = from_entry.msgstr_plural.copy()
+
+  def translate_entry(self, entry, out_po=None):
+    """
+    Translate a single entry
+    Args:
+        entry (polib.POEntry): The entry to translate
+        out_po (polib.POFile): The output po file if already existing
+    Returns:
+        nothing (the entry is updated in-place)
+    """
+    forced = False
+    if not entry.msgid:
+      return {"status": 'Empty', "forced": forced}
+    # dont translate fuzzy entries except if forced by 'fuzzy' param
+    if entry.fuzzy and not self.params.fuzzy:
+      return {"status": 'Fuzzy', "forced": forced}
+    if out_po:
+      out_entry = out_po.find(entry.msgid)
+      # don't translate again the existing translations except if forced by params
+      if out_entry:
+        if ((out_entry.msgstr != "" or
+             (out_entry.msgid_plural and out_entry.msgstr_plural[0] != ""))
+            and not self.params.force):
+          self._copy_entry(entry, out_entry)
+          return {"status": 'Already', "forced": forced}
+        else:
+          forced = "True"
+    original_phrase = entry.msgid
+    if entry.msgid_plural:  # entry with plural management. First manage the singular case
+      context_translation = entry.msgstr_plural[0] if entry.msgstr_plural else entry.msgid_plural
+    else:
+      context_translation = entry.msgstr if entry.msgstr else entry.msgid
+    translation, explanation = self.translate(original_phrase, context_translation)
+    # Add explanation to comment
+    if explanation:
+      entry.comment = explanation
+    # Update translation
+    if entry.msgid_plural:  # entry with plural management. Update the singular case
+      entry.msgstr_plural[0] = translation
+    else:
+      entry.msgstr = translation
+    logger.info(f"""==================
+{self.params.original_language}: "{original_phrase}"
+{self.params.context_language}: "{context_translation}"
+{self.target_language}: "{translation}"
+Comment:{explanation if explanation else ''}
+""")
+
+    if entry.msgid_plural:  # entry with plural management. Now manage the plural case
+      original_phrase = entry.msgid_plural
+      context_translation = entry.msgstr_plural[1] if entry.msgstr_plural else entry.msgid_plural
+      translation, explanation = self.translate(original_phrase, context_translation)
+      # Update translation
+      entry.msgstr_plural[1] = translation
+      # Note: the plural explanation is **not** stored in the out po file.
+      logger.info(f"""================== PLURAL CASE ==================
+{self.params.original_language}: "{original_phrase}"
+{self.params.context_language}: "{context_translation}"
+{self.target_language}: "{translation}"
+Comment:{explanation if explanation else ''}
+""")
+      return {"status": 'Plural', "forced": forced}
+    return {"status": 'Singular', "forced": forced}
+
+  def translate_pofile(self, input_file, output_file):
+    """
+    Translate a .po file (given by input_file) from its original language to the target language and saves it
+    to output_file. If the output_file already exists, it will be overwritten, otherwise it will be created.
+    The function returns a tuple containing:
+      - the number of translated entries,
+      - the percent of translated entries,
+      - the number of entries that were already translated and not taken into account
+        (if output_file already exists and force=False),
+      - the number of forced (ie overwritten) entries (if output_file already exists and force=True),
+      - and the number of fuzzy entries not taken into account (if fuzzy=False).
+    """
+    logger.info(f"Translating {input_file} to {self.target_language} in {output_file}")
+    po = polib.pofile(input_file)
+    out_po = polib.pofile(output_file) if Path(output_file).exists() else None
+    self.set_po_header_and_metadata(po, input_file)
     try:
       nb_translations = 0
       already_translated = 0
+      forced = 0
+      fuzzy = 0
       for entry in po:
-        if entry.msgid and (self.params.fuzzy or not entry.fuzzy):
-
-          # don't translate again the existing translations except if forced
-          if out_po:
-            out_entry = out_po.find(entry.msgid)
-            if out_entry and out_entry.msgstr != "" and not self.params.force:
-              entry.msgstr = out_entry.msgstr
-              already_translated += 1
-              continue
-
-          original_phrase = entry.msgid
-          if entry.msgid_plural:  # entry with plural management. First manage the singular case
-            context_translation = entry.msgstr_plural[0] if entry.msgstr_plural else entry.msgid_plural
-          else:
-            context_translation = entry.msgstr if entry.msgstr else entry.msgid
-          translation, explanation = self.translate(original_phrase, context_translation)
-          # Add explanation to comment
-          if explanation:
-            entry.comment = explanation
-          # Update translation
-          if entry.msgid_plural:  # entry with plural management. Update the singular case
-            entry.msgstr_plural[0] = translation
-          else:
-            entry.msgstr = translation
-          logger.info(f"""==================
-  {self.params.original_language}: "{original_phrase}"
-  {self.params.context_language}: "{context_translation}"
-  {self.target_language}: "{translation}"
-  Comment:{explanation if explanation else ''}
-  """)
-          sleep(1.0)  # Sleep for 1 second to avoid rate limiting
-
-          if entry.msgid_plural:  # entry with plural management. Now manage the plural case
-            original_phrase = entry.msgid_plural
-            context_translation = entry.msgstr_plural[1] if entry.msgstr_plural else entry.msgid_plural
-            translation, explanation = self.translate(original_phrase, context_translation)
-            # Update translation
-            entry.msgstr_plural[1] = translation
-            # Note: the plural explanation is not stored in the out po file.
-            logger.info(f"""================== PLURAL CASE ==================
-    {self.params.original_language}: "{original_phrase}"
-    {self.params.context_language}: "{context_translation}"
-    {self.target_language}: "{translation}"
-    Comment:{explanation if explanation else ''}  
-    """)
-            sleep(1.0)  # Sleep for 1 second to avoid rate limiting
-
-          nb_translations += 1
+        res = self.translate_entry(entry, out_po)
+        if res['status'] == 'Already':
+          already_translated += 1
+        elif res['status'] == 'Fuzzy':
+          fuzzy += 1
+        elif res['forced'] == 'True':
+          forced += 1
+        sleep(0.5)  # Sleep for 1/2 second to avoid rate limiting
+        nb_translations += 1
     except Exception as e:
       logger.error(f"Error: {e}")
     # Save the new .po file even if there was an error to not lose what was translated
@@ -293,7 +357,14 @@ def translate_pofile(self, input_file, output_file):
       logger.info(f"Compiling {output_file}")
       mo_output_file = Path(output_file).with_suffix('.mo')
       po.save_as_mofile(mo_output_file)
-    percent_translated = round(nb_translations / (len(po)-already_translated) * 100, 2)
-    logger.info(f"Saved {output_file}, translated {nb_translations} entries out "
-                f"of {len(po)} entries, with {already_translated} entries already translated and not taken into account "
-                f"({percent_translated}%)")
+    to_be_translated = len(po) - already_translated
+    if to_be_translated == 0:
+      logger.info(f"Nothing to translate in {output_file}")
+      percent_translated = 100
+    else:
+      percent_translated = round(nb_translations / (len(po)-already_translated) * 100, 2)
+      logger.info(f"Saved {output_file}, translated {nb_translations} entries out "
+                  f"of {len(po)} entries, with {already_translated} entries already translated and not taken into account "
+                  f"({percent_translated}%)")
+      logger.info(f"{forced} forced entries, {fuzzy} fuzzy entries")
+    return nb_translations, percent_translated, already_translated, forced, fuzzy
diff --git a/src/auto_po_lyglot/clients/openai_ollama_client.py b/src/auto_po_lyglot/clients/openai_ollama_client.py
@@ -43,8 +43,10 @@ def __init__(self, params, target_language=None):
 
 
 class OllamaClient(OpenAIAPICompatibleClient):
+    use_large_system_prompt = True  # ollama tokens are free
+
     def __init__(self, params, target_language=None):
-        params.model = params.model or "llama3.1:8b"  # default model if not provided
+        params.model = params.model or "qwen2.5:3b"  # default model if not provided, the most translation capable small model
         params.ollama_base_url = params.ollama_base_url or 'http://localhost:11434/v1'  # default Ollama local server URL
         super().__init__(params, target_language)
         self.client = OpenAI(api_key='Ollama_Key_Unused_But_Required', base_url=self.params.ollama_base_url)
diff --git a/src/auto_po_lyglot/csv_extractor.py b/src/auto_po_lyglot/csv_extractor.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 """Converts output md files from transpopenai.py to CSV files."""
+from pathlib import PurePath
 import re
 import csv
 import sys
@@ -46,6 +47,16 @@ def extract_csv(input_file, output_file, languages=["English", "French", "Italia
             writer.writerow(row)
 
 
+def extract_csv_translations(output_file, params):
+  csv_file = PurePath(output_file).with_suffix('.csv')
+  if not output_file.exists():
+    print(f"Error: Input file '{output_file}' does not exist.")
+    sys.exit(1)
+  languages = [params.original_language, params.context_language] + params.target_languages
+  extract_csv(output_file, csv_file, languages)
+  print("CSV extracted to file:", csv_file)
+
+
 def main():
     if len(sys.argv) != 2:
         print(f"Usage: {sys.argv[0]} <input_file>")