Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tests #16

Merged
merged 22 commits into from
Oct 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ TEMPERATURE=0.2
# FUZZY=False
# Force translating already translated entries. Can be overriden on the command line (-f or --force). Default is False
# FORCE=False
# Overwrite the output po file. Can be overriden on the command line (-o or --overwrite). Default is False
# OVERWRITE_OUTPUT=False
# Compile the output po file to an mo file. Can be overriden on the command line (-c or --compile). Default is False
# COMPILE=False

Expand Down
10 changes: 5 additions & 5 deletions .github/workflows/build-package.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,20 @@ on:

env:
LLM_CLIENT: ollama
LLM_MODEL: gemma2:2b # or phi3
LLM_MODEL: qwen2.5:3b # or gemma2:2b
LOG_LEVEL: INFO
INPUT_PO: tests/input/input.po
INPUT_PO: tests/input/test-small.po
ORIGINAL_LANGUAGE: English
CONTEXT_LANGUAGE: French
TARGET_LANGUAGES: Italian # comma separated list
OLLAMA_BASE_URL: "http://localhost:11434/v1"
# 2 files used to cache the Ollama version and model list
# so that they do not need to be downloaded every time
# Touch this file to force it to update Ollama
OLLAMA_VERSION_FILE: '.github/workflows/ollama-version.txt'
OLLAMA_VERSION_FILE: 'tests/resources/ollama-version.txt'
# Put in this file a list of all models you want to pull from Ollama, one per line.
# LLM_MODEL must be set to one of these
MODEL_LIST_FILE: '.github/workflows/model-list.txt'
MODEL_LIST_FILE: 'tests/resources/model-list.txt'

jobs:

Expand Down Expand Up @@ -146,4 +146,4 @@ jobs:
# EXTREMELY WEIRD: if you remove these 2 lines, the test fails because LLM_MODEL is not set.
echo "Running pytest with environment variables:"
env | grep -E 'LOG_LEVEL|INPUT_PO|ORIGINAL_LANGUAGE|CONTEXT_LANGUAGE|TARGET_LANGUAGES|LLM_CLIENT|LLM_MODEL|OLLAMA_BASE_URL'
pytest -s -v ./tests
pytest -m 'not gentestvalues and not asserts_llm_results' -x -s -v
2 changes: 0 additions & 2 deletions .github/workflows/model-list.txt

This file was deleted.

37 changes: 30 additions & 7 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,23 @@
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: pytest",
"type": "debugpy",
"request": "launch",
"module": "pytest",
"args": [
"-m",
"not gentestvalues and not asserts_llm_results",
"-v",
"-x",
"-s",
"-k",
"test_django"
// "accounts"
],
"justMyCode": false,
},
{
"name": "Python Debugger: Current File",
"type": "debugpy",
Expand All @@ -12,14 +29,20 @@
"console": "integratedTerminal",
"justMyCode": false,
"args": [
"--llm", "ollama",
"--model", "gemma2:2b",
"--input_po", "tests/input/fr/LC_MESSAGES/input.po",
"--target_language", "Italian",
"--context_language", "English",
"--original_language", "French",
"--llm",
"ollama",
"--model",
"gemma2:2b",
"--input_po",
"tests/input/fr/LC_MESSAGES/input.po",
"--target_language",
"Italian",
"--context_language",
"English",
"--original_language",
"French",
"--verbose"
]
}
},
]
}
7 changes: 7 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"python.testing.pytestArgs": [
"tests"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
}
9 changes: 9 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,12 @@ auto_djangopo_lyglot = "auto_po_lyglot.po_django_main:main"

[tool.hatch.build]
include = ["src"]

[tool.pytest.ini_options]
markers = [
"gentestvalues: marks tests as generating test values (deselect with '-m \"not genetestvalues\"')...",
"asserts_llm_results: tests which can fail because they checking LLM results. (deselect with '-m \"not asserts_llm_results\"')...",
]
testpaths = [
"./tests",
]
193 changes: 132 additions & 61 deletions src/auto_po_lyglot/clients/client_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,11 +170,11 @@ def get_system_prompt(self):
raise PoLyglotException(f"examples.py does not contain an example for these piece: {str(e)}")

system_prompt = format.format(**prompt_params)
if self.first:
logger.info(f"First system prompt:\n{system_prompt}")
self.first = False
else:
logger.debug(f"System prompt:\n{system_prompt}")
# if self.first:
# logger.info(f"First system prompt:\n{system_prompt}")
# self.first = False
# else:
logger.debug(f"System prompt:\n{system_prompt}")
return system_prompt

def get_user_prompt(self, phrase, context_translation):
Expand All @@ -191,6 +191,13 @@ def get_user_prompt(self, phrase, context_translation):
return format.format(**params)

def process_translation(self, raw_result):
"""
Process the raw translation result
Args:
raw_result (str): The raw translation result
Returns:
tuple(str,str): The translation and its explanation
"""
translation_result = raw_result.split('\n')
translation = translation_result[0].strip(' "')
explanation = None
Expand All @@ -202,24 +209,29 @@ def process_translation(self, raw_result):
return translation, explanation

def translate(self, phrase, context_translation):
"""
Translate a single phrase using the given context translation
Args:
phrase (str): The phrase to translate
context_translation (str): The context translation
Returns:
str: The translated phrase and its explanation
"""
if self.target_language is None:
raise PoLyglotException("Error:target_language must be set before trying to translate anything")
system_prompt = self.get_system_prompt()
user_prompt = self.get_user_prompt(phrase, context_translation)
raw_result = self.get_translation(system_prompt, user_prompt)
return self.process_translation(raw_result)

def translate_pofile(self, input_file, output_file):
logger.info(f"Translating {input_file} to {self.target_language} in {output_file}")
def set_po_header_and_metadata(self, po, input_file):
input_path = Path(input_file)
if str(input_path.parents[1]) == 'LC_MESSAGES':
app_name = input_path.parents[4].name.capitalize()
wr_input_file = '/'.join(input_path.parts[-6:]) # don't keep the beginning of the file name to put in the header
else:
app_name = "NO NAME FOUND"
wr_input_file = input_file
po = polib.pofile(input_file)
out_po = polib.pofile(output_file) if Path(output_file).exists() else None
po.header = f"""{self.target_language} Translations for {app_name} app.
Copyright (C) {datetime.now().year} {self.params.owner}
This file is distributed under the same license as the application.
Expand All @@ -233,58 +245,110 @@ def translate_pofile(self, input_file, output_file):
po.metadata['Language'] = get_language_code(self.target_language).upper()
po.metadata['PO-Revision-Date'] = f"{datetime.now():%Y-%m-%d %H:%M+00:00}\n" # "2024-08-07 20:09+0200""

def _copy_entry(self, to_entry, from_entry):
for attr in ["msgid", "msgstr", "msgid_plural", "fuzzy",
"obsolete", "comment", "msgctxt", "encoding",
"occurrences", "tcomment", "flags",
"previous_msgctxt", "previous_msgid",
"previous_msgid_plural", "linenum"]:
setattr(to_entry, attr, getattr(from_entry, attr))
if from_entry.msgstr_plural: # entry with plural management. Deep copy the plural case
to_entry.msgstr_plural = from_entry.msgstr_plural.copy()

def translate_entry(self, entry, out_po=None):
"""
Translate a single entry
Args:
entry (polib.POEntry): The entry to translate
out_po (polib.POFile): The output po file if already existing
Returns:
nothing (the entry is updated in-place)
"""
forced = False
if not entry.msgid:
return {"status": 'Empty', "forced": forced}
# dont translate fuzzy entries except if forced by 'fuzzy' param
if entry.fuzzy and not self.params.fuzzy:
return {"status": 'Fuzzy', "forced": forced}
if out_po:
out_entry = out_po.find(entry.msgid)
# don't translate again the existing translations except if forced by params
if out_entry:
if ((out_entry.msgstr != "" or
(out_entry.msgid_plural and out_entry.msgstr_plural[0] != ""))
and not self.params.force):
self._copy_entry(entry, out_entry)
return {"status": 'Already', "forced": forced}
else:
forced = "True"
original_phrase = entry.msgid
if entry.msgid_plural: # entry with plural management. First manage the singular case
context_translation = entry.msgstr_plural[0] if entry.msgstr_plural else entry.msgid_plural
else:
context_translation = entry.msgstr if entry.msgstr else entry.msgid
translation, explanation = self.translate(original_phrase, context_translation)
# Add explanation to comment
if explanation:
entry.comment = explanation
# Update translation
if entry.msgid_plural: # entry with plural management. Update the singular case
entry.msgstr_plural[0] = translation
else:
entry.msgstr = translation
logger.info(f"""==================
{self.params.original_language}: "{original_phrase}"
{self.params.context_language}: "{context_translation}"
{self.target_language}: "{translation}"
Comment:{explanation if explanation else ''}
""")

if entry.msgid_plural: # entry with plural management. Now manage the plural case
original_phrase = entry.msgid_plural
context_translation = entry.msgstr_plural[1] if entry.msgstr_plural else entry.msgid_plural
translation, explanation = self.translate(original_phrase, context_translation)
# Update translation
entry.msgstr_plural[1] = translation
# Note: the plural explanation is **not** stored in the out po file.
logger.info(f"""================== PLURAL CASE ==================
{self.params.original_language}: "{original_phrase}"
{self.params.context_language}: "{context_translation}"
{self.target_language}: "{translation}"
Comment:{explanation if explanation else ''}
""")
return {"status": 'Plural', "forced": forced}
return {"status": 'Singular', "forced": forced}

def translate_pofile(self, input_file, output_file):
"""
Translate a .po file (given by input_file) from its original language to the target language and saves it
to output_file. If the output_file already exists, it will be overwritten, otherwise it will be created.
The function returns a tuple containing:
- the number of translated entries,
- the percent of translated entries,
- the number of entries that were already translated and not taken into account
(if output_file already exists and force=False),
- the number of forced (ie overwritten) entries (if output_file already exists and force=True),
- and the number of fuzzy entries not taken into account (if fuzzy=False).
"""
logger.info(f"Translating {input_file} to {self.target_language} in {output_file}")
po = polib.pofile(input_file)
out_po = polib.pofile(output_file) if Path(output_file).exists() else None
self.set_po_header_and_metadata(po, input_file)
try:
nb_translations = 0
already_translated = 0
forced = 0
fuzzy = 0
for entry in po:
if entry.msgid and (self.params.fuzzy or not entry.fuzzy):

# don't translate again the existing translations except if forced
if out_po:
out_entry = out_po.find(entry.msgid)
if out_entry and out_entry.msgstr != "" and not self.params.force:
entry.msgstr = out_entry.msgstr
already_translated += 1
continue

original_phrase = entry.msgid
if entry.msgid_plural: # entry with plural management. First manage the singular case
context_translation = entry.msgstr_plural[0] if entry.msgstr_plural else entry.msgid_plural
else:
context_translation = entry.msgstr if entry.msgstr else entry.msgid
translation, explanation = self.translate(original_phrase, context_translation)
# Add explanation to comment
if explanation:
entry.comment = explanation
# Update translation
if entry.msgid_plural: # entry with plural management. Update the singular case
entry.msgstr_plural[0] = translation
else:
entry.msgstr = translation
logger.info(f"""==================
{self.params.original_language}: "{original_phrase}"
{self.params.context_language}: "{context_translation}"
{self.target_language}: "{translation}"
Comment:{explanation if explanation else ''}
""")
sleep(1.0) # Sleep for 1 second to avoid rate limiting

if entry.msgid_plural: # entry with plural management. Now manage the plural case
original_phrase = entry.msgid_plural
context_translation = entry.msgstr_plural[1] if entry.msgstr_plural else entry.msgid_plural
translation, explanation = self.translate(original_phrase, context_translation)
# Update translation
entry.msgstr_plural[1] = translation
# Note: the plural explanation is not stored in the out po file.
logger.info(f"""================== PLURAL CASE ==================
{self.params.original_language}: "{original_phrase}"
{self.params.context_language}: "{context_translation}"
{self.target_language}: "{translation}"
Comment:{explanation if explanation else ''}
""")
sleep(1.0) # Sleep for 1 second to avoid rate limiting

nb_translations += 1
res = self.translate_entry(entry, out_po)
if res['status'] == 'Already':
already_translated += 1
elif res['status'] == 'Fuzzy':
fuzzy += 1
elif res['forced'] == 'True':
forced += 1
sleep(0.5) # Sleep for 1/2 second to avoid rate limiting
nb_translations += 1
except Exception as e:
logger.error(f"Error: {e}")
# Save the new .po file even if there was an error to not lose what was translated
Expand All @@ -293,7 +357,14 @@ def translate_pofile(self, input_file, output_file):
logger.info(f"Compiling {output_file}")
mo_output_file = Path(output_file).with_suffix('.mo')
po.save_as_mofile(mo_output_file)
percent_translated = round(nb_translations / (len(po)-already_translated) * 100, 2)
logger.info(f"Saved {output_file}, translated {nb_translations} entries out "
f"of {len(po)} entries, with {already_translated} entries already translated and not taken into account "
f"({percent_translated}%)")
to_be_translated = len(po) - already_translated
if to_be_translated == 0:
logger.info(f"Nothing to translate in {output_file}")
percent_translated = 100
else:
percent_translated = round(nb_translations / (len(po)-already_translated) * 100, 2)
logger.info(f"Saved {output_file}, translated {nb_translations} entries out "
f"of {len(po)} entries, with {already_translated} entries already translated and not taken into account "
f"({percent_translated}%)")
logger.info(f"{forced} forced entries, {fuzzy} fuzzy entries")
return nb_translations, percent_translated, already_translated, forced, fuzzy
4 changes: 3 additions & 1 deletion src/auto_po_lyglot/clients/openai_ollama_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,10 @@ def __init__(self, params, target_language=None):


class OllamaClient(OpenAIAPICompatibleClient):
use_large_system_prompt = True # ollama tokens are free

def __init__(self, params, target_language=None):
params.model = params.model or "llama3.1:8b" # default model if not provided
params.model = params.model or "qwen2.5:3b" # default model if not provided, the most translation capable small model
params.ollama_base_url = params.ollama_base_url or 'http://localhost:11434/v1' # default Ollama local server URL
super().__init__(params, target_language)
self.client = OpenAI(api_key='Ollama_Key_Unused_But_Required', base_url=self.params.ollama_base_url)
11 changes: 11 additions & 0 deletions src/auto_po_lyglot/csv_extractor.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python
"""Converts output md files from transpopenai.py to CSV files."""
from pathlib import PurePath
import re
import csv
import sys
Expand Down Expand Up @@ -46,6 +47,16 @@ def extract_csv(input_file, output_file, languages=["English", "French", "Italia
writer.writerow(row)


def extract_csv_translations(output_file, params):
csv_file = PurePath(output_file).with_suffix('.csv')
if not output_file.exists():
print(f"Error: Input file '{output_file}' does not exist.")
sys.exit(1)
languages = [params.original_language, params.context_language] + params.target_languages
extract_csv(output_file, csv_file, languages)
print("CSV extracted to file:", csv_file)


def main():
if len(sys.argv) != 2:
print(f"Usage: {sys.argv[0]} <input_file>")
Expand Down
Loading
Loading