Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: better evaluation scripts #116

Merged
merged 3 commits into from
Apr 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ jobs:
if: matrix.python-version != '3.6' && matrix.python-version != '3.7'
run: pip install -r requirements-dev.txt

- name: Install training dependencies
if: matrix.python-version != '3.6' && matrix.python-version != '3.7'
run: pip install -r training/requirements.txt

- name: Install dependencies (legacy versions)
if: matrix.python-version == '3.6' || matrix.python-version == '3.7'
run: |
Expand All @@ -89,7 +93,7 @@ jobs:
run: black --check --diff simplemma training tests

- name: Type checking with mypy
if: matrix.python-version != '3.6'
if: matrix.python-version != '3.6' && matrix.python-version != '3.7'
run: mypy -p simplemma -p training -p tests

- name: Test with pytest
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,4 @@ Makefile

# eval
UD/
training/**/data
13 changes: 0 additions & 13 deletions eval/README.rst

This file was deleted.

1 change: 0 additions & 1 deletion eval/eval-requirements.txt

This file was deleted.

134 changes: 0 additions & 134 deletions eval/udscore.py

This file was deleted.

20 changes: 8 additions & 12 deletions simplemma/language_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,18 +198,14 @@ def proportion_in_target_languages(
Returns:
float: The proportion of text in the target language(s).
"""
tokens = self._token_sampler.sample_text(text)
if len(tokens) == 0:
return 0

in_target = 0
for token in tokens:
for lang_code in self._lang:
candidate = self._lemmatization_strategy.get_lemma(token, lang_code)
if candidate is not None:
in_target += 1
break
return in_target / len(tokens)
return sum(
percentage
for (
lang_code,
percentage,
) in self.proportion_in_each_language(text).items()
if lang_code != "unk"
)

def main_language(
self,
Expand Down
8 changes: 0 additions & 8 deletions tests/test_language_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,14 +108,6 @@ def test_in_target_language() -> None:
== 1.0
)

langs = ("en", "de")
text = "It was a true gift"
assert (
LanguageDetector(lang=langs).proportion_in_target_languages(text)
== in_target_language(text, lang=langs)
== 1.0
)


def test_main_language():
text = "Dieser Satz ist auf Deutsch."
Expand Down
15 changes: 15 additions & 0 deletions training/README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
Instructions to run the evaluation
----------------------------------

The scores are calculated on `Universal Dependencies <https://universaldependencies.org/>`_ treebanks on single word tokens (including some contractions but not merged prepositions). They can be reproduced by the following steps:

1. Install the evaluation dependencies, Python >= 3.8 required (``pip install -r training/requirements.txt``)
2. Update ``DATA_URL`` in ``training/download-eval-data.py`` to point to the latest treebanks archive from `Universal Dependencies <https://universaldependencies.org/#download>` (or the version that you which to use).
3. Run ``python3 training/download-eval-data.py`` which will
1. Download the archive
2. Extract relevant data (language and if applicable specific treebank, see notes in the results table)
3. Concatenate the train, dev and test data into a single file (e.g. ``cat de_gsd*.conllu > de-gsd-all.conllu``)
4. Store the files at the expected location (``training/data/UD/``)
4. Run the script, e.g. from the home directory ``python3 training/evaluate_simplema.py``
5. Results are stored at ``training/data/results/results_summary.csv``. Also, errors are written in a CSV file for each dataset under the ``data/results``folder.

71 changes: 71 additions & 0 deletions training/download-eval-data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from typing import Iterable, List, Tuple
from os import mkdir, path, scandir
import re
import logging
import tarfile
import requests
from glob import glob

from simplemma.strategies.dictionaries.dictionary_factory import SUPPORTED_LANGUAGES

log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
DATA_URL = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-5150/ud-treebanks-v2.12.tgz?sequence=1&isAllowed=y"
DATA_FOLDER = path.join(path.dirname(__file__), "data")
DATA_FILE = path.join(DATA_FOLDER, "ud-treeebanks.tgz")
CLEAN_DATA_FOLDER = path.join(DATA_FOLDER, "UD")


def get_dirs(file_name: str) -> List[str]:
return [dir.name for dir in scandir(file_name) if dir.is_dir()]


def get_files(file_name: str) -> List[str]:
return [dir.name for dir in scandir(file_name) if dir.is_file()]


def get_relevant_language_data_folders(data_folder) -> Iterable[Tuple[str, str, str]]:
for lang_folder in get_dirs(data_folder):
lang_data_folder = path.join(uncompressed_data_folder, lang_folder)
conllu_file = glob(path.join(lang_data_folder, "*.conllu"))[0]
matches_files = re.search("^.*/(.*)-ud.*$", conllu_file)
if matches_files is not None:
dataset_name = matches_files.groups()[0]
lang = dataset_name.split("_")[0]

if lang in SUPPORTED_LANGUAGES:
yield (lang, dataset_name, lang_data_folder)


if path.exists(DATA_FOLDER) or path.exists(CLEAN_DATA_FOLDER):
raise Exception(
"Data folder seems to be already present. Delete it before creating new data."
)

mkdir(DATA_FOLDER)
mkdir(CLEAN_DATA_FOLDER)

log.info("Downloading evaluation data...")
response = requests.get(DATA_URL)
open(DATA_FILE, "wb").write(response.content)

log.info("Uncompressing evaluation data...")
with tarfile.open(DATA_FILE) as tar:
tar.extractall(DATA_FOLDER)
uncompressed_data_folder = path.join(
DATA_FOLDER, glob(f"{DATA_FOLDER}/ud-treebanks-*")[0]
)

log.info("Filtering files...")
for lang, dataset_name, dataset__folder in get_relevant_language_data_folders(
uncompressed_data_folder
):
log.info(lang + " - " + dataset__folder)
# Concatenate the train, dev and test data into a single file (e.g. ``cat de_gsd*.conllu > de-gsd-all.conllu``)
lang_clean_data_file = path.join(CLEAN_DATA_FOLDER, f"{dataset_name}.conllu")
log.debug(f"Procressing data for {dataset_name}")
with open(lang_clean_data_file, "w") as outfile:
for file in glob(path.join(dataset__folder, "*.conllu")):
with open(file) as infile:
for line in infile:
outfile.write(line)
Loading
Loading