Skip to content

Commit

Permalink
feat: better evaluation scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
juanjoDiaz committed Sep 15, 2023
1 parent 75561b6 commit fb4dade
Show file tree
Hide file tree
Showing 9 changed files with 217 additions and 168 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,4 @@ Makefile

# eval
UD/
training/**/data
13 changes: 0 additions & 13 deletions eval/README.rst

This file was deleted.

134 changes: 0 additions & 134 deletions eval/udscore.py

This file was deleted.

20 changes: 8 additions & 12 deletions simplemma/language_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,18 +198,14 @@ def proportion_in_target_languages(
Returns:
float: The proportion of text in the target language(s).
"""
tokens = self._token_sampler.sample_text(text)
if len(tokens) == 0:
return 0

in_target = 0
for token in tokens:
for lang_code in self._lang:
candidate = self._lemmatization_strategy.get_lemma(token, lang_code)
if candidate is not None:
in_target += 1
break
return in_target / len(tokens)
return sum(
percentage
for (
lang_code,
percentage,
) in self.proportion_in_each_language(text).items()
if lang_code != "unk"
)

def main_language(
self,
Expand Down
9 changes: 0 additions & 9 deletions tests/test_language_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,15 +108,6 @@ def test_in_target_language() -> None:
== 1.0
)

langs = ("en", "de")
text = "It was a true gift"
assert (
LanguageDetector(lang=langs).proportion_in_target_languages(text)
== in_target_language(text, lang=langs)
== 1.0
)
in_target_language("It was a true gift", lang=("en", "de"))


def test_main_language():
text = "Dieser Satz ist auf Deutsch."
Expand Down
15 changes: 15 additions & 0 deletions training/eval/README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
Instructions to run the evaluation
----------------------------------

The scores are calculated on `Universal Dependencies <https://universaldependencies.org/>`_ treebanks on single word tokens (including some contractions but not merged prepositions). They can be reproduced by the following steps:

1. Update ``DATA_URL`` in ``download-eval-data.py`` to point to the latest treebanks archive from `Universal Dependencies <https://universaldependencies.org/#download>` (or the version that you which to use).
2. Run ``python3 download-eval-data.py`` which will
1. Download the archive
2. Extract relevant data (language and if applicable specific treebank, see notes in the results table)
3. Concatenate the train, dev and test data into a single file (e.g. ``cat de_gsd*.conllu > de-gsd-all.conllu``)
4. Store the files at the expected location (``data/UD/``)
5. Install the evaluation dependencies (``pip install -r eval-requirements.txt``)
6. Run the script, e.g. from the home directory ``python3 evaluate_simplema.py``
7. Results are stored at ``data/results/results_summary.csv``. Also, errors are written in a CSV file for each dataset under the ``data/results``folder.
58 changes: 58 additions & 0 deletions training/eval/download-eval-data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from typing import Iterable, List, Tuple
from os import mkdir, path, scandir
import re
import logging
import tarfile
import requests
from glob import glob

from simplemma.strategies.dictionaries.dictionary_factory import SUPPORTED_LANGUAGES

log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
DATA_URL = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-5150/ud-treebanks-v2.12.tgz?sequence=1&isAllowed=y"
DATA_FOLDER = path.join(path.dirname(__file__), "..", "data")
DATA_FILE = path.join(DATA_FOLDER, "ud-treeebanks.tgz")
CLEAN_DATA_FOLDER = path.join(DATA_FOLDER, "UD")

def get_dirs(file_name: str) -> List[str]:
return [dir.name for dir in scandir(file_name) if dir.is_dir()]

def get_files(file_name: str) -> List[str]:
return [dir.name for dir in scandir(file_name) if dir.is_file()]

def get_relevant_language_data_folders(data_folder) -> Iterable[Tuple[str, str, str]]:
for lang_folder in get_dirs(data_folder):
lang_data_folder = path.join(uncompressed_data_folder, lang_folder)
conllu_file = glob(path.join(lang_data_folder, "*.conllu"))[0]
dataset_name = re.search("^.*/(.*)-ud.*$", conllu_file).groups()[0]
lang = dataset_name.split("_")[0]

if lang in SUPPORTED_LANGUAGES:
yield (lang, dataset_name, lang_data_folder)

if path.exists(DATA_FOLDER) or path.exists(CLEAN_DATA_FOLDER):
raise Exception("Data folder seems to be already present. Delete it before creating new data.")

mkdir(DATA_FOLDER)
mkdir(CLEAN_DATA_FOLDER)

log.info("Downloading evaluation data...")
response = requests.get(DATA_URL)
open(DATA_FILE, "wb").write(response.content)

log.info("Uncompressing evaluation data...")
tarfile.open(DATA_FILE).extractall(DATA_FOLDER).close()
uncompressed_data_folder = path.join(DATA_FOLDER, glob(f"{DATA_FOLDER}/ud-treebanks-*")[0])

log.info("Filtering files...")
for (lang, dataset_name, dataset__folder) in get_relevant_language_data_folders(uncompressed_data_folder):
log.info(lang + " - " + dataset__folder)
# Concatenate the train, dev and test data into a single file (e.g. ``cat de_gsd*.conllu > de-gsd-all.conllu``)
lang_clean_data_file = path.join(CLEAN_DATA_FOLDER, f"{dataset_name}.conllu")
log.debug(f"Procressing data for {dataset_name}")
with open(lang_clean_data_file, 'w') as outfile:
for file in glob(path.join(dataset__folder, "*.conllu")):
with open(file) as infile:
for line in infile:
outfile.write(line)
File renamed without changes.
Loading

0 comments on commit fb4dade

Please sign in to comment.