-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbenchmark_langid.py
68 lines (58 loc) · 2.37 KB
/
benchmark_langid.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import py3langid as langid
import psutil
import os
import time
import math
import numpy as np
import sys
import pandas as pd
from language_dictionary import lang_dict
from tqdm.auto import tqdm
pd.set_option("max_colwidth", None)
tqdm.pandas()
from typing import List
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s | %(levelname)s | %(message)s')
class BenchmarkLangid():
def __init__(self):
""" dummy run to load the model and get memory usage """
p = psutil.Process(os.getpid())
mem_before = p.memory_info().rss
langid.classify("Hello World")
mem_after = p.memory_info().rss
self.mem_usage = mem_after - mem_before
logger.info('Default model for Langid loaded ...')
def _detect_language(self, row):
"""Detects language for the given text"""
text = row['Text']
gt = row['language']
start = time.time()
pred, _ = langid.classify(text)
end = time.time() - start
match = gt == pred
return pd.Series([pred, end, match])
def __call__(self) -> List[pd.DataFrame]:
""" detects language for all the texts and calculates benchmark """
logger.info('Benchmark for Langid started ...')
MB = 1024 * 1024
df = pd.read_csv("data/dataset.csv")
df['language'] = df['language'].apply(lambda x:lang_dict[x])
df[['pred_lang', 'time_taken', 'ismatch']] = df.progress_apply(self._detect_language ,axis=1)
time_taken = df["time_taken"].to_list()
correct_predictions = df[df['ismatch'] == True].shape[0]
total_predictions = df.shape[0]
d = {"algorithm": "Langid",
"mean": np.mean(time_taken),
"max" : np.max(time_taken),
"min" : np.min(time_taken),
"median" : np.median(time_taken),
"mem": str(round(self.mem_usage/ MB,2)) + " mb",
"accuracy":correct_predictions/ total_predictions
}
df.to_csv("data/predictions_langid.csv", index = False)
summary_df = pd.DataFrame([d])
logger.info('Benchmark for Langid ended ...')
logger.info('See predictions_langid.csv files...')
return [summary_df]