From c4b25cb4d1fc1a535c0f99cef37443cbe5aa7e4c Mon Sep 17 00:00:00 2001 From: Peter Stanchev Date: Tue, 4 Feb 2020 10:05:24 +0100 Subject: [PATCH] Speed improvement and addition of C++ module --- CharacTER.py | 229 ++++++++++++++++----------------------------------- ed.cpp | 39 +++++++++ libED.so | Bin 0 -> 16792 bytes 3 files changed, 111 insertions(+), 157 deletions(-) create mode 100755 ed.cpp create mode 100755 libED.so diff --git a/CharacTER.py b/CharacTER.py index 0ac4265..b509853 100755 --- a/CharacTER.py +++ b/CharacTER.py @@ -1,9 +1,9 @@ -#! /usr/bin/env python2 +#! /usr/bin/env python3 # -*- coding:utf-8 -*- """ This program is free software: you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software +the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. @@ -15,13 +15,12 @@ this program. If not, see . """ -from __future__ import division, print_function -import re -import codecs +# from math import sqrt #needed for aditional statistics +import os import sys +import ctypes +import argparse import itertools -import math -import time import Levenshtein try: from itertools import izip as zip @@ -29,34 +28,74 @@ pass +""" +Class which allows a more efficient way of computing the edit distance on a changing hypothesis. +Stores the C++ wrapper for the actual edit distance computation in self.ed_wrapper. +The reference is stored and converted to a sequence of integers on initialisation in self.ref via _word_to_num(). +Since the hypothesis changes after each shift it is added on call __call__(). +""" +class EditDistance(): + def __init__(self, ref, ed): + self.dic = {} + self.i = 0 + self.ed_wrapper = ed + self.ref = self._word_to_num(ref) + + def __call__(self, hyp): + return self._edit_distance(hyp) + + # Calls the C++ implementation of the edit distance + def _edit_distance(self, hyp): + hyp_c = (ctypes.c_ulonglong * len(hyp))() + ref_c = (ctypes.c_ulonglong * len(self.ref))() + hyp_c[:] = self._word_to_num(hyp) + ref_c[:] = self.ref + norm = len(ref_c) + result = self.ed_wrapper.wrapper(hyp_c, ref_c, len(hyp_c), len(ref_c), norm) + return result + + # Converts a sequence of words into a sequence of numbers. + # Each (unique) word is allocated a unique integer. + def _word_to_num(self, words): + res = [] + for word in words: + if word in self.dic: + res.append(self.dic[word]) + else: + self.dic[word] = self.i + res.append(self.dic[word]) + self.i = self.i + 1 + return res + + # Character error rate calculator, both hyp and ref are word lists -def cer(hyp, ref): - hyp_words, ref_words = list(hyp), list(ref) - ed_calc = CachedEditDistance(ref_words) +def cer(hyp_words, ref_words, ed_wrapper): hyp_backup = hyp_words - + edit_distance = EditDistance(ref_words, ed_wrapper) + pre_score = edit_distance(hyp_words) + if pre_score == 0: + return 0 """ Shifting phrases of the hypothesis sentence until the edit distance from the reference sentence is minimized """ while True: - diff, new_words = shifter(hyp_words, ref_words, ed_calc) - + diff, new_words = shifter(hyp_words, ref_words, pre_score, edit_distance) if diff <= 0: break hyp_words = new_words - + pre_score = pre_score - diff + shift_cost = _shift_cost(hyp_words, hyp_backup) shifted_chars = " ".join(hyp_words) ref_chars = " ".join(ref_words) if len(shifted_chars) == 0: return 1.0 - else: - edit_cost = Levenshtein.distance(shifted_chars, ref_chars) + shift_cost - cer = min(1.0, edit_cost / len(shifted_chars)) - return cer + + edit_cost = Levenshtein.distance(shifted_chars, ref_chars) + shift_cost + return min(1.0, edit_cost / len(shifted_chars)) """ @@ -66,18 +105,14 @@ def cer(hyp, ref): return the difference of edit distances between before and after shifting, and the shifted version of the hypothesis sentence. """ +def shifter(hyp_words, ref_words, pre_score, edit_distance): - -def shifter(hyp_words, ref_words, ed_calc): - pre_score = ed_calc(hyp_words) scores = [] - # Changing the phrase order of the hypothesis sentence for hyp_start, ref_start, length in couple_discoverer(hyp_words, ref_words): shifted_words = hyp_words[:hyp_start] + hyp_words[hyp_start+length:] shifted_words[ref_start:ref_start] = hyp_words[hyp_start:hyp_start+length] - scores.append((pre_score - ed_calc(shifted_words), shifted_words)) - + scores.append((pre_score - edit_distance(shifted_words), shifted_words)) # The case that the phrase order has not to be changed if not scores: return 0, hyp_words @@ -91,8 +126,6 @@ def shifter(hyp_words, ref_words, ed_calc): and yield the corresponding begin positions in both sentences as well as the maximal phrase length. Both sentences are represented as word lists. """ - - def couple_discoverer(sentence_1, sentence_2): # Applying the cartesian product to traversing both sentences for start_1, start_2 in \ @@ -119,45 +152,11 @@ def couple_discoverer(sentence_1, sentence_2): yield (start_1, start_2, length) -# Identical to Levenshtein distance -def edit_distance(sentence_1, sentence_2): - - # Keep sentence_2 as the shorter sentence - if len(sentence_1) < len(sentence_2): - return edit_distance(sentence_2, sentence_1) - - """ - If one sentence does not contain any words, the edit distance should be the - length of the other sentence - """ - if len(sentence_2) == 0: - return len(sentence_1) - - previous_row = range(len(sentence_2) + 1) - - # Go through the first sentence - for i, character_1 in enumerate(sentence_1): - current_row = [i+1] - - # Go through the second sentence and check the Levenshtein distance - for j, character_2 in enumerate(sentence_2): - insertions = previous_row[j + 1] + 1 - deletions = current_row[j] + 1 - substitutions = previous_row[j] + (character_1 != character_2) - current_row.append(min(insertions, deletions, substitutions)) - - previous_row = current_row - - return previous_row[-1] - - """ Shift cost: the average word length of the shifted phrase shifted_words: list of words in the shifted hypothesis sequence original_words: list of words in the original hypothesis sequence """ - - def _shift_cost(shifted_words, original_words): shift_cost = 0 original_start = 0 @@ -212,95 +211,12 @@ def _shift_cost(shifted_words, original_words): shift_cost += avg_shifted_charaters original_start += 1 - - return shift_cost - -""" -Function to calculate the number of edits (The same as TER): -1. Dynamic programming for calcualting edit distance -2. Greedy search to find the shift which most reduces minimum edit distance -Python code copyright (c) 2011 Hiroyuki Tanaka -""" - - -class CachedEditDistance(object): - - def __init__(self, rwords): - self.rwds = rwords - self._cache = {} - self.list_for_copy = [0 for _ in range(len(self.rwds) + 1)] - - def __call__(self, iwords): - start_position, cached_score = self._find_cache(iwords) - score, newly_created_matrix = \ - self._edit_distance(iwords, start_position, cached_score) - self._add_cache(iwords, newly_created_matrix) - return score - - def _edit_distance(self, iwords, spos, cache): - - if cache is None: - cache = [tuple(range(len(self.rwds) + 1))] - else: - cache = [cache] - - l = cache + [list(self.list_for_copy) - for _ in range(len(iwords) - spos)] - assert len(l) - 1 == len(iwords) - spos - - for i, j in itertools.product(range(1, len(iwords) - spos + 1), - range(len(self.rwds) + 1)): - - if j == 0: - l[i][j] = l[i - 1][j] + 1 - else: - l[i][j] = min(l[i - 1][j] + 1, - l[i][j - 1] + 1, - l[i - 1][j - 1] + (0 if iwords[spos + i - 1] == - self.rwds[j - 1] else 1)) - - return l[-1][-1], l[1:] - - def _add_cache(self, iwords, mat): - node = self._cache - skipnum = len(iwords) - len(mat) - - for i in range(skipnum): - node = node[iwords[i]][0] - - assert len(iwords[skipnum:]) == len(mat) - - for word, row in zip(iwords[skipnum:], mat): - - if word not in node: - node[word] = [{}, None] - - value = node[word] - - if value[1] is None: - value[1] = tuple(row) - - node = value[0] - - def _find_cache(self, iwords): - node = self._cache - start_position, row = 0, None - - for idx, word in enumerate(iwords): - - if word in node: - start_position = idx + 1 - node, row = node[word] - else: - break - - return start_position, row + return shift_cost # Parsing arguments def parse_args(): - import argparse parser = argparse.ArgumentParser( description='CharacTER: Character Level Translation Edit Rate', epilog="Please apply 'PYTHONIOENCODING' in environment variables, " @@ -315,9 +231,8 @@ def parse_args(): def main(): args = parse_args() - hyp_lines = [x for x in codecs.open(args.hyp, 'r', 'utf-8').readlines()] - ref_lines = [x for x in codecs.open(args.ref, 'r', 'utf-8').readlines()] - + hyp_lines = [x for x in open(args.hyp, 'r')] + ref_lines = [x for x in open(args.ref, 'r')] """ Check whether the hypothesis and reference files have the same number of sentences @@ -327,27 +242,27 @@ def main(): " reference file.".format(len(hyp_lines), len(ref_lines))) sys.exit(1) + # Initialise the connection to C++ + ed_wrapper = ctypes.CDLL(os.path.dirname(os.path.abspath(__file__)) + '/libED.so') + ed_wrapper.wrapper.restype = ctypes.c_float + scores = [] # Split the hypothesis and reference sentences into word lists - for index, (hyp, ref) in \ - enumerate(zip(hyp_lines, ref_lines), start=1): + for index, (hyp, ref) in enumerate(zip(hyp_lines, ref_lines), start=1): ref, hyp = ref.split(), hyp.split() - score = cer(hyp, ref) + score = cer(hyp, ref, ed_wrapper) scores.append(score) - # Print out scores of every sentence if args.verbose: print("CharacTER of sentence {0} is {1:.4f}".format(index, score)) average = sum(scores) / len(scores) - variance = sum((s - average) ** 2 for s in scores) / len(scores) - standard_deviation = math.sqrt(variance) + # variance = sum((s - average) ** 2 for s in scores) / len(scores) + # standard_deviation = sqrt(variance) print(average) if __name__ == '__main__': - # start_time = time.time() main() - # end_time = time.time() - # print(end_time-start_time) - + sys.exit(0) + diff --git a/ed.cpp b/ed.cpp new file mode 100755 index 0000000..79d776f --- /dev/null +++ b/ed.cpp @@ -0,0 +1,39 @@ +#include +#include +#include + +using namespace std; + +float ED(const std::vector hyp, const std::vector ref, const int norm){ + + std::vector row(hyp.size() + 1, 1); + for(float i = 0; i < row.size(); ++i){ + row[i] = i; + } + std::vector nextRow(hyp.size() + 1, std::numeric_limits::max()); + + + for(int w = 1; w < ref.size() + 1; ++w){ + for(int i = 0; i < hyp.size() + 1; ++i){ + if(i > 0){ + nextRow[i] = std::min({nextRow[i-1] + 1, row[i-1] + (ref[w-1] != hyp[i-1]), row[i]+ 1}); + } + else{ + nextRow[i] = row[i]+ 1.0; + } + } + row = nextRow; + nextRow.assign(nextRow.size() ,std::numeric_limits::max()); + } + + float errors = row[row.size()-1]; + return (errors)/(norm); +} + +//C wrapper for the C++ implementation. Communication channel with Python. +extern "C" float wrapper(const unsigned long long* hyp, const unsigned long long* ref, const int len_h, const int len_r, const int norm){ + std::vector hyp_vec(hyp,hyp+len_h); + std::vector ref_vec(ref,ref+len_r); + + return ED(hyp_vec, ref_vec, norm); +} \ No newline at end of file diff --git a/libED.so b/libED.so new file mode 100755 index 0000000000000000000000000000000000000000..2cc08543954efd558597f71ffd4559567b2fa4af GIT binary patch literal 16792 zcmeHOdw5gFl^?ypG{#p>ozzXBC|!|C4X8HQ1gD``wgvZ+D{!!BToy+WmJROuLG5ev zSlYOijiYtdx=A1VZMR#x*>t~czi-QykKH!iv^Ebb*^&(*O-tS;FC+-DQzt-zvG#Z7 z-mzra?w5Uh`4_ zHJgw46YOSAYfBYP%YE$=I!dIF%n?+_JUqk8If&H3Ga}t%*703ND)UE<5-o+wH?$DthZk{hS}N#8D){aOAGPzt2D?GHS=2X+ zUL1x#L~3-0_bm@HEO^#$+`^vgd$K$Gqa*kKWZC(>-`n!JM;0r)a;XX`Ux-KO|A_&| zSDAHS0pX|(*W)Vr1y`r`wAQ!FCsg~=d+p&{KpLXOuRmJ)AfHN1YVC3KWNym zke>xP*RyQKZqhM75c4b$*wNhD5@^?gZCW6}0*x(=8VfWaVF9Ju9|+aA)$eF**XrA< z{pC%qE%nvG?M?N1*~}$@x;;S>5NvAP2f_`Vfz5`5@}^*Wdwn}=YTVwgh3am*&C}lM zSw-NEy1GC+L4^d>>G;6b6~2lpZB=J|oz~i>?5zqadwssD!T_4KwPklR;!sm%Cu^>6 zuG_U2P?c7Admy0gY-`;e*d7c8f=x}Wb)77*rDb;@@O=AQ--SDw$6S2;8SA*I5td|SD*;m= z{tl!>@4%zEAB5|(u(gxrX4Y+#BVm=d?!Ato=DQ4e!N9GMXJ%Cfz3{`re~ucNxZ!UF z{*aN84LcQh0iXGdgR0{`TkkUm*>DW(mN z!si=!l99e{Inwat3q^3H;f<-wiwe^49}&bzSEu2j;K@*$hR3X%408CbEV<9-cR85c zGoabxe~BVFd_3!zPLeF%j-uq!5MpUbDRKnwq{75-5@G39f~lK{Qykt*Fm*XGz~K!9 zlj;+PI9x_BsXTFz!*>u&sz~&6_~@q`tf4=-^m(4j(KUr2kfSzAw#e=)a^wv;d?lyz z1~mJ}prK|js7($xNJ}aht35`oZ16}+$lB1A=$dzkC|Z0T3HT?7e{jMgM@Hqr_*&UC zBtJc&t>kr$Qlag)@&NGwh>56%!jGKkrO~>Yb8KhV?Nj&M_vQct~r6U z#e8rW961hTo+d%iRpjzxKI|sC*7{3MYO>#bQg#2uQiS9G0Q7f&= zOzM@;NI5)V^2T6iU2C-)0hqn9a^5s83pw5%f8??A4u zjg?8cO+R2`Qq2Xhb-Eng=16ccAxS0T#~x`(GumC`j92j z;s2C+UxIQsM)QtPWoqOPDh4Mj{`2u(0Te}lj zYpgt5V6Opd*KP&Y7Av<4>^QJ_nw^AQMK(WHp2v$yhNSM>;TaV7km5eBL^qDhFg4hf zS&e#48>87h)IsS0?a|bzGc37^AA>oB&(2rSZ11ngsdy!Kk&}krE*UgO?f>0;8eiL}zvN73 zF)&$5_6wehH*atA3oM9~06rzjexh?sW&R&_t+FL`G}wCEd}Lk=fRv3aB^Ef0Bf zY#kpHZGLPW9|juie43aprO%h8+~#BQ;1!D;9x&0UN(a7ggWCG`JMAibfx}mrB+ti9 znao(}%TgmDXO`kVN#+c#C#CLx#{3woknFZ%cm*0SQ{idAH9qDIzixq>xCS*rwMi3Z zL-N-&04UD2NCz}<`g>lNx=%op5C&&M!$E_oZT0v0QE@IwUiboRw?cBgLSnBY%oet( zB(G3gK>5Xwp>)hA)jUJc(w`9ox_YY|9zw5pPpF4Z_eHQvcJU%KXq5>?fBrvRnqA;LF!_33hr-^#^DdNDbXe-9&A-2|7OGNBS=6M37va@pmBxUd1OfHnc_Lr>Yl1);dIU9m-t6l+P3?b^pMos2r|BlN$p?MJ`GNwPjxyJ z_sg<7;g7s4558~4lr}3RC#BwHFe(`|cG>;59JwrGeN#%trQYAsP+`9I+yyVOF}kf7 ztHCxhZW@)V7G(4&|i@|#umpc+%a}GWx^eo>8qmQg#FPS zs4!*{&R8uOmU??WNG8?3ChVcy7nQ+lW`AU)!6%J8LB^^+HZJ$PfZ_kN>bw)Cc<&RK zPO39Y!%F^2{HO0?S?{?>z)1Y>0Kn2Ju*ipx%YR<$-R7c~i{Aq*P}I!p(M&ah2FCw5MqNhkuIMssC7YZMim(`+NPPDi z*>M*Q1-Gr+slC4u6>|F=rNs*sz6dE0t(bh3Sm^l~5fyN}W;3L_kWO?&_&+yotwdM^hGTvX10fx`b_m@1bL^seDtL6G=;J?63TT&a(h!) z_uf?AwR}BiYD_~dF5*mACHEInSTLdK?J8u$J`-Iu!LmVzIpqc5_V!bBkfK8r4N!E7 zqG5`Nxedpm3hcZm&u6OpWq-+Qn1O#oGjJkp22Mhp(t;oFG-o*oMw5-e?r3zrSg%jZ}ARY;9I zuSU*tZfXg!97q{X zf{4vpC1Q)V7O_?HAhv0X5zo`I;W)wQc$5r|^5LVfxd90J@Yxhk=_uOKzMQsvayZ^4 zxl8Hk$8!+RAv^Qk(c3n5+U+nk#mVEq@-Yacf|@XVkcer0>LhVQ=0FD z7X)WwC&o0MH$w8T+oBh=WVllIH}#xZ(*Rqvt%$AKdc-!(jTqAwv!K+D)yLuGTEsa7 zXA!8-we$o$B>cgBm}~-9{4c;^iVXq3YZC7xb)<+EeqCpjCvo6dxEn|iPC?$VT}M`8 z8a^W(Fu$Knf^@M8EX3g?6Yl~`4UhUMcaWk(6b(>xilSkPbOD2yR2LqLXTME)ctMH8 z^yTcui=@-5lPlUuz_iR+K^K381II3$jX#SV_XKj?Y4M6Lt5cO0LEO)8OVw1R#eg82eAq~*ww^J)aUqFDU9RPo?z zcv12aR%eU9=c3g8AjuKuZpe93>P-*|ja|=$Ttug&o(nXp!Z`SPxXtN64b-Z9Dkz{P zQ-)so^Gqpa^nJJ@UdAk;Wuv#1S1=NP^DVR|y2!=lKm<9n)im`>jaFZxRu6*;`!%bK zeU4qmIs>|G&eib;h#O0M0o)t30+)sNc6A2Qu7lM8X=>vTn2fLIz$F4M$L|2Z_ZT>I z9j{5-W9Z|7y(FAh4DK&}ABhyp_L7tm&{yt@l#*+NbWnG9H=fY$ zt2rG~_v5rN(On}vD_+CJ;>iot5%P{dMIo;rlP?kFfbOq-g_ja#X>j2{;uygAw}~}! zVzP@Nz<8aackv!xI1>LR()bs3O1>u|-JFcC2Ij(v_}zNJg~#LTDMt#0L?U@6egl!5 zM8Ej^(eUK!P++>&A`0L!0>P<=b_nT@>1NAkouI@tV*}@t$x@`H(9#1)9dP@@NGZ^N zXB15PHZ#*6yXofa`B`*oBwqUKWDj_E5kZc9T~7W6>AKxnU2N@5pZ)Yo=O+m*{$4Td zt5ms!`QLx*4LHZ}KyvrvS&ik5{=>1-`(xh>av|fu zJcZec5ZIdl&W$&46Z(7-5vn^s{Pg+|4-O66HqVTMh&>vWdq}v9>T$t0qoCfAJ zFsFez4a{j^P6KlqnA55r$iu4*9_X7v3If*O`lMj` z29W~(dgVcU--~~$c@W=~e%ye?cY`+@Fuon&!T8LSiSGx+`w~(9-bAvM2uF>Eh!+EP ztIm&q40tG@*WU=@o4ZznXr$k z_$(&uXexgC_t{i#fi)jA^f{IOI%XG!msViGY!vr1($J8zaN2j?RBmB8?2wpnX$%%7 zP2OinLxbMVvKFUf^t^@TvPH&yj`UIsz9bj(ek%T>lkKN2$9^epgnKP1y`&jL^PH!|;& zj2rZs_eo4x0B6#}r+~+(Pm}MNh2A?0{tL6Ze`CgYxo#_==w!@4%DUF&FQ@YLbb z#fI|D0adB0#zmCWy_0~}9H^t4C+(mJwFY)HwQdhK1wyzJ(jEwQ>|u4S&AXcFwfd0f z_TrMFnThy~mO!wrEw~r=IkdLDtf4K~TptK^G&kewNFWX3S74@)1_J9gd;PwEZ({{r zk>Sl0d0f5<_+*1XuGq{1>(xzVUNx|3-MT7Yb)ec?ruv8p*JmaLgxXsJJA*ADT%@VE z$J18d;FxOEB|-bdjqzky~UWJKQxC-$i#73UaHzu*`BZ!!4Gh#5ibXGJQ`QN&MYr%e7mz>!YG zxlNoKMVh(eqPC`ob^z)8Vt*`B{$G3=1O8IyL8M;+l+G{q(UYk`FA~vy0rzI`I}8Gm zepwtK4F*xK;HOUz)9V-KWRZ&VqoB|1|3e197=N*U7iocDG}6razn{TBZ0!F<>dfHJ ztpA@f_{DiZq*)lKl&m0Uj;AOzTg+<8gXxZ*yym(yJ)`{$FF77 zFU~(_K7#@Tm!3?#oQA( lO8Z5F497;tWLm34e}w;}`7Qd6>}2R)rDykL2+wE^`!Ai5or(Ye literal 0 HcmV?d00001