Skip to content

Commit

Permalink
Merge pull request #43 from Helsinki-NLP/fix-opusfilter-interface
Browse files Browse the repository at this point in the history
Fix opusfilter interface
  • Loading branch information
miau1 authored Aug 8, 2024
2 parents 8eef88c + b6180fb commit 9355ba7
Show file tree
Hide file tree
Showing 6 changed files with 1,393 additions and 1,249 deletions.
5 changes: 2 additions & 3 deletions opustools_pkg/opustools/opus_file_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def download_files(self):
og = OpusGet(**arguments)
og.get_files()

def open_moses_files(self):
def open_moses_files(self, outpath=None):
moses_zip_name = os.path.join(self.download_dir, f'{self.directory}_{self.release}_moses_'
f'{self.fromto[0]}-{self.fromto[1]}.txt.zip')
if not os.path.isfile(moses_zip_name):
Expand All @@ -47,7 +47,7 @@ def open_moses_files(self):
ret_file_names = []
for fn in moses_zip.filelist:
if fn.filename.split('.')[-1] in self.fromto:
moses_zip.extract(fn.filename)
moses_zip.extract(fn.filename, path=outpath)
ret_file_names.append(fn.filename)
moses_zip.close()
return sorted(ret_file_names)
Expand Down Expand Up @@ -150,4 +150,3 @@ def close_zipfiles(self):
if self.zip_opened:
self.src_zip.close()
self.trg_zip.close()

209 changes: 115 additions & 94 deletions opustools_pkg/opustools/opus_read.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,28 @@
import logging
import os
import shutil
import re
import sys
import tempfile

from .parse.alignment_parser import AlignmentParser
from .parse.sentence_parser import SentenceParser, SentenceParserError
from .util import file_open
from .formatting import *
from .opus_file_handler import OpusFileHandler


logger = logging.getLogger(__name__)


def skip_regex_type(n, N):
"Select function to skip document names"

def get_re(doc_name):
return not re.search(n, doc_name)

def skip_re(doc_name):
return re.search(N, doc_name)

def nothing(doc_name):
return False

Expand All @@ -27,7 +35,8 @@ def nothing(doc_name):

class OpusRead:

def __init__(self, directory=None, source=None, target=None,
def __init__(
self, directory=None, source=None, target=None,
release='latest', preprocess='xml', maximum=-1, src_range='all',
tgt_range='all', attribute=None, threshold=None,
leave_non_alignments_out=False, write=None, write_mode='normal',
Expand All @@ -39,7 +48,8 @@ def __init__(self, directory=None, source=None, target=None,
change_annotation_delimiter='|',
src_cld2=None, trg_cld2=None, src_langid=None, trg_langid=None,
write_ids=None, suppress_prompts=False, download_dir='.',
preserve_inline_tags=False, n=None, N=None, chunk_size=1000000, verbose=False):
preserve_inline_tags=False, n=None, N=None, chunk_size=1000000,
verbose=False):
"""Read xces alignment files and xml sentence files and output in
desired format.
Expand Down Expand Up @@ -83,7 +93,8 @@ def __init__(self, directory=None, source=None, target=None,
preserve_inline_tags -- Preserve inline tags within sentences
n -- Get only documents that match the regex
N -- Skip all doucments that match the regex
chunk_size -- Number of sentence pairs in chunks to be processed (default 1000000)
chunk_size -- Number of sentence pairs in chunks to be processed
(default 1000000)
verbose -- Print progress messages
"""

Expand All @@ -99,64 +110,51 @@ def __init__(self, directory=None, source=None, target=None,
self.verbose = True

if self.switch_langs:
temp = src_range
src_range = tgt_range
tgt_range = temp
temp = src_cld2
src_cld2 = trg_cld2
trg_cld2 = temp
temp = src_langid
src_langid = trg_langid
trg_langid = temp
temp = source_zip
source_zip = target_zip
target_zip = temp
temp = source_annotations.copy()
source_annotations = target_annotations.copy()
target_annotations = temp.copy()
src_range, tgt_range = tgt_range, src_range
src_cld2, trg_cld2 = trg_cld2, src_cld2
src_langid, trg_langid = trg_langid, src_langid
source_zip, target_zip = target_zip, source_zip
source_annotations, target_annotations = \
target_annotations.copy(), source_annotations.copy()

lang_filters = [src_cld2, src_langid, trg_cld2, trg_langid]

default_alignment = os.path.join(root_directory, directory, release,
'xml', self.fromto[0]+'-'+self.fromto[1]+'.xml.gz')
default_alignment = os.path.join(
root_directory, directory, release, 'xml',
self.fromto[0]+'-'+self.fromto[1]+'.xml.gz')
if alignment_file == -1:
self.alignment = default_alignment
else:
self.alignment = alignment_file

dl_prefix = directory + '_' + release + '_' + preprocess + '_'
if not source_zip:
dl_src_zip = os.path.join(download_dir, directory+'_'+release+'_'+
preprocess+'_'+self.fromto[0]+'.zip')
dl_src_zip = os.path.join(
download_dir, dl_prefix + self.fromto[0] + '.zip')
if os.path.isfile(dl_src_zip):
source_zip = dl_src_zip
else:
source_zip = os.path.join(root_directory, directory, release,
source_zip = os.path.join(
root_directory, directory, release,
preprocess, self.fromto[0]+'.zip')
if not target_zip:
dl_trg_zip = os.path.join(download_dir, directory+'_'+release+'_'+
preprocess+'_'+self.fromto[1]+'.zip')
dl_trg_zip = os.path.join(
download_dir, dl_prefix + self.fromto[1] + '.zip')
if os.path.isfile(dl_trg_zip):
target_zip = dl_trg_zip
else:
target_zip = os.path.join(root_directory, directory, release,
target_zip = os.path.join(
root_directory, directory, release,
preprocess, self.fromto[1]+'.zip')

self.resultfile = None
self.mosessrc = None
self.mosestrg = None

self.id_file = None
if write_ids:
self.id_file = file_open(write_ids, 'w', encoding='utf-8')

self.write_mode = write_mode
self.write = write
self.maximum = maximum
self.preprocess = preprocess
if print_annotations:
self.preprocess = 'parsed'

self.write_ids=write_ids
self.write_ids = write_ids

self.preserve = preserve_inline_tags

Expand Down Expand Up @@ -192,44 +190,67 @@ def __init__(self, directory=None, source=None, target=None,
preprocess, self.fromto, suppress_prompts)

if preprocess == 'moses':
# If preprocessing is moses, download
moses_names = self.of_handler.open_moses_files()
if self.write:
if len(self.write) == 2:
if not self.switch_langs:
shutil.move(moses_names[0], os.path.join(download_dir, self.write[0]))
shutil.move(moses_names[1], os.path.join(download_dir, self.write[1]))
else:
shutil.move(moses_names[0], os.path.join(download_dir, self.write[1]))
shutil.move(moses_names[1], os.path.join(download_dir, self.write[0]))
moses_names = self.write
else:
print('"moses" preprocessing requires two output file names. Using default names.')
else:
shutil.move(moses_names[0], os.path.join(download_dir, moses_names[0]))
shutil.move(moses_names[1], os.path.join(download_dir, moses_names[1]))
print(f'Moses files written to {", ".join([download_dir+"/"+n for n in moses_names])}')
exit()

if write:
if write_mode == 'moses' and len(write) == 2:
self.mosessrc = file_open(write[0], mode='w', encoding='utf-8')
self.mosestrg = file_open(write[1], mode='w', encoding='utf-8')
else:
self.resultfile = file_open(write[0], mode='w', encoding='utf-8')
if self.write_mode != 'moses':
logger.warning("Only moses write_mode is supported for moses preprocessing. "
"Ignoring write_mode %s.", self.write_mode)
self.write_mode = 'moses'
return

store_attrs = False
if write_mode == "links" or write_ids != None:
if write_mode == "links" or write_ids is not None:
store_attrs = True

self.alignment = self.of_handler.open_alignment_file(self.alignment)
self.alignmentParser = AlignmentParser(self.alignment,
(src_range, tgt_range), attribute, threshold, store_attrs,
leave_non_alignments_out)
self.alignmentParser = AlignmentParser(
self.alignment, (src_range, tgt_range), attribute, threshold,
store_attrs, leave_non_alignments_out)

def printPairs(self):
logger.debug("printPairs called!")
resultfile = None
mosessrc = None
mosestrg = None
id_file = None

self.add_file_header(self.resultfile)
if self.write_ids:
id_file = file_open(self.write_ids, 'w', encoding='utf-8')

if self.write:
if self.write_mode == 'moses' and len(self.write) == 2:
mosessrc = file_open(self.write[0], mode='w', encoding='utf-8')
mosestrg = file_open(self.write[1], mode='w', encoding='utf-8')
else:
resultfile = file_open(self.write[0], mode='w', encoding='utf-8')

if self.preprocess == 'moses':
# If preprocessing is moses, download
if not self.write or len(self.write) != 2:
# Write to current path and return
if self.write and len(self.write) != 2:
resultfile.close()
logger.warning('"moses" preprocessing requires two output '
'file names. Using default names.')
moses_names = self.of_handler.open_moses_files(
outpath=self.of_handler.download_dir)
logger.info('Moses files written to %s', ', '.join(moses_names))
return
with tempfile.TemporaryDirectory() as tmpdir:
# Write to specified files
logger.info('Extracting data...')
moses_names = self.of_handler.open_moses_files(outpath=tmpdir)
with file_open(os.path.join(tmpdir, moses_names[0])) as in1, \
file_open(os.path.join(tmpdir, moses_names[1])) as in2:
if self.switch_langs:
in1, in2 = in2, in1
for fin, fout in [(in1, mosessrc), (in2, mosestrg)]:
for line in fin:
fout.write(line)
mosessrc.close()
mosestrg.close()
logger.info('Moses files written to %s', ', '.join(self.write))
return

self.add_file_header(resultfile)

src_parser = None
trg_parser = None
Expand All @@ -248,13 +269,14 @@ def printPairs(self):
self.alignmentParser.collect_links(cur_pos, self.chunk_size, self.verbose)

if src_doc_name != prev_src_doc_name:
src_doc_size = -1
src_doc_size = -1
prev_src_doc_name = src_doc_name
if trg_doc_name != prev_trg_doc_name:
trg_doc_size = -1
trg_doc_size = -1
prev_trg_doc_name = trg_doc_name

if self.verbose: print("")
if self.verbose:
print("", file=sys.stderr)

if not src_doc_name:
break
Expand All @@ -268,32 +290,32 @@ def printPairs(self):
src_doc = self.of_handler.open_sentence_file(src_doc_name, 'src')
trg_doc = self.of_handler.open_sentence_file(trg_doc_name, 'trg')
except KeyError as e:
print('\n'+e.args[0]+'\nContinuing from next sentence file pair.')
print('\n'+e.args[0]+'\nContinuing from next sentence file pair.', file=sys.stderr)
continue

try:
src_parser = SentenceParser(src_doc,
preprocessing=self.preprocess, anno_attrs=self.src_annot,
preserve=self.preserve, delimiter=self.annot_delimiter)
src_parser = SentenceParser(
src_doc, preprocessing=self.preprocess, anno_attrs=self.src_annot,
preserve=self.preserve, delimiter=self.annot_delimiter)
src_doc_size = src_parser.store_sentences(src_set, src_doc_size, self.verbose)
trg_parser = SentenceParser(trg_doc,
preprocessing=self.preprocess, anno_attrs=self.trg_annot,
preserve=self.preserve, delimiter=self.annot_delimiter)
trg_parser = SentenceParser(
trg_doc, preprocessing=self.preprocess, anno_attrs=self.trg_annot,
preserve=self.preserve, delimiter=self.annot_delimiter)
trg_doc_size = trg_parser.store_sentences(trg_set, trg_doc_size, self.verbose)
except SentenceParserError as e:
print('\n'+e.message+'\nContinuing from next sentence file pair.')
print('\n'+e.message+'\nContinuing from next sentence file pair.', file=sys.stderr)
continue

self.add_doc_names(src_doc_name, trg_doc_name,
self.resultfile, self.mosessrc, self.mosestrg)
self.add_doc_names(
src_doc_name, trg_doc_name, resultfile, mosessrc, mosestrg)

len_link_list = len(link_list)

for i, link_a in enumerate(link_list):
if self.verbose:
if i%1000==0 or i+1==len_link_list:
if i % 1000 == 0 or i + 1 == len_link_list:
progress = str(round((i+1)/len_link_list*100, 2))
print("\x1b[2KWriting chunk ... {}%".format(progress), end="\r")
print("\x1b[2KWriting chunk ... {}%".format(progress), end="\r", file=sys.stderr)

src_result, trg_result = self.format_pair(
link_a, src_parser, trg_parser, self.fromto)
Expand All @@ -303,39 +325,38 @@ def printPairs(self):

link_attr = attrs_list[i] if i < len(attrs_list) else None

self.out_put_pair(src_result, trg_result, self.resultfile,
self.mosessrc, self.mosestrg, link_attr, self.id_file,
src_doc_name, trg_doc_name)
self.out_put_pair(
src_result, trg_result, resultfile, mosessrc, mosestrg,
link_attr, id_file, src_doc_name, trg_doc_name)

total += 1
if total == self.maximum:
stop = True
break

self.add_doc_ending(self.resultfile)
self.add_doc_ending(resultfile)

if self.verbose and self.write:
print("\033[F\033[F\033[F", end="")
print("\033[F\033[F\033[F", end="", file=sys.stderr)

if stop:
break

if self.verbose and self.write:
print("\n\n")
print("\n\n", file=sys.stderr)

self.add_file_ending(self.resultfile)
self.add_file_ending(resultfile)

self.alignmentParser.bp.close_document()

if self.write:
if self.write_mode == 'moses' and self.mosessrc:
self.mosessrc.close()
self.mosestrg.close()
if self.write_mode == 'moses' and mosessrc:
mosessrc.close()
mosestrg.close()
else:
self.resultfile.close()
resultfile.close()

if self.write_ids:
self.id_file.close()
id_file.close()

self.of_handler.close_zipfiles()

Loading

0 comments on commit 9355ba7

Please sign in to comment.