Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix opusfilter interface #43

Merged
merged 4 commits into from
Aug 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions opustools_pkg/opustools/opus_file_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def download_files(self):
og = OpusGet(**arguments)
og.get_files()

def open_moses_files(self):
def open_moses_files(self, outpath=None):
moses_zip_name = os.path.join(self.download_dir, f'{self.directory}_{self.release}_moses_'
f'{self.fromto[0]}-{self.fromto[1]}.txt.zip')
if not os.path.isfile(moses_zip_name):
Expand All @@ -47,7 +47,7 @@ def open_moses_files(self):
ret_file_names = []
for fn in moses_zip.filelist:
if fn.filename.split('.')[-1] in self.fromto:
moses_zip.extract(fn.filename)
moses_zip.extract(fn.filename, path=outpath)
ret_file_names.append(fn.filename)
moses_zip.close()
return sorted(ret_file_names)
Expand Down Expand Up @@ -150,4 +150,3 @@ def close_zipfiles(self):
if self.zip_opened:
self.src_zip.close()
self.trg_zip.close()

209 changes: 115 additions & 94 deletions opustools_pkg/opustools/opus_read.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,28 @@
import logging
import os
import shutil
import re
import sys
import tempfile

from .parse.alignment_parser import AlignmentParser
from .parse.sentence_parser import SentenceParser, SentenceParserError
from .util import file_open
from .formatting import *
from .opus_file_handler import OpusFileHandler


logger = logging.getLogger(__name__)


def skip_regex_type(n, N):
"Select function to skip document names"

def get_re(doc_name):
return not re.search(n, doc_name)

def skip_re(doc_name):
return re.search(N, doc_name)

def nothing(doc_name):
return False

Expand All @@ -27,7 +35,8 @@ def nothing(doc_name):

class OpusRead:

def __init__(self, directory=None, source=None, target=None,
def __init__(
self, directory=None, source=None, target=None,
release='latest', preprocess='xml', maximum=-1, src_range='all',
tgt_range='all', attribute=None, threshold=None,
leave_non_alignments_out=False, write=None, write_mode='normal',
Expand All @@ -39,7 +48,8 @@ def __init__(self, directory=None, source=None, target=None,
change_annotation_delimiter='|',
src_cld2=None, trg_cld2=None, src_langid=None, trg_langid=None,
write_ids=None, suppress_prompts=False, download_dir='.',
preserve_inline_tags=False, n=None, N=None, chunk_size=1000000, verbose=False):
preserve_inline_tags=False, n=None, N=None, chunk_size=1000000,
verbose=False):
"""Read xces alignment files and xml sentence files and output in
desired format.

Expand Down Expand Up @@ -83,7 +93,8 @@ def __init__(self, directory=None, source=None, target=None,
preserve_inline_tags -- Preserve inline tags within sentences
n -- Get only documents that match the regex
N -- Skip all doucments that match the regex
chunk_size -- Number of sentence pairs in chunks to be processed (default 1000000)
chunk_size -- Number of sentence pairs in chunks to be processed
(default 1000000)
verbose -- Print progress messages
"""

Expand All @@ -99,64 +110,51 @@ def __init__(self, directory=None, source=None, target=None,
self.verbose = True

if self.switch_langs:
temp = src_range
src_range = tgt_range
tgt_range = temp
temp = src_cld2
src_cld2 = trg_cld2
trg_cld2 = temp
temp = src_langid
src_langid = trg_langid
trg_langid = temp
temp = source_zip
source_zip = target_zip
target_zip = temp
temp = source_annotations.copy()
source_annotations = target_annotations.copy()
target_annotations = temp.copy()
src_range, tgt_range = tgt_range, src_range
src_cld2, trg_cld2 = trg_cld2, src_cld2
src_langid, trg_langid = trg_langid, src_langid
source_zip, target_zip = target_zip, source_zip
source_annotations, target_annotations = \
target_annotations.copy(), source_annotations.copy()

lang_filters = [src_cld2, src_langid, trg_cld2, trg_langid]

default_alignment = os.path.join(root_directory, directory, release,
'xml', self.fromto[0]+'-'+self.fromto[1]+'.xml.gz')
default_alignment = os.path.join(
root_directory, directory, release, 'xml',
self.fromto[0]+'-'+self.fromto[1]+'.xml.gz')
if alignment_file == -1:
self.alignment = default_alignment
else:
self.alignment = alignment_file

dl_prefix = directory + '_' + release + '_' + preprocess + '_'
if not source_zip:
dl_src_zip = os.path.join(download_dir, directory+'_'+release+'_'+
preprocess+'_'+self.fromto[0]+'.zip')
dl_src_zip = os.path.join(
download_dir, dl_prefix + self.fromto[0] + '.zip')
if os.path.isfile(dl_src_zip):
source_zip = dl_src_zip
else:
source_zip = os.path.join(root_directory, directory, release,
source_zip = os.path.join(
root_directory, directory, release,
preprocess, self.fromto[0]+'.zip')
if not target_zip:
dl_trg_zip = os.path.join(download_dir, directory+'_'+release+'_'+
preprocess+'_'+self.fromto[1]+'.zip')
dl_trg_zip = os.path.join(
download_dir, dl_prefix + self.fromto[1] + '.zip')
if os.path.isfile(dl_trg_zip):
target_zip = dl_trg_zip
else:
target_zip = os.path.join(root_directory, directory, release,
target_zip = os.path.join(
root_directory, directory, release,
preprocess, self.fromto[1]+'.zip')

self.resultfile = None
self.mosessrc = None
self.mosestrg = None

self.id_file = None
if write_ids:
self.id_file = file_open(write_ids, 'w', encoding='utf-8')

self.write_mode = write_mode
self.write = write
self.maximum = maximum
self.preprocess = preprocess
if print_annotations:
self.preprocess = 'parsed'

self.write_ids=write_ids
self.write_ids = write_ids

self.preserve = preserve_inline_tags

Expand Down Expand Up @@ -192,44 +190,67 @@ def __init__(self, directory=None, source=None, target=None,
preprocess, self.fromto, suppress_prompts)

if preprocess == 'moses':
# If preprocessing is moses, download
moses_names = self.of_handler.open_moses_files()
if self.write:
if len(self.write) == 2:
if not self.switch_langs:
shutil.move(moses_names[0], os.path.join(download_dir, self.write[0]))
shutil.move(moses_names[1], os.path.join(download_dir, self.write[1]))
else:
shutil.move(moses_names[0], os.path.join(download_dir, self.write[1]))
shutil.move(moses_names[1], os.path.join(download_dir, self.write[0]))
moses_names = self.write
else:
print('"moses" preprocessing requires two output file names. Using default names.')
else:
shutil.move(moses_names[0], os.path.join(download_dir, moses_names[0]))
shutil.move(moses_names[1], os.path.join(download_dir, moses_names[1]))
print(f'Moses files written to {", ".join([download_dir+"/"+n for n in moses_names])}')
exit()

if write:
if write_mode == 'moses' and len(write) == 2:
self.mosessrc = file_open(write[0], mode='w', encoding='utf-8')
self.mosestrg = file_open(write[1], mode='w', encoding='utf-8')
else:
self.resultfile = file_open(write[0], mode='w', encoding='utf-8')
if self.write_mode != 'moses':
logger.warning("Only moses write_mode is supported for moses preprocessing. "
"Ignoring write_mode %s.", self.write_mode)
self.write_mode = 'moses'
return

store_attrs = False
if write_mode == "links" or write_ids != None:
if write_mode == "links" or write_ids is not None:
store_attrs = True

self.alignment = self.of_handler.open_alignment_file(self.alignment)
self.alignmentParser = AlignmentParser(self.alignment,
(src_range, tgt_range), attribute, threshold, store_attrs,
leave_non_alignments_out)
self.alignmentParser = AlignmentParser(
self.alignment, (src_range, tgt_range), attribute, threshold,
store_attrs, leave_non_alignments_out)

def printPairs(self):
logger.debug("printPairs called!")
resultfile = None
mosessrc = None
mosestrg = None
id_file = None

self.add_file_header(self.resultfile)
if self.write_ids:
id_file = file_open(self.write_ids, 'w', encoding='utf-8')

if self.write:
if self.write_mode == 'moses' and len(self.write) == 2:
mosessrc = file_open(self.write[0], mode='w', encoding='utf-8')
mosestrg = file_open(self.write[1], mode='w', encoding='utf-8')
else:
resultfile = file_open(self.write[0], mode='w', encoding='utf-8')

if self.preprocess == 'moses':
# If preprocessing is moses, download
if not self.write or len(self.write) != 2:
# Write to current path and return
if self.write and len(self.write) != 2:
resultfile.close()
logger.warning('"moses" preprocessing requires two output '
'file names. Using default names.')
moses_names = self.of_handler.open_moses_files(
outpath=self.of_handler.download_dir)
logger.info('Moses files written to %s', ', '.join(moses_names))
return
with tempfile.TemporaryDirectory() as tmpdir:
# Write to specified files
logger.info('Extracting data...')
moses_names = self.of_handler.open_moses_files(outpath=tmpdir)
with file_open(os.path.join(tmpdir, moses_names[0])) as in1, \
file_open(os.path.join(tmpdir, moses_names[1])) as in2:
if self.switch_langs:
in1, in2 = in2, in1
for fin, fout in [(in1, mosessrc), (in2, mosestrg)]:
for line in fin:
fout.write(line)
mosessrc.close()
mosestrg.close()
logger.info('Moses files written to %s', ', '.join(self.write))
return

self.add_file_header(resultfile)

src_parser = None
trg_parser = None
Expand All @@ -248,13 +269,14 @@ def printPairs(self):
self.alignmentParser.collect_links(cur_pos, self.chunk_size, self.verbose)

if src_doc_name != prev_src_doc_name:
src_doc_size = -1
src_doc_size = -1
prev_src_doc_name = src_doc_name
if trg_doc_name != prev_trg_doc_name:
trg_doc_size = -1
trg_doc_size = -1
prev_trg_doc_name = trg_doc_name

if self.verbose: print("")
if self.verbose:
print("", file=sys.stderr)

if not src_doc_name:
break
Expand All @@ -268,32 +290,32 @@ def printPairs(self):
src_doc = self.of_handler.open_sentence_file(src_doc_name, 'src')
trg_doc = self.of_handler.open_sentence_file(trg_doc_name, 'trg')
except KeyError as e:
print('\n'+e.args[0]+'\nContinuing from next sentence file pair.')
print('\n'+e.args[0]+'\nContinuing from next sentence file pair.', file=sys.stderr)
continue

try:
src_parser = SentenceParser(src_doc,
preprocessing=self.preprocess, anno_attrs=self.src_annot,
preserve=self.preserve, delimiter=self.annot_delimiter)
src_parser = SentenceParser(
src_doc, preprocessing=self.preprocess, anno_attrs=self.src_annot,
preserve=self.preserve, delimiter=self.annot_delimiter)
src_doc_size = src_parser.store_sentences(src_set, src_doc_size, self.verbose)
trg_parser = SentenceParser(trg_doc,
preprocessing=self.preprocess, anno_attrs=self.trg_annot,
preserve=self.preserve, delimiter=self.annot_delimiter)
trg_parser = SentenceParser(
trg_doc, preprocessing=self.preprocess, anno_attrs=self.trg_annot,
preserve=self.preserve, delimiter=self.annot_delimiter)
trg_doc_size = trg_parser.store_sentences(trg_set, trg_doc_size, self.verbose)
except SentenceParserError as e:
print('\n'+e.message+'\nContinuing from next sentence file pair.')
print('\n'+e.message+'\nContinuing from next sentence file pair.', file=sys.stderr)
continue

self.add_doc_names(src_doc_name, trg_doc_name,
self.resultfile, self.mosessrc, self.mosestrg)
self.add_doc_names(
src_doc_name, trg_doc_name, resultfile, mosessrc, mosestrg)

len_link_list = len(link_list)

for i, link_a in enumerate(link_list):
if self.verbose:
if i%1000==0 or i+1==len_link_list:
if i % 1000 == 0 or i + 1 == len_link_list:
progress = str(round((i+1)/len_link_list*100, 2))
print("\x1b[2KWriting chunk ... {}%".format(progress), end="\r")
print("\x1b[2KWriting chunk ... {}%".format(progress), end="\r", file=sys.stderr)

src_result, trg_result = self.format_pair(
link_a, src_parser, trg_parser, self.fromto)
Expand All @@ -303,39 +325,38 @@ def printPairs(self):

link_attr = attrs_list[i] if i < len(attrs_list) else None

self.out_put_pair(src_result, trg_result, self.resultfile,
self.mosessrc, self.mosestrg, link_attr, self.id_file,
src_doc_name, trg_doc_name)
self.out_put_pair(
src_result, trg_result, resultfile, mosessrc, mosestrg,
link_attr, id_file, src_doc_name, trg_doc_name)

total += 1
if total == self.maximum:
stop = True
break

self.add_doc_ending(self.resultfile)
self.add_doc_ending(resultfile)

if self.verbose and self.write:
print("\033[F\033[F\033[F", end="")
print("\033[F\033[F\033[F", end="", file=sys.stderr)

if stop:
break

if self.verbose and self.write:
print("\n\n")
print("\n\n", file=sys.stderr)

self.add_file_ending(self.resultfile)
self.add_file_ending(resultfile)

self.alignmentParser.bp.close_document()

if self.write:
if self.write_mode == 'moses' and self.mosessrc:
self.mosessrc.close()
self.mosestrg.close()
if self.write_mode == 'moses' and mosessrc:
mosessrc.close()
mosestrg.close()
else:
self.resultfile.close()
resultfile.close()

if self.write_ids:
self.id_file.close()
id_file.close()

self.of_handler.close_zipfiles()

Loading