From 0a73a9e2cb6249ccfcd271915dad15c7e1a4cc48 Mon Sep 17 00:00:00 2001 From: Sami Virpioja Date: Wed, 31 Jul 2024 17:20:17 +0300 Subject: [PATCH 1/4] move extracting moses zip files to printPairs --- opustools_pkg/opustools/opus_read.py | 88 ++++++++++++++-------------- 1 file changed, 45 insertions(+), 43 deletions(-) diff --git a/opustools_pkg/opustools/opus_read.py b/opustools_pkg/opustools/opus_read.py index 8a8822f..8f703aa 100644 --- a/opustools_pkg/opustools/opus_read.py +++ b/opustools_pkg/opustools/opus_read.py @@ -141,11 +141,6 @@ def __init__(self, directory=None, source=None, target=None, target_zip = os.path.join(root_directory, directory, release, preprocess, self.fromto[1]+'.zip') - self.resultfile = None - self.mosessrc = None - self.mosestrg = None - - self.id_file = None if write_ids: self.id_file = file_open(write_ids, 'w', encoding='utf-8') @@ -156,7 +151,7 @@ def __init__(self, directory=None, source=None, target=None, if print_annotations: self.preprocess = 'parsed' - self.write_ids=write_ids + self.write_ids = write_ids self.preserve = preserve_inline_tags @@ -192,31 +187,7 @@ def __init__(self, directory=None, source=None, target=None, preprocess, self.fromto, suppress_prompts) if preprocess == 'moses': - # If preprocessing is moses, download - moses_names = self.of_handler.open_moses_files() - if self.write: - if len(self.write) == 2: - if not self.switch_langs: - shutil.move(moses_names[0], os.path.join(download_dir, self.write[0])) - shutil.move(moses_names[1], os.path.join(download_dir, self.write[1])) - else: - shutil.move(moses_names[0], os.path.join(download_dir, self.write[1])) - shutil.move(moses_names[1], os.path.join(download_dir, self.write[0])) - moses_names = self.write - else: - print('"moses" preprocessing requires two output file names. Using default names.') - else: - shutil.move(moses_names[0], os.path.join(download_dir, moses_names[0])) - shutil.move(moses_names[1], os.path.join(download_dir, moses_names[1])) - print(f'Moses files written to {", ".join([download_dir+"/"+n for n in moses_names])}') - exit() - - if write: - if write_mode == 'moses' and len(write) == 2: - self.mosessrc = file_open(write[0], mode='w', encoding='utf-8') - self.mosestrg = file_open(write[1], mode='w', encoding='utf-8') - else: - self.resultfile = file_open(write[0], mode='w', encoding='utf-8') + return store_attrs = False if write_mode == "links" or write_ids != None: @@ -228,8 +199,40 @@ def __init__(self, directory=None, source=None, target=None, leave_non_alignments_out) def printPairs(self): + print("printPairs called!") + resultfile = None + mosessrc = None + mosestrg = None + id_file = None + + if self.write_ids: + id_file = file_open(self.write_ids, 'w', encoding='utf-8') - self.add_file_header(self.resultfile) + if self.write: + if self.write_mode == 'moses' and len(self.write) == 2: + mosessrc = file_open(self.write[0], mode='w', encoding='utf-8') + mosestrg = file_open(self.write[1], mode='w', encoding='utf-8') + else: + resultfile = file_open(self.write[0], mode='w', encoding='utf-8') + + if self.preprocess == 'moses': + # If preprocessing is moses, download + moses_names = self.of_handler.open_moses_files() + if self.write: + if len(self.write) == 2: + if not self.switch_langs: + shutil.move(moses_names[0], self.write[0]) + shutil.move(moses_names[1], self.write[1]) + else: + shutil.move(moses_names[0], self.write[1]) + shutil.move(moses_names[1], self.write[0]) + moses_names = self.write + else: + print('"moses" preprocessing requires two output file names. Using default names.') + print(f'Moses files written to {", ".join(moses_names)}') + return + + self.add_file_header(resultfile) src_parser = None trg_parser = None @@ -285,7 +288,7 @@ def printPairs(self): continue self.add_doc_names(src_doc_name, trg_doc_name, - self.resultfile, self.mosessrc, self.mosestrg) + resultfile, mosessrc, mosestrg) len_link_list = len(link_list) @@ -303,8 +306,8 @@ def printPairs(self): link_attr = attrs_list[i] if i < len(attrs_list) else None - self.out_put_pair(src_result, trg_result, self.resultfile, - self.mosessrc, self.mosestrg, link_attr, self.id_file, + self.out_put_pair(src_result, trg_result, resultfile, + mosessrc, mosestrg, link_attr, id_file, src_doc_name, trg_doc_name) total += 1 @@ -312,7 +315,7 @@ def printPairs(self): stop = True break - self.add_doc_ending(self.resultfile) + self.add_doc_ending(resultfile) if self.verbose and self.write: print("\033[F\033[F\033[F", end="") @@ -323,19 +326,18 @@ def printPairs(self): if self.verbose and self.write: print("\n\n") - self.add_file_ending(self.resultfile) + self.add_file_ending(resultfile) self.alignmentParser.bp.close_document() if self.write: - if self.write_mode == 'moses' and self.mosessrc: - self.mosessrc.close() - self.mosestrg.close() + if self.write_mode == 'moses' and mosessrc: + mosessrc.close() + mosestrg.close() else: - self.resultfile.close() + resultfile.close() if self.write_ids: - self.id_file.close() + id_file.close() self.of_handler.close_zipfiles() - From 5ed5acc82e93c3943e0e67fb0481f42e7df77656 Mon Sep 17 00:00:00 2001 From: Sami Virpioja Date: Wed, 7 Aug 2024 16:46:25 +0300 Subject: [PATCH 2/4] fix moses output formats, error printing etc. --- opustools_pkg/opustools/opus_file_handler.py | 5 +- opustools_pkg/opustools/opus_read.py | 130 +- opustools_pkg/opustools/parse/block_parser.py | 7 +- .../opustools/parse/sentence_parser.py | 6 +- opustools_pkg/opustools/readopusdata.py | 54 +- opustools_pkg/tests/test_opus_read.py | 2361 +++++++++-------- 6 files changed, 1358 insertions(+), 1205 deletions(-) diff --git a/opustools_pkg/opustools/opus_file_handler.py b/opustools_pkg/opustools/opus_file_handler.py index b4be1d3..9a3debc 100644 --- a/opustools_pkg/opustools/opus_file_handler.py +++ b/opustools_pkg/opustools/opus_file_handler.py @@ -38,7 +38,7 @@ def download_files(self): og = OpusGet(**arguments) og.get_files() - def open_moses_files(self): + def open_moses_files(self, outpath=None): moses_zip_name = os.path.join(self.download_dir, f'{self.directory}_{self.release}_moses_' f'{self.fromto[0]}-{self.fromto[1]}.txt.zip') if not os.path.isfile(moses_zip_name): @@ -47,7 +47,7 @@ def open_moses_files(self): ret_file_names = [] for fn in moses_zip.filelist: if fn.filename.split('.')[-1] in self.fromto: - moses_zip.extract(fn.filename) + moses_zip.extract(fn.filename, path=outpath) ret_file_names.append(fn.filename) moses_zip.close() return sorted(ret_file_names) @@ -150,4 +150,3 @@ def close_zipfiles(self): if self.zip_opened: self.src_zip.close() self.trg_zip.close() - diff --git a/opustools_pkg/opustools/opus_read.py b/opustools_pkg/opustools/opus_read.py index 8f703aa..48c876d 100644 --- a/opustools_pkg/opustools/opus_read.py +++ b/opustools_pkg/opustools/opus_read.py @@ -1,6 +1,8 @@ +import logging import os -import shutil import re +import sys +import tempfile from .parse.alignment_parser import AlignmentParser from .parse.sentence_parser import SentenceParser, SentenceParserError @@ -8,13 +10,19 @@ from .formatting import * from .opus_file_handler import OpusFileHandler + +logger = logging.getLogger(__name__) + + def skip_regex_type(n, N): "Select function to skip document names" def get_re(doc_name): return not re.search(n, doc_name) + def skip_re(doc_name): return re.search(N, doc_name) + def nothing(doc_name): return False @@ -27,7 +35,8 @@ def nothing(doc_name): class OpusRead: - def __init__(self, directory=None, source=None, target=None, + def __init__( + self, directory=None, source=None, target=None, release='latest', preprocess='xml', maximum=-1, src_range='all', tgt_range='all', attribute=None, threshold=None, leave_non_alignments_out=False, write=None, write_mode='normal', @@ -39,7 +48,8 @@ def __init__(self, directory=None, source=None, target=None, change_annotation_delimiter='|', src_cld2=None, trg_cld2=None, src_langid=None, trg_langid=None, write_ids=None, suppress_prompts=False, download_dir='.', - preserve_inline_tags=False, n=None, N=None, chunk_size=1000000, verbose=False): + preserve_inline_tags=False, n=None, N=None, chunk_size=1000000, + verbose=False): """Read xces alignment files and xml sentence files and output in desired format. @@ -83,7 +93,8 @@ def __init__(self, directory=None, source=None, target=None, preserve_inline_tags -- Preserve inline tags within sentences n -- Get only documents that match the regex N -- Skip all doucments that match the regex - chunk_size -- Number of sentence pairs in chunks to be processed (default 1000000) + chunk_size -- Number of sentence pairs in chunks to be processed + (default 1000000) verbose -- Print progress messages """ @@ -117,28 +128,32 @@ def __init__(self, directory=None, source=None, target=None, lang_filters = [src_cld2, src_langid, trg_cld2, trg_langid] - default_alignment = os.path.join(root_directory, directory, release, - 'xml', self.fromto[0]+'-'+self.fromto[1]+'.xml.gz') + default_alignment = os.path.join( + root_directory, directory, release, 'xml', + self.fromto[0]+'-'+self.fromto[1]+'.xml.gz') if alignment_file == -1: self.alignment = default_alignment else: self.alignment = alignment_file + dl_prefix = directory + '_' + release + '_' + preprocess + '_' if not source_zip: - dl_src_zip = os.path.join(download_dir, directory+'_'+release+'_'+ - preprocess+'_'+self.fromto[0]+'.zip') + dl_src_zip = os.path.join( + download_dir, dl_prefix + self.fromto[0] + '.zip') if os.path.isfile(dl_src_zip): source_zip = dl_src_zip else: - source_zip = os.path.join(root_directory, directory, release, + source_zip = os.path.join( + root_directory, directory, release, preprocess, self.fromto[0]+'.zip') if not target_zip: - dl_trg_zip = os.path.join(download_dir, directory+'_'+release+'_'+ - preprocess+'_'+self.fromto[1]+'.zip') + dl_trg_zip = os.path.join( + download_dir, dl_prefix + self.fromto[1] + '.zip') if os.path.isfile(dl_trg_zip): target_zip = dl_trg_zip else: - target_zip = os.path.join(root_directory, directory, release, + target_zip = os.path.join( + root_directory, directory, release, preprocess, self.fromto[1]+'.zip') if write_ids: @@ -187,19 +202,23 @@ def __init__(self, directory=None, source=None, target=None, preprocess, self.fromto, suppress_prompts) if preprocess == 'moses': + if self.write_mode != 'moses': + logger.warning("Only moses write_mode is supported for moses preprocessing. " + "Ignoring write_mode %s.", self.write_mode) + self.write_mode = 'moses' return store_attrs = False - if write_mode == "links" or write_ids != None: + if write_mode == "links" or write_ids is not None: store_attrs = True self.alignment = self.of_handler.open_alignment_file(self.alignment) - self.alignmentParser = AlignmentParser(self.alignment, - (src_range, tgt_range), attribute, threshold, store_attrs, - leave_non_alignments_out) + self.alignmentParser = AlignmentParser( + self.alignment, (src_range, tgt_range), attribute, threshold, + store_attrs, leave_non_alignments_out) def printPairs(self): - print("printPairs called!") + logger.debug("printPairs called!") resultfile = None mosessrc = None mosestrg = None @@ -217,19 +236,29 @@ def printPairs(self): if self.preprocess == 'moses': # If preprocessing is moses, download - moses_names = self.of_handler.open_moses_files() - if self.write: - if len(self.write) == 2: - if not self.switch_langs: - shutil.move(moses_names[0], self.write[0]) - shutil.move(moses_names[1], self.write[1]) - else: - shutil.move(moses_names[0], self.write[1]) - shutil.move(moses_names[1], self.write[0]) - moses_names = self.write - else: - print('"moses" preprocessing requires two output file names. Using default names.') - print(f'Moses files written to {", ".join(moses_names)}') + if not self.write or len(self.write) != 2: + # Write to current path and return + if self.write and len(self.write) != 2: + logger.warning('"moses" preprocessing requires two output ' + 'file names. Using default names.') + moses_names = self.of_handler.open_moses_files( + outpath=self.of_handler.download_dir) + logger.info('Moses files written to %s', ', '.join(moses_names)) + return + with tempfile.TemporaryDirectory() as tmpdir: + # Write to specified files + logger.info('Extracting data...') + moses_names = self.of_handler.open_moses_files(outpath=tmpdir) + with file_open(os.path.join(tmpdir, moses_names[0])) as in1, \ + file_open(os.path.join(tmpdir, moses_names[1])) as in2: + if self.switch_langs: + in1, in2 = in2, in1 + for fin, fout in [(in1, mosessrc), (in2, mosestrg)]: + for line in fin: + fout.write(line) + mosessrc.close() + mosestrg.close() + logger.info('Moses files written to %s', ', '.join(self.write)) return self.add_file_header(resultfile) @@ -251,13 +280,14 @@ def printPairs(self): self.alignmentParser.collect_links(cur_pos, self.chunk_size, self.verbose) if src_doc_name != prev_src_doc_name: - src_doc_size = -1 + src_doc_size = -1 prev_src_doc_name = src_doc_name if trg_doc_name != prev_trg_doc_name: - trg_doc_size = -1 + trg_doc_size = -1 prev_trg_doc_name = trg_doc_name - if self.verbose: print("") + if self.verbose: + print("", file=sys.stderr) if not src_doc_name: break @@ -271,32 +301,32 @@ def printPairs(self): src_doc = self.of_handler.open_sentence_file(src_doc_name, 'src') trg_doc = self.of_handler.open_sentence_file(trg_doc_name, 'trg') except KeyError as e: - print('\n'+e.args[0]+'\nContinuing from next sentence file pair.') + print('\n'+e.args[0]+'\nContinuing from next sentence file pair.', file=sys.stderr) continue try: - src_parser = SentenceParser(src_doc, - preprocessing=self.preprocess, anno_attrs=self.src_annot, - preserve=self.preserve, delimiter=self.annot_delimiter) + src_parser = SentenceParser( + src_doc, preprocessing=self.preprocess, anno_attrs=self.src_annot, + preserve=self.preserve, delimiter=self.annot_delimiter) src_doc_size = src_parser.store_sentences(src_set, src_doc_size, self.verbose) - trg_parser = SentenceParser(trg_doc, - preprocessing=self.preprocess, anno_attrs=self.trg_annot, - preserve=self.preserve, delimiter=self.annot_delimiter) + trg_parser = SentenceParser( + trg_doc, preprocessing=self.preprocess, anno_attrs=self.trg_annot, + preserve=self.preserve, delimiter=self.annot_delimiter) trg_doc_size = trg_parser.store_sentences(trg_set, trg_doc_size, self.verbose) except SentenceParserError as e: - print('\n'+e.message+'\nContinuing from next sentence file pair.') + print('\n'+e.message+'\nContinuing from next sentence file pair.', file=sys.stderr) continue - self.add_doc_names(src_doc_name, trg_doc_name, - resultfile, mosessrc, mosestrg) + self.add_doc_names( + src_doc_name, trg_doc_name, resultfile, mosessrc, mosestrg) len_link_list = len(link_list) for i, link_a in enumerate(link_list): if self.verbose: - if i%1000==0 or i+1==len_link_list: + if i % 1000 == 0 or i + 1 == len_link_list: progress = str(round((i+1)/len_link_list*100, 2)) - print("\x1b[2KWriting chunk ... {}%".format(progress), end="\r") + print("\x1b[2KWriting chunk ... {}%".format(progress), end="\r", file=sys.stderr) src_result, trg_result = self.format_pair( link_a, src_parser, trg_parser, self.fromto) @@ -306,9 +336,9 @@ def printPairs(self): link_attr = attrs_list[i] if i < len(attrs_list) else None - self.out_put_pair(src_result, trg_result, resultfile, - mosessrc, mosestrg, link_attr, id_file, - src_doc_name, trg_doc_name) + self.out_put_pair( + src_result, trg_result, resultfile, mosessrc, mosestrg, + link_attr, id_file, src_doc_name, trg_doc_name) total += 1 if total == self.maximum: @@ -318,13 +348,13 @@ def printPairs(self): self.add_doc_ending(resultfile) if self.verbose and self.write: - print("\033[F\033[F\033[F", end="") + print("\033[F\033[F\033[F", end="", file=sys.stderr) if stop: break if self.verbose and self.write: - print("\n\n") + print("\n\n", file=sys.stderr) self.add_file_ending(resultfile) diff --git a/opustools_pkg/opustools/parse/block_parser.py b/opustools_pkg/opustools/parse/block_parser.py index 2981033..6867fa0 100644 --- a/opustools_pkg/opustools/parse/block_parser.py +++ b/opustools_pkg/opustools/parse/block_parser.py @@ -1,6 +1,9 @@ +import sys + import xml.parsers.expat from ..util import file_open + class BlockParserError(Exception): def __init__(self, message): @@ -56,7 +59,7 @@ def __init__(self, document, data_tag=None, doc_size=-1): self.completeBlocks = [] if doc_size == -1: - print(f'Measuring file "{document.name}" ...', end="\r") + print(f'Measuring file "{document.name}" ...', end="\r", file=sys.stderr) self.document.seek(0, 2) self.doc_size = self.document.tell() self.document.seek(0) @@ -99,7 +102,7 @@ def close_document(self): def report_progress(self, cur_pos): progress = str(round(cur_pos/self.doc_size*100, 2) if self.doc_size > 0 else 0) - print("\x1b[2KParsing file \"{}\" ... {}%".format(self.document.name, progress), end="\r") + print("\x1b[2KParsing file \"{}\" ... {}%".format(self.document.name, progress), end="\r", file=sys.stderr) def get_complete_blocks(self, cur_pos, verbose=False): """ diff --git a/opustools_pkg/opustools/parse/sentence_parser.py b/opustools_pkg/opustools/parse/sentence_parser.py index 26eb7b6..36b53ab 100644 --- a/opustools_pkg/opustools/parse/sentence_parser.py +++ b/opustools_pkg/opustools/parse/sentence_parser.py @@ -1,5 +1,8 @@ +import sys + from .block_parser import BlockParser, BlockParserError + class SentenceParserError(Exception): def __init__(self, message): @@ -160,7 +163,7 @@ def store_sentences(self, id_set, doc_size, verbose=False): bp.close_document() if verbose: bp.report_progress(cur_pos) - print("") + print("", file=sys.stderr) except BlockParserError as e: raise SentenceParserError( 'Error while parsing sentence file: {error}'.format(error=e.args[0])) @@ -199,4 +202,3 @@ def read_sentence(self, ids): attrsList.append(attrs) return sentence, attrsList - diff --git a/opustools_pkg/opustools/readopusdata.py b/opustools_pkg/opustools/readopusdata.py index ad60e42..8461295 100644 --- a/opustools_pkg/opustools/readopusdata.py +++ b/opustools_pkg/opustools/readopusdata.py @@ -5,14 +5,20 @@ from ruamel.yaml import YAML, scanner, reader + +logger = logging.getLogger(__name__) + + def read_url(url): return urllib.request.urlopen(url).read().decode('utf-8').split('\n') + def read_url_yaml(url, yaml): raw = urllib.request.urlopen(url).read().decode('utf-8') data = yaml.load(raw) return data + def create_table(cur): create_opusfile_table = '''CREATE TABLE IF NOT EXISTS opusfile ( id integer PRIMARY KEY, @@ -34,6 +40,7 @@ def create_table(cur): create_url_index = 'CREATE INDEX IF NOT EXISTS idx_url ON opusfile(url)' cur.execute(create_url_index) + def execute_sql(cur, opusfile): columns = ['source', 'target', 'corpus', 'preprocessing', 'version', 'url', 'size', 'documents', 'alignment_pairs', 'source_tokens', 'target_tokens', 'latest'] #wheres = [f'{columns[i]}="{opusfile[i]}"' for i in range(6)] @@ -48,71 +55,73 @@ def execute_sql(cur, opusfile): sql = f'INSERT INTO opusfile({", ".join(columns)}, updated) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,1)' cur.execute(sql, opusfile) + def get_lang_info(name, data, data_type, info): source, target, documents, alignment_pairs, source_tokens, target_tokens = '', '', '', '', '', '' source = name if data_type in ['bitexts', 'moses', 'tmx']: names = name.split('-') if len(names) != 2: - logging.warning(f'{info} {data_type} {name}: cannot split name "{name}" into two language codes') + logger.warning(f'{info} {data_type} {name}: cannot split name "{name}" into two language codes') else: source, target = names documents = '' if data_type in ['bitexts', 'monolingual']: documents = data.get('files', '') if documents == '': - logging.warning(f'{info} {data_type} {name} is missing "files"') + logger.warning(f'{info} {data_type} {name} is missing "files"') if data_type in ['bitexts', 'moses']: alignment_pairs = data.get('alignments', '') if alignment_pairs == '': - logging.warning(f'{info} {data_type} {name} is missing "alignments"') + logger.warning(f'{info} {data_type} {name} is missing "alignments"') elif data_type == 'tmx': alignment_pairs = data.get('translation units', '') if alignment_pairs == '': - logging.warning(f'{info} {data_type} {name} is missing "translation units"') + logger.warning(f'{info} {data_type} {name} is missing "translation units"') elif data_type == 'monolingual': alignment_pairs = data.get('sentences', '') if alignment_pairs == '': - logging.warning(f'{info} {data_type} {name} is missing "sentences"') + logger.warning(f'{info} {data_type} {name} is missing "sentences"') if data_type == 'monolingual': source_tokens = data.get('tokens', '') if source_tokens == '': - logging.warning(f'{info} {data_type} {name} is missing "tokens"') + logger.warning(f'{info} {data_type} {name} is missing "tokens"') target_tokens = '' else: source_tokens = data.get('source language tokens', '') if source_tokens == '': - logging.warning(f'{info} {data_type} {name} is missing "source language tokens"') + logger.warning(f'{info} {data_type} {name} is missing "source language tokens"') target_tokens = data.get('target language tokens', '') if target_tokens == '': - logging.warning(f'{info} {data_type} {name} is missing "target language tokens"') + logger.warning(f'{info} {data_type} {name} is missing "target language tokens"') return source, target, documents, alignment_pairs, source_tokens, target_tokens + def get_size_url_prep(name, data, data_type, info): size, url, preprocessing = '','','' if data_type in ['tmx', 'moses']: size = data.get('download size', '') if size == '': - logging.warning(f'{info} {data_type} {name} is missing "download size"') + logger.warning(f'{info} {data_type} {name} is missing "download size"') else: size = int(int(size)/1024) url = data.get('download url', '') if url == '': - logging.warning(f'{info} {data_type} {name} is missing "download url"') + logger.warning(f'{info} {data_type} {name} is missing "download url"') elif data_type in ['bitexts', 'monolingual']: size = data.get('size', '') if size == '': - logging.warning(f'{info} {data_type} {name} is missing "size"') + logger.warning(f'{info} {data_type} {name} is missing "size"') else: size = int(int(size)/1024) url = data.get('url', '') if url == '': - logging.warning(f'{info} {data_type} {name} is missing "url"') + logger.warning(f'{info} {data_type} {name} is missing "url"') pre_step = url.split('/') if len(pre_step) < 2: - logging.warning(f'{info} {data_type} {name}: cannot find preprocessing from url "{url}"') + logger.warning(f'{info} {data_type} {name}: cannot find preprocessing from url "{url}"') else: preprocessing = pre_step[-2] @@ -125,6 +134,7 @@ def get_tmx_entries(corpus, version, latest, tmx, cur, info): opusfile = (source, target, corpus, preprocessing, version, url, size, documents, alignment_pairs, source_tokens, target_tokens, latest) execute_sql(cur, opusfile) + def get_moses_entries(corpus, version, latest, moses, cur, info): for item in moses: source, target, documents, alignment_pairs, source_tokens, target_tokens = get_lang_info(item, moses[item], 'moses', info) @@ -132,6 +142,7 @@ def get_moses_entries(corpus, version, latest, moses, cur, info): opusfile = (source, target, corpus, preprocessing, version, url, size, documents, alignment_pairs, source_tokens, target_tokens, latest) execute_sql(cur, opusfile) + def get_monolingual_entries(corpus, version, latest, monolingual, cur, info): for item in monolingual: source, target, documents, alignment_pairs, source_tokens, target_tokens = get_lang_info(item, monolingual[item], 'monolingual', info) @@ -140,6 +151,7 @@ def get_monolingual_entries(corpus, version, latest, monolingual, cur, info): opusfile = (source, target, corpus, preprocessing, version, url, size, documents, alignment_pairs, source_tokens, target_tokens, latest) execute_sql(cur, opusfile) + def get_bitext_entries(corpus, version, latest, bitexts, cur, info): for item in bitexts: source, target, documents, alignment_pairs, source_tokens, target_tokens = get_lang_info(item, bitexts[item], 'bitexts', info) @@ -150,12 +162,14 @@ def get_bitext_entries(corpus, version, latest, bitexts, cur, info): opusfile = (source, target, corpus, preprocessing, version, url, size, documents, alignment_pairs, source_tokens, target_tokens, latest) execute_sql(cur, opusfile) + def remove_missing_items(cur): sql = 'DELETE FROM opusfile WHERE updated=0' cur.execute(sql) sql = 'UPDATE opusfile SET updated=0' cur.execute(sql) + def update_db(db_file=None, log_type='errors'): yaml = YAML() @@ -183,15 +197,15 @@ def update_db(db_file=None, log_type='errors'): try: gen_info = read_url_yaml(URL_BASE + info, yaml) except (scanner.ScannerError, urllib.error.HTTPError, reader.ReaderError) as e: - logging.error(f'{info}, {type(e).__name__}: {e}') + logger.error(f'{info}, {type(e).__name__}: {e}') gen_info = {} corpus = gen_info.get('name') if not corpus: - logging.warning(f'{info}, corpus name missing') + logger.warning(f'{info}, corpus name missing') print(f'Processing corpus {corpus}') latest_v = gen_info.get('latest_release') if not latest_v: - logging.error(f'{info}, latest_release missing') + logger.error(f'{info}, latest_release missing') elif len(info_s) == 3: version = info_s[1] if not corpus: @@ -203,7 +217,7 @@ def update_db(db_file=None, log_type='errors'): try: corpus_data = read_url_yaml(URL_BASE + stats, yaml) except (scanner.ScannerError, urllib.error.HTTPError, reader.ReaderError) as e: - logging.error(f'{stats}, {type(e).__name__}: {e}') + logger.error(f'{stats}, {type(e).__name__}: {e}') continue get_entries = {'bitexts': get_bitext_entries, @@ -212,7 +226,7 @@ def update_db(db_file=None, log_type='errors'): 'tmx': get_tmx_entries} if not corpus_data: - logging.error(f'{info}, corpus_data is empty') + logger.error(f'{info}, corpus_data is empty') continue for item in get_entries.keys(): @@ -220,15 +234,17 @@ def update_db(db_file=None, log_type='errors'): if sub_data: get_entries[item](corpus, version, latest, sub_data, cur, info) else: - logging.warning(f'{info}, {item} data missing') + logger.warning(f'{info}, {item} data missing') remove_missing_items(cur) con.commit() con.close() + def main(): update_db() + if __name__ == "__main__": main() diff --git a/opustools_pkg/tests/test_opus_read.py b/opustools_pkg/tests/test_opus_read.py index 695d045..76e5ec8 100644 --- a/opustools_pkg/tests/test_opus_read.py +++ b/opustools_pkg/tests/test_opus_read.py @@ -1,20 +1,18 @@ +import logging import os import unittest from unittest import mock import io import sys -import xml.parsers.expat import gzip import shutil import zipfile import tempfile -import bz2 from opustools import OpusRead, OpusGet -from opustools.parse.block_parser import BlockParserError -from opustools.parse.sentence_parser import SentenceParserError from opustools.parse.alignment_parser import AlignmentParserError + def pairPrinterToVariable(**kwargs): old_stdout = sys.stdout printout = io.StringIO() @@ -24,245 +22,237 @@ def pairPrinterToVariable(**kwargs): sys.stdout = old_stdout return printout.getvalue() -def preMosesToVariable(**kwargs): - old_stdout = sys.stdout - printout = io.StringIO() - sys.stdout = printout - try: - OpusRead(**kwargs) - except SystemExit as e: - pass - sys.stdout = old_stdout - return printout.getvalue() def add_to_root_dir(corpus=None, source=None, target=None, - version='latest', preprocess=None, root_dir=None): + version='latest', preprocess=None, root_dir=None): OpusGet(directory=corpus, source=source, target=target, release=version, - preprocess=preprocess, download_dir=root_dir, suppress_prompts=True, - database='tests/testdata.db').get_files() + preprocess=preprocess, download_dir=root_dir, suppress_prompts=True, + database='tests/testdata.db').get_files() source_zip = '{corpus}_{version}_{preprocess}_{source}.zip'.format( corpus=corpus, version=version, preprocess=preprocess, source=source) os.rename(os.path.join(root_dir, source_zip), - os.path.join(root_dir, corpus, version, preprocess, source+'.zip')) + os.path.join(root_dir, corpus, version, preprocess, source+'.zip')) target_zip = '{corpus}_{version}_{preprocess}_{target}.zip'.format( corpus=corpus, version=version, preprocess=preprocess, target=target) - os.rename(os.path.join(root_dir,target_zip), - os.path.join(root_dir, corpus, version, preprocess, target+'.zip')) + os.rename(os.path.join(root_dir, target_zip), + os.path.join(root_dir, corpus, version, preprocess, target+'.zip')) alignment_xml = ('{corpus}_{version}_{preprocess}_{source}-' - '{target}.xml.gz').format(corpus=corpus, version=version, - preprocess='xml', source=source, target=target) + '{target}.xml.gz').format(corpus=corpus, version=version, + preprocess='xml', source=source, target=target) os.rename(os.path.join(root_dir, alignment_xml), - os.path.join(root_dir, corpus, version, 'xml', - source+'-'+target+'.xml.gz')) + os.path.join(root_dir, corpus, version, 'xml', + source+'-'+target+'.xml.gz')) + OPUS_TEMP = 'tmp_opus_read_temp' OPUS_ROOT = 'tmp_opus_read_root' + class TestOpusRead(unittest.TestCase): @classmethod def setUpClass(self): if ('OPUS_TEST_SAVE' in os.environ.keys() and os.path.exists(os.path.join(os.environ['OPUS_TEST_SAVE'], OPUS_TEMP))): - self.tempdir1 = os.path.join(os.environ['OPUS_TEST_SAVE'], OPUS_TEMP) + self.tempdir1 = os.path.join(os.environ['OPUS_TEST_SAVE'], OPUS_TEMP) else: self.tempdir1 = tempfile.mkdtemp() os.mkdir(os.path.join(self.tempdir1, 'test_files')) os.makedirs(os.path.join(self.tempdir1, 'RF', 'xml', 'en')) with open(os.path.join(self.tempdir1, 'RF', 'xml', 'en', - '1996.xml'), 'w') as f: + '1996.xml'), 'w') as f: f.write('\n' - '\n \n The \n Hound \n of \n the' - ' \n Baskervilles \n by' - ' \n Sir \n Arthur \n ' - 'Conan \n Doyle \n Aligned \n by\n : \n András \n ' - 'Farkas \n (\n fully ' - '\n reviewed\n ) \n\n\n \n Source&<>"\'\n \n :\n \n manybooks.' - 'netAudiobook\n available\n \n \n here\n \n\n\n\n\n\n \n Chapter\n ' - '1\n ' - 'Mr.\n Sherlock\n Holmes\n \n

\n\n \n Mr.\n ' - '' - 'Sherlock\n Holmes\n\n\n\n\n\n ' - '\n I\n \n \n believe\n \n

\n\n

\n\n \n ' - '"\n Excellent\n \n !\n\n \n\n\n

\n ' - '\n
\n') + '\n \n The \n Hound \n of \n the' + ' \n Baskervilles \n by' + ' \n Sir \n Arthur \n ' + 'Conan \n Doyle \n Aligned \n by\n : \n András \n ' + 'Farkas \n (\n fully ' + '\n reviewed\n ) \n\n\n \n Source&<>"\'\n \n :\n \n manybooks.' + 'netAudiobook\n available\n \n \n here\n \n\n\n\n\n\n \n Chapter\n ' + '1\n ' + 'Mr.\n Sherlock\n Holmes\n \n

\n\n \n Mr.\n ' + '' + 'Sherlock\n Holmes\n\n\n\n\n\n ' + '\n I\n \n \n believe\n \n

\n\n

\n\n \n ' + '"\n Excellent\n \n !\n\n \n\n\n

\n ' + '\n\n') with zipfile.ZipFile(os.path.join(self.tempdir1, - 'RF_v1_xml_en.zip'), 'w') as zf: + 'RF_v1_xml_en.zip'), 'w') as zf: zf.write(os.path.join(self.tempdir1, 'RF', 'xml', 'en', - '1996.xml'), arcname='RF/xml/en/1996.xml') + '1996.xml'), arcname='RF/xml/en/1996.xml') os.mkdir(os.path.join(self.tempdir1, 'RF', 'xml', 'sv')) with open(os.path.join(self.tempdir1, 'RF', 'xml', 'sv', - '1996.xml'), 'w') as f: + '1996.xml'), 'w') as f: f.write('\n\n \n The Hound of the Baskervilles \n by ' - 'Sir Arthur Conan Doyle \n Aligned by: András Farkas (fully ' - 'reviewed) \n \n \n \n\n ' - 'Source\n : \n Project \n Gutenberg\n\n\n\n Herra \n Sherlock' - ' \n Holmes\n .\n

\n\n Herra \n Sherlock \n Holmes\n\n \n\n Luulenpa \n että \n sinulla \n

\n' - '

\n\n "\n ' - 'Erinomaista\n .\n' - '

\n \n\n') + '>\n \n The Hound of the Baskervilles \n by ' + 'Sir Arthur Conan Doyle \n Aligned by: András Farkas (fully ' + 'reviewed) \n \n \n \n\n ' + 'Source\n : \n Project \n Gutenberg\n\n\n\n Herra \n Sherlock' + ' \n Holmes\n .\n

\n\n Herra \n Sherlock \n Holmes\n\n \n\n Luulenpa \n että \n sinulla \n

\n' + '

\n\n "\n ' + 'Erinomaista\n .\n' + '

\n \n\n') with zipfile.ZipFile(os.path.join(self.tempdir1, - 'RF_v1_xml_sv.zip'), 'w') as zf: + 'RF_v1_xml_sv.zip'), 'w') as zf: zf.write(os.path.join(self.tempdir1, 'RF', 'xml', 'sv', - '1996.xml'), arcname='RF/xml/sv/1996.xml') + '1996.xml'), arcname='RF/xml/sv/1996.xml') shutil.copyfile(os.path.join(self.tempdir1, 'RF_v1_xml_en.zip'), - os.path.join(self.tempdir1, 'en.zip')) + os.path.join(self.tempdir1, 'en.zip')) shutil.copyfile(os.path.join(self.tempdir1, 'RF_v1_xml_sv.zip'), - os.path.join(self.tempdir1, 'sv.zip')) + os.path.join(self.tempdir1, 'sv.zip')) with open(os.path.join(self.tempdir1, 'books_alignment.xml'), - 'w') as f: + 'w') as f: f.write('\n\n' - '\n\n\n\n\n\n\n \n\n') + 'cesAlign PUBLIC "-//CES//DTD XML cesAlign//EN" "">\n' + '\n\n\n\n\n\n\n \n\n') with gzip.open(os.path.join(self.tempdir1, - 'RF_v1_xml_en-sv.xml.gz'), 'wb') as f: + 'RF_v1_xml_en-sv.xml.gz'), 'wb') as f: with open(os.path.join(self.tempdir1, 'books_alignment.xml'), - 'rb') as b: + 'rb') as b: f.write(b.read()) with open(os.path.join(self.tempdir1, 'non_alignment.xml'), - 'w') as f: + 'w') as f: f.write('\n\n' - '\n\n\n\n\n ' - '\n\n') + 'cesAlign PUBLIC "-//CES//DTD XML cesAlign//EN" "">\n' + '\n\n\n\n\n ' + '\n\n') if ('OPUS_TEST_SAVE' in os.environ.keys() and os.path.exists(os.path.join(os.environ['OPUS_TEST_SAVE'], OPUS_ROOT))): - self.root_directory = os.path.join(os.environ['OPUS_TEST_SAVE'], OPUS_ROOT) + self.root_directory = os.path.join(os.environ['OPUS_TEST_SAVE'], OPUS_ROOT) else: self.root_directory = tempfile.mkdtemp() os.makedirs(os.path.join(self.root_directory, 'RF', 'latest', - 'xml')) + 'xml')) with gzip.open(os.path.join(self.root_directory, 'RF', 'latest', - 'xml', 'fi-sv.xml.gz'), 'wb') as f: + 'xml', 'fi-sv.xml.gz'), 'wb') as f: with open(os.path.join(self.tempdir1, 'books_alignment.xml'), - 'rb') as b: + 'rb') as b: f.write(b.read()) with gzip.open(os.path.join(self.root_directory, 'RF', 'latest', - 'xml', 'ab-cd.xml.gz'), 'wb') as f: + 'xml', 'ab-cd.xml.gz'), 'wb') as f: with open(os.path.join(self.tempdir1, 'books_alignment.xml'), - 'rb') as b: + 'rb') as b: f.write(b.read()) add_to_root_dir(corpus='RF', source='en', target='sv', - preprocess='xml', root_dir=self.root_directory) + preprocess='xml', root_dir=self.root_directory) add_to_root_dir(corpus='RF', source='en', target='es', - preprocess='xml', root_dir=self.root_directory) + preprocess='xml', root_dir=self.root_directory) os.mkdir(os.path.join(self.root_directory, 'RF', 'latest', 'raw')) add_to_root_dir(corpus='RF', source='en', target='sv', - preprocess='raw', root_dir=self.root_directory) + preprocess='raw', root_dir=self.root_directory) os.mkdir(os.path.join(self.root_directory, 'RF', 'latest', - 'parsed')) + 'parsed')) add_to_root_dir(corpus='RF', source='en', target='sv', - preprocess='parsed', root_dir=self.root_directory) + preprocess='parsed', root_dir=self.root_directory) os.makedirs(os.path.join(self.root_directory, 'RF', 'v1', 'xml')) add_to_root_dir(corpus='RF', source='en', target='sv', - version='v1', preprocess='xml', root_dir=self.root_directory) + version='v1', preprocess='xml', root_dir=self.root_directory) add_to_root_dir(corpus='RF', source='en', target='es', - version='v1', preprocess='xml', root_dir=self.root_directory) + version='v1', preprocess='xml', root_dir=self.root_directory) os.makedirs(os.path.join(self.root_directory, 'OpenSubtitles', - 'latest', 'raw')) + 'latest', 'raw')) os.makedirs(os.path.join(self.root_directory, 'OpenSubtitles', - 'latest', 'xml')) + 'latest', 'xml')) add_to_root_dir(corpus='OpenSubtitles', source='eo', target='tl', - preprocess='raw', root_dir=self.root_directory) + preprocess='raw', root_dir=self.root_directory) add_to_root_dir(corpus='OpenSubtitles', source='eo', target='tl', - preprocess='xml', root_dir=self.root_directory) + preprocess='xml', root_dir=self.root_directory) os.makedirs(os.path.join(self.root_directory, 'Books', - 'latest', 'xml')) + 'latest', 'xml')) add_to_root_dir(corpus='Books', source='eo', target='pt', - preprocess='xml', root_dir=self.root_directory) + preprocess='xml', root_dir=self.root_directory) add_to_root_dir(corpus='RF', source='fr', target='sv', - preprocess='xml', root_dir=self.root_directory) + preprocess='xml', root_dir=self.root_directory) os.remove(os.path.join(self.root_directory, 'RF', 'latest', 'xml', - 'fr.zip')) + 'fr.zip')) self.opr = OpusRead(directory='RF', source='en', target='sv', - root_directory=self.root_directory) + root_directory=self.root_directory) self.maxDiff= None @@ -283,23 +273,23 @@ def tearDown(self): def test_normal_xml_write(self): OpusRead(directory='RF', source='en', target='sv', maximum=2, - write=[os.path.join(self.tempdir1, 'test_files', 'test_result')], - root_directory=self.root_directory).printPairs() + write=[os.path.join(self.tempdir1, 'test_files', 'test_result')], + root_directory=self.root_directory).printPairs() with open(os.path.join(self.tempdir1, 'test_files', 'test_result'), - 'r') as f: + 'r') as f: self.assertEqual(f.read(), - '\n# en/1988.xml.gz\n' - '# sv/1988.xml.gz\n\n' - '================================\n(src)="s1.1">State' - 'ment of Government Policy by the Prime Minister , Mr' - ' Ingvar Carlsson , at the Opening of the Swedish Parl' - 'iament on Tuesday , 4 October , 1988 .\n(trg)="s1.1"' - '>REGERINGSFÖRKLARING .\n============================' - '====\n(src)="s2.1">Your Majesties , Your Royal Highn' - 'esses , Mr Speaker , Members of the Swedish Parliame' - 'nt .\n(trg)="s2.1">Eders Majestäter , Eders Kungliga' - ' Högheter , herr talman , ledamöter av Sveriges riks' - 'dag !\n================================\n') + '\n# en/1988.xml.gz\n' + '# sv/1988.xml.gz\n\n' + '================================\n(src)="s1.1">State' + 'ment of Government Policy by the Prime Minister , Mr' + ' Ingvar Carlsson , at the Opening of the Swedish Parl' + 'iament on Tuesday , 4 October , 1988 .\n(trg)="s1.1"' + '>REGERINGSFÖRKLARING .\n============================' + '====\n(src)="s2.1">Your Majesties , Your Royal Highn' + 'esses , Mr Speaker , Members of the Swedish Parliame' + 'nt .\n(trg)="s2.1">Eders Majestäter , Eders Kungliga' + ' Högheter , herr talman , ledamöter av Sveriges riks' + 'dag !\n================================\n') def test_normal_xml_write_link_end_and_linkGrp_end_on_same_line(self): same_line = os.path.join(self.tempdir1, 'test_files', 'sameline') @@ -315,12 +305,13 @@ def test_normal_xml_write_link_end_and_linkGrp_end_on_same_line(self): '\n') OpusRead(directory='RF', source='en', target='sv', - write=[os.path.join(self.tempdir1, 'test_files', 'test_result')], - root_directory=self.root_directory, - alignment_file=same_line).printPairs() + write=[os.path.join(self.tempdir1, 'test_files', 'test_result')], + root_directory=self.root_directory, + alignment_file=same_line).printPairs() with open(os.path.join(self.tempdir1, 'test_files', 'test_result'), - 'r') as f: - self.assertEqual(f.read(), + 'r') as f: + self.assertEqual( + f.read(), '\n# en/1988.xml.gz\n' '# sv/1988.xml.gz\n' '\n================================\n' @@ -340,43 +331,49 @@ def test_normal_xml_write_link_end_and_linkGrp_end_on_same_line(self): '================================\n') def test_normal_xml_write_verbose(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - maximum=1, write=[os.path.join( - self.tempdir1, 'test_files', 'test_result')], - root_directory=self.root_directory, verbose=True) - alignment=os.path.join(self.root_directory, 'RF', 'latest', 'xml', 'en-sv.xml.gz') - self.assertTrue('Parsing file "{}'.format(alignment) in var) - self.assertTrue('Parsing file "RF/xml/en/1988.xml"' in var) - self.assertTrue('Parsing file "RF/xml/sv/1988.xml"' in var) + with mock.patch('sys.stderr', new=io.StringIO()) as output: + OpusRead(directory='RF', source='en', target='sv', + maximum=1, write=[os.path.join( + self.tempdir1, 'test_files', 'test_result')], + root_directory=self.root_directory, verbose=True).printPairs() + alignment=os.path.join(self.root_directory, 'RF', 'latest', 'xml', 'en-sv.xml.gz') + var = output.getvalue() + self.assertIn('Parsing file "{}'.format(alignment), var) + self.assertIn('Parsing file "RF/xml/en/1988.xml"', var) + self.assertIn('Parsing file "RF/xml/sv/1988.xml"', var) def test_normal_xml_print(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - maximum=1, root_directory=self.root_directory) - self.assertTrue( - '\n# en/1988.xml.gz\n' - '# sv/1988.xml.gz\n\n' - '================================\n(src)="s1.1">State' - 'ment of Government Policy by the Prime Minister , Mr' - ' Ingvar Carlsson , at the Opening of the Swedish Parl' - 'iament on Tuesday , 4 October , 1988 .\n(trg)="s1.1"' - '>REGERINGSFÖRKLARING .\n============================' - '====\n' in var) - + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='en', target='sv', + maximum=1, root_directory=self.root_directory).printPairs() + var = output.getvalue() + self.assertIn( + '\n# en/1988.xml.gz\n' + '# sv/1988.xml.gz\n\n' + '================================\n(src)="s1.1">State' + 'ment of Government Policy by the Prime Minister , Mr' + ' Ingvar Carlsson , at the Opening of the Swedish Parl' + 'iament on Tuesday , 4 October , 1988 .\n(trg)="s1.1"' + '>REGERINGSFÖRKLARING .\n============================' + '====\n', var) def test_normal_xml_print_verbose(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - maximum=1, root_directory=self.root_directory, verbose=True) - self.assertTrue('Parsing file "'+self.root_directory+'/RF/latest/xml/en-sv.xml.gz"' in var) - self.assertTrue('Parsing file "RF/xml/en/1988.xml"' in var) - self.assertTrue('Parsing file "RF/xml/sv/1988.xml"' in var) + with mock.patch('sys.stderr', new=io.StringIO()) as output: + OpusRead(directory='RF', source='en', target='sv', + maximum=1, root_directory=self.root_directory, verbose=True).printPairs() + var = output.getvalue() + self.assertIn('Parsing file "'+self.root_directory+'/RF/latest/xml/en-sv.xml.gz"', var) + self.assertIn('Parsing file "RF/xml/en/1988.xml"', var) + self.assertIn('Parsing file "RF/xml/sv/1988.xml"', var) def test_normal_raw_write(self): OpusRead(directory='RF', source='en', target='sv', maximum=1, - write=[os.path.join(self.tempdir1, 'test_files', 'test_result')], - preprocess='raw', root_directory=self.root_directory).printPairs() + write=[os.path.join(self.tempdir1, 'test_files', 'test_result')], + preprocess='raw', root_directory=self.root_directory).printPairs() with open(os.path.join(self.tempdir1, 'test_files', 'test_result'), - 'r') as f: - self.assertEqual(f.read(), + 'r') as f: + self.assertEqual( + f.read(), '\n# en/1988.xml.gz\n' '# sv/1988.xml.gz\n\n' '================================\n(src)="s1.1">State' @@ -386,42 +383,46 @@ def test_normal_raw_write(self): '>REGERINGSFÖRKLARING.\n============================' '====\n') - def test_normal_raw_print(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - maximum=1, preprocess='raw', root_directory=self.root_directory) - self.assertTrue( - '\n# en/1988.xml.gz\n' - '# sv/1988.xml.gz\n\n' - '================================\n(src)="s1.1">State' - 'ment of Government Policy by the Prime Minister, Mr' - ' Ingvar Carlsson, at the Opening of the Swedish Parl' - 'iament on Tuesday, 4 October, 1988.\n(trg)="s1.1"' - '>REGERINGSFÖRKLARING.\n============================' - '====\n' in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='en', target='sv', + maximum=1, preprocess='raw', root_directory=self.root_directory).printPairs() + var = output.getvalue() + self.assertIn( + '\n# en/1988.xml.gz\n' + '# sv/1988.xml.gz\n\n' + '================================\n(src)="s1.1">State' + 'ment of Government Policy by the Prime Minister, Mr' + ' Ingvar Carlsson, at the Opening of the Swedish Parl' + 'iament on Tuesday, 4 October, 1988.\n(trg)="s1.1"' + '>REGERINGSFÖRKLARING.\n============================' + '====\n', var) def test_normal_raw_print_OpenSubtitles(self): - var = pairPrinterToVariable(directory='OpenSubtitles', source='eo', - target='tl', maximum=1, preprocess='raw', - root_directory=self.root_directory) - self.assertTrue( - '\n# eo/2009/1187043/6483790.xml.gz\n' - '# tl/2009/1187043/6934998.xml.gz\n\n' - '================================\n' - '(src)="1">Ĉiuj nomoj, roluloj kaj eventoj reprezentitaj en ĉi ' - 'tiu filmo estas fikciaj.\n' - '================================\n' in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='OpenSubtitles', source='eo', + target='tl', maximum=1, preprocess='raw', + root_directory=self.root_directory).printPairs() + var = output.getvalue() + self.assertIn( + '\n# eo/2009/1187043/6483790.xml.gz\n' + '# tl/2009/1187043/6934998.xml.gz\n\n' + '================================\n' + '(src)="1">Ĉiuj nomoj, roluloj kaj eventoj reprezentitaj en ĉi ' + 'tiu filmo estas fikciaj.\n' + '================================\n', var) def test_normal_parsed_write(self): OpusRead(directory='RF', source='en', target='sv', maximum=1, - preprocess='parsed', print_annotations=True, - source_annotations=['upos', 'feats', 'lemma'], - target_annotations=['upos', 'feats', 'lemma'], - write=[os.path.join(self.tempdir1, 'test_files', 'test_result')], - root_directory=self.root_directory).printPairs() + preprocess='parsed', print_annotations=True, + source_annotations=['upos', 'feats', 'lemma'], + target_annotations=['upos', 'feats', 'lemma'], + write=[os.path.join(self.tempdir1, 'test_files', 'test_result')], + root_directory=self.root_directory).printPairs() with open(os.path.join(self.tempdir1, 'test_files', 'test_result'), - 'r') as f: - self.assertEqual(f.read(), + 'r') as f: + self.assertEqual( + f.read(), '\n# en/1988.xml.gz\n# sv/1988.xml.gz\n\n' '================================' '\n(src)="s1.1">Statement|NOUN|Number=Sing|statement ' @@ -442,110 +443,115 @@ def test_normal_parsed_write(self): '=Neut|Number=Sing|Regeringsförklaring .|PUNCT|.' '\n================================\n') - def test_normal_parsed_print(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - maximum=1, preprocess='parsed', print_annotations=True, - source_annotations=['upos', 'feats', 'lemma'], - target_annotations=['upos', 'feats', 'lemma'], - root_directory=self.root_directory) - self.assertTrue( - '\n# en/1988.xml.gz\n# sv/1988.xml.gz\n\n' - '================================' - '\n(src)="s1.1">Statement|NOUN|Number=Sing|statement ' - 'of|ADP|of Government|NOUN|Number=Sing|government Pol' - 'icy|NOUN|Number=Sing|policy by|ADP|by the|DET|Defini' - 'te=Def|PronType=Art|the Prime|PROPN|Number=Sing|Prim' - 'e Minister|PROPN|Number=Sing|Minister ,|PUNCT|, Mr|P' - 'ROPN|Number=Sing|Mr Ingvar|PROPN|Number=Sing|Ingvar ' - 'Carlsson|PROPN|Number=Sing|Carlsson ,|PUNCT|, at|ADP' - '|at the|DET|Definite=Def|PronType=Art|the Opening|NO' - 'UN|Number=Sing|opening of|ADP|of the|DET|Definite=De' - 'f|PronType=Art|the Swedish|ADJ|Degree=Pos|swedish Pa' - 'rliament|NOUN|Number=Sing|parliament on|ADP|on Tuesd' - 'ay|PROPN|Number=Sing|Tuesday ,|PUNCT|, 4|NUM|NumType' - '=Card|4 October|PROPN|Number=Sing|October ,|PUNCT|, ' - '1988|NUM|NumType=Card|1988 .|PUNCT|.\n(trg)="s1.1">R' - 'EGERINGSFÖRKLARING|NOUN|Case=Nom|Definite=Ind|Gender' - '=Neut|Number=Sing|Regeringsförklaring .|PUNCT|.' - '\n================================\n' in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='en', target='sv', + maximum=1, preprocess='parsed', print_annotations=True, + source_annotations=['upos', 'feats', 'lemma'], + target_annotations=['upos', 'feats', 'lemma'], + root_directory=self.root_directory).printPairs() + var = output.getvalue() + self.assertIn( + '\n# en/1988.xml.gz\n# sv/1988.xml.gz\n\n' + '================================' + '\n(src)="s1.1">Statement|NOUN|Number=Sing|statement ' + 'of|ADP|of Government|NOUN|Number=Sing|government Pol' + 'icy|NOUN|Number=Sing|policy by|ADP|by the|DET|Defini' + 'te=Def|PronType=Art|the Prime|PROPN|Number=Sing|Prim' + 'e Minister|PROPN|Number=Sing|Minister ,|PUNCT|, Mr|P' + 'ROPN|Number=Sing|Mr Ingvar|PROPN|Number=Sing|Ingvar ' + 'Carlsson|PROPN|Number=Sing|Carlsson ,|PUNCT|, at|ADP' + '|at the|DET|Definite=Def|PronType=Art|the Opening|NO' + 'UN|Number=Sing|opening of|ADP|of the|DET|Definite=De' + 'f|PronType=Art|the Swedish|ADJ|Degree=Pos|swedish Pa' + 'rliament|NOUN|Number=Sing|parliament on|ADP|on Tuesd' + 'ay|PROPN|Number=Sing|Tuesday ,|PUNCT|, 4|NUM|NumType' + '=Card|4 October|PROPN|Number=Sing|October ,|PUNCT|, ' + '1988|NUM|NumType=Card|1988 .|PUNCT|.\n(trg)="s1.1">R' + 'EGERINGSFÖRKLARING|NOUN|Case=Nom|Definite=Ind|Gender' + '=Neut|Number=Sing|Regeringsförklaring .|PUNCT|.' + '\n================================\n', var) def test_normal_parsed_print_unalphabetical(self): - var = pairPrinterToVariable(directory='RF', source='sv', target='en', - maximum=1, preprocess='parsed', print_annotations=True, - source_annotations=['upos', 'feats', 'lemma'], - target_annotations=['upos', 'feats', 'lemma'], - root_directory=self.root_directory) - self.assertTrue( - '\n# en/1988.xml.gz\n# sv/1988.xml.gz\n\n' - '================================' - '\n(src)="s1.1">REGERINGSFÖRKLARING|NOUN|Case=Nom|Definit' - 'e=Ind|Gender=Neut|Number=Sing|Regeringsförklaring .|PUNC' - 'T|.\n(trg)="s1.1">Statement|NOUN|Number=Sing|statement ' - 'of|ADP|of Government|NOUN|Number=Sing|government Pol' - 'icy|NOUN|Number=Sing|policy by|ADP|by the|DET|Defini' - 'te=Def|PronType=Art|the Prime|PROPN|Number=Sing|Prim' - 'e Minister|PROPN|Number=Sing|Minister ,|PUNCT|, Mr|P' - 'ROPN|Number=Sing|Mr Ingvar|PROPN|Number=Sing|Ingvar ' - 'Carlsson|PROPN|Number=Sing|Carlsson ,|PUNCT|, at|ADP' - '|at the|DET|Definite=Def|PronType=Art|the Opening|NO' - 'UN|Number=Sing|opening of|ADP|of the|DET|Definite=De' - 'f|PronType=Art|the Swedish|ADJ|Degree=Pos|swedish Pa' - 'rliament|NOUN|Number=Sing|parliament on|ADP|on Tuesd' - 'ay|PROPN|Number=Sing|Tuesday ,|PUNCT|, 4|NUM|NumType' - '=Card|4 October|PROPN|Number=Sing|October ,|PUNCT|, ' - '1988|NUM|NumType=Card|1988 .|PUNCT|.' - '\n================================\n' in var) - + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='sv', target='en', + maximum=1, preprocess='parsed', print_annotations=True, + source_annotations=['upos', 'feats', 'lemma'], + target_annotations=['upos', 'feats', 'lemma'], + root_directory=self.root_directory).printPairs() + var = output.getvalue() + self.assertIn( + '\n# en/1988.xml.gz\n# sv/1988.xml.gz\n\n' + '================================' + '\n(src)="s1.1">REGERINGSFÖRKLARING|NOUN|Case=Nom|Definit' + 'e=Ind|Gender=Neut|Number=Sing|Regeringsförklaring .|PUNC' + 'T|.\n(trg)="s1.1">Statement|NOUN|Number=Sing|statement ' + 'of|ADP|of Government|NOUN|Number=Sing|government Pol' + 'icy|NOUN|Number=Sing|policy by|ADP|by the|DET|Defini' + 'te=Def|PronType=Art|the Prime|PROPN|Number=Sing|Prim' + 'e Minister|PROPN|Number=Sing|Minister ,|PUNCT|, Mr|P' + 'ROPN|Number=Sing|Mr Ingvar|PROPN|Number=Sing|Ingvar ' + 'Carlsson|PROPN|Number=Sing|Carlsson ,|PUNCT|, at|ADP' + '|at the|DET|Definite=Def|PronType=Art|the Opening|NO' + 'UN|Number=Sing|opening of|ADP|of the|DET|Definite=De' + 'f|PronType=Art|the Swedish|ADJ|Degree=Pos|swedish Pa' + 'rliament|NOUN|Number=Sing|parliament on|ADP|on Tuesd' + 'ay|PROPN|Number=Sing|Tuesday ,|PUNCT|, 4|NUM|NumType' + '=Card|4 October|PROPN|Number=Sing|October ,|PUNCT|, ' + '1988|NUM|NumType=Card|1988 .|PUNCT|.' + '\n================================\n', var) def test_normal_parsed_print_all_attributes(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - maximum=1, preprocess='parsed', print_annotations=True, - source_annotations=['all_attrs'], target_annotations=['all_attrs'], - root_directory=self.root_directory) - self.assertTrue( - '\n# en/1988.xml.gz\n# sv/1988.xml.gz\n\n' - '================================' - '\n(src)="s1.1">Statement|root|Number=Sing|0|w1.1.1|state' - 'ment|NOUN|NOUN of|case|w1.1.4|w1.1.2|of|ADP|ADP Governme' - 'nt|compound|Number=Sing|w1.1.4|w1.1.3|government|NOUN|NO' - 'UN Policy|nmod|Number=Sing|w1.1.1|w1.1.4|policy|NOUN|NOU' - 'N by|case|w1.1.8|w1.1.5|by|ADP|ADP the|det|Definite=Def|' - 'PronType=Art|w1.1.8|w1.1.6|the|DET|DET Prime|compound|Nu' - 'mber=Sing|w1.1.8|w1.1.7|Prime|PROPN|PROPN Minister|nmod|' - 'Number=Sing|w1.1.1|w1.1.8|Minister|SpaceAfter=No|PROPN|P' - 'ROPN ,|punct|w1.1.8|w1.1.9|,|PUNCT|PUNCT Mr|compound|Num' - 'ber=Sing|w1.1.12|w1.1.10|Mr|PROPN|PROPN Ingvar|flat|Numb' - 'er=Sing|w1.1.10|w1.1.11|Ingvar|PROPN|PROPN Carlsson|flat' - '|Number=Sing|w1.1.8|w1.1.12|Carlsson|SpaceAfter=No|PROPN' - '|PROPN ,|punct|w1.1.1|w1.1.13|,|PUNCT|PUNCT at|case|w1.1' - '.16|w1.1.14|at|ADP|ADP the|det|Definite=Def|PronType=Art' - '|w1.1.16|w1.1.15|the|DET|DET Opening|nmod|Number=Sing|w1' - '.1.1|w1.1.16|opening|NOUN|NOUN of|case|w1.1.20|w1.1.17|o' - 'f|ADP|ADP the|det|Definite=Def|PronType=Art|w1.1.20|w1.1' - '.18|the|DET|DET Swedish|amod|Degree=Pos|w1.1.20|w1.1.19|' - 'swedish|ADJ|ADJ Parliament|nmod|Number=Sing|w1.1.16|w1.1' - '.20|parliament|NOUN|NOUN on|case|w1.1.22|w1.1.21|on|ADP|' - 'ADP Tuesday|nmod|Number=Sing|w1.1.16|w1.1.22|Tuesday|Spa' - 'ceAfter=No|PROPN|PROPN ,|punct|w1.1.1|w1.1.23|,|PUNCT|PU' - 'NCT 4|nummod|NumType=Card|w1.1.25|w1.1.24|4|NUM|NUM Octo' - 'ber|appos|Number=Sing|w1.1.1|w1.1.25|October|SpaceAfter=' - 'No|PROPN|PROPN ,|punct|w1.1.25|w1.1.26|,|PUNCT|PUNCT 198' - '8|nummod|NumType=Card|w1.1.25|w1.1.27|1988|SpaceAfter=No' - '|NUM|NUM .|punct|w1.1.1|w1.1.28|.|SpaceAfter=No|PUNCT|PU' - 'NCT\n(trg)="s1.1">REGERINGSFÖRKLARING|root|Case=Nom|Defini' - 'te=Ind|Gender=Neut|Number=Sing|0|w1.1.1|Regeringsförklar' - 'ing|SpaceAfter=No|NOUN|NOUN .|punct|w1.1.1|w1.1.2|.|Spac' - 'eAfter=No|PUNCT|PUNCT' - '\n================================\n' in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='en', target='sv', + maximum=1, preprocess='parsed', print_annotations=True, + source_annotations=['all_attrs'], target_annotations=['all_attrs'], + root_directory=self.root_directory).printPairs() + var = output.getvalue() + self.assertIn( + '\n# en/1988.xml.gz\n# sv/1988.xml.gz\n\n' + '================================' + '\n(src)="s1.1">Statement|root|Number=Sing|0|w1.1.1|state' + 'ment|NOUN|NOUN of|case|w1.1.4|w1.1.2|of|ADP|ADP Governme' + 'nt|compound|Number=Sing|w1.1.4|w1.1.3|government|NOUN|NO' + 'UN Policy|nmod|Number=Sing|w1.1.1|w1.1.4|policy|NOUN|NOU' + 'N by|case|w1.1.8|w1.1.5|by|ADP|ADP the|det|Definite=Def|' + 'PronType=Art|w1.1.8|w1.1.6|the|DET|DET Prime|compound|Nu' + 'mber=Sing|w1.1.8|w1.1.7|Prime|PROPN|PROPN Minister|nmod|' + 'Number=Sing|w1.1.1|w1.1.8|Minister|SpaceAfter=No|PROPN|P' + 'ROPN ,|punct|w1.1.8|w1.1.9|,|PUNCT|PUNCT Mr|compound|Num' + 'ber=Sing|w1.1.12|w1.1.10|Mr|PROPN|PROPN Ingvar|flat|Numb' + 'er=Sing|w1.1.10|w1.1.11|Ingvar|PROPN|PROPN Carlsson|flat' + '|Number=Sing|w1.1.8|w1.1.12|Carlsson|SpaceAfter=No|PROPN' + '|PROPN ,|punct|w1.1.1|w1.1.13|,|PUNCT|PUNCT at|case|w1.1' + '.16|w1.1.14|at|ADP|ADP the|det|Definite=Def|PronType=Art' + '|w1.1.16|w1.1.15|the|DET|DET Opening|nmod|Number=Sing|w1' + '.1.1|w1.1.16|opening|NOUN|NOUN of|case|w1.1.20|w1.1.17|o' + 'f|ADP|ADP the|det|Definite=Def|PronType=Art|w1.1.20|w1.1' + '.18|the|DET|DET Swedish|amod|Degree=Pos|w1.1.20|w1.1.19|' + 'swedish|ADJ|ADJ Parliament|nmod|Number=Sing|w1.1.16|w1.1' + '.20|parliament|NOUN|NOUN on|case|w1.1.22|w1.1.21|on|ADP|' + 'ADP Tuesday|nmod|Number=Sing|w1.1.16|w1.1.22|Tuesday|Spa' + 'ceAfter=No|PROPN|PROPN ,|punct|w1.1.1|w1.1.23|,|PUNCT|PU' + 'NCT 4|nummod|NumType=Card|w1.1.25|w1.1.24|4|NUM|NUM Octo' + 'ber|appos|Number=Sing|w1.1.1|w1.1.25|October|SpaceAfter=' + 'No|PROPN|PROPN ,|punct|w1.1.25|w1.1.26|,|PUNCT|PUNCT 198' + '8|nummod|NumType=Card|w1.1.25|w1.1.27|1988|SpaceAfter=No' + '|NUM|NUM .|punct|w1.1.1|w1.1.28|.|SpaceAfter=No|PUNCT|PU' + 'NCT\n(trg)="s1.1">REGERINGSFÖRKLARING|root|Case=Nom|Defini' + 'te=Ind|Gender=Neut|Number=Sing|0|w1.1.1|Regeringsförklar' + 'ing|SpaceAfter=No|NOUN|NOUN .|punct|w1.1.1|w1.1.2|.|Spac' + 'eAfter=No|PUNCT|PUNCT' + '\n================================\n', var) def test_tmx_xml_write(self): OpusRead(directory='RF', source='en', target='sv', maximum=1, - write=[os.path.join(self.tempdir1, 'test_files', 'test_result')], - write_mode='tmx', root_directory=self.root_directory).printPairs() + write=[os.path.join(self.tempdir1, 'test_files', 'test_result')], + write_mode='tmx', root_directory=self.root_directory).printPairs() with open(os.path.join(self.tempdir1, 'test_files', 'test_result'), - 'r') as f: - self.assertEqual(f.read(), + 'r') as f: + self.assertEqual( + f.read(), '\n' '\n
\n\t\n\t\t' @@ -558,11 +564,12 @@ def test_tmx_xml_write(self): def test_tmx_xml_write_unalphabetical(self): OpusRead(directory='RF', source='sv', target='en', maximum=1, - write=[os.path.join(self.tempdir1, 'test_files', 'test_result')], - write_mode='tmx', root_directory=self.root_directory).printPairs() + write=[os.path.join(self.tempdir1, 'test_files', 'test_result')], + write_mode='tmx', root_directory=self.root_directory).printPairs() with open(os.path.join(self.tempdir1, 'test_files', 'test_result'), - 'r') as f: - self.assertEqual(f.read(), + 'r') as f: + self.assertEqual( + f.read(), '\n' '\n
\n\t\n\t\t' @@ -574,54 +581,61 @@ def test_tmx_xml_write_unalphabetical(self): 'day , 4 October , 1988 .' '\n\t\t\n\t\n\n') - def test_tmx_xml_print(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - maximum=1, write_mode='tmx', root_directory=self.root_directory) - self.assertTrue( - '\n' - '\n
\n\t' in var) - self.assertTrue('' - '\n\t\t\tStatement of Governm' - 'ent Policy by the Prime Minister , Mr Ingvar Carlsso' - 'n , at the Opening of the Swedish Parliament on Tues' - 'day , 4 October , 1988 .' - '\n\t\t\tREGERING' - 'SFÖRKLARING .\n\t\t\n\t\n\n' in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='en', target='sv', + maximum=1, write_mode='tmx', root_directory=self.root_directory).printPairs() + var = output.getvalue() + self.assertIn( + '\n' + '\n
\n\t', var) + self.assertIn( + '' + '\n\t\t\tStatement of Governm' + 'ent Policy by the Prime Minister , Mr Ingvar Carlsso' + 'n , at the Opening of the Swedish Parliament on Tues' + 'day , 4 October , 1988 .' + '\n\t\t\tREGERING' + 'SFÖRKLARING .\n\t\t\n\t\n\n', var) def test_tmx_xml_print_verbose(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - maximum=1, write_mode='tmx', root_directory=self.root_directory, - verbose=True) - self.assertTrue('Parsing file "'+self.root_directory+'/RF/latest/xml/en-sv.xml.gz"' in var) - self.assertTrue('Parsing file "RF/xml/en/1988.xml"' in var) - self.assertTrue('Parsing file "RF/xml/sv/1988.xml"' in var) + with mock.patch('sys.stderr', new=io.StringIO()) as output: + var = pairPrinterToVariable( + directory='RF', source='en', target='sv', + maximum=1, write_mode='tmx', root_directory=self.root_directory, + verbose=True) + var = output.getvalue() + self.assertIn('Parsing file "'+self.root_directory+'/RF/latest/xml/en-sv.xml.gz"', var) + self.assertIn('Parsing file "RF/xml/en/1988.xml"', var) + self.assertIn('Parsing file "RF/xml/sv/1988.xml"', var) def test_tmx_xml_print_unalphabetical(self): - var = pairPrinterToVariable(directory='RF', source='sv', target='en', - maximum=1, write_mode='tmx', root_directory=self.root_directory) - self.assertTrue( - '\n' - '\n
\n\t\n\t\t' - '\n\t\t\tREGERING' - 'SFÖRKLARING .\n\t\t\tStatement of Governm' - 'ent Policy by the Prime Minister , Mr Ingvar Carlsso' - 'n , at the Opening of the Swedish Parliament on Tues' - 'day , 4 October , 1988 .' - '\n\t\t\n\t\n\n') - + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='sv', target='en', + maximum=1, write_mode='tmx', root_directory=self.root_directory).printPairs() + var = output.getvalue() + self.assertIn( + '\n' + '\n
\n\t\n\t\t' + '\n\t\t\tREGERING' + 'SFÖRKLARING .\n\t\t\tStatement of Governm' + 'ent Policy by the Prime Minister , Mr Ingvar Carlsso' + 'n , at the Opening of the Swedish Parliament on Tues' + 'day , 4 October , 1988 .' + '\n\t\t\n\t\n\n', var) def test_tmx_raw_write(self): OpusRead(directory='RF', source='en', target='sv', maximum=1, - write_mode='tmx', - write=[os.path.join(self.tempdir1, 'test_files', 'test_result')], - preprocess='raw', root_directory=self.root_directory).printPairs() + write_mode='tmx', + write=[os.path.join(self.tempdir1, 'test_files', 'test_result')], + preprocess='raw', root_directory=self.root_directory).printPairs() with open(os.path.join(self.tempdir1, 'test_files', 'test_result'), - 'r') as f: - self.assertEqual(f.read(), + 'r') as f: + self.assertEqual( + f.read(), '\n' '\n
\n\t\n\t\t' @@ -633,32 +647,34 @@ def test_tmx_raw_write(self): 'SFÖRKLARING.\n\t\t\n\t\n\n') def test_tmx_raw_print(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - maximum=1, write_mode='tmx', preprocess='raw', - root_directory=self.root_directory) - self.assertTrue( - '\n' - '\n
\n\t' in var) - self.assertTrue( - 'Statement of Governm' - 'ent Policy by the Prime Minister, Mr Ingvar Carlsso' - 'n, at the Opening of the Swedish Parliament on Tues' - 'day, 4 October, 1988.' - '\n\t\t\tREGERING' - 'SFÖRKLARING.\n\t\t\n\t\n\n' in var) - + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='en', target='sv', + maximum=1, write_mode='tmx', preprocess='raw', + root_directory=self.root_directory).printPairs() + var = output.getvalue() + self.assertIn( + '\n' + '\n
\n\t', var) + self.assertIn( + 'Statement of Governm' + 'ent Policy by the Prime Minister, Mr Ingvar Carlsso' + 'n, at the Opening of the Swedish Parliament on Tues' + 'day, 4 October, 1988.' + '\n\t\t\tREGERING' + 'SFÖRKLARING.\n\t\t\n\t\n\n', var) def test_tmx_parsed_write(self): OpusRead(directory='RF', source='en', target='sv', maximum=1, - write=[os.path.join(self.tempdir1, 'test_files', 'test_result')], - write_mode='tmx', preprocess='parsed', print_annotations=True, - source_annotations=['upos', 'feats', 'lemma'], - target_annotations=['upos', 'feats', 'lemma'], - root_directory=self.root_directory).printPairs() + write=[os.path.join(self.tempdir1, 'test_files', 'test_result')], + write_mode='tmx', preprocess='parsed', print_annotations=True, + source_annotations=['upos', 'feats', 'lemma'], + target_annotations=['upos', 'feats', 'lemma'], + root_directory=self.root_directory).printPairs() with open(os.path.join(self.tempdir1, 'test_files', 'test_result'), - 'r') as f: - self.assertEqual(f.read(), + 'r') as f: + self.assertEqual( + f.read(), '\n' '\n
\n\t\n\t\t' @@ -682,118 +698,122 @@ def test_tmx_parsed_write(self): '\n\t\t\n\t\n\n') def test_tmx_parsed_print(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - maximum=1, write_mode='tmx', preprocess='parsed', - print_annotations=True, - source_annotations=['upos', 'feats', 'lemma'], - target_annotations=['upos', 'feats', 'lemma'], - root_directory=self.root_directory) - self.assertTrue( - '\n' - '\n
\n\t' in var) - self.assertTrue('' - '\n\t\t\tStatement|NOUN|Numbe' - 'r=Sing|statement of|ADP|of Government|NOUN|Number=Si' - 'ng|government Policy|NOUN|Number=Sing|policy by|ADP|' - 'by the|DET|Definite=Def|PronType=Art|the Prime|PROPN' - '|Number=Sing|Prime Minister|PROPN|Number=Sing|Minist' - 'er ,|PUNCT|, Mr|PROPN|Number=Sing|Mr Ingvar|PROPN|Nu' - 'mber=Sing|Ingvar Carlsson|PROPN|Number=Sing|Carlsson ' - ',|PUNCT|, at|ADP|at the|DET|Definite=Def|PronType=Ar' - 't|the Opening|NOUN|Number=Sing|opening of|ADP|of the' - '|DET|Definite=Def|PronType=Art|the Swedish|ADJ|Degre' - 'e=Pos|swedish Parliament|NOUN|Number=Sing|parliament ' - 'on|ADP|on Tuesday|PROPN|Number=Sing|Tuesday ,|PUNCT|' - ', 4|NUM|NumType=Card|4 October|PROPN|Number=Sing|Oct' - 'ober ,|PUNCT|, 1988|NUM|NumType=Card|1988 .|PUNCT|.<' - '/seg>\n\t\t\tREGERINGS' - 'FÖRKLARING|NOUN|Case=Nom|Definite=Ind|Gender=Neut|Nu' - 'mber=Sing|Regeringsförklaring .|PUNCT|.' - '\n\t\t\n\t\n\n' in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='en', target='sv', + maximum=1, write_mode='tmx', preprocess='parsed', + print_annotations=True, + source_annotations=['upos', 'feats', 'lemma'], + target_annotations=['upos', 'feats', 'lemma'], + root_directory=self.root_directory).printPairs() + var = output.getvalue() + self.assertIn( + '\n' + '\n
\n\t', var) + self.assertIn( + '' + '\n\t\t\tStatement|NOUN|Numbe' + 'r=Sing|statement of|ADP|of Government|NOUN|Number=Si' + 'ng|government Policy|NOUN|Number=Sing|policy by|ADP|' + 'by the|DET|Definite=Def|PronType=Art|the Prime|PROPN' + '|Number=Sing|Prime Minister|PROPN|Number=Sing|Minist' + 'er ,|PUNCT|, Mr|PROPN|Number=Sing|Mr Ingvar|PROPN|Nu' + 'mber=Sing|Ingvar Carlsson|PROPN|Number=Sing|Carlsson ' + ',|PUNCT|, at|ADP|at the|DET|Definite=Def|PronType=Ar' + 't|the Opening|NOUN|Number=Sing|opening of|ADP|of the' + '|DET|Definite=Def|PronType=Art|the Swedish|ADJ|Degre' + 'e=Pos|swedish Parliament|NOUN|Number=Sing|parliament ' + 'on|ADP|on Tuesday|PROPN|Number=Sing|Tuesday ,|PUNCT|' + ', 4|NUM|NumType=Card|4 October|PROPN|Number=Sing|Oct' + 'ober ,|PUNCT|, 1988|NUM|NumType=Card|1988 .|PUNCT|.<' + '/seg>\n\t\t\tREGERINGS' + 'FÖRKLARING|NOUN|Case=Nom|Definite=Ind|Gender=Neut|Nu' + 'mber=Sing|Regeringsförklaring .|PUNCT|.' + '\n\t\t\n\t\n\n', var) def test_moses_xml_write(self): OpusRead(directory='RF', source='en', target='sv', maximum=1, - write=[os.path.join(self.tempdir1, 'test_files', 'test.src'), - os.path.join(self.tempdir1, 'test_files', 'test.trg')], - write_mode='moses', root_directory=self.root_directory).printPairs() + write=[os.path.join(self.tempdir1, 'test_files', 'test.src'), + os.path.join(self.tempdir1, 'test_files', 'test.trg')], + write_mode='moses', root_directory=self.root_directory).printPairs() with open(os.path.join(self.tempdir1, 'test_files', 'test.src'), - 'r') as f: + 'r') as f: self.assertEqual(f.read(), - 'Statement of Government Policy by the Prime Minister , ' - 'Mr Ingvar Carlsson , at the Opening of the Swedish Parli' - 'ament on Tuesday , 4 October , 1988 .\n') + 'Statement of Government Policy by the Prime Minister , ' + 'Mr Ingvar Carlsson , at the Opening of the Swedish Parli' + 'ament on Tuesday , 4 October , 1988 .\n') with open(os.path.join(self.tempdir1, 'test_files', 'test.trg'), - 'r') as f: + 'r') as f: self.assertEqual(f.read(), 'REGERINGSFÖRKLARING .\n') def test_moses_xml_write_unalphabetical(self): OpusRead(directory='RF', source='sv', target='en', maximum=1, - write=[os.path.join(self.tempdir1, 'test_files', 'test.src'), - os.path.join(self.tempdir1, 'test_files', 'test.trg')], - write_mode='moses', root_directory=self.root_directory).printPairs() + write=[os.path.join(self.tempdir1, 'test_files', 'test.src'), + os.path.join(self.tempdir1, 'test_files', 'test.trg')], + write_mode='moses', root_directory=self.root_directory).printPairs() with open(os.path.join(self.tempdir1, 'test_files', 'test.trg'), - 'r') as f: + 'r') as f: self.assertEqual(f.read(), - 'Statement of Government Policy by the Prime Minister , ' - 'Mr Ingvar Carlsson , at the Opening of the Swedish Parli' - 'ament on Tuesday , 4 October , 1988 .\n') + 'Statement of Government Policy by the Prime Minister , ' + 'Mr Ingvar Carlsson , at the Opening of the Swedish Parli' + 'ament on Tuesday , 4 October , 1988 .\n') with open(os.path.join(self.tempdir1, 'test_files', 'test.src'), - 'r') as f: + 'r') as f: self.assertEqual(f.read(), 'REGERINGSFÖRKLARING .\n') def test_moses_xml_write_with_file_names(self): OpusRead(directory='RF', source='en', target='sv', maximum=1, - write=[os.path.join(self.tempdir1, 'test_files', 'test.src'), - os.path.join(self.tempdir1, 'test_files', 'test.trg')], - write_mode='moses', print_file_names=True, - root_directory=self.root_directory).printPairs() + write=[os.path.join(self.tempdir1, 'test_files', 'test.src'), + os.path.join(self.tempdir1, 'test_files', 'test.trg')], + write_mode='moses', print_file_names=True, + root_directory=self.root_directory).printPairs() with open(os.path.join(self.tempdir1, 'test_files', 'test.src'), - 'r') as f: + 'r') as f: self.assertEqual(f.read(), - '\nen/1988.xml.gz\n\nStatement of Gover' - 'nment Policy by the Prime Minister , ' - 'Mr Ingvar Carlsson , at the Opening of the Swedish Parli' - 'ament on Tuesday , 4 October , 1988 .\n') + '\nen/1988.xml.gz\n\nStatement of Gover' + 'nment Policy by the Prime Minister , ' + 'Mr Ingvar Carlsson , at the Opening of the Swedish Parli' + 'ament on Tuesday , 4 October , 1988 .\n') with open(os.path.join(self.tempdir1, 'test_files', 'test.trg'), - 'r') as f: + 'r') as f: self.assertEqual(f.read(), - '\nsv/1988.xml.gz\n\nREGERINGSFÖRKLARING .\n') + '\nsv/1988.xml.gz\n\nREGERINGSFÖRKLARING .\n') def test_moses_xml_write_single_file(self): OpusRead(directory='RF', source='en', target='sv', maximum=1, - write=[os.path.join(self.tempdir1, 'test_files', 'test.src')], - write_mode='moses', root_directory=self.root_directory - ).printPairs() + write=[os.path.join(self.tempdir1, 'test_files', 'test.src')], + write_mode='moses', root_directory=self.root_directory + ).printPairs() with open(os.path.join(self.tempdir1, 'test_files', 'test.src'), - 'r') as f: + 'r') as f: self.assertEqual(f.read(), - 'Statement of Government Policy by the Prime Minister , ' - 'Mr Ingvar Carlsson , at the Opening of the Swedish Parli' - 'ament on Tuesday , 4 October , 1988 .\tREGERINGSFÖRK' - 'LARING .\n') + 'Statement of Government Policy by the Prime Minister , ' + 'Mr Ingvar Carlsson , at the Opening of the Swedish Parli' + 'ament on Tuesday , 4 October , 1988 .\tREGERINGSFÖRK' + 'LARING .\n') def test_moses_xml_write_single_file_unalphabetical(self): OpusRead(directory='RF', source='sv', target='en', maximum=1, - write=[os.path.join(self.tempdir1, 'test_files', 'test.src')], - write_mode='moses', root_directory=self.root_directory - ).printPairs() + write=[os.path.join(self.tempdir1, 'test_files', 'test.src')], + write_mode='moses', root_directory=self.root_directory + ).printPairs() with open(os.path.join(self.tempdir1, 'test_files', 'test.src'), - 'r') as f: + 'r') as f: self.assertEqual(f.read(), - 'REGERINGSFÖRKLARING .\tStatement of Government Poli' - 'cy by the Prime Minister , Mr Ingvar Carlsson , at t' - 'he Opening of the Swedish Parliament on Tuesday , 4 ' - 'October , 1988 .\n') + 'REGERINGSFÖRKLARING .\tStatement of Government Poli' + 'cy by the Prime Minister , Mr Ingvar Carlsson , at t' + 'he Opening of the Swedish Parliament on Tuesday , 4 ' + 'October , 1988 .\n') def test_moses_xml_write_single_file_with_file_names(self): OpusRead(directory='RF', source='en', target='sv', maximum=1, - write=[os.path.join(self.tempdir1, 'test_files', 'test.src')], - write_mode='moses', print_file_names=True, - root_directory=self.root_directory).printPairs() + write=[os.path.join(self.tempdir1, 'test_files', 'test.src')], + write_mode='moses', print_file_names=True, + root_directory=self.root_directory).printPairs() with open(os.path.join(self.tempdir1, 'test_files', 'test.src'), - 'r') as f: - self.assertEqual(f.read(), + 'r') as f: + self.assertEqual( + f.read(), '\nen/1988.xml.gz\nsv/1988' '.xml.gz\n\nStatement of Government Policy by' ' the Prime Minister , Mr Ingvar Carlsson , at the Ope' @@ -802,13 +822,14 @@ def test_moses_xml_write_single_file_with_file_names(self): def test_moses_xml_write_single_file_with_file_names_unalphabetical(self): OpusRead(directory='RF', source='sv', target='en', maximum=1, - write=[os.path.join(self.tempdir1, 'test_files', 'test.src')], - write_mode='moses', print_file_names=True, - root_directory=self.root_directory).printPairs() + write=[os.path.join(self.tempdir1, 'test_files', 'test.src')], + write_mode='moses', print_file_names=True, + root_directory=self.root_directory).printPairs() with open(os.path.join(self.tempdir1, - 'test_files', 'test.src'), - 'r') as f: - self.assertEqual(f.read(), + 'test_files', 'test.src'), + 'r') as f: + self.assertEqual( + f.read(), '\nen/1988.xml.gz\nsv/1988' '.xml.gz\n\nREGERINGSFÖRKLARING .\tStatement ' 'of Government Policy by the Prime Minister , Mr Ingv' @@ -816,119 +837,134 @@ def test_moses_xml_write_single_file_with_file_names_unalphabetical(self): 'nt on Tuesday , 4 October , 1988 .\n') def test_moses_xml_print(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - maximum=1, write_mode='moses', root_directory=self.root_directory) - self.assertTrue( - 'Statement of Government Policy by the Prime Minister , ' - 'Mr Ingvar Carlsson , at the Opening of the Swedish Parli' - 'ament on Tuesday , 4 October , 1988 .\t' - 'REGERINGSFÖRKLARING .\n' in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='en', target='sv', + maximum=1, write_mode='moses', root_directory=self.root_directory).printPairs() + var = output.getvalue() + self.assertIn( + 'Statement of Government Policy by the Prime Minister , ' + 'Mr Ingvar Carlsson , at the Opening of the Swedish Parli' + 'ament on Tuesday , 4 October , 1988 .\t' + 'REGERINGSFÖRKLARING .\n', var) def test_moses_xml_print_unalphabetical(self): - var = pairPrinterToVariable(directory='RF', source='sv', target='en', - maximum=1, write_mode='moses', root_directory=self.root_directory) - self.assertTrue( - 'REGERINGSFÖRKLARING .\tStatement of Government Policy b' - 'y the Prime Minister , Mr Ingvar Carlsson , at the Openi' - 'ng of the Swedish Parliament on Tuesday , 4 October , 1988 .\n' in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='sv', target='en', + maximum=1, write_mode='moses', root_directory=self.root_directory).printPairs() + var = output.getvalue() + self.assertIn( + 'REGERINGSFÖRKLARING .\tStatement of Government Policy b' + 'y the Prime Minister , Mr Ingvar Carlsson , at the Openi' + 'ng of the Swedish Parliament on Tuesday , 4 October , 1988 .\n', var) def test_moses_xml_print_with_file_names(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - maximum=1, write_mode='moses', print_file_names=True, - root_directory=self.root_directory) - self.assertTrue( - '\nen/1988.xml.gz\nsv/1988' - '.xml.gz\n\nStatement of Government Policy by' - ' the Prime Minister , Mr Ingvar Carlsson , at the Ope' - 'ning of the Swedish Parliament on Tuesday , 4 Octobe' - 'r , 1988 .\tREGERINGSFÖRKLARING .\n' in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='en', target='sv', + maximum=1, write_mode='moses', print_file_names=True, + root_directory=self.root_directory).printPairs() + var = output.getvalue() + self.assertIn( + '\nen/1988.xml.gz\nsv/1988' + '.xml.gz\n\nStatement of Government Policy by' + ' the Prime Minister , Mr Ingvar Carlsson , at the Ope' + 'ning of the Swedish Parliament on Tuesday , 4 Octobe' + 'r , 1988 .\tREGERINGSFÖRKLARING .\n', var) def test_moses_raw_write(self): OpusRead(directory='RF', source='en', target='sv', maximum=1, - write_mode='moses', - write=[os.path.join(self.tempdir1, 'test_files', 'test.src'), - os.path.join(self.tempdir1, 'test_files', 'test.trg')], - preprocess='raw', root_directory=self.root_directory).printPairs() + write_mode='moses', + write=[os.path.join(self.tempdir1, 'test_files', 'test.src'), + os.path.join(self.tempdir1, 'test_files', 'test.trg')], + preprocess='raw', root_directory=self.root_directory).printPairs() with open(os.path.join(self.tempdir1, - 'test_files', 'test.src'), 'r') as f: - self.assertEqual(f.read(), - 'Statement of Government Policy by the Prime Minister, ' - 'Mr Ingvar Carlsson, at the Opening of the Swedish Parli' - 'ament on Tuesday, 4 October, 1988.\n') + 'test_files', 'test.src'), 'r') as f: + self.assertEqual( + f.read(), + 'Statement of Government Policy by the Prime Minister, ' + 'Mr Ingvar Carlsson, at the Opening of the Swedish Parli' + 'ament on Tuesday, 4 October, 1988.\n') with open(os.path.join(self.tempdir1, - 'test_files', 'test.trg'), 'r') as f: + 'test_files', 'test.trg'), 'r') as f: self.assertEqual(f.read(), 'REGERINGSFÖRKLARING.\n') def test_moses_raw_print(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - maximum=1, write_mode='moses', preprocess='raw', - root_directory=self.root_directory) - self.assertTrue( - 'Statement of Government Policy by the Prime Minister, ' - 'Mr Ingvar Carlsson, at the Opening of the Swedish Parli' - 'ament on Tuesday, 4 October, 1988.\t' - 'REGERINGSFÖRKLARING.\n' in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='en', target='sv', + maximum=1, write_mode='moses', preprocess='raw', + root_directory=self.root_directory).printPairs() + var = output.getvalue() + self.assertIn( + 'Statement of Government Policy by the Prime Minister, ' + 'Mr Ingvar Carlsson, at the Opening of the Swedish Parli' + 'ament on Tuesday, 4 October, 1988.\t' + 'REGERINGSFÖRKLARING.\n', var) + def test_moses_parsed_write(self): OpusRead(directory='RF', source='en', target='sv', maximum=1, - write=[os.path.join(self.tempdir1, 'test_files', 'test.src'), - os.path.join(self.tempdir1, 'test_files', 'test.trg')], - write_mode='moses', preprocess='parsed', print_annotations=True, - source_annotations=['upos', 'feats', 'lemma'], - target_annotations=['upos', 'feats', 'lemma'], - root_directory=self.root_directory).printPairs() + write=[os.path.join(self.tempdir1, 'test_files', 'test.src'), + os.path.join(self.tempdir1, 'test_files', 'test.trg')], + write_mode='moses', preprocess='parsed', print_annotations=True, + source_annotations=['upos', 'feats', 'lemma'], + target_annotations=['upos', 'feats', 'lemma'], + root_directory=self.root_directory).printPairs() with open(os.path.join(self.tempdir1, 'test_files', 'test.src'), - 'r') as f: - self.assertEqual(f.read(), 'Statement|NOUN|Number=Sing|st' - 'atement of|ADP|of Government|NOUN|Number=Sing|government' - ' Policy|NOUN|Number=Sing|policy by|ADP|by the|DET|Definit' - 'e=Def|PronType=Art|the Prime|PROPN|Number=Sing|Prime Min' - 'ister|PROPN|Number=Sing|Minister ,|PUNCT|, Mr|PROPN|Numb' - 'er=Sing|Mr Ingvar|PROPN|Number=Sing|Ingvar Carlsson|PROP' - 'N|Number=Sing|Carlsson ,|PUNCT|, at|ADP|at the|DET|Defin' - 'ite=Def|PronType=Art|the Opening|NOUN|Number=Sing|openin' - 'g of|ADP|of the|DET|Definite=Def|PronType=Art|the Swedis' - 'h|ADJ|Degree=Pos|swedish Parliament|NOUN|Number=Sing|par' - 'liament on|ADP|on Tuesday|PROPN|Number=Sing|Tuesday ,|PU' - 'NCT|, 4|NUM|NumType=Card|4 October|PROPN|Number=Sing|Oct' - 'ober ,|PUNCT|, 1988|NUM|NumType=Card|1988 .|PUNCT|.\n') + 'r') as f: + self.assertEqual( + f.read(), 'Statement|NOUN|Number=Sing|st' + 'atement of|ADP|of Government|NOUN|Number=Sing|government' + ' Policy|NOUN|Number=Sing|policy by|ADP|by the|DET|Definit' + 'e=Def|PronType=Art|the Prime|PROPN|Number=Sing|Prime Min' + 'ister|PROPN|Number=Sing|Minister ,|PUNCT|, Mr|PROPN|Numb' + 'er=Sing|Mr Ingvar|PROPN|Number=Sing|Ingvar Carlsson|PROP' + 'N|Number=Sing|Carlsson ,|PUNCT|, at|ADP|at the|DET|Defin' + 'ite=Def|PronType=Art|the Opening|NOUN|Number=Sing|openin' + 'g of|ADP|of the|DET|Definite=Def|PronType=Art|the Swedis' + 'h|ADJ|Degree=Pos|swedish Parliament|NOUN|Number=Sing|par' + 'liament on|ADP|on Tuesday|PROPN|Number=Sing|Tuesday ,|PU' + 'NCT|, 4|NUM|NumType=Card|4 October|PROPN|Number=Sing|Oct' + 'ober ,|PUNCT|, 1988|NUM|NumType=Card|1988 .|PUNCT|.\n') with open(os.path.join(self.tempdir1, - 'test_files', 'test.trg'), - 'r') as f: - self.assertEqual(f.read(), - 'REGERINGSFÖRKLARING|NOUN|Case=Nom|Definite=Ind|Gender=Ne' - 'ut|Number=Sing|Regeringsförklaring .|PUNCT|.\n') + 'test_files', 'test.trg'), + 'r') as f: + self.assertEqual( + f.read(), + 'REGERINGSFÖRKLARING|NOUN|Case=Nom|Definite=Ind|Gender=Ne' + 'ut|Number=Sing|Regeringsförklaring .|PUNCT|.\n') def test_moses_parsed_print(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - maximum=1, write_mode='moses', preprocess='parsed', - print_annotations=True, - source_annotations=['upos', 'feats', 'lemma'], - target_annotations=['upos', 'feats', 'lemma'], - root_directory=self.root_directory) - self.assertTrue( - 'Statement|NOUN|Number=Sing|st' - 'atement of|ADP|of Government|NOUN|Number=Sing|government' - ' Policy|NOUN|Number=Sing|policy by|ADP|by the|DET|Definit' - 'e=Def|PronType=Art|the Prime|PROPN|Number=Sing|Prime Min' - 'ister|PROPN|Number=Sing|Minister ,|PUNCT|, Mr|PROPN|Numb' - 'er=Sing|Mr Ingvar|PROPN|Number=Sing|Ingvar Carlsson|PROP' - 'N|Number=Sing|Carlsson ,|PUNCT|, at|ADP|at the|DET|Defin' - 'ite=Def|PronType=Art|the Opening|NOUN|Number=Sing|openin' - 'g of|ADP|of the|DET|Definite=Def|PronType=Art|the Swedis' - 'h|ADJ|Degree=Pos|swedish Parliament|NOUN|Number=Sing|par' - 'liament on|ADP|on Tuesday|PROPN|Number=Sing|Tuesday ,|PU' - 'NCT|, 4|NUM|NumType=Card|4 October|PROPN|Number=Sing|Oct' - 'ober ,|PUNCT|, 1988|NUM|NumType=Card|1988 .|PUNCT|.\tREG' - 'ERINGSFÖRKLARING|NOUN|Case=Nom|Definite=Ind|Gender=Ne' - 'ut|Number=Sing|Regeringsförklaring .|PUNCT|.\n' in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='en', target='sv', + maximum=1, write_mode='moses', preprocess='parsed', + print_annotations=True, + source_annotations=['upos', 'feats', 'lemma'], + target_annotations=['upos', 'feats', 'lemma'], + root_directory=self.root_directory).printPairs() + var = output.getvalue() + self.assertIn( + 'Statement|NOUN|Number=Sing|st' + 'atement of|ADP|of Government|NOUN|Number=Sing|government' + ' Policy|NOUN|Number=Sing|policy by|ADP|by the|DET|Definit' + 'e=Def|PronType=Art|the Prime|PROPN|Number=Sing|Prime Min' + 'ister|PROPN|Number=Sing|Minister ,|PUNCT|, Mr|PROPN|Numb' + 'er=Sing|Mr Ingvar|PROPN|Number=Sing|Ingvar Carlsson|PROP' + 'N|Number=Sing|Carlsson ,|PUNCT|, at|ADP|at the|DET|Defin' + 'ite=Def|PronType=Art|the Opening|NOUN|Number=Sing|openin' + 'g of|ADP|of the|DET|Definite=Def|PronType=Art|the Swedis' + 'h|ADJ|Degree=Pos|swedish Parliament|NOUN|Number=Sing|par' + 'liament on|ADP|on Tuesday|PROPN|Number=Sing|Tuesday ,|PU' + 'NCT|, 4|NUM|NumType=Card|4 October|PROPN|Number=Sing|Oct' + 'ober ,|PUNCT|, 1988|NUM|NumType=Card|1988 .|PUNCT|.\tREG' + 'ERINGSFÖRKLARING|NOUN|Case=Nom|Definite=Ind|Gender=Ne' + 'ut|Number=Sing|Regeringsförklaring .|PUNCT|.\n', var) def test_links_write(self): OpusRead(directory='RF', source='en', target='sv', maximum=1, - write=[os.path.join(self.tempdir1, 'test_files', 'test_result')], - write_mode='links', root_directory=self.root_directory).printPairs() + write=[os.path.join(self.tempdir1, 'test_files', 'test_result')], + write_mode='links', root_directory=self.root_directory).printPairs() with open(os.path.join(self.tempdir1, 'test_files', 'test_result'), - 'r') as f: - self.assertEqual(f.read(), + 'r') as f: + self.assertEqual( + f.read(), '\n' '\n\n ' @@ -939,12 +975,13 @@ def test_links_write(self): def test_links_write_unalphabetical(self): OpusRead(directory='RF', source='sv', target='en', - write=[os.path.join(self.tempdir1, 'test_files', 'test_result')], - write_mode='links', src_range='1-5', tgt_range='2', - root_directory=self.root_directory).printPairs() + write=[os.path.join(self.tempdir1, 'test_files', 'test_result')], + write_mode='links', src_range='1-5', tgt_range='2', + root_directory=self.root_directory).printPairs() with open(os.path.join(self.tempdir1, 'test_files', 'test_result'), - 'r') as f: - self.assertEqual(f.read(), + 'r') as f: + self.assertEqual( + f.read(), '\n' '\n\n ' @@ -956,60 +993,66 @@ def test_links_write_unalphabetical(self): ' \n\n') def test_links_print(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - maximum=1, write_mode='links', root_directory=self.root_directory) - self.assertTrue( - '\n' - '\n\n ' - '\n' - '\n \n\n' in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='en', target='sv', + maximum=1, write_mode='links', root_directory=self.root_directory).printPairs() + var = output.getvalue() + self.assertIn( + '\n' + '\n\n ' + '\n' + '\n \n\n', var) def test_links_print_unalphabetical(self): - var = pairPrinterToVariable(directory='RF', source='sv', target='en', - write_mode='links', src_range='1', tgt_range='2', - root_directory=self.root_directory) - self.assertTrue( - '\n' - '\n\n ' - '\n' - '\n' - ' \n' - ' \n' - ' \n\n' in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='sv', target='en', + write_mode='links', src_range='1', tgt_range='2', + root_directory=self.root_directory).printPairs() + var = output.getvalue() + self.assertIn( + '\n' + '\n\n ' + '\n' + '\n' + ' \n' + ' \n' + ' \n\n', var) def test_iteration_stops_at_the_end_of_the_document_even_if_max_is_not_filled(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - src_range='2', tgt_range='1', maximum=5, - root_directory=self.root_directory) - self.assertTrue( - """\n# en/1988.xml.gz\n# sv/1988.xml.gz\n\n==============""" - """==================\n(src)="s4.4">The army will be reor""" - """ganized with the aim of making it more effective .\n(""" - """src)="s4.5">It is the Government 's intention to seek """ - """broad solutions in issues that are of importance for o""" - """ur national security .\n(trg)="s4.4">Det är regeringe""" - """ns föresats att söka breda lösningar i frågor som är a""" - """v betydelse för vår nationella säkerhet .\n==========""" - """======================""" in var) - self.assertTrue("""# en/1996.xml.gz\n# sv/1996""" - """.xml.gz\n\n================================\n""" in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='en', target='sv', + src_range='2', tgt_range='1', maximum=5, + root_directory=self.root_directory).printPairs() + var = output.getvalue() + self.assertIn( + """\n# en/1988.xml.gz\n# sv/1988.xml.gz\n\n==============""" + """==================\n(src)="s4.4">The army will be reor""" + """ganized with the aim of making it more effective .\n(""" + """src)="s4.5">It is the Government 's intention to seek """ + """broad solutions in issues that are of importance for o""" + """ur national security .\n(trg)="s4.4">Det är regeringe""" + """ns föresats att söka breda lösningar i frågor som är a""" + """v betydelse för vår nationella säkerhet .\n==========""" + """======================""", var) + self.assertIn("""# en/1996.xml.gz\n# sv/1996""" + """.xml.gz\n\n================================\n""", var) def test_use_given_sentence_alignment_file(self): OpusRead(directory='Books', source='eo', target='pt', src_range='2', - tgt_range='2', maximum=1, write_mode='links', - write=[os.path.join(self.tempdir1, 'test_files', 'testlinks')], - root_directory=self.root_directory).printPairs() + tgt_range='2', maximum=1, write_mode='links', + write=[os.path.join(self.tempdir1, 'test_files', 'testlinks')], + root_directory=self.root_directory).printPairs() var = pairPrinterToVariable( directory='Books', source='eo', target='pt', alignment_file=os.path.join(self.tempdir1, 'test_files', - 'testlinks'), + 'testlinks'), root_directory=self.root_directory) - self.assertTrue( + self.assertIn( '\n# eo/Carroll_Lewis-Alice_in_wonderland.xml.gz\n' '# pt/Carroll_Lewis-Alice_in_wonderland.xml.gz\n\n=======' '=========================\n' @@ -1032,17 +1075,17 @@ def test_use_given_sentence_alignment_file(self): ', correu através do campo atrás dele e felizmente chegou ' 'bem a tempo de o ver pular para dentro de uma grande toc' 'a de coelho debaixo da cerca .\n========================' - '========\n' in var) + '========\n', var) def test_use_given_sentence_alignment_file_with_lingGrp_end_tag_on_the_same_line_as_link_tag(self): OpusRead(directory='RF', source='en', target='sv', src_range='2', - tgt_range='1', write_mode='links', - write=[os.path.join(self.tempdir1, 'test_files', 'testlinks')], - root_directory=self.root_directory).printPairs() + tgt_range='1', write_mode='links', + write=[os.path.join(self.tempdir1, 'test_files', 'testlinks')], + root_directory=self.root_directory).printPairs() var = pairPrinterToVariable(directory='RF', source='en', target='sv', - alignment_file=os.path.join(self.tempdir1, 'test_files', - 'testlinks'), root_directory=self.root_directory) - self.assertTrue( + alignment_file=os.path.join(self.tempdir1, 'test_files', + 'testlinks'), root_directory=self.root_directory) + self.assertIn( """\n# en/1988.xml.gz\n# sv/1988.xml.gz\n\n==============""" """==================\n(src)="s4.4">The army will be reor""" """ganized with the aim of making it more effective .\n(""" @@ -1051,85 +1094,87 @@ def test_use_given_sentence_alignment_file_with_lingGrp_end_tag_on_the_same_line """ur national security .\n(trg)="s4.4">Det är regeringe""" """ns föresats att söka breda lösningar i frågor som är a""" """v betydelse för vår nationella säkerhet .\n==========""" - """======================\n""" in var) - self.assertTrue("""# en/1996.xml.gz\n# sv/1996""" - """.xml.gz\n\n================================\n""" in var) + """======================\n""", var) + self.assertIn("""# en/1996.xml.gz\n# sv/1996""" + """.xml.gz\n\n================================\n""", var) def test_use_given_sentence_alignment_file_and_print_links(self): OpusRead(directory='RF', source='en', target='sv', maximum=1, - write_mode='links', - write=[os.path.join(self.tempdir1, 'test_files', 'testlinks')], - root_directory=self.root_directory).printPairs() + write_mode='links', + write=[os.path.join(self.tempdir1, 'test_files', 'testlinks')], + root_directory=self.root_directory).printPairs() var = pairPrinterToVariable(directory='RF', source='en', target='sv', - write_mode='links', - alignment_file=os.path.join(self.tempdir1, 'test_files', - 'testlinks'), - root_directory=self.root_directory) - self.assertTrue('' - '\n\n\n \n\n \n<' - '/cesAlign>\n' in var) + write_mode='links', + alignment_file=os.path.join(self.tempdir1, 'test_files', + 'testlinks'), + root_directory=self.root_directory) + self.assertIn('' + '\n\n\n \n\n \n<' + '/cesAlign>\n', var) def test_use_given_sentence_alignment_file_and_write_links(self): OpusRead(directory='RF', source='en', target='sv', maximum=1, - write_mode='links', - write=[os.path.join(self.tempdir1, 'test_files', 'testlinks')], - root_directory=self.root_directory).printPairs() + write_mode='links', + write=[os.path.join(self.tempdir1, 'test_files', 'testlinks')], + root_directory=self.root_directory).printPairs() OpusRead(directory='RF', source='en', target='sv', write_mode='links', - alignment_file=os.path.join(self.tempdir1, 'test_files', - 'testlinks'), - write=[os.path.join(self.tempdir1, 'test_files', 'testresult')], - root_directory=self.root_directory).printPairs() + alignment_file=os.path.join(self.tempdir1, 'test_files', + 'testlinks'), + write=[os.path.join(self.tempdir1, 'test_files', 'testresult')], + root_directory=self.root_directory).printPairs() with open(os.path.join(self.tempdir1, 'test_files', 'testresult'), - 'r') as f: - self.assertEqual(f.read(), '' - '\n\n\n \n\n \n<' - '/cesAlign>\n') + 'r') as f: + self.assertEqual( + f.read(), '' + '\n\n\n \n\n \n<' + '/cesAlign>\n') def test_use_given_sentence_alignment_file_and_print_links_Books(self): OpusRead(directory='Books', source='eo', target='pt', maximum=1, - write_mode='links', - write=[os.path.join(self.tempdir1, 'test_files', 'testlinks')], - root_directory=self.root_directory).printPairs() + write_mode='links', + write=[os.path.join(self.tempdir1, 'test_files', 'testlinks')], + root_directory=self.root_directory).printPairs() var = pairPrinterToVariable(directory='Books', source='eo', - target='pt', write_mode='links', - alignment_file=os.path.join(self.tempdir1, 'test_files', - 'testlinks'), - root_directory=self.root_directory) - self.assertTrue('' - '\n\n\n \n\n \n\n' in var) + target='pt', write_mode='links', + alignment_file=os.path.join(self.tempdir1, 'test_files', + 'testlinks'), + root_directory=self.root_directory) + self.assertIn('' + '\n\n\n \n\n \n\n', var) def test_use_given_sentence_alignment_file_and_write_links_Books(self): OpusRead(directory='Books', source='eo', target='pt', maximum=1, - write_mode='links', - write=[os.path.join(self.tempdir1, 'test_files', 'testlinks')], - root_directory=self.root_directory).printPairs() + write_mode='links', + write=[os.path.join(self.tempdir1, 'test_files', 'testlinks')], + root_directory=self.root_directory).printPairs() OpusRead(directory='Books', source='eo', target='pt', - write_mode='links', alignment_file=os.path.join(self.tempdir1, - 'test_files', 'testlinks'), - write=[os.path.join(self.tempdir1, 'test_files', 'testresult')], - root_directory=self.root_directory).printPairs() + write_mode='links', alignment_file=os.path.join(self.tempdir1, + 'test_files', 'testlinks'), + write=[os.path.join(self.tempdir1, 'test_files', 'testresult')], + root_directory=self.root_directory).printPairs() with open(os.path.join(self.tempdir1, 'test_files', 'testresult'), - 'r') as f: - self.assertEqual(f.read(), '' - '\n\n\n \n\n \n\n') + 'r') as f: + self.assertEqual( + f.read(), '' + '\n\n\n \n\n \n\n') def test_checks_first_whether_documents_are_in_path(self): with open(os.path.join(self.tempdir1, 'test_files', 'testlinks'), - 'w') as f: + 'w') as f: f.write( '\n' @@ -1137,31 +1182,31 @@ def test_checks_first_whether_documents_are_in_path(self): 'test_en" toDoc="test_files/test_fi" >\n\n +\n') with open(os.path.join(self.tempdir1, 'test_files', 'test_en'), - 'w') as f: + 'w') as f: f.write( '\n\n' '\n\n test_en1\n test_en2' '\n\n \n') with open(os.path.join(self.tempdir1, 'test_files', 'test_fi'), - 'w') as f: + 'w') as f: f.write( '\n\n \n' '\n test_fi1\n test_fi2' '\n\n \n') var = pairPrinterToVariable(directory='Books', source='en', - target='fi', alignment_file=os.path.join(self.tempdir1, - 'test_files', 'testlinks'), - download_dir=self.tempdir1) - self.assertTrue( + target='fi', alignment_file=os.path.join(self.tempdir1, + 'test_files', 'testlinks'), + download_dir=self.tempdir1) + self.assertIn( '\n# test_files/test_en\n# test_files/test_fi\n\n' '================================\n(src)="s1">test_en1 test_en2\n' '(trg)="s1">test_fi1 test_fi2' - '\n================================\n' in var) + '\n================================\n', var) def test_open_documents_from_specifed_zips(self): with open(os.path.join(self.tempdir1, 'test_files', 'testlinks'), - 'w') as f: + 'w') as f: f.write( '\n' @@ -1169,40 +1214,40 @@ def test_open_documents_from_specifed_zips(self): 'test_en" toDoc="test_files/test_fi" >\n\n +\n') with open(os.path.join(self.tempdir1, 'test_files', 'test_en'), - 'w') as f: + 'w') as f: f.write( '\n\n' '\n\n test_en1\n test_en2' '\n\n \n') with zipfile.ZipFile(os.path.join(self.tempdir1, 'test_en.zip'), - 'w') as zf: + 'w') as zf: zf.write(os.path.join(self.tempdir1, 'test_files', 'test_en'), - arcname=os.path.join('test_files', 'test_en')) + arcname=os.path.join('test_files', 'test_en')) with open(os.path.join(self.tempdir1, 'test_files', 'test_fi'), - 'w') as f: + 'w') as f: f.write( '\n\n \n' '\n test_fi1\n test_fi2' '\n\n \n') with zipfile.ZipFile(os.path.join(self.tempdir1, 'test_fi.zip'), - 'w') as zf: + 'w') as zf: zf.write(os.path.join(self.tempdir1, 'test_files', 'test_fi'), - arcname=os.path.join('test_files', 'test_fi')) + arcname=os.path.join('test_files', 'test_fi')) var = pairPrinterToVariable(directory='Books', source='en', - target='fi', alignment_file=os.path.join(self.tempdir1, - 'test_files', 'testlinks'), - source_zip = os.path.join(self.tempdir1, 'test_en.zip'), - target_zip = os.path.join(self.tempdir1, 'test_fi.zip')) - self.assertTrue( + target='fi', alignment_file=os.path.join(self.tempdir1, + 'test_files', 'testlinks'), + source_zip = os.path.join(self.tempdir1, 'test_en.zip'), + target_zip = os.path.join(self.tempdir1, 'test_fi.zip')) + self.assertIn( '\n# test_files/test_en\n# test_files/test_fi\n\n' '================================\n(src)="s1">test_en1 test_en2\n' '(trg)="s1">test_fi1 test_fi2' - '\n================================\n' in var) + '\n================================\n', var) def test_try_to_open_wrongly_named_docs_from_specifed_source_zip(self): with open(os.path.join(self.tempdir1, 'test_files', 'testlinks'), - 'w') as f: + 'w') as f: f.write( '\n' @@ -1210,39 +1255,40 @@ def test_try_to_open_wrongly_named_docs_from_specifed_source_zip(self): 'test_en" toDoc="test_files/test_fi" >\n\n +\n') with open(os.path.join(self.tempdir1, 'test_files', 'test_en'), - 'w') as f: + 'w') as f: f.write( '\n\n' '\n\n test_en1\n test_en2' '\n\n \n') with zipfile.ZipFile(os.path.join(self.tempdir1, 'test_en.zip'), - 'w') as zf: + 'w') as zf: zf.write(os.path.join(self.tempdir1, 'test_files', 'test_en'), - arcname=os.path.join('test_files', 'test_un')) + arcname=os.path.join('test_files', 'test_un')) with open(os.path.join(self.tempdir1, 'test_files', 'test_fi'), - 'w') as f: + 'w') as f: f.write( '\n\n \n' '\n test_fi1\n test_fi2' '\n\n \n') with zipfile.ZipFile(os.path.join(self.tempdir1, 'test_fi.zip'), - 'w') as zf: + 'w') as zf: zf.write(os.path.join(self.tempdir1, 'test_files', 'test_fi'), - arcname=os.path.join('test_files', 'test_fi')) - - var = pairPrinterToVariable(directory='Books', source='en', - target='fi', alignment_file=os.path.join(self.tempdir1, - 'test_files', 'testlinks'), - source_zip = os.path.join(self.tempdir1, 'test_en.zip'), - target_zip = os.path.join(self.tempdir1, 'test_fi.zip')) + arcname=os.path.join('test_files', 'test_fi')) - self.assertTrue("\nThere is no item named 'test_files/test_en' " - "in the archive '"+os.path.join(self.tempdir1, 'test_en.zip')+"'\n" - "Continuing from next sentence file pair.\n" in var) + with mock.patch('sys.stderr', new=io.StringIO()) as output: + OpusRead(directory='Books', source='en', + target='fi', alignment_file=os.path.join(self.tempdir1, + 'test_files', 'testlinks'), + source_zip = os.path.join(self.tempdir1, 'test_en.zip'), + target_zip = os.path.join(self.tempdir1, 'test_fi.zip')).printPairs() + var = output.getvalue() + self.assertIn("\nThere is no item named 'test_files/test_en' " + "in the archive '"+os.path.join(self.tempdir1, 'test_en.zip')+"'\n" + "Continuing from next sentence file pair.\n", var) def test_try_to_open_wrongly_named_docs_from_specifed_target_zip(self): with open(os.path.join(self.tempdir1, 'test_files', 'testlinks'), - 'w') as f: + 'w') as f: f.write( '\n' @@ -1250,39 +1296,40 @@ def test_try_to_open_wrongly_named_docs_from_specifed_target_zip(self): 'test_en" toDoc="test_files/test_fi" >\n\n +\n') with open(os.path.join(self.tempdir1, 'test_files', 'test_en'), - 'w') as f: + 'w') as f: f.write( '\n\n' '\n\n test_en1\n test_en2' '\n\n \n') with zipfile.ZipFile(os.path.join(self.tempdir1, 'test_en.zip'), - 'w') as zf: + 'w') as zf: zf.write(os.path.join(self.tempdir1, 'test_files', 'test_en'), - arcname=os.path.join('test_files', 'test_en')) + arcname=os.path.join('test_files', 'test_en')) with open(os.path.join(self.tempdir1, 'test_files', 'test_fi'), - 'w') as f: + 'w') as f: f.write( '\n\n \n' '\n test_fi1\n test_fi2' '\n\n \n') with zipfile.ZipFile(os.path.join(self.tempdir1, 'test_fi.zip'), - 'w') as zf: + 'w') as zf: zf.write(os.path.join(self.tempdir1, 'test_files', 'test_fi'), - arcname=os.path.join('test_files', 'test_un')) - - var = pairPrinterToVariable(directory='Books', source='en', - target='fi', alignment_file=os.path.join(self.tempdir1, - 'test_files', 'testlinks'), - source_zip = os.path.join(self.tempdir1, 'test_en.zip'), - target_zip = os.path.join(self.tempdir1, 'test_fi.zip')) + arcname=os.path.join('test_files', 'test_un')) - self.assertTrue("\nThere is no item named 'test_files/test_fi' " - "in the archive '"+os.path.join(self.tempdir1, 'test_fi.zip')+"'\n" - "Continuing from next sentence file pair.\n" in var) + with mock.patch('sys.stderr', new=io.StringIO()) as output: + OpusRead(directory='Books', source='en', + target='fi', alignment_file=os.path.join(self.tempdir1, + 'test_files', 'testlinks'), + source_zip = os.path.join(self.tempdir1, 'test_en.zip'), + target_zip = os.path.join(self.tempdir1, 'test_fi.zip')).printPairs() + var = output.getvalue() + self.assertIn("\nThere is no item named 'test_files/test_fi' " + "in the archive '"+os.path.join(self.tempdir1, 'test_fi.zip')+"'\n" + "Continuing from next sentence file pair.\n", var) def test_checks_first_whether_documents_are_in_path_gz(self): with open(os.path.join(self.tempdir1, 'test_files', 'testlinks'), - 'w') as f: + 'w') as f: f.write( '\n' @@ -1290,171 +1337,190 @@ def test_checks_first_whether_documents_are_in_path_gz(self): 'test_en.gz" toDoc="test_files/test_fi.gz" >\n\n +\n') with open(os.path.join(self.tempdir1, 'test_files', 'test_en'), - 'w') as f: + 'w') as f: f.write( '\n\n' '\n\n test_en1\n test_en2' '\n\n \n') with open(os.path.join(self.tempdir1, 'test_files', 'test_fi'), - 'w') as f: + 'w') as f: f.write( '\n\n \n' '\n test_fi1\n test_fi2' '\n\n \n') with open(os.path.join(self.tempdir1, 'test_files', 'test_en'), - 'rb') as f: + 'rb') as f: with gzip.open(os.path.join(self.tempdir1, 'test_files', - 'test_en.gz'), 'wb') as gf: + 'test_en.gz'), 'wb') as gf: shutil.copyfileobj(f, gf) with open(os.path.join(self.tempdir1, 'test_files', 'test_fi'), - 'rb') as f: + 'rb') as f: with gzip.open(os.path.join(self.tempdir1, - 'test_files', 'test_fi.gz'), 'wb') as gf: + 'test_files', 'test_fi.gz'), 'wb') as gf: shutil.copyfileobj(f, gf) var = pairPrinterToVariable(directory='Books', source='eo', - target='pt', alignment_file=os.path.join(self.tempdir1, - 'test_files', 'testlinks'), download_dir=self.tempdir1, - root_directory=self.root_directory) - self.assertTrue( + target='pt', alignment_file=os.path.join(self.tempdir1, + 'test_files', 'testlinks'), download_dir=self.tempdir1, + root_directory=self.root_directory) + self.assertIn( '\n# test_files/test_en.gz\n# test_files/test_fi.gz\n\n' '================================\n(src)="s1">test_en1 test_en2\n' '(trg)="s1">test_fi1 test_fi2' - '\n================================\n' in var) + '\n================================\n', var) def test_filtering_by_src_cld2(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - release='v1', maximum=1, src_cld2=['en', '0.98'], - alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'), - download_dir=self.tempdir1) - self.assertTrue( - '\n# en/1996.xml.gz\n' - '# sv/1996.xml.gz\n' - '\n================================' - '\n(src)="s5.0">Mr. Sherlock Holmes' - '\n(trg)="s5.0">Herra Sherlock Holmes' - '\n================================\n' in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='en', target='sv', + release='v1', maximum=1, src_cld2=['en', '0.98'], + alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'), + download_dir=self.tempdir1).printPairs() + var = output.getvalue() + self.assertIn( + '\n# en/1996.xml.gz\n' + '# sv/1996.xml.gz\n' + '\n================================' + '\n(src)="s5.0">Mr. Sherlock Holmes' + '\n(trg)="s5.0">Herra Sherlock Holmes' + '\n================================\n', var) def test_filtering_by_trg_cld2(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - release='v1', maximum=1, trg_cld2=['ia', '0'], - alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'), - download_dir=self.tempdir1) - self.assertTrue( - '\n# en/1996.xml.gz\n' - '# sv/1996.xml.gz\n' - '\n================================' - '\n(src)="s4">Chapter 1 Mr. Sherlock Holmes' - '\n(trg)="s4">Herra Sherlock Holmes .' - '\n================================\n' in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='en', target='sv', + release='v1', maximum=1, trg_cld2=['ia', '0'], + alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'), + download_dir=self.tempdir1).printPairs() + var = output.getvalue() + self.assertIn( + '\n# en/1996.xml.gz\n' + '# sv/1996.xml.gz\n' + '\n================================' + '\n(src)="s4">Chapter 1 Mr. Sherlock Holmes' + '\n(trg)="s4">Herra Sherlock Holmes .' + '\n================================\n', var) def test_filtering_by_src_langid(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - release='v1', maximum=1, src_langid=['de', '0'], - alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'), - download_dir=self.tempdir1) - self.assertTrue( - '\n# en/1996.xml.gz\n' - '# sv/1996.xml.gz\n' - '\n================================' - '\n(src)="s167.0">" Excellent !' - '\n(trg)="s167.0">" Erinomaista .' - '\n================================\n' in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='en', target='sv', + release='v1', maximum=1, src_langid=['de', '0'], + alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'), + download_dir=self.tempdir1).printPairs() + var = output.getvalue() + self.assertIn( + '\n# en/1996.xml.gz\n' + '# sv/1996.xml.gz\n' + '\n================================' + '\n(src)="s167.0">" Excellent !' + '\n(trg)="s167.0">" Erinomaista .' + '\n================================\n', var) def test_filtering_by_trg_langid(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - release='v1', maximum=1, trg_langid=['et', '0'], - alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'), - download_dir=self.tempdir1) - self.assertTrue( - '\n# en/1996.xml.gz\n' - '# sv/1996.xml.gz\n' - '\n================================' - '\n(src)="s4">Chapter 1 Mr. Sherlock Holmes' - '\n(trg)="s4">Herra Sherlock Holmes .' - '\n================================\n' in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='en', target='sv', + release='v1', maximum=1, trg_langid=['et', '0'], + alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'), + download_dir=self.tempdir1).printPairs() + var = output.getvalue() + self.assertIn( + '\n# en/1996.xml.gz\n' + '# sv/1996.xml.gz\n' + '\n================================' + '\n(src)="s4">Chapter 1 Mr. Sherlock Holmes' + '\n(trg)="s4">Herra Sherlock Holmes .' + '\n================================\n', var) def test_filtering_by_lang_labels(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - release='v1', maximum=1, src_cld2=['un', '0'], - trg_cld2=['fi', '0.97'], src_langid=['en', '0.17'], - trg_langid=['fi', '1'], - alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'), - download_dir=self.tempdir1) - self.assertTrue( - '\n# en/1996.xml.gz\n' - '# sv/1996.xml.gz\n' - '\n================================' - '\n(src)="s8.1">I believe' - '\n(trg)="s8.1">Luulenpa että sinulla' - '\n================================\n' in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='en', target='sv', + release='v1', maximum=1, src_cld2=['un', '0'], + trg_cld2=['fi', '0.97'], src_langid=['en', '0.17'], + trg_langid=['fi', '1'], + alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'), + download_dir=self.tempdir1).printPairs() + var = output.getvalue() + self.assertIn( + '\n# en/1996.xml.gz\n' + '# sv/1996.xml.gz\n' + '\n================================' + '\n(src)="s8.1">I believe' + '\n(trg)="s8.1">Luulenpa että sinulla' + '\n================================\n', var) def test_filtering_by_lang_labels_nonalphabetical_lang_order(self): - var = pairPrinterToVariable(directory='RF', source='sv', target='en', - release='v1', maximum=1, trg_cld2=['un', '0'], - src_cld2=['fi', '0.97'], trg_langid=['en', '0.17'], - src_langid=['fi', '1'], - alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'), - download_dir=self.tempdir1) - self.assertTrue( - '\n# en/1996.xml.gz\n' - '# sv/1996.xml.gz\n' - '\n================================' - '\n(src)="s8.1">Luulenpa että sinulla' - '\n(trg)="s8.1">I believe' - '\n================================\n' in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='sv', target='en', + release='v1', maximum=1, trg_cld2=['un', '0'], + src_cld2=['fi', '0.97'], trg_langid=['en', '0.17'], + src_langid=['fi', '1'], + alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'), + download_dir=self.tempdir1).printPairs() + var = output.getvalue() + self.assertIn( + '\n# en/1996.xml.gz\n' + '# sv/1996.xml.gz\n' + '\n================================' + '\n(src)="s8.1">Luulenpa että sinulla' + '\n(trg)="s8.1">I believe' + '\n================================\n', var) def test_filtering_by_lang_labels_no_matches_found(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - release='v1', maximum=1, src_cld2=['fi', '2'], - alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'), - download_dir=self.tempdir1) - self.assertTrue( - '\n# en/1996.xml.gz\n' - '# sv/1996.xml.gz\n' - '\n================================\n' in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='en', target='sv', + release='v1', maximum=1, src_cld2=['fi', '2'], + alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'), + download_dir=self.tempdir1).printPairs() + var = output.getvalue() + self.assertIn( + '\n# en/1996.xml.gz\n' + '# sv/1996.xml.gz\n' + '\n================================\n', var) def test_filtering_by_src_cld2_print_links(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - release='v1', maximum=1, src_cld2=['en', '0.98'], - alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'), - write_mode='links', download_dir=self.tempdir1) - self.assertTrue( - '\n\n' in var) - self.assertTrue( - ' \n' - '\n \n\n' in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='en', target='sv', + release='v1', maximum=1, src_cld2=['en', '0.98'], + alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'), + write_mode='links', download_dir=self.tempdir1).printPairs() + var = output.getvalue() + self.assertIn( + '\n\n', var) + self.assertIn( + ' \n' + '\n \n\n', var) def test_filtering_by_lang_labels_print_links(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - release='v1', maximum=1, src_cld2=['un', '0'], - trg_cld2=['fi', '0.97'], src_langid=['en', '0.17'], - trg_langid=['fi', '1'], - alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'), - write_mode='links', download_dir=self.tempdir1) - self.assertTrue( - '\n\n' in var) - self.assertTrue( - ' \n' - '\n \n\n' in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='en', target='sv', + release='v1', maximum=1, src_cld2=['un', '0'], + trg_cld2=['fi', '0.97'], src_langid=['en', '0.17'], + trg_langid=['fi', '1'], + alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'), + write_mode='links', download_dir=self.tempdir1).printPairs() + var = output.getvalue() + self.assertIn( + '\n\n', var) + self.assertIn( + ' \n' + '\n \n\n', var) def test_filtering_by_lang_labels_write_links(self): OpusRead(directory='RF', source='en', target='sv', - release='v1', maximum=1, src_cld2=['un', '0'], - trg_cld2=['fi', '0.97'], src_langid=['en', '0.17'], - trg_langid=['fi', '1'], - alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'), - write=[os.path.join(self.tempdir1, 'test_files', 'result')], - write_mode='links', download_dir=self.tempdir1).printPairs() + release='v1', maximum=1, src_cld2=['un', '0'], + trg_cld2=['fi', '0.97'], src_langid=['en', '0.17'], + trg_langid=['fi', '1'], + alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'), + write=[os.path.join(self.tempdir1, 'test_files', 'result')], + write_mode='links', download_dir=self.tempdir1).printPairs() with open(os.path.join(self.tempdir1, 'test_files', 'result'), - 'r') as f: - self.assertEqual(f.read(), + 'r') as f: + self.assertEqual( + f.read(), '\n\n\n' @@ -1463,204 +1529,218 @@ def test_filtering_by_lang_labels_write_links(self): '\n \n\n') def test_use_given_zip_files(self): - var = pairPrinterToVariable(directory='RF', source='en', target='sv', - maximum=1, source_zip=os.path.join(self.tempdir1, 'en.zip'), - target_zip=os.path.join(self.tempdir1, 'sv.zip'), - alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'), - root_directory=self.root_directory) - self.assertTrue( - '\n# en/1996.xml.gz' - '\n# sv/1996.xml.gz' - '\n\n================================' - '\n(src)="s1">Source&<>"\' : manybooks.netAudiobook available here' - '\n(trg)="s1">Source : Project Gutenberg' - '\n================================\n' in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', source='en', target='sv', + maximum=1, source_zip=os.path.join(self.tempdir1, 'en.zip'), + target_zip=os.path.join(self.tempdir1, 'sv.zip'), + alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'), + root_directory=self.root_directory).printPairs() + var = output.getvalue() + self.assertIn( + '\n# en/1996.xml.gz' + '\n# sv/1996.xml.gz' + '\n\n================================' + '\n(src)="s1">Source&<>"\' : manybooks.netAudiobook available here' + '\n(trg)="s1">Source : Project Gutenberg' + '\n================================\n', var) def test_use_given_zip_files_unalphabetical(self): - var = pairPrinterToVariable(directory='RF', target='en', source='sv', - maximum=1, target_zip=os.path.join(self.tempdir1, 'en.zip'), - source_zip=os.path.join(self.tempdir1, 'sv.zip'), - alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'), - root_directory=self.root_directory) - self.assertTrue( - '\n# en/1996.xml.gz' - '\n# sv/1996.xml.gz' - '\n\n================================' - '\n(src)="s1">Source : Project Gutenberg' - '\n(trg)="s1">Source&<>"\' : manybooks.netAudiobook available here' - '\n================================\n' in var) + with mock.patch('sys.stdout', new=io.StringIO()) as output: + OpusRead(directory='RF', target='en', source='sv', + maximum=1, target_zip=os.path.join(self.tempdir1, 'en.zip'), + source_zip=os.path.join(self.tempdir1, 'sv.zip'), + alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'), + root_directory=self.root_directory).printPairs() + var = output.getvalue() + self.assertIn( + '\n# en/1996.xml.gz' + '\n# sv/1996.xml.gz' + '\n\n================================' + '\n(src)="s1">Source : Project Gutenberg' + '\n(trg)="s1">Source&<>"\' : manybooks.netAudiobook available here' + '\n================================\n', var) @mock.patch('opustools.opus_get.input', create=True) def test_alignment_file_not_found(self, mocked_input): mocked_input.side_effect = ['y', 'n'] pairPrinterToVariable(directory='RF', source='en', target='sv', maximum=1, - alignment_file=os.path.join(self.tempdir1, 'unfound.xml.gz'), - download_dir=self.tempdir1, root_directory=self.root_directory) + alignment_file=os.path.join(self.tempdir1, 'unfound.xml.gz'), + download_dir=self.tempdir1, root_directory=self.root_directory) os.remove(os.path.join(self.tempdir1, 'RF_latest_xml_en-sv.xml.gz')) os.remove(os.path.join(self.tempdir1, 'RF_latest_xml_en.zip')) os.remove(os.path.join(self.tempdir1, 'RF_latest_xml_sv.zip')) with self.assertRaises(FileNotFoundError): pairPrinterToVariable(directory='RF', source='en', target='sv', - maximum=1, - alignment_file=os.path.join(self.tempdir1, 'unfound.xml.gz')) + maximum=1, + alignment_file=os.path.join(self.tempdir1, 'unfound.xml.gz')) def test_alignment_file_not_found_no_prompt(self): opr = OpusRead(directory='RF', source='en', target='sv', maximum=1, - alignment_file=os.path.join(self.tempdir1, 'unfound.xml.gz'), - suppress_prompts=True, download_dir=self.tempdir1, - root_directory=self.root_directory) + alignment_file=os.path.join(self.tempdir1, 'unfound.xml.gz'), + suppress_prompts=True, download_dir=self.tempdir1, + root_directory=self.root_directory) opr.printPairs() self.assertTrue(os.path.isfile(os.path.join(self.tempdir1, - 'RF_latest_xml_en-sv.xml.gz'))) + 'RF_latest_xml_en-sv.xml.gz'))) self.assertTrue(os.path.isfile(os.path.join(self.tempdir1, - 'RF_latest_xml_en.zip'))) + 'RF_latest_xml_en.zip'))) self.assertTrue(os.path.isfile(os.path.join(self.tempdir1, - 'RF_latest_xml_sv.zip'))) + 'RF_latest_xml_sv.zip'))) os.remove(os.path.join(self.tempdir1, 'RF_latest_xml_en-sv.xml.gz')) os.remove(os.path.join(self.tempdir1, 'RF_latest_xml_en.zip')) os.remove(os.path.join(self.tempdir1, 'RF_latest_xml_sv.zip')) def test_id_file_printing(self): OpusRead(directory='RF', source='en', target='sv', maximum=1, - attribute='certainty', threshold='1', - write_ids=os.path.join(self.tempdir1, 'test_files', 'test.id'), - root_directory=self.root_directory).printPairs() + attribute='certainty', threshold='1', + write_ids=os.path.join(self.tempdir1, 'test_files', 'test.id'), + root_directory=self.root_directory).printPairs() with open(os.path.join(self.tempdir1, - 'test_files', 'test.id')) as id_file: + 'test_files', 'test.id')) as id_file: self.assertEqual(id_file.read(), 'en/1988.xml.gz\tsv/1988' - '.xml.gz\ts3.2\ts3.2\t1.14214\n') + '.xml.gz\ts3.2\ts3.2\t1.14214\n') def test_id_file_printing_unalphabetical(self): OpusRead(directory='RF', source='sv', target='en', maximum=1, - src_range='1', tgt_range='2', attribute='certainty', - threshold='0.1', - write_ids=os.path.join(self.tempdir1, 'test_files', 'test.id'), - root_directory=self.root_directory).printPairs() + src_range='1', tgt_range='2', attribute='certainty', + threshold='0.1', + write_ids=os.path.join(self.tempdir1, 'test_files', 'test.id'), + root_directory=self.root_directory).printPairs() with open(os.path.join( self.tempdir1, 'test_files', 'test.id')) as id_file: self.assertEqual(id_file.read(), 'sv/1988.xml.gz\ten/1988' - '.xml.gz\ts4.4\ts4.4 s4.5\t0.188136\n') + '.xml.gz\ts4.4\ts4.4 s4.5\t0.188136\n') def test_id_file_printing_with_no_attribute(self): OpusRead(directory='RF', source='en', target='sv', maximum=1, - write_ids=os.path.join(self.tempdir1, 'test_files', 'test.id'), - root_directory=self.root_directory).printPairs() + write_ids=os.path.join(self.tempdir1, 'test_files', 'test.id'), + root_directory=self.root_directory).printPairs() with open(os.path.join( self.tempdir1, 'test_files/test.id')) as id_file: self.assertEqual(id_file.read(), 'en/1988.xml.gz\tsv/1988' - '.xml.gz\ts1.1\ts1.1\tNone\n') + '.xml.gz\ts1.1\ts1.1\tNone\n') def test_id_file_printing_with_attribute_no_threshold(self): OpusRead(directory='RF', source='en', target='sv', maximum=1, - attribute='certainty', - write_ids=os.path.join(self.tempdir1, 'test_files', 'test.id'), - root_directory=self.root_directory).printPairs() + attribute='certainty', + write_ids=os.path.join(self.tempdir1, 'test_files', 'test.id'), + root_directory=self.root_directory).printPairs() with open(os.path.join( self.tempdir1, 'test_files/test.id')) as id_file: self.assertEqual(id_file.read(), 'en/1988.xml.gz\tsv/1988' - '.xml.gz\ts1.1\ts1.1\t-0.0636364\n') + '.xml.gz\ts1.1\ts1.1\t-0.0636364\n') def test_id_file_printing_with_invalid_attribute(self): OpusRead(directory='RF', source='en', target='sv', maximum=1, - attribute='asfg', - write_ids=os.path.join(self.tempdir1, 'test_files', 'test.id'), - root_directory=self.root_directory).printPairs() + attribute='asfg', + write_ids=os.path.join(self.tempdir1, 'test_files', 'test.id'), + root_directory=self.root_directory).printPairs() with open(os.path.join( - self.tempdir1, 'test_files/test.id')) as id_file: + self.tempdir1, 'test_files/test.id')) as id_file: self.assertEqual(id_file.read(), 'en/1988.xml.gz\tsv/1988' - '.xml.gz\ts1.1\ts1.1\tNone\n') + '.xml.gz\ts1.1\ts1.1\tNone\n') def test_id_file_printing_with_only_threshold(self): OpusRead(directory='RF', source='en', target='sv', maximum=1, - threshold='0', - write_ids=os.path.join(self.tempdir1, 'test_files', 'test.id'), - root_directory=self.root_directory).printPairs() + threshold='0', + write_ids=os.path.join(self.tempdir1, 'test_files', 'test.id'), + root_directory=self.root_directory).printPairs() with open(os.path.join( - self.tempdir1, 'test_files/test.id')) as id_file: + self.tempdir1, 'test_files/test.id')) as id_file: self.assertEqual(id_file.read(), 'en/1988.xml.gz\tsv/1988' - '.xml.gz\ts1.1\ts1.1\tNone\n') + '.xml.gz\ts1.1\ts1.1\tNone\n') def test_writing_time_tags_xml(self): - var = pairPrinterToVariable(directory='OpenSubtitles', source='eo', - target='tl', maximum=1, preserve_inline_tags=True, - root_directory=self.root_directory) - self.assertTrue( - '\n# eo/2009/1187043/6483790.xml.gz\n' - '# tl/2009/1187043/6934998.xml.gz\n\n' - '================================\n(src)="1">