diff --git a/opustools_pkg/opustools/opus_file_handler.py b/opustools_pkg/opustools/opus_file_handler.py
index b4be1d3..9a3debc 100644
--- a/opustools_pkg/opustools/opus_file_handler.py
+++ b/opustools_pkg/opustools/opus_file_handler.py
@@ -38,7 +38,7 @@ def download_files(self):
og = OpusGet(**arguments)
og.get_files()
- def open_moses_files(self):
+ def open_moses_files(self, outpath=None):
moses_zip_name = os.path.join(self.download_dir, f'{self.directory}_{self.release}_moses_'
f'{self.fromto[0]}-{self.fromto[1]}.txt.zip')
if not os.path.isfile(moses_zip_name):
@@ -47,7 +47,7 @@ def open_moses_files(self):
ret_file_names = []
for fn in moses_zip.filelist:
if fn.filename.split('.')[-1] in self.fromto:
- moses_zip.extract(fn.filename)
+ moses_zip.extract(fn.filename, path=outpath)
ret_file_names.append(fn.filename)
moses_zip.close()
return sorted(ret_file_names)
@@ -150,4 +150,3 @@ def close_zipfiles(self):
if self.zip_opened:
self.src_zip.close()
self.trg_zip.close()
-
diff --git a/opustools_pkg/opustools/opus_read.py b/opustools_pkg/opustools/opus_read.py
index 8a8822f..af4dac6 100644
--- a/opustools_pkg/opustools/opus_read.py
+++ b/opustools_pkg/opustools/opus_read.py
@@ -1,6 +1,8 @@
+import logging
import os
-import shutil
import re
+import sys
+import tempfile
from .parse.alignment_parser import AlignmentParser
from .parse.sentence_parser import SentenceParser, SentenceParserError
@@ -8,13 +10,19 @@
from .formatting import *
from .opus_file_handler import OpusFileHandler
+
+logger = logging.getLogger(__name__)
+
+
def skip_regex_type(n, N):
"Select function to skip document names"
def get_re(doc_name):
return not re.search(n, doc_name)
+
def skip_re(doc_name):
return re.search(N, doc_name)
+
def nothing(doc_name):
return False
@@ -27,7 +35,8 @@ def nothing(doc_name):
class OpusRead:
- def __init__(self, directory=None, source=None, target=None,
+ def __init__(
+ self, directory=None, source=None, target=None,
release='latest', preprocess='xml', maximum=-1, src_range='all',
tgt_range='all', attribute=None, threshold=None,
leave_non_alignments_out=False, write=None, write_mode='normal',
@@ -39,7 +48,8 @@ def __init__(self, directory=None, source=None, target=None,
change_annotation_delimiter='|',
src_cld2=None, trg_cld2=None, src_langid=None, trg_langid=None,
write_ids=None, suppress_prompts=False, download_dir='.',
- preserve_inline_tags=False, n=None, N=None, chunk_size=1000000, verbose=False):
+ preserve_inline_tags=False, n=None, N=None, chunk_size=1000000,
+ verbose=False):
"""Read xces alignment files and xml sentence files and output in
desired format.
@@ -83,7 +93,8 @@ def __init__(self, directory=None, source=None, target=None,
preserve_inline_tags -- Preserve inline tags within sentences
n -- Get only documents that match the regex
N -- Skip all doucments that match the regex
- chunk_size -- Number of sentence pairs in chunks to be processed (default 1000000)
+ chunk_size -- Number of sentence pairs in chunks to be processed
+ (default 1000000)
verbose -- Print progress messages
"""
@@ -99,56 +110,43 @@ def __init__(self, directory=None, source=None, target=None,
self.verbose = True
if self.switch_langs:
- temp = src_range
- src_range = tgt_range
- tgt_range = temp
- temp = src_cld2
- src_cld2 = trg_cld2
- trg_cld2 = temp
- temp = src_langid
- src_langid = trg_langid
- trg_langid = temp
- temp = source_zip
- source_zip = target_zip
- target_zip = temp
- temp = source_annotations.copy()
- source_annotations = target_annotations.copy()
- target_annotations = temp.copy()
+ src_range, tgt_range = tgt_range, src_range
+ src_cld2, trg_cld2 = trg_cld2, src_cld2
+ src_langid, trg_langid = trg_langid, src_langid
+ source_zip, target_zip = target_zip, source_zip
+ source_annotations, target_annotations = \
+ target_annotations.copy(), source_annotations.copy()
lang_filters = [src_cld2, src_langid, trg_cld2, trg_langid]
- default_alignment = os.path.join(root_directory, directory, release,
- 'xml', self.fromto[0]+'-'+self.fromto[1]+'.xml.gz')
+ default_alignment = os.path.join(
+ root_directory, directory, release, 'xml',
+ self.fromto[0]+'-'+self.fromto[1]+'.xml.gz')
if alignment_file == -1:
self.alignment = default_alignment
else:
self.alignment = alignment_file
+ dl_prefix = directory + '_' + release + '_' + preprocess + '_'
if not source_zip:
- dl_src_zip = os.path.join(download_dir, directory+'_'+release+'_'+
- preprocess+'_'+self.fromto[0]+'.zip')
+ dl_src_zip = os.path.join(
+ download_dir, dl_prefix + self.fromto[0] + '.zip')
if os.path.isfile(dl_src_zip):
source_zip = dl_src_zip
else:
- source_zip = os.path.join(root_directory, directory, release,
+ source_zip = os.path.join(
+ root_directory, directory, release,
preprocess, self.fromto[0]+'.zip')
if not target_zip:
- dl_trg_zip = os.path.join(download_dir, directory+'_'+release+'_'+
- preprocess+'_'+self.fromto[1]+'.zip')
+ dl_trg_zip = os.path.join(
+ download_dir, dl_prefix + self.fromto[1] + '.zip')
if os.path.isfile(dl_trg_zip):
target_zip = dl_trg_zip
else:
- target_zip = os.path.join(root_directory, directory, release,
+ target_zip = os.path.join(
+ root_directory, directory, release,
preprocess, self.fromto[1]+'.zip')
- self.resultfile = None
- self.mosessrc = None
- self.mosestrg = None
-
- self.id_file = None
- if write_ids:
- self.id_file = file_open(write_ids, 'w', encoding='utf-8')
-
self.write_mode = write_mode
self.write = write
self.maximum = maximum
@@ -156,7 +154,7 @@ def __init__(self, directory=None, source=None, target=None,
if print_annotations:
self.preprocess = 'parsed'
- self.write_ids=write_ids
+ self.write_ids = write_ids
self.preserve = preserve_inline_tags
@@ -192,44 +190,67 @@ def __init__(self, directory=None, source=None, target=None,
preprocess, self.fromto, suppress_prompts)
if preprocess == 'moses':
- # If preprocessing is moses, download
- moses_names = self.of_handler.open_moses_files()
- if self.write:
- if len(self.write) == 2:
- if not self.switch_langs:
- shutil.move(moses_names[0], os.path.join(download_dir, self.write[0]))
- shutil.move(moses_names[1], os.path.join(download_dir, self.write[1]))
- else:
- shutil.move(moses_names[0], os.path.join(download_dir, self.write[1]))
- shutil.move(moses_names[1], os.path.join(download_dir, self.write[0]))
- moses_names = self.write
- else:
- print('"moses" preprocessing requires two output file names. Using default names.')
- else:
- shutil.move(moses_names[0], os.path.join(download_dir, moses_names[0]))
- shutil.move(moses_names[1], os.path.join(download_dir, moses_names[1]))
- print(f'Moses files written to {", ".join([download_dir+"/"+n for n in moses_names])}')
- exit()
-
- if write:
- if write_mode == 'moses' and len(write) == 2:
- self.mosessrc = file_open(write[0], mode='w', encoding='utf-8')
- self.mosestrg = file_open(write[1], mode='w', encoding='utf-8')
- else:
- self.resultfile = file_open(write[0], mode='w', encoding='utf-8')
+ if self.write_mode != 'moses':
+ logger.warning("Only moses write_mode is supported for moses preprocessing. "
+ "Ignoring write_mode %s.", self.write_mode)
+ self.write_mode = 'moses'
+ return
store_attrs = False
- if write_mode == "links" or write_ids != None:
+ if write_mode == "links" or write_ids is not None:
store_attrs = True
self.alignment = self.of_handler.open_alignment_file(self.alignment)
- self.alignmentParser = AlignmentParser(self.alignment,
- (src_range, tgt_range), attribute, threshold, store_attrs,
- leave_non_alignments_out)
+ self.alignmentParser = AlignmentParser(
+ self.alignment, (src_range, tgt_range), attribute, threshold,
+ store_attrs, leave_non_alignments_out)
def printPairs(self):
+ logger.debug("printPairs called!")
+ resultfile = None
+ mosessrc = None
+ mosestrg = None
+ id_file = None
- self.add_file_header(self.resultfile)
+ if self.write_ids:
+ id_file = file_open(self.write_ids, 'w', encoding='utf-8')
+
+ if self.write:
+ if self.write_mode == 'moses' and len(self.write) == 2:
+ mosessrc = file_open(self.write[0], mode='w', encoding='utf-8')
+ mosestrg = file_open(self.write[1], mode='w', encoding='utf-8')
+ else:
+ resultfile = file_open(self.write[0], mode='w', encoding='utf-8')
+
+ if self.preprocess == 'moses':
+ # If preprocessing is moses, download
+ if not self.write or len(self.write) != 2:
+ # Write to current path and return
+ if self.write and len(self.write) != 2:
+ resultfile.close()
+ logger.warning('"moses" preprocessing requires two output '
+ 'file names. Using default names.')
+ moses_names = self.of_handler.open_moses_files(
+ outpath=self.of_handler.download_dir)
+ logger.info('Moses files written to %s', ', '.join(moses_names))
+ return
+ with tempfile.TemporaryDirectory() as tmpdir:
+ # Write to specified files
+ logger.info('Extracting data...')
+ moses_names = self.of_handler.open_moses_files(outpath=tmpdir)
+ with file_open(os.path.join(tmpdir, moses_names[0])) as in1, \
+ file_open(os.path.join(tmpdir, moses_names[1])) as in2:
+ if self.switch_langs:
+ in1, in2 = in2, in1
+ for fin, fout in [(in1, mosessrc), (in2, mosestrg)]:
+ for line in fin:
+ fout.write(line)
+ mosessrc.close()
+ mosestrg.close()
+ logger.info('Moses files written to %s', ', '.join(self.write))
+ return
+
+ self.add_file_header(resultfile)
src_parser = None
trg_parser = None
@@ -248,13 +269,14 @@ def printPairs(self):
self.alignmentParser.collect_links(cur_pos, self.chunk_size, self.verbose)
if src_doc_name != prev_src_doc_name:
- src_doc_size = -1
+ src_doc_size = -1
prev_src_doc_name = src_doc_name
if trg_doc_name != prev_trg_doc_name:
- trg_doc_size = -1
+ trg_doc_size = -1
prev_trg_doc_name = trg_doc_name
- if self.verbose: print("")
+ if self.verbose:
+ print("", file=sys.stderr)
if not src_doc_name:
break
@@ -268,32 +290,32 @@ def printPairs(self):
src_doc = self.of_handler.open_sentence_file(src_doc_name, 'src')
trg_doc = self.of_handler.open_sentence_file(trg_doc_name, 'trg')
except KeyError as e:
- print('\n'+e.args[0]+'\nContinuing from next sentence file pair.')
+ print('\n'+e.args[0]+'\nContinuing from next sentence file pair.', file=sys.stderr)
continue
try:
- src_parser = SentenceParser(src_doc,
- preprocessing=self.preprocess, anno_attrs=self.src_annot,
- preserve=self.preserve, delimiter=self.annot_delimiter)
+ src_parser = SentenceParser(
+ src_doc, preprocessing=self.preprocess, anno_attrs=self.src_annot,
+ preserve=self.preserve, delimiter=self.annot_delimiter)
src_doc_size = src_parser.store_sentences(src_set, src_doc_size, self.verbose)
- trg_parser = SentenceParser(trg_doc,
- preprocessing=self.preprocess, anno_attrs=self.trg_annot,
- preserve=self.preserve, delimiter=self.annot_delimiter)
+ trg_parser = SentenceParser(
+ trg_doc, preprocessing=self.preprocess, anno_attrs=self.trg_annot,
+ preserve=self.preserve, delimiter=self.annot_delimiter)
trg_doc_size = trg_parser.store_sentences(trg_set, trg_doc_size, self.verbose)
except SentenceParserError as e:
- print('\n'+e.message+'\nContinuing from next sentence file pair.')
+ print('\n'+e.message+'\nContinuing from next sentence file pair.', file=sys.stderr)
continue
- self.add_doc_names(src_doc_name, trg_doc_name,
- self.resultfile, self.mosessrc, self.mosestrg)
+ self.add_doc_names(
+ src_doc_name, trg_doc_name, resultfile, mosessrc, mosestrg)
len_link_list = len(link_list)
for i, link_a in enumerate(link_list):
if self.verbose:
- if i%1000==0 or i+1==len_link_list:
+ if i % 1000 == 0 or i + 1 == len_link_list:
progress = str(round((i+1)/len_link_list*100, 2))
- print("\x1b[2KWriting chunk ... {}%".format(progress), end="\r")
+ print("\x1b[2KWriting chunk ... {}%".format(progress), end="\r", file=sys.stderr)
src_result, trg_result = self.format_pair(
link_a, src_parser, trg_parser, self.fromto)
@@ -303,39 +325,38 @@ def printPairs(self):
link_attr = attrs_list[i] if i < len(attrs_list) else None
- self.out_put_pair(src_result, trg_result, self.resultfile,
- self.mosessrc, self.mosestrg, link_attr, self.id_file,
- src_doc_name, trg_doc_name)
+ self.out_put_pair(
+ src_result, trg_result, resultfile, mosessrc, mosestrg,
+ link_attr, id_file, src_doc_name, trg_doc_name)
total += 1
if total == self.maximum:
stop = True
break
- self.add_doc_ending(self.resultfile)
+ self.add_doc_ending(resultfile)
if self.verbose and self.write:
- print("\033[F\033[F\033[F", end="")
+ print("\033[F\033[F\033[F", end="", file=sys.stderr)
if stop:
break
if self.verbose and self.write:
- print("\n\n")
+ print("\n\n", file=sys.stderr)
- self.add_file_ending(self.resultfile)
+ self.add_file_ending(resultfile)
self.alignmentParser.bp.close_document()
if self.write:
- if self.write_mode == 'moses' and self.mosessrc:
- self.mosessrc.close()
- self.mosestrg.close()
+ if self.write_mode == 'moses' and mosessrc:
+ mosessrc.close()
+ mosestrg.close()
else:
- self.resultfile.close()
+ resultfile.close()
if self.write_ids:
- self.id_file.close()
+ id_file.close()
self.of_handler.close_zipfiles()
-
diff --git a/opustools_pkg/opustools/parse/block_parser.py b/opustools_pkg/opustools/parse/block_parser.py
index 2981033..6867fa0 100644
--- a/opustools_pkg/opustools/parse/block_parser.py
+++ b/opustools_pkg/opustools/parse/block_parser.py
@@ -1,6 +1,9 @@
+import sys
+
import xml.parsers.expat
from ..util import file_open
+
class BlockParserError(Exception):
def __init__(self, message):
@@ -56,7 +59,7 @@ def __init__(self, document, data_tag=None, doc_size=-1):
self.completeBlocks = []
if doc_size == -1:
- print(f'Measuring file "{document.name}" ...', end="\r")
+ print(f'Measuring file "{document.name}" ...', end="\r", file=sys.stderr)
self.document.seek(0, 2)
self.doc_size = self.document.tell()
self.document.seek(0)
@@ -99,7 +102,7 @@ def close_document(self):
def report_progress(self, cur_pos):
progress = str(round(cur_pos/self.doc_size*100, 2) if self.doc_size > 0 else 0)
- print("\x1b[2KParsing file \"{}\" ... {}%".format(self.document.name, progress), end="\r")
+ print("\x1b[2KParsing file \"{}\" ... {}%".format(self.document.name, progress), end="\r", file=sys.stderr)
def get_complete_blocks(self, cur_pos, verbose=False):
"""
diff --git a/opustools_pkg/opustools/parse/sentence_parser.py b/opustools_pkg/opustools/parse/sentence_parser.py
index 26eb7b6..36b53ab 100644
--- a/opustools_pkg/opustools/parse/sentence_parser.py
+++ b/opustools_pkg/opustools/parse/sentence_parser.py
@@ -1,5 +1,8 @@
+import sys
+
from .block_parser import BlockParser, BlockParserError
+
class SentenceParserError(Exception):
def __init__(self, message):
@@ -160,7 +163,7 @@ def store_sentences(self, id_set, doc_size, verbose=False):
bp.close_document()
if verbose:
bp.report_progress(cur_pos)
- print("")
+ print("", file=sys.stderr)
except BlockParserError as e:
raise SentenceParserError(
'Error while parsing sentence file: {error}'.format(error=e.args[0]))
@@ -199,4 +202,3 @@ def read_sentence(self, ids):
attrsList.append(attrs)
return sentence, attrsList
-
diff --git a/opustools_pkg/opustools/readopusdata.py b/opustools_pkg/opustools/readopusdata.py
index ad60e42..8461295 100644
--- a/opustools_pkg/opustools/readopusdata.py
+++ b/opustools_pkg/opustools/readopusdata.py
@@ -5,14 +5,20 @@
from ruamel.yaml import YAML, scanner, reader
+
+logger = logging.getLogger(__name__)
+
+
def read_url(url):
return urllib.request.urlopen(url).read().decode('utf-8').split('\n')
+
def read_url_yaml(url, yaml):
raw = urllib.request.urlopen(url).read().decode('utf-8')
data = yaml.load(raw)
return data
+
def create_table(cur):
create_opusfile_table = '''CREATE TABLE IF NOT EXISTS opusfile (
id integer PRIMARY KEY,
@@ -34,6 +40,7 @@ def create_table(cur):
create_url_index = 'CREATE INDEX IF NOT EXISTS idx_url ON opusfile(url)'
cur.execute(create_url_index)
+
def execute_sql(cur, opusfile):
columns = ['source', 'target', 'corpus', 'preprocessing', 'version', 'url', 'size', 'documents', 'alignment_pairs', 'source_tokens', 'target_tokens', 'latest']
#wheres = [f'{columns[i]}="{opusfile[i]}"' for i in range(6)]
@@ -48,71 +55,73 @@ def execute_sql(cur, opusfile):
sql = f'INSERT INTO opusfile({", ".join(columns)}, updated) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,1)'
cur.execute(sql, opusfile)
+
def get_lang_info(name, data, data_type, info):
source, target, documents, alignment_pairs, source_tokens, target_tokens = '', '', '', '', '', ''
source = name
if data_type in ['bitexts', 'moses', 'tmx']:
names = name.split('-')
if len(names) != 2:
- logging.warning(f'{info} {data_type} {name}: cannot split name "{name}" into two language codes')
+ logger.warning(f'{info} {data_type} {name}: cannot split name "{name}" into two language codes')
else:
source, target = names
documents = ''
if data_type in ['bitexts', 'monolingual']:
documents = data.get('files', '')
if documents == '':
- logging.warning(f'{info} {data_type} {name} is missing "files"')
+ logger.warning(f'{info} {data_type} {name} is missing "files"')
if data_type in ['bitexts', 'moses']:
alignment_pairs = data.get('alignments', '')
if alignment_pairs == '':
- logging.warning(f'{info} {data_type} {name} is missing "alignments"')
+ logger.warning(f'{info} {data_type} {name} is missing "alignments"')
elif data_type == 'tmx':
alignment_pairs = data.get('translation units', '')
if alignment_pairs == '':
- logging.warning(f'{info} {data_type} {name} is missing "translation units"')
+ logger.warning(f'{info} {data_type} {name} is missing "translation units"')
elif data_type == 'monolingual':
alignment_pairs = data.get('sentences', '')
if alignment_pairs == '':
- logging.warning(f'{info} {data_type} {name} is missing "sentences"')
+ logger.warning(f'{info} {data_type} {name} is missing "sentences"')
if data_type == 'monolingual':
source_tokens = data.get('tokens', '')
if source_tokens == '':
- logging.warning(f'{info} {data_type} {name} is missing "tokens"')
+ logger.warning(f'{info} {data_type} {name} is missing "tokens"')
target_tokens = ''
else:
source_tokens = data.get('source language tokens', '')
if source_tokens == '':
- logging.warning(f'{info} {data_type} {name} is missing "source language tokens"')
+ logger.warning(f'{info} {data_type} {name} is missing "source language tokens"')
target_tokens = data.get('target language tokens', '')
if target_tokens == '':
- logging.warning(f'{info} {data_type} {name} is missing "target language tokens"')
+ logger.warning(f'{info} {data_type} {name} is missing "target language tokens"')
return source, target, documents, alignment_pairs, source_tokens, target_tokens
+
def get_size_url_prep(name, data, data_type, info):
size, url, preprocessing = '','',''
if data_type in ['tmx', 'moses']:
size = data.get('download size', '')
if size == '':
- logging.warning(f'{info} {data_type} {name} is missing "download size"')
+ logger.warning(f'{info} {data_type} {name} is missing "download size"')
else:
size = int(int(size)/1024)
url = data.get('download url', '')
if url == '':
- logging.warning(f'{info} {data_type} {name} is missing "download url"')
+ logger.warning(f'{info} {data_type} {name} is missing "download url"')
elif data_type in ['bitexts', 'monolingual']:
size = data.get('size', '')
if size == '':
- logging.warning(f'{info} {data_type} {name} is missing "size"')
+ logger.warning(f'{info} {data_type} {name} is missing "size"')
else:
size = int(int(size)/1024)
url = data.get('url', '')
if url == '':
- logging.warning(f'{info} {data_type} {name} is missing "url"')
+ logger.warning(f'{info} {data_type} {name} is missing "url"')
pre_step = url.split('/')
if len(pre_step) < 2:
- logging.warning(f'{info} {data_type} {name}: cannot find preprocessing from url "{url}"')
+ logger.warning(f'{info} {data_type} {name}: cannot find preprocessing from url "{url}"')
else:
preprocessing = pre_step[-2]
@@ -125,6 +134,7 @@ def get_tmx_entries(corpus, version, latest, tmx, cur, info):
opusfile = (source, target, corpus, preprocessing, version, url, size, documents, alignment_pairs, source_tokens, target_tokens, latest)
execute_sql(cur, opusfile)
+
def get_moses_entries(corpus, version, latest, moses, cur, info):
for item in moses:
source, target, documents, alignment_pairs, source_tokens, target_tokens = get_lang_info(item, moses[item], 'moses', info)
@@ -132,6 +142,7 @@ def get_moses_entries(corpus, version, latest, moses, cur, info):
opusfile = (source, target, corpus, preprocessing, version, url, size, documents, alignment_pairs, source_tokens, target_tokens, latest)
execute_sql(cur, opusfile)
+
def get_monolingual_entries(corpus, version, latest, monolingual, cur, info):
for item in monolingual:
source, target, documents, alignment_pairs, source_tokens, target_tokens = get_lang_info(item, monolingual[item], 'monolingual', info)
@@ -140,6 +151,7 @@ def get_monolingual_entries(corpus, version, latest, monolingual, cur, info):
opusfile = (source, target, corpus, preprocessing, version, url, size, documents, alignment_pairs, source_tokens, target_tokens, latest)
execute_sql(cur, opusfile)
+
def get_bitext_entries(corpus, version, latest, bitexts, cur, info):
for item in bitexts:
source, target, documents, alignment_pairs, source_tokens, target_tokens = get_lang_info(item, bitexts[item], 'bitexts', info)
@@ -150,12 +162,14 @@ def get_bitext_entries(corpus, version, latest, bitexts, cur, info):
opusfile = (source, target, corpus, preprocessing, version, url, size, documents, alignment_pairs, source_tokens, target_tokens, latest)
execute_sql(cur, opusfile)
+
def remove_missing_items(cur):
sql = 'DELETE FROM opusfile WHERE updated=0'
cur.execute(sql)
sql = 'UPDATE opusfile SET updated=0'
cur.execute(sql)
+
def update_db(db_file=None, log_type='errors'):
yaml = YAML()
@@ -183,15 +197,15 @@ def update_db(db_file=None, log_type='errors'):
try:
gen_info = read_url_yaml(URL_BASE + info, yaml)
except (scanner.ScannerError, urllib.error.HTTPError, reader.ReaderError) as e:
- logging.error(f'{info}, {type(e).__name__}: {e}')
+ logger.error(f'{info}, {type(e).__name__}: {e}')
gen_info = {}
corpus = gen_info.get('name')
if not corpus:
- logging.warning(f'{info}, corpus name missing')
+ logger.warning(f'{info}, corpus name missing')
print(f'Processing corpus {corpus}')
latest_v = gen_info.get('latest_release')
if not latest_v:
- logging.error(f'{info}, latest_release missing')
+ logger.error(f'{info}, latest_release missing')
elif len(info_s) == 3:
version = info_s[1]
if not corpus:
@@ -203,7 +217,7 @@ def update_db(db_file=None, log_type='errors'):
try:
corpus_data = read_url_yaml(URL_BASE + stats, yaml)
except (scanner.ScannerError, urllib.error.HTTPError, reader.ReaderError) as e:
- logging.error(f'{stats}, {type(e).__name__}: {e}')
+ logger.error(f'{stats}, {type(e).__name__}: {e}')
continue
get_entries = {'bitexts': get_bitext_entries,
@@ -212,7 +226,7 @@ def update_db(db_file=None, log_type='errors'):
'tmx': get_tmx_entries}
if not corpus_data:
- logging.error(f'{info}, corpus_data is empty')
+ logger.error(f'{info}, corpus_data is empty')
continue
for item in get_entries.keys():
@@ -220,15 +234,17 @@ def update_db(db_file=None, log_type='errors'):
if sub_data:
get_entries[item](corpus, version, latest, sub_data, cur, info)
else:
- logging.warning(f'{info}, {item} data missing')
+ logger.warning(f'{info}, {item} data missing')
remove_missing_items(cur)
con.commit()
con.close()
+
def main():
update_db()
+
if __name__ == "__main__":
main()
diff --git a/opustools_pkg/tests/test_opus_read.py b/opustools_pkg/tests/test_opus_read.py
index 695d045..76e5ec8 100644
--- a/opustools_pkg/tests/test_opus_read.py
+++ b/opustools_pkg/tests/test_opus_read.py
@@ -1,20 +1,18 @@
+import logging
import os
import unittest
from unittest import mock
import io
import sys
-import xml.parsers.expat
import gzip
import shutil
import zipfile
import tempfile
-import bz2
from opustools import OpusRead, OpusGet
-from opustools.parse.block_parser import BlockParserError
-from opustools.parse.sentence_parser import SentenceParserError
from opustools.parse.alignment_parser import AlignmentParserError
+
def pairPrinterToVariable(**kwargs):
old_stdout = sys.stdout
printout = io.StringIO()
@@ -24,245 +22,237 @@ def pairPrinterToVariable(**kwargs):
sys.stdout = old_stdout
return printout.getvalue()
-def preMosesToVariable(**kwargs):
- old_stdout = sys.stdout
- printout = io.StringIO()
- sys.stdout = printout
- try:
- OpusRead(**kwargs)
- except SystemExit as e:
- pass
- sys.stdout = old_stdout
- return printout.getvalue()
def add_to_root_dir(corpus=None, source=None, target=None,
- version='latest', preprocess=None, root_dir=None):
+ version='latest', preprocess=None, root_dir=None):
OpusGet(directory=corpus, source=source, target=target, release=version,
- preprocess=preprocess, download_dir=root_dir, suppress_prompts=True,
- database='tests/testdata.db').get_files()
+ preprocess=preprocess, download_dir=root_dir, suppress_prompts=True,
+ database='tests/testdata.db').get_files()
source_zip = '{corpus}_{version}_{preprocess}_{source}.zip'.format(
corpus=corpus, version=version, preprocess=preprocess, source=source)
os.rename(os.path.join(root_dir, source_zip),
- os.path.join(root_dir, corpus, version, preprocess, source+'.zip'))
+ os.path.join(root_dir, corpus, version, preprocess, source+'.zip'))
target_zip = '{corpus}_{version}_{preprocess}_{target}.zip'.format(
corpus=corpus, version=version, preprocess=preprocess, target=target)
- os.rename(os.path.join(root_dir,target_zip),
- os.path.join(root_dir, corpus, version, preprocess, target+'.zip'))
+ os.rename(os.path.join(root_dir, target_zip),
+ os.path.join(root_dir, corpus, version, preprocess, target+'.zip'))
alignment_xml = ('{corpus}_{version}_{preprocess}_{source}-'
- '{target}.xml.gz').format(corpus=corpus, version=version,
- preprocess='xml', source=source, target=target)
+ '{target}.xml.gz').format(corpus=corpus, version=version,
+ preprocess='xml', source=source, target=target)
os.rename(os.path.join(root_dir, alignment_xml),
- os.path.join(root_dir, corpus, version, 'xml',
- source+'-'+target+'.xml.gz'))
+ os.path.join(root_dir, corpus, version, 'xml',
+ source+'-'+target+'.xml.gz'))
+
OPUS_TEMP = 'tmp_opus_read_temp'
OPUS_ROOT = 'tmp_opus_read_root'
+
class TestOpusRead(unittest.TestCase):
@classmethod
def setUpClass(self):
if ('OPUS_TEST_SAVE' in os.environ.keys() and
os.path.exists(os.path.join(os.environ['OPUS_TEST_SAVE'], OPUS_TEMP))):
- self.tempdir1 = os.path.join(os.environ['OPUS_TEST_SAVE'], OPUS_TEMP)
+ self.tempdir1 = os.path.join(os.environ['OPUS_TEST_SAVE'], OPUS_TEMP)
else:
self.tempdir1 = tempfile.mkdtemp()
os.mkdir(os.path.join(self.tempdir1, 'test_files'))
os.makedirs(os.path.join(self.tempdir1, 'RF', 'xml', 'en'))
with open(os.path.join(self.tempdir1, 'RF', 'xml', 'en',
- '1996.xml'), 'w') as f:
+ '1996.xml'), 'w') as f:
f.write('\n'
- '\n \n The \n Hound \n of \n the'
- ' \n Baskervilles \n by'
- ' \n Sir \n Arthur \n '
- 'Conan \n Doyle \n Aligned \n by\n : \n András \n '
- 'Farkas \n (\n fully '
- '\n reviewed\n ) \n'
- 'meta>\n\n \n Source&<>"\'\n \n :\n \n manybooks.'
- 'netAudiobook\n available\n \n \n here\n \n\n\n\n\n\n \n Chapter\n '
- '1\n '
- 'Mr.'
- 'w>\n Sherlock\n Holmes\n \n\n\n \n Mr.\n '
- ''
- 'Sherlock\n Holmes\n\n\n\n\n\n '
- '\n I\n \n \n believe\n \n
\n\n\n\n \n '
- '"'
- 'w>\n Excellent\n \n !\n\n \n\n\n
\n '
- '\n\n')
+ '
\n \n The \n Hound \n of \n the'
+ ' \n Baskervilles \n by'
+ ' \n Sir \n Arthur \n '
+ 'Conan \n Doyle \n Aligned \n by\n : \n András \n '
+ 'Farkas \n (\n fully '
+ '\n reviewed\n ) \n'
+ 'meta>\n\n \n Source&<>"\'\n \n :\n \n manybooks.'
+ 'netAudiobook\n available\n \n \n here\n \n\n\n\n\n\n \n Chapter\n '
+ '1\n '
+ 'Mr.'
+ 'w>\n Sherlock\n Holmes\n \n\n\n \n Mr.\n '
+ ''
+ 'Sherlock\n Holmes\n\n\n\n\n\n '
+ '\n I\n \n \n believe\n \n
\n\n\n\n \n '
+ '"'
+ 'w>\n Excellent\n \n !\n\n \n\n\n
\n '
+ '\n\n')
with zipfile.ZipFile(os.path.join(self.tempdir1,
- 'RF_v1_xml_en.zip'), 'w') as zf:
+ 'RF_v1_xml_en.zip'), 'w') as zf:
zf.write(os.path.join(self.tempdir1, 'RF', 'xml', 'en',
- '1996.xml'), arcname='RF/xml/en/1996.xml')
+ '1996.xml'), arcname='RF/xml/en/1996.xml')
os.mkdir(os.path.join(self.tempdir1, 'RF', 'xml', 'sv'))
with open(os.path.join(self.tempdir1, 'RF', 'xml', 'sv',
- '1996.xml'), 'w') as f:
+ '1996.xml'), 'w') as f:
f.write('\n\n \n The Hound of the Baskervilles \n by '
- 'Sir Arthur Conan Doyle \n Aligned by: András Farkas (fully '
- 'reviewed) \n \n \n \n\n '
- 'Source\n : \n Project \n Gutenberg\n\n\n\n Herra \n Sherlock'
- ' \n Holmes\n .\n\n\n Herra \n Sherlock \n Holmes\n'
- 's>\n \n\n Luulenpa \n että \n sinulla \n
\n'
- '\n\n "\n '
- 'Erinomaista\n .\n'
- '
\n \n\n')
+ '>\n \n The Hound of the Baskervilles \n by '
+ 'Sir Arthur Conan Doyle \n Aligned by: András Farkas (fully '
+ 'reviewed) \n \n \n \n\n '
+ 'Source\n : \n Project \n Gutenberg\n\n\n\n Herra \n Sherlock'
+ ' \n Holmes\n .\n\n\n Herra \n Sherlock \n Holmes\n'
+ 's>\n \n\n Luulenpa \n että \n sinulla \n
\n'
+ '\n\n "\n '
+ 'Erinomaista\n .\n'
+ '
\n \n\n')
with zipfile.ZipFile(os.path.join(self.tempdir1,
- 'RF_v1_xml_sv.zip'), 'w') as zf:
+ 'RF_v1_xml_sv.zip'), 'w') as zf:
zf.write(os.path.join(self.tempdir1, 'RF', 'xml', 'sv',
- '1996.xml'), arcname='RF/xml/sv/1996.xml')
+ '1996.xml'), arcname='RF/xml/sv/1996.xml')
shutil.copyfile(os.path.join(self.tempdir1, 'RF_v1_xml_en.zip'),
- os.path.join(self.tempdir1, 'en.zip'))
+ os.path.join(self.tempdir1, 'en.zip'))
shutil.copyfile(os.path.join(self.tempdir1, 'RF_v1_xml_sv.zip'),
- os.path.join(self.tempdir1, 'sv.zip'))
+ os.path.join(self.tempdir1, 'sv.zip'))
with open(os.path.join(self.tempdir1, 'books_alignment.xml'),
- 'w') as f:
+ 'w') as f:
f.write('\n\n'
- '\n\n\n\n\n\n\n \n\n')
+ 'cesAlign PUBLIC "-//CES//DTD XML cesAlign//EN" "">\n'
+ '\n\n\n\n\n\n\n \n\n')
with gzip.open(os.path.join(self.tempdir1,
- 'RF_v1_xml_en-sv.xml.gz'), 'wb') as f:
+ 'RF_v1_xml_en-sv.xml.gz'), 'wb') as f:
with open(os.path.join(self.tempdir1, 'books_alignment.xml'),
- 'rb') as b:
+ 'rb') as b:
f.write(b.read())
with open(os.path.join(self.tempdir1, 'non_alignment.xml'),
- 'w') as f:
+ 'w') as f:
f.write('\n\n'
- '\n\n\n\n\n '
- '\n\n')
+ 'cesAlign PUBLIC "-//CES//DTD XML cesAlign//EN" "">\n'
+ '\n\n\n\n\n '
+ '\n\n')
if ('OPUS_TEST_SAVE' in os.environ.keys() and
os.path.exists(os.path.join(os.environ['OPUS_TEST_SAVE'], OPUS_ROOT))):
- self.root_directory = os.path.join(os.environ['OPUS_TEST_SAVE'], OPUS_ROOT)
+ self.root_directory = os.path.join(os.environ['OPUS_TEST_SAVE'], OPUS_ROOT)
else:
self.root_directory = tempfile.mkdtemp()
os.makedirs(os.path.join(self.root_directory, 'RF', 'latest',
- 'xml'))
+ 'xml'))
with gzip.open(os.path.join(self.root_directory, 'RF', 'latest',
- 'xml', 'fi-sv.xml.gz'), 'wb') as f:
+ 'xml', 'fi-sv.xml.gz'), 'wb') as f:
with open(os.path.join(self.tempdir1, 'books_alignment.xml'),
- 'rb') as b:
+ 'rb') as b:
f.write(b.read())
with gzip.open(os.path.join(self.root_directory, 'RF', 'latest',
- 'xml', 'ab-cd.xml.gz'), 'wb') as f:
+ 'xml', 'ab-cd.xml.gz'), 'wb') as f:
with open(os.path.join(self.tempdir1, 'books_alignment.xml'),
- 'rb') as b:
+ 'rb') as b:
f.write(b.read())
add_to_root_dir(corpus='RF', source='en', target='sv',
- preprocess='xml', root_dir=self.root_directory)
+ preprocess='xml', root_dir=self.root_directory)
add_to_root_dir(corpus='RF', source='en', target='es',
- preprocess='xml', root_dir=self.root_directory)
+ preprocess='xml', root_dir=self.root_directory)
os.mkdir(os.path.join(self.root_directory, 'RF', 'latest', 'raw'))
add_to_root_dir(corpus='RF', source='en', target='sv',
- preprocess='raw', root_dir=self.root_directory)
+ preprocess='raw', root_dir=self.root_directory)
os.mkdir(os.path.join(self.root_directory, 'RF', 'latest',
- 'parsed'))
+ 'parsed'))
add_to_root_dir(corpus='RF', source='en', target='sv',
- preprocess='parsed', root_dir=self.root_directory)
+ preprocess='parsed', root_dir=self.root_directory)
os.makedirs(os.path.join(self.root_directory, 'RF', 'v1', 'xml'))
add_to_root_dir(corpus='RF', source='en', target='sv',
- version='v1', preprocess='xml', root_dir=self.root_directory)
+ version='v1', preprocess='xml', root_dir=self.root_directory)
add_to_root_dir(corpus='RF', source='en', target='es',
- version='v1', preprocess='xml', root_dir=self.root_directory)
+ version='v1', preprocess='xml', root_dir=self.root_directory)
os.makedirs(os.path.join(self.root_directory, 'OpenSubtitles',
- 'latest', 'raw'))
+ 'latest', 'raw'))
os.makedirs(os.path.join(self.root_directory, 'OpenSubtitles',
- 'latest', 'xml'))
+ 'latest', 'xml'))
add_to_root_dir(corpus='OpenSubtitles', source='eo', target='tl',
- preprocess='raw', root_dir=self.root_directory)
+ preprocess='raw', root_dir=self.root_directory)
add_to_root_dir(corpus='OpenSubtitles', source='eo', target='tl',
- preprocess='xml', root_dir=self.root_directory)
+ preprocess='xml', root_dir=self.root_directory)
os.makedirs(os.path.join(self.root_directory, 'Books',
- 'latest', 'xml'))
+ 'latest', 'xml'))
add_to_root_dir(corpus='Books', source='eo', target='pt',
- preprocess='xml', root_dir=self.root_directory)
+ preprocess='xml', root_dir=self.root_directory)
add_to_root_dir(corpus='RF', source='fr', target='sv',
- preprocess='xml', root_dir=self.root_directory)
+ preprocess='xml', root_dir=self.root_directory)
os.remove(os.path.join(self.root_directory, 'RF', 'latest', 'xml',
- 'fr.zip'))
+ 'fr.zip'))
self.opr = OpusRead(directory='RF', source='en', target='sv',
- root_directory=self.root_directory)
+ root_directory=self.root_directory)
self.maxDiff= None
@@ -283,23 +273,23 @@ def tearDown(self):
def test_normal_xml_write(self):
OpusRead(directory='RF', source='en', target='sv', maximum=2,
- write=[os.path.join(self.tempdir1, 'test_files', 'test_result')],
- root_directory=self.root_directory).printPairs()
+ write=[os.path.join(self.tempdir1, 'test_files', 'test_result')],
+ root_directory=self.root_directory).printPairs()
with open(os.path.join(self.tempdir1, 'test_files', 'test_result'),
- 'r') as f:
+ 'r') as f:
self.assertEqual(f.read(),
- '\n# en/1988.xml.gz\n'
- '# sv/1988.xml.gz\n\n'
- '================================\n(src)="s1.1">State'
- 'ment of Government Policy by the Prime Minister , Mr'
- ' Ingvar Carlsson , at the Opening of the Swedish Parl'
- 'iament on Tuesday , 4 October , 1988 .\n(trg)="s1.1"'
- '>REGERINGSFÖRKLARING .\n============================'
- '====\n(src)="s2.1">Your Majesties , Your Royal Highn'
- 'esses , Mr Speaker , Members of the Swedish Parliame'
- 'nt .\n(trg)="s2.1">Eders Majestäter , Eders Kungliga'
- ' Högheter , herr talman , ledamöter av Sveriges riks'
- 'dag !\n================================\n')
+ '\n# en/1988.xml.gz\n'
+ '# sv/1988.xml.gz\n\n'
+ '================================\n(src)="s1.1">State'
+ 'ment of Government Policy by the Prime Minister , Mr'
+ ' Ingvar Carlsson , at the Opening of the Swedish Parl'
+ 'iament on Tuesday , 4 October , 1988 .\n(trg)="s1.1"'
+ '>REGERINGSFÖRKLARING .\n============================'
+ '====\n(src)="s2.1">Your Majesties , Your Royal Highn'
+ 'esses , Mr Speaker , Members of the Swedish Parliame'
+ 'nt .\n(trg)="s2.1">Eders Majestäter , Eders Kungliga'
+ ' Högheter , herr talman , ledamöter av Sveriges riks'
+ 'dag !\n================================\n')
def test_normal_xml_write_link_end_and_linkGrp_end_on_same_line(self):
same_line = os.path.join(self.tempdir1, 'test_files', 'sameline')
@@ -315,12 +305,13 @@ def test_normal_xml_write_link_end_and_linkGrp_end_on_same_line(self):
'\n')
OpusRead(directory='RF', source='en', target='sv',
- write=[os.path.join(self.tempdir1, 'test_files', 'test_result')],
- root_directory=self.root_directory,
- alignment_file=same_line).printPairs()
+ write=[os.path.join(self.tempdir1, 'test_files', 'test_result')],
+ root_directory=self.root_directory,
+ alignment_file=same_line).printPairs()
with open(os.path.join(self.tempdir1, 'test_files', 'test_result'),
- 'r') as f:
- self.assertEqual(f.read(),
+ 'r') as f:
+ self.assertEqual(
+ f.read(),
'\n# en/1988.xml.gz\n'
'# sv/1988.xml.gz\n'
'\n================================\n'
@@ -340,43 +331,49 @@ def test_normal_xml_write_link_end_and_linkGrp_end_on_same_line(self):
'================================\n')
def test_normal_xml_write_verbose(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- maximum=1, write=[os.path.join(
- self.tempdir1, 'test_files', 'test_result')],
- root_directory=self.root_directory, verbose=True)
- alignment=os.path.join(self.root_directory, 'RF', 'latest', 'xml', 'en-sv.xml.gz')
- self.assertTrue('Parsing file "{}'.format(alignment) in var)
- self.assertTrue('Parsing file "RF/xml/en/1988.xml"' in var)
- self.assertTrue('Parsing file "RF/xml/sv/1988.xml"' in var)
+ with mock.patch('sys.stderr', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ maximum=1, write=[os.path.join(
+ self.tempdir1, 'test_files', 'test_result')],
+ root_directory=self.root_directory, verbose=True).printPairs()
+ alignment=os.path.join(self.root_directory, 'RF', 'latest', 'xml', 'en-sv.xml.gz')
+ var = output.getvalue()
+ self.assertIn('Parsing file "{}'.format(alignment), var)
+ self.assertIn('Parsing file "RF/xml/en/1988.xml"', var)
+ self.assertIn('Parsing file "RF/xml/sv/1988.xml"', var)
def test_normal_xml_print(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- maximum=1, root_directory=self.root_directory)
- self.assertTrue(
- '\n# en/1988.xml.gz\n'
- '# sv/1988.xml.gz\n\n'
- '================================\n(src)="s1.1">State'
- 'ment of Government Policy by the Prime Minister , Mr'
- ' Ingvar Carlsson , at the Opening of the Swedish Parl'
- 'iament on Tuesday , 4 October , 1988 .\n(trg)="s1.1"'
- '>REGERINGSFÖRKLARING .\n============================'
- '====\n' in var)
-
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ maximum=1, root_directory=self.root_directory).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n# en/1988.xml.gz\n'
+ '# sv/1988.xml.gz\n\n'
+ '================================\n(src)="s1.1">State'
+ 'ment of Government Policy by the Prime Minister , Mr'
+ ' Ingvar Carlsson , at the Opening of the Swedish Parl'
+ 'iament on Tuesday , 4 October , 1988 .\n(trg)="s1.1"'
+ '>REGERINGSFÖRKLARING .\n============================'
+ '====\n', var)
def test_normal_xml_print_verbose(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- maximum=1, root_directory=self.root_directory, verbose=True)
- self.assertTrue('Parsing file "'+self.root_directory+'/RF/latest/xml/en-sv.xml.gz"' in var)
- self.assertTrue('Parsing file "RF/xml/en/1988.xml"' in var)
- self.assertTrue('Parsing file "RF/xml/sv/1988.xml"' in var)
+ with mock.patch('sys.stderr', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ maximum=1, root_directory=self.root_directory, verbose=True).printPairs()
+ var = output.getvalue()
+ self.assertIn('Parsing file "'+self.root_directory+'/RF/latest/xml/en-sv.xml.gz"', var)
+ self.assertIn('Parsing file "RF/xml/en/1988.xml"', var)
+ self.assertIn('Parsing file "RF/xml/sv/1988.xml"', var)
def test_normal_raw_write(self):
OpusRead(directory='RF', source='en', target='sv', maximum=1,
- write=[os.path.join(self.tempdir1, 'test_files', 'test_result')],
- preprocess='raw', root_directory=self.root_directory).printPairs()
+ write=[os.path.join(self.tempdir1, 'test_files', 'test_result')],
+ preprocess='raw', root_directory=self.root_directory).printPairs()
with open(os.path.join(self.tempdir1, 'test_files', 'test_result'),
- 'r') as f:
- self.assertEqual(f.read(),
+ 'r') as f:
+ self.assertEqual(
+ f.read(),
'\n# en/1988.xml.gz\n'
'# sv/1988.xml.gz\n\n'
'================================\n(src)="s1.1">State'
@@ -386,42 +383,46 @@ def test_normal_raw_write(self):
'>REGERINGSFÖRKLARING.\n============================'
'====\n')
-
def test_normal_raw_print(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- maximum=1, preprocess='raw', root_directory=self.root_directory)
- self.assertTrue(
- '\n# en/1988.xml.gz\n'
- '# sv/1988.xml.gz\n\n'
- '================================\n(src)="s1.1">State'
- 'ment of Government Policy by the Prime Minister, Mr'
- ' Ingvar Carlsson, at the Opening of the Swedish Parl'
- 'iament on Tuesday, 4 October, 1988.\n(trg)="s1.1"'
- '>REGERINGSFÖRKLARING.\n============================'
- '====\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ maximum=1, preprocess='raw', root_directory=self.root_directory).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n# en/1988.xml.gz\n'
+ '# sv/1988.xml.gz\n\n'
+ '================================\n(src)="s1.1">State'
+ 'ment of Government Policy by the Prime Minister, Mr'
+ ' Ingvar Carlsson, at the Opening of the Swedish Parl'
+ 'iament on Tuesday, 4 October, 1988.\n(trg)="s1.1"'
+ '>REGERINGSFÖRKLARING.\n============================'
+ '====\n', var)
def test_normal_raw_print_OpenSubtitles(self):
- var = pairPrinterToVariable(directory='OpenSubtitles', source='eo',
- target='tl', maximum=1, preprocess='raw',
- root_directory=self.root_directory)
- self.assertTrue(
- '\n# eo/2009/1187043/6483790.xml.gz\n'
- '# tl/2009/1187043/6934998.xml.gz\n\n'
- '================================\n'
- '(src)="1">Ĉiuj nomoj, roluloj kaj eventoj reprezentitaj en ĉi '
- 'tiu filmo estas fikciaj.\n'
- '================================\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='OpenSubtitles', source='eo',
+ target='tl', maximum=1, preprocess='raw',
+ root_directory=self.root_directory).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n# eo/2009/1187043/6483790.xml.gz\n'
+ '# tl/2009/1187043/6934998.xml.gz\n\n'
+ '================================\n'
+ '(src)="1">Ĉiuj nomoj, roluloj kaj eventoj reprezentitaj en ĉi '
+ 'tiu filmo estas fikciaj.\n'
+ '================================\n', var)
def test_normal_parsed_write(self):
OpusRead(directory='RF', source='en', target='sv', maximum=1,
- preprocess='parsed', print_annotations=True,
- source_annotations=['upos', 'feats', 'lemma'],
- target_annotations=['upos', 'feats', 'lemma'],
- write=[os.path.join(self.tempdir1, 'test_files', 'test_result')],
- root_directory=self.root_directory).printPairs()
+ preprocess='parsed', print_annotations=True,
+ source_annotations=['upos', 'feats', 'lemma'],
+ target_annotations=['upos', 'feats', 'lemma'],
+ write=[os.path.join(self.tempdir1, 'test_files', 'test_result')],
+ root_directory=self.root_directory).printPairs()
with open(os.path.join(self.tempdir1, 'test_files', 'test_result'),
- 'r') as f:
- self.assertEqual(f.read(),
+ 'r') as f:
+ self.assertEqual(
+ f.read(),
'\n# en/1988.xml.gz\n# sv/1988.xml.gz\n\n'
'================================'
'\n(src)="s1.1">Statement|NOUN|Number=Sing|statement '
@@ -442,110 +443,115 @@ def test_normal_parsed_write(self):
'=Neut|Number=Sing|Regeringsförklaring .|PUNCT|.'
'\n================================\n')
-
def test_normal_parsed_print(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- maximum=1, preprocess='parsed', print_annotations=True,
- source_annotations=['upos', 'feats', 'lemma'],
- target_annotations=['upos', 'feats', 'lemma'],
- root_directory=self.root_directory)
- self.assertTrue(
- '\n# en/1988.xml.gz\n# sv/1988.xml.gz\n\n'
- '================================'
- '\n(src)="s1.1">Statement|NOUN|Number=Sing|statement '
- 'of|ADP|of Government|NOUN|Number=Sing|government Pol'
- 'icy|NOUN|Number=Sing|policy by|ADP|by the|DET|Defini'
- 'te=Def|PronType=Art|the Prime|PROPN|Number=Sing|Prim'
- 'e Minister|PROPN|Number=Sing|Minister ,|PUNCT|, Mr|P'
- 'ROPN|Number=Sing|Mr Ingvar|PROPN|Number=Sing|Ingvar '
- 'Carlsson|PROPN|Number=Sing|Carlsson ,|PUNCT|, at|ADP'
- '|at the|DET|Definite=Def|PronType=Art|the Opening|NO'
- 'UN|Number=Sing|opening of|ADP|of the|DET|Definite=De'
- 'f|PronType=Art|the Swedish|ADJ|Degree=Pos|swedish Pa'
- 'rliament|NOUN|Number=Sing|parliament on|ADP|on Tuesd'
- 'ay|PROPN|Number=Sing|Tuesday ,|PUNCT|, 4|NUM|NumType'
- '=Card|4 October|PROPN|Number=Sing|October ,|PUNCT|, '
- '1988|NUM|NumType=Card|1988 .|PUNCT|.\n(trg)="s1.1">R'
- 'EGERINGSFÖRKLARING|NOUN|Case=Nom|Definite=Ind|Gender'
- '=Neut|Number=Sing|Regeringsförklaring .|PUNCT|.'
- '\n================================\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ maximum=1, preprocess='parsed', print_annotations=True,
+ source_annotations=['upos', 'feats', 'lemma'],
+ target_annotations=['upos', 'feats', 'lemma'],
+ root_directory=self.root_directory).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n# en/1988.xml.gz\n# sv/1988.xml.gz\n\n'
+ '================================'
+ '\n(src)="s1.1">Statement|NOUN|Number=Sing|statement '
+ 'of|ADP|of Government|NOUN|Number=Sing|government Pol'
+ 'icy|NOUN|Number=Sing|policy by|ADP|by the|DET|Defini'
+ 'te=Def|PronType=Art|the Prime|PROPN|Number=Sing|Prim'
+ 'e Minister|PROPN|Number=Sing|Minister ,|PUNCT|, Mr|P'
+ 'ROPN|Number=Sing|Mr Ingvar|PROPN|Number=Sing|Ingvar '
+ 'Carlsson|PROPN|Number=Sing|Carlsson ,|PUNCT|, at|ADP'
+ '|at the|DET|Definite=Def|PronType=Art|the Opening|NO'
+ 'UN|Number=Sing|opening of|ADP|of the|DET|Definite=De'
+ 'f|PronType=Art|the Swedish|ADJ|Degree=Pos|swedish Pa'
+ 'rliament|NOUN|Number=Sing|parliament on|ADP|on Tuesd'
+ 'ay|PROPN|Number=Sing|Tuesday ,|PUNCT|, 4|NUM|NumType'
+ '=Card|4 October|PROPN|Number=Sing|October ,|PUNCT|, '
+ '1988|NUM|NumType=Card|1988 .|PUNCT|.\n(trg)="s1.1">R'
+ 'EGERINGSFÖRKLARING|NOUN|Case=Nom|Definite=Ind|Gender'
+ '=Neut|Number=Sing|Regeringsförklaring .|PUNCT|.'
+ '\n================================\n', var)
def test_normal_parsed_print_unalphabetical(self):
- var = pairPrinterToVariable(directory='RF', source='sv', target='en',
- maximum=1, preprocess='parsed', print_annotations=True,
- source_annotations=['upos', 'feats', 'lemma'],
- target_annotations=['upos', 'feats', 'lemma'],
- root_directory=self.root_directory)
- self.assertTrue(
- '\n# en/1988.xml.gz\n# sv/1988.xml.gz\n\n'
- '================================'
- '\n(src)="s1.1">REGERINGSFÖRKLARING|NOUN|Case=Nom|Definit'
- 'e=Ind|Gender=Neut|Number=Sing|Regeringsförklaring .|PUNC'
- 'T|.\n(trg)="s1.1">Statement|NOUN|Number=Sing|statement '
- 'of|ADP|of Government|NOUN|Number=Sing|government Pol'
- 'icy|NOUN|Number=Sing|policy by|ADP|by the|DET|Defini'
- 'te=Def|PronType=Art|the Prime|PROPN|Number=Sing|Prim'
- 'e Minister|PROPN|Number=Sing|Minister ,|PUNCT|, Mr|P'
- 'ROPN|Number=Sing|Mr Ingvar|PROPN|Number=Sing|Ingvar '
- 'Carlsson|PROPN|Number=Sing|Carlsson ,|PUNCT|, at|ADP'
- '|at the|DET|Definite=Def|PronType=Art|the Opening|NO'
- 'UN|Number=Sing|opening of|ADP|of the|DET|Definite=De'
- 'f|PronType=Art|the Swedish|ADJ|Degree=Pos|swedish Pa'
- 'rliament|NOUN|Number=Sing|parliament on|ADP|on Tuesd'
- 'ay|PROPN|Number=Sing|Tuesday ,|PUNCT|, 4|NUM|NumType'
- '=Card|4 October|PROPN|Number=Sing|October ,|PUNCT|, '
- '1988|NUM|NumType=Card|1988 .|PUNCT|.'
- '\n================================\n' in var)
-
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='sv', target='en',
+ maximum=1, preprocess='parsed', print_annotations=True,
+ source_annotations=['upos', 'feats', 'lemma'],
+ target_annotations=['upos', 'feats', 'lemma'],
+ root_directory=self.root_directory).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n# en/1988.xml.gz\n# sv/1988.xml.gz\n\n'
+ '================================'
+ '\n(src)="s1.1">REGERINGSFÖRKLARING|NOUN|Case=Nom|Definit'
+ 'e=Ind|Gender=Neut|Number=Sing|Regeringsförklaring .|PUNC'
+ 'T|.\n(trg)="s1.1">Statement|NOUN|Number=Sing|statement '
+ 'of|ADP|of Government|NOUN|Number=Sing|government Pol'
+ 'icy|NOUN|Number=Sing|policy by|ADP|by the|DET|Defini'
+ 'te=Def|PronType=Art|the Prime|PROPN|Number=Sing|Prim'
+ 'e Minister|PROPN|Number=Sing|Minister ,|PUNCT|, Mr|P'
+ 'ROPN|Number=Sing|Mr Ingvar|PROPN|Number=Sing|Ingvar '
+ 'Carlsson|PROPN|Number=Sing|Carlsson ,|PUNCT|, at|ADP'
+ '|at the|DET|Definite=Def|PronType=Art|the Opening|NO'
+ 'UN|Number=Sing|opening of|ADP|of the|DET|Definite=De'
+ 'f|PronType=Art|the Swedish|ADJ|Degree=Pos|swedish Pa'
+ 'rliament|NOUN|Number=Sing|parliament on|ADP|on Tuesd'
+ 'ay|PROPN|Number=Sing|Tuesday ,|PUNCT|, 4|NUM|NumType'
+ '=Card|4 October|PROPN|Number=Sing|October ,|PUNCT|, '
+ '1988|NUM|NumType=Card|1988 .|PUNCT|.'
+ '\n================================\n', var)
def test_normal_parsed_print_all_attributes(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- maximum=1, preprocess='parsed', print_annotations=True,
- source_annotations=['all_attrs'], target_annotations=['all_attrs'],
- root_directory=self.root_directory)
- self.assertTrue(
- '\n# en/1988.xml.gz\n# sv/1988.xml.gz\n\n'
- '================================'
- '\n(src)="s1.1">Statement|root|Number=Sing|0|w1.1.1|state'
- 'ment|NOUN|NOUN of|case|w1.1.4|w1.1.2|of|ADP|ADP Governme'
- 'nt|compound|Number=Sing|w1.1.4|w1.1.3|government|NOUN|NO'
- 'UN Policy|nmod|Number=Sing|w1.1.1|w1.1.4|policy|NOUN|NOU'
- 'N by|case|w1.1.8|w1.1.5|by|ADP|ADP the|det|Definite=Def|'
- 'PronType=Art|w1.1.8|w1.1.6|the|DET|DET Prime|compound|Nu'
- 'mber=Sing|w1.1.8|w1.1.7|Prime|PROPN|PROPN Minister|nmod|'
- 'Number=Sing|w1.1.1|w1.1.8|Minister|SpaceAfter=No|PROPN|P'
- 'ROPN ,|punct|w1.1.8|w1.1.9|,|PUNCT|PUNCT Mr|compound|Num'
- 'ber=Sing|w1.1.12|w1.1.10|Mr|PROPN|PROPN Ingvar|flat|Numb'
- 'er=Sing|w1.1.10|w1.1.11|Ingvar|PROPN|PROPN Carlsson|flat'
- '|Number=Sing|w1.1.8|w1.1.12|Carlsson|SpaceAfter=No|PROPN'
- '|PROPN ,|punct|w1.1.1|w1.1.13|,|PUNCT|PUNCT at|case|w1.1'
- '.16|w1.1.14|at|ADP|ADP the|det|Definite=Def|PronType=Art'
- '|w1.1.16|w1.1.15|the|DET|DET Opening|nmod|Number=Sing|w1'
- '.1.1|w1.1.16|opening|NOUN|NOUN of|case|w1.1.20|w1.1.17|o'
- 'f|ADP|ADP the|det|Definite=Def|PronType=Art|w1.1.20|w1.1'
- '.18|the|DET|DET Swedish|amod|Degree=Pos|w1.1.20|w1.1.19|'
- 'swedish|ADJ|ADJ Parliament|nmod|Number=Sing|w1.1.16|w1.1'
- '.20|parliament|NOUN|NOUN on|case|w1.1.22|w1.1.21|on|ADP|'
- 'ADP Tuesday|nmod|Number=Sing|w1.1.16|w1.1.22|Tuesday|Spa'
- 'ceAfter=No|PROPN|PROPN ,|punct|w1.1.1|w1.1.23|,|PUNCT|PU'
- 'NCT 4|nummod|NumType=Card|w1.1.25|w1.1.24|4|NUM|NUM Octo'
- 'ber|appos|Number=Sing|w1.1.1|w1.1.25|October|SpaceAfter='
- 'No|PROPN|PROPN ,|punct|w1.1.25|w1.1.26|,|PUNCT|PUNCT 198'
- '8|nummod|NumType=Card|w1.1.25|w1.1.27|1988|SpaceAfter=No'
- '|NUM|NUM .|punct|w1.1.1|w1.1.28|.|SpaceAfter=No|PUNCT|PU'
- 'NCT\n(trg)="s1.1">REGERINGSFÖRKLARING|root|Case=Nom|Defini'
- 'te=Ind|Gender=Neut|Number=Sing|0|w1.1.1|Regeringsförklar'
- 'ing|SpaceAfter=No|NOUN|NOUN .|punct|w1.1.1|w1.1.2|.|Spac'
- 'eAfter=No|PUNCT|PUNCT'
- '\n================================\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ maximum=1, preprocess='parsed', print_annotations=True,
+ source_annotations=['all_attrs'], target_annotations=['all_attrs'],
+ root_directory=self.root_directory).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n# en/1988.xml.gz\n# sv/1988.xml.gz\n\n'
+ '================================'
+ '\n(src)="s1.1">Statement|root|Number=Sing|0|w1.1.1|state'
+ 'ment|NOUN|NOUN of|case|w1.1.4|w1.1.2|of|ADP|ADP Governme'
+ 'nt|compound|Number=Sing|w1.1.4|w1.1.3|government|NOUN|NO'
+ 'UN Policy|nmod|Number=Sing|w1.1.1|w1.1.4|policy|NOUN|NOU'
+ 'N by|case|w1.1.8|w1.1.5|by|ADP|ADP the|det|Definite=Def|'
+ 'PronType=Art|w1.1.8|w1.1.6|the|DET|DET Prime|compound|Nu'
+ 'mber=Sing|w1.1.8|w1.1.7|Prime|PROPN|PROPN Minister|nmod|'
+ 'Number=Sing|w1.1.1|w1.1.8|Minister|SpaceAfter=No|PROPN|P'
+ 'ROPN ,|punct|w1.1.8|w1.1.9|,|PUNCT|PUNCT Mr|compound|Num'
+ 'ber=Sing|w1.1.12|w1.1.10|Mr|PROPN|PROPN Ingvar|flat|Numb'
+ 'er=Sing|w1.1.10|w1.1.11|Ingvar|PROPN|PROPN Carlsson|flat'
+ '|Number=Sing|w1.1.8|w1.1.12|Carlsson|SpaceAfter=No|PROPN'
+ '|PROPN ,|punct|w1.1.1|w1.1.13|,|PUNCT|PUNCT at|case|w1.1'
+ '.16|w1.1.14|at|ADP|ADP the|det|Definite=Def|PronType=Art'
+ '|w1.1.16|w1.1.15|the|DET|DET Opening|nmod|Number=Sing|w1'
+ '.1.1|w1.1.16|opening|NOUN|NOUN of|case|w1.1.20|w1.1.17|o'
+ 'f|ADP|ADP the|det|Definite=Def|PronType=Art|w1.1.20|w1.1'
+ '.18|the|DET|DET Swedish|amod|Degree=Pos|w1.1.20|w1.1.19|'
+ 'swedish|ADJ|ADJ Parliament|nmod|Number=Sing|w1.1.16|w1.1'
+ '.20|parliament|NOUN|NOUN on|case|w1.1.22|w1.1.21|on|ADP|'
+ 'ADP Tuesday|nmod|Number=Sing|w1.1.16|w1.1.22|Tuesday|Spa'
+ 'ceAfter=No|PROPN|PROPN ,|punct|w1.1.1|w1.1.23|,|PUNCT|PU'
+ 'NCT 4|nummod|NumType=Card|w1.1.25|w1.1.24|4|NUM|NUM Octo'
+ 'ber|appos|Number=Sing|w1.1.1|w1.1.25|October|SpaceAfter='
+ 'No|PROPN|PROPN ,|punct|w1.1.25|w1.1.26|,|PUNCT|PUNCT 198'
+ '8|nummod|NumType=Card|w1.1.25|w1.1.27|1988|SpaceAfter=No'
+ '|NUM|NUM .|punct|w1.1.1|w1.1.28|.|SpaceAfter=No|PUNCT|PU'
+ 'NCT\n(trg)="s1.1">REGERINGSFÖRKLARING|root|Case=Nom|Defini'
+ 'te=Ind|Gender=Neut|Number=Sing|0|w1.1.1|Regeringsförklar'
+ 'ing|SpaceAfter=No|NOUN|NOUN .|punct|w1.1.1|w1.1.2|.|Spac'
+ 'eAfter=No|PUNCT|PUNCT'
+ '\n================================\n', var)
def test_tmx_xml_write(self):
OpusRead(directory='RF', source='en', target='sv', maximum=1,
- write=[os.path.join(self.tempdir1, 'test_files', 'test_result')],
- write_mode='tmx', root_directory=self.root_directory).printPairs()
+ write=[os.path.join(self.tempdir1, 'test_files', 'test_result')],
+ write_mode='tmx', root_directory=self.root_directory).printPairs()
with open(os.path.join(self.tempdir1, 'test_files', 'test_result'),
- 'r') as f:
- self.assertEqual(f.read(),
+ 'r') as f:
+ self.assertEqual(
+ f.read(),
'\n'
'\n\n\t\n\t\t'
@@ -558,11 +564,12 @@ def test_tmx_xml_write(self):
def test_tmx_xml_write_unalphabetical(self):
OpusRead(directory='RF', source='sv', target='en', maximum=1,
- write=[os.path.join(self.tempdir1, 'test_files', 'test_result')],
- write_mode='tmx', root_directory=self.root_directory).printPairs()
+ write=[os.path.join(self.tempdir1, 'test_files', 'test_result')],
+ write_mode='tmx', root_directory=self.root_directory).printPairs()
with open(os.path.join(self.tempdir1, 'test_files', 'test_result'),
- 'r') as f:
- self.assertEqual(f.read(),
+ 'r') as f:
+ self.assertEqual(
+ f.read(),
'\n'
'\n\n\t\n\t\t'
@@ -574,54 +581,61 @@ def test_tmx_xml_write_unalphabetical(self):
'day , 4 October , 1988 .'
'\n\t\t\n\t\n\n')
-
def test_tmx_xml_print(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- maximum=1, write_mode='tmx', root_directory=self.root_directory)
- self.assertTrue(
- '\n'
- '\n\n\t' in var)
- self.assertTrue(''
- '\n\t\t\tStatement of Governm'
- 'ent Policy by the Prime Minister , Mr Ingvar Carlsso'
- 'n , at the Opening of the Swedish Parliament on Tues'
- 'day , 4 October , 1988 .'
- '\n\t\t\tREGERING'
- 'SFÖRKLARING .\n\t\t\n\t\n\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ maximum=1, write_mode='tmx', root_directory=self.root_directory).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n'
+ '\n\n\t', var)
+ self.assertIn(
+ ''
+ '\n\t\t\tStatement of Governm'
+ 'ent Policy by the Prime Minister , Mr Ingvar Carlsso'
+ 'n , at the Opening of the Swedish Parliament on Tues'
+ 'day , 4 October , 1988 .'
+ '\n\t\t\tREGERING'
+ 'SFÖRKLARING .\n\t\t\n\t\n\n', var)
def test_tmx_xml_print_verbose(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- maximum=1, write_mode='tmx', root_directory=self.root_directory,
- verbose=True)
- self.assertTrue('Parsing file "'+self.root_directory+'/RF/latest/xml/en-sv.xml.gz"' in var)
- self.assertTrue('Parsing file "RF/xml/en/1988.xml"' in var)
- self.assertTrue('Parsing file "RF/xml/sv/1988.xml"' in var)
+ with mock.patch('sys.stderr', new=io.StringIO()) as output:
+ var = pairPrinterToVariable(
+ directory='RF', source='en', target='sv',
+ maximum=1, write_mode='tmx', root_directory=self.root_directory,
+ verbose=True)
+ var = output.getvalue()
+ self.assertIn('Parsing file "'+self.root_directory+'/RF/latest/xml/en-sv.xml.gz"', var)
+ self.assertIn('Parsing file "RF/xml/en/1988.xml"', var)
+ self.assertIn('Parsing file "RF/xml/sv/1988.xml"', var)
def test_tmx_xml_print_unalphabetical(self):
- var = pairPrinterToVariable(directory='RF', source='sv', target='en',
- maximum=1, write_mode='tmx', root_directory=self.root_directory)
- self.assertTrue(
- '\n'
- '\n\n\t\n\t\t'
- '\n\t\t\tREGERING'
- 'SFÖRKLARING .\n\t\t\tStatement of Governm'
- 'ent Policy by the Prime Minister , Mr Ingvar Carlsso'
- 'n , at the Opening of the Swedish Parliament on Tues'
- 'day , 4 October , 1988 .'
- '\n\t\t\n\t\n\n')
-
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='sv', target='en',
+ maximum=1, write_mode='tmx', root_directory=self.root_directory).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n'
+ '\n\n\t\n\t\t'
+ '\n\t\t\tREGERING'
+ 'SFÖRKLARING .\n\t\t\tStatement of Governm'
+ 'ent Policy by the Prime Minister , Mr Ingvar Carlsso'
+ 'n , at the Opening of the Swedish Parliament on Tues'
+ 'day , 4 October , 1988 .'
+ '\n\t\t\n\t\n\n', var)
def test_tmx_raw_write(self):
OpusRead(directory='RF', source='en', target='sv', maximum=1,
- write_mode='tmx',
- write=[os.path.join(self.tempdir1, 'test_files', 'test_result')],
- preprocess='raw', root_directory=self.root_directory).printPairs()
+ write_mode='tmx',
+ write=[os.path.join(self.tempdir1, 'test_files', 'test_result')],
+ preprocess='raw', root_directory=self.root_directory).printPairs()
with open(os.path.join(self.tempdir1, 'test_files', 'test_result'),
- 'r') as f:
- self.assertEqual(f.read(),
+ 'r') as f:
+ self.assertEqual(
+ f.read(),
'\n'
'\n\n\t\n\t\t'
@@ -633,32 +647,34 @@ def test_tmx_raw_write(self):
'SFÖRKLARING.\n\t\t\n\t\n\n')
def test_tmx_raw_print(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- maximum=1, write_mode='tmx', preprocess='raw',
- root_directory=self.root_directory)
- self.assertTrue(
- '\n'
- '\n\n\t' in var)
- self.assertTrue(
- 'Statement of Governm'
- 'ent Policy by the Prime Minister, Mr Ingvar Carlsso'
- 'n, at the Opening of the Swedish Parliament on Tues'
- 'day, 4 October, 1988.'
- '\n\t\t\tREGERING'
- 'SFÖRKLARING.\n\t\t\n\t\n\n' in var)
-
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ maximum=1, write_mode='tmx', preprocess='raw',
+ root_directory=self.root_directory).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n'
+ '\n\n\t', var)
+ self.assertIn(
+ 'Statement of Governm'
+ 'ent Policy by the Prime Minister, Mr Ingvar Carlsso'
+ 'n, at the Opening of the Swedish Parliament on Tues'
+ 'day, 4 October, 1988.'
+ '\n\t\t\tREGERING'
+ 'SFÖRKLARING.\n\t\t\n\t\n\n', var)
def test_tmx_parsed_write(self):
OpusRead(directory='RF', source='en', target='sv', maximum=1,
- write=[os.path.join(self.tempdir1, 'test_files', 'test_result')],
- write_mode='tmx', preprocess='parsed', print_annotations=True,
- source_annotations=['upos', 'feats', 'lemma'],
- target_annotations=['upos', 'feats', 'lemma'],
- root_directory=self.root_directory).printPairs()
+ write=[os.path.join(self.tempdir1, 'test_files', 'test_result')],
+ write_mode='tmx', preprocess='parsed', print_annotations=True,
+ source_annotations=['upos', 'feats', 'lemma'],
+ target_annotations=['upos', 'feats', 'lemma'],
+ root_directory=self.root_directory).printPairs()
with open(os.path.join(self.tempdir1, 'test_files', 'test_result'),
- 'r') as f:
- self.assertEqual(f.read(),
+ 'r') as f:
+ self.assertEqual(
+ f.read(),
'\n'
'\n\n\t\n\t\t'
@@ -682,118 +698,122 @@ def test_tmx_parsed_write(self):
'\n\t\t\n\t\n\n')
def test_tmx_parsed_print(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- maximum=1, write_mode='tmx', preprocess='parsed',
- print_annotations=True,
- source_annotations=['upos', 'feats', 'lemma'],
- target_annotations=['upos', 'feats', 'lemma'],
- root_directory=self.root_directory)
- self.assertTrue(
- '\n'
- '\n\n\t' in var)
- self.assertTrue(''
- '\n\t\t\tStatement|NOUN|Numbe'
- 'r=Sing|statement of|ADP|of Government|NOUN|Number=Si'
- 'ng|government Policy|NOUN|Number=Sing|policy by|ADP|'
- 'by the|DET|Definite=Def|PronType=Art|the Prime|PROPN'
- '|Number=Sing|Prime Minister|PROPN|Number=Sing|Minist'
- 'er ,|PUNCT|, Mr|PROPN|Number=Sing|Mr Ingvar|PROPN|Nu'
- 'mber=Sing|Ingvar Carlsson|PROPN|Number=Sing|Carlsson '
- ',|PUNCT|, at|ADP|at the|DET|Definite=Def|PronType=Ar'
- 't|the Opening|NOUN|Number=Sing|opening of|ADP|of the'
- '|DET|Definite=Def|PronType=Art|the Swedish|ADJ|Degre'
- 'e=Pos|swedish Parliament|NOUN|Number=Sing|parliament '
- 'on|ADP|on Tuesday|PROPN|Number=Sing|Tuesday ,|PUNCT|'
- ', 4|NUM|NumType=Card|4 October|PROPN|Number=Sing|Oct'
- 'ober ,|PUNCT|, 1988|NUM|NumType=Card|1988 .|PUNCT|.<'
- '/seg>\n\t\t\tREGERINGS'
- 'FÖRKLARING|NOUN|Case=Nom|Definite=Ind|Gender=Neut|Nu'
- 'mber=Sing|Regeringsförklaring .|PUNCT|.'
- '\n\t\t\n\t\n\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ maximum=1, write_mode='tmx', preprocess='parsed',
+ print_annotations=True,
+ source_annotations=['upos', 'feats', 'lemma'],
+ target_annotations=['upos', 'feats', 'lemma'],
+ root_directory=self.root_directory).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n'
+ '\n\n\t', var)
+ self.assertIn(
+ ''
+ '\n\t\t\tStatement|NOUN|Numbe'
+ 'r=Sing|statement of|ADP|of Government|NOUN|Number=Si'
+ 'ng|government Policy|NOUN|Number=Sing|policy by|ADP|'
+ 'by the|DET|Definite=Def|PronType=Art|the Prime|PROPN'
+ '|Number=Sing|Prime Minister|PROPN|Number=Sing|Minist'
+ 'er ,|PUNCT|, Mr|PROPN|Number=Sing|Mr Ingvar|PROPN|Nu'
+ 'mber=Sing|Ingvar Carlsson|PROPN|Number=Sing|Carlsson '
+ ',|PUNCT|, at|ADP|at the|DET|Definite=Def|PronType=Ar'
+ 't|the Opening|NOUN|Number=Sing|opening of|ADP|of the'
+ '|DET|Definite=Def|PronType=Art|the Swedish|ADJ|Degre'
+ 'e=Pos|swedish Parliament|NOUN|Number=Sing|parliament '
+ 'on|ADP|on Tuesday|PROPN|Number=Sing|Tuesday ,|PUNCT|'
+ ', 4|NUM|NumType=Card|4 October|PROPN|Number=Sing|Oct'
+ 'ober ,|PUNCT|, 1988|NUM|NumType=Card|1988 .|PUNCT|.<'
+ '/seg>\n\t\t\tREGERINGS'
+ 'FÖRKLARING|NOUN|Case=Nom|Definite=Ind|Gender=Neut|Nu'
+ 'mber=Sing|Regeringsförklaring .|PUNCT|.'
+ '\n\t\t\n\t\n\n', var)
def test_moses_xml_write(self):
OpusRead(directory='RF', source='en', target='sv', maximum=1,
- write=[os.path.join(self.tempdir1, 'test_files', 'test.src'),
- os.path.join(self.tempdir1, 'test_files', 'test.trg')],
- write_mode='moses', root_directory=self.root_directory).printPairs()
+ write=[os.path.join(self.tempdir1, 'test_files', 'test.src'),
+ os.path.join(self.tempdir1, 'test_files', 'test.trg')],
+ write_mode='moses', root_directory=self.root_directory).printPairs()
with open(os.path.join(self.tempdir1, 'test_files', 'test.src'),
- 'r') as f:
+ 'r') as f:
self.assertEqual(f.read(),
- 'Statement of Government Policy by the Prime Minister , '
- 'Mr Ingvar Carlsson , at the Opening of the Swedish Parli'
- 'ament on Tuesday , 4 October , 1988 .\n')
+ 'Statement of Government Policy by the Prime Minister , '
+ 'Mr Ingvar Carlsson , at the Opening of the Swedish Parli'
+ 'ament on Tuesday , 4 October , 1988 .\n')
with open(os.path.join(self.tempdir1, 'test_files', 'test.trg'),
- 'r') as f:
+ 'r') as f:
self.assertEqual(f.read(), 'REGERINGSFÖRKLARING .\n')
def test_moses_xml_write_unalphabetical(self):
OpusRead(directory='RF', source='sv', target='en', maximum=1,
- write=[os.path.join(self.tempdir1, 'test_files', 'test.src'),
- os.path.join(self.tempdir1, 'test_files', 'test.trg')],
- write_mode='moses', root_directory=self.root_directory).printPairs()
+ write=[os.path.join(self.tempdir1, 'test_files', 'test.src'),
+ os.path.join(self.tempdir1, 'test_files', 'test.trg')],
+ write_mode='moses', root_directory=self.root_directory).printPairs()
with open(os.path.join(self.tempdir1, 'test_files', 'test.trg'),
- 'r') as f:
+ 'r') as f:
self.assertEqual(f.read(),
- 'Statement of Government Policy by the Prime Minister , '
- 'Mr Ingvar Carlsson , at the Opening of the Swedish Parli'
- 'ament on Tuesday , 4 October , 1988 .\n')
+ 'Statement of Government Policy by the Prime Minister , '
+ 'Mr Ingvar Carlsson , at the Opening of the Swedish Parli'
+ 'ament on Tuesday , 4 October , 1988 .\n')
with open(os.path.join(self.tempdir1, 'test_files', 'test.src'),
- 'r') as f:
+ 'r') as f:
self.assertEqual(f.read(), 'REGERINGSFÖRKLARING .\n')
def test_moses_xml_write_with_file_names(self):
OpusRead(directory='RF', source='en', target='sv', maximum=1,
- write=[os.path.join(self.tempdir1, 'test_files', 'test.src'),
- os.path.join(self.tempdir1, 'test_files', 'test.trg')],
- write_mode='moses', print_file_names=True,
- root_directory=self.root_directory).printPairs()
+ write=[os.path.join(self.tempdir1, 'test_files', 'test.src'),
+ os.path.join(self.tempdir1, 'test_files', 'test.trg')],
+ write_mode='moses', print_file_names=True,
+ root_directory=self.root_directory).printPairs()
with open(os.path.join(self.tempdir1, 'test_files', 'test.src'),
- 'r') as f:
+ 'r') as f:
self.assertEqual(f.read(),
- '\nen/1988.xml.gz\n\nStatement of Gover'
- 'nment Policy by the Prime Minister , '
- 'Mr Ingvar Carlsson , at the Opening of the Swedish Parli'
- 'ament on Tuesday , 4 October , 1988 .\n')
+ '\nen/1988.xml.gz\n\nStatement of Gover'
+ 'nment Policy by the Prime Minister , '
+ 'Mr Ingvar Carlsson , at the Opening of the Swedish Parli'
+ 'ament on Tuesday , 4 October , 1988 .\n')
with open(os.path.join(self.tempdir1, 'test_files', 'test.trg'),
- 'r') as f:
+ 'r') as f:
self.assertEqual(f.read(),
- '\nsv/1988.xml.gz\n\nREGERINGSFÖRKLARING .\n')
+ '\nsv/1988.xml.gz\n\nREGERINGSFÖRKLARING .\n')
def test_moses_xml_write_single_file(self):
OpusRead(directory='RF', source='en', target='sv', maximum=1,
- write=[os.path.join(self.tempdir1, 'test_files', 'test.src')],
- write_mode='moses', root_directory=self.root_directory
- ).printPairs()
+ write=[os.path.join(self.tempdir1, 'test_files', 'test.src')],
+ write_mode='moses', root_directory=self.root_directory
+ ).printPairs()
with open(os.path.join(self.tempdir1, 'test_files', 'test.src'),
- 'r') as f:
+ 'r') as f:
self.assertEqual(f.read(),
- 'Statement of Government Policy by the Prime Minister , '
- 'Mr Ingvar Carlsson , at the Opening of the Swedish Parli'
- 'ament on Tuesday , 4 October , 1988 .\tREGERINGSFÖRK'
- 'LARING .\n')
+ 'Statement of Government Policy by the Prime Minister , '
+ 'Mr Ingvar Carlsson , at the Opening of the Swedish Parli'
+ 'ament on Tuesday , 4 October , 1988 .\tREGERINGSFÖRK'
+ 'LARING .\n')
def test_moses_xml_write_single_file_unalphabetical(self):
OpusRead(directory='RF', source='sv', target='en', maximum=1,
- write=[os.path.join(self.tempdir1, 'test_files', 'test.src')],
- write_mode='moses', root_directory=self.root_directory
- ).printPairs()
+ write=[os.path.join(self.tempdir1, 'test_files', 'test.src')],
+ write_mode='moses', root_directory=self.root_directory
+ ).printPairs()
with open(os.path.join(self.tempdir1, 'test_files', 'test.src'),
- 'r') as f:
+ 'r') as f:
self.assertEqual(f.read(),
- 'REGERINGSFÖRKLARING .\tStatement of Government Poli'
- 'cy by the Prime Minister , Mr Ingvar Carlsson , at t'
- 'he Opening of the Swedish Parliament on Tuesday , 4 '
- 'October , 1988 .\n')
+ 'REGERINGSFÖRKLARING .\tStatement of Government Poli'
+ 'cy by the Prime Minister , Mr Ingvar Carlsson , at t'
+ 'he Opening of the Swedish Parliament on Tuesday , 4 '
+ 'October , 1988 .\n')
def test_moses_xml_write_single_file_with_file_names(self):
OpusRead(directory='RF', source='en', target='sv', maximum=1,
- write=[os.path.join(self.tempdir1, 'test_files', 'test.src')],
- write_mode='moses', print_file_names=True,
- root_directory=self.root_directory).printPairs()
+ write=[os.path.join(self.tempdir1, 'test_files', 'test.src')],
+ write_mode='moses', print_file_names=True,
+ root_directory=self.root_directory).printPairs()
with open(os.path.join(self.tempdir1, 'test_files', 'test.src'),
- 'r') as f:
- self.assertEqual(f.read(),
+ 'r') as f:
+ self.assertEqual(
+ f.read(),
'\nen/1988.xml.gz\nsv/1988'
'.xml.gz\n\nStatement of Government Policy by'
' the Prime Minister , Mr Ingvar Carlsson , at the Ope'
@@ -802,13 +822,14 @@ def test_moses_xml_write_single_file_with_file_names(self):
def test_moses_xml_write_single_file_with_file_names_unalphabetical(self):
OpusRead(directory='RF', source='sv', target='en', maximum=1,
- write=[os.path.join(self.tempdir1, 'test_files', 'test.src')],
- write_mode='moses', print_file_names=True,
- root_directory=self.root_directory).printPairs()
+ write=[os.path.join(self.tempdir1, 'test_files', 'test.src')],
+ write_mode='moses', print_file_names=True,
+ root_directory=self.root_directory).printPairs()
with open(os.path.join(self.tempdir1,
- 'test_files', 'test.src'),
- 'r') as f:
- self.assertEqual(f.read(),
+ 'test_files', 'test.src'),
+ 'r') as f:
+ self.assertEqual(
+ f.read(),
'\nen/1988.xml.gz\nsv/1988'
'.xml.gz\n\nREGERINGSFÖRKLARING .\tStatement '
'of Government Policy by the Prime Minister , Mr Ingv'
@@ -816,119 +837,134 @@ def test_moses_xml_write_single_file_with_file_names_unalphabetical(self):
'nt on Tuesday , 4 October , 1988 .\n')
def test_moses_xml_print(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- maximum=1, write_mode='moses', root_directory=self.root_directory)
- self.assertTrue(
- 'Statement of Government Policy by the Prime Minister , '
- 'Mr Ingvar Carlsson , at the Opening of the Swedish Parli'
- 'ament on Tuesday , 4 October , 1988 .\t'
- 'REGERINGSFÖRKLARING .\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ maximum=1, write_mode='moses', root_directory=self.root_directory).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ 'Statement of Government Policy by the Prime Minister , '
+ 'Mr Ingvar Carlsson , at the Opening of the Swedish Parli'
+ 'ament on Tuesday , 4 October , 1988 .\t'
+ 'REGERINGSFÖRKLARING .\n', var)
def test_moses_xml_print_unalphabetical(self):
- var = pairPrinterToVariable(directory='RF', source='sv', target='en',
- maximum=1, write_mode='moses', root_directory=self.root_directory)
- self.assertTrue(
- 'REGERINGSFÖRKLARING .\tStatement of Government Policy b'
- 'y the Prime Minister , Mr Ingvar Carlsson , at the Openi'
- 'ng of the Swedish Parliament on Tuesday , 4 October , 1988 .\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='sv', target='en',
+ maximum=1, write_mode='moses', root_directory=self.root_directory).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ 'REGERINGSFÖRKLARING .\tStatement of Government Policy b'
+ 'y the Prime Minister , Mr Ingvar Carlsson , at the Openi'
+ 'ng of the Swedish Parliament on Tuesday , 4 October , 1988 .\n', var)
def test_moses_xml_print_with_file_names(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- maximum=1, write_mode='moses', print_file_names=True,
- root_directory=self.root_directory)
- self.assertTrue(
- '\nen/1988.xml.gz\nsv/1988'
- '.xml.gz\n\nStatement of Government Policy by'
- ' the Prime Minister , Mr Ingvar Carlsson , at the Ope'
- 'ning of the Swedish Parliament on Tuesday , 4 Octobe'
- 'r , 1988 .\tREGERINGSFÖRKLARING .\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ maximum=1, write_mode='moses', print_file_names=True,
+ root_directory=self.root_directory).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\nen/1988.xml.gz\nsv/1988'
+ '.xml.gz\n\nStatement of Government Policy by'
+ ' the Prime Minister , Mr Ingvar Carlsson , at the Ope'
+ 'ning of the Swedish Parliament on Tuesday , 4 Octobe'
+ 'r , 1988 .\tREGERINGSFÖRKLARING .\n', var)
def test_moses_raw_write(self):
OpusRead(directory='RF', source='en', target='sv', maximum=1,
- write_mode='moses',
- write=[os.path.join(self.tempdir1, 'test_files', 'test.src'),
- os.path.join(self.tempdir1, 'test_files', 'test.trg')],
- preprocess='raw', root_directory=self.root_directory).printPairs()
+ write_mode='moses',
+ write=[os.path.join(self.tempdir1, 'test_files', 'test.src'),
+ os.path.join(self.tempdir1, 'test_files', 'test.trg')],
+ preprocess='raw', root_directory=self.root_directory).printPairs()
with open(os.path.join(self.tempdir1,
- 'test_files', 'test.src'), 'r') as f:
- self.assertEqual(f.read(),
- 'Statement of Government Policy by the Prime Minister, '
- 'Mr Ingvar Carlsson, at the Opening of the Swedish Parli'
- 'ament on Tuesday, 4 October, 1988.\n')
+ 'test_files', 'test.src'), 'r') as f:
+ self.assertEqual(
+ f.read(),
+ 'Statement of Government Policy by the Prime Minister, '
+ 'Mr Ingvar Carlsson, at the Opening of the Swedish Parli'
+ 'ament on Tuesday, 4 October, 1988.\n')
with open(os.path.join(self.tempdir1,
- 'test_files', 'test.trg'), 'r') as f:
+ 'test_files', 'test.trg'), 'r') as f:
self.assertEqual(f.read(), 'REGERINGSFÖRKLARING.\n')
def test_moses_raw_print(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- maximum=1, write_mode='moses', preprocess='raw',
- root_directory=self.root_directory)
- self.assertTrue(
- 'Statement of Government Policy by the Prime Minister, '
- 'Mr Ingvar Carlsson, at the Opening of the Swedish Parli'
- 'ament on Tuesday, 4 October, 1988.\t'
- 'REGERINGSFÖRKLARING.\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ maximum=1, write_mode='moses', preprocess='raw',
+ root_directory=self.root_directory).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ 'Statement of Government Policy by the Prime Minister, '
+ 'Mr Ingvar Carlsson, at the Opening of the Swedish Parli'
+ 'ament on Tuesday, 4 October, 1988.\t'
+ 'REGERINGSFÖRKLARING.\n', var)
+
def test_moses_parsed_write(self):
OpusRead(directory='RF', source='en', target='sv', maximum=1,
- write=[os.path.join(self.tempdir1, 'test_files', 'test.src'),
- os.path.join(self.tempdir1, 'test_files', 'test.trg')],
- write_mode='moses', preprocess='parsed', print_annotations=True,
- source_annotations=['upos', 'feats', 'lemma'],
- target_annotations=['upos', 'feats', 'lemma'],
- root_directory=self.root_directory).printPairs()
+ write=[os.path.join(self.tempdir1, 'test_files', 'test.src'),
+ os.path.join(self.tempdir1, 'test_files', 'test.trg')],
+ write_mode='moses', preprocess='parsed', print_annotations=True,
+ source_annotations=['upos', 'feats', 'lemma'],
+ target_annotations=['upos', 'feats', 'lemma'],
+ root_directory=self.root_directory).printPairs()
with open(os.path.join(self.tempdir1, 'test_files', 'test.src'),
- 'r') as f:
- self.assertEqual(f.read(), 'Statement|NOUN|Number=Sing|st'
- 'atement of|ADP|of Government|NOUN|Number=Sing|government'
- ' Policy|NOUN|Number=Sing|policy by|ADP|by the|DET|Definit'
- 'e=Def|PronType=Art|the Prime|PROPN|Number=Sing|Prime Min'
- 'ister|PROPN|Number=Sing|Minister ,|PUNCT|, Mr|PROPN|Numb'
- 'er=Sing|Mr Ingvar|PROPN|Number=Sing|Ingvar Carlsson|PROP'
- 'N|Number=Sing|Carlsson ,|PUNCT|, at|ADP|at the|DET|Defin'
- 'ite=Def|PronType=Art|the Opening|NOUN|Number=Sing|openin'
- 'g of|ADP|of the|DET|Definite=Def|PronType=Art|the Swedis'
- 'h|ADJ|Degree=Pos|swedish Parliament|NOUN|Number=Sing|par'
- 'liament on|ADP|on Tuesday|PROPN|Number=Sing|Tuesday ,|PU'
- 'NCT|, 4|NUM|NumType=Card|4 October|PROPN|Number=Sing|Oct'
- 'ober ,|PUNCT|, 1988|NUM|NumType=Card|1988 .|PUNCT|.\n')
+ 'r') as f:
+ self.assertEqual(
+ f.read(), 'Statement|NOUN|Number=Sing|st'
+ 'atement of|ADP|of Government|NOUN|Number=Sing|government'
+ ' Policy|NOUN|Number=Sing|policy by|ADP|by the|DET|Definit'
+ 'e=Def|PronType=Art|the Prime|PROPN|Number=Sing|Prime Min'
+ 'ister|PROPN|Number=Sing|Minister ,|PUNCT|, Mr|PROPN|Numb'
+ 'er=Sing|Mr Ingvar|PROPN|Number=Sing|Ingvar Carlsson|PROP'
+ 'N|Number=Sing|Carlsson ,|PUNCT|, at|ADP|at the|DET|Defin'
+ 'ite=Def|PronType=Art|the Opening|NOUN|Number=Sing|openin'
+ 'g of|ADP|of the|DET|Definite=Def|PronType=Art|the Swedis'
+ 'h|ADJ|Degree=Pos|swedish Parliament|NOUN|Number=Sing|par'
+ 'liament on|ADP|on Tuesday|PROPN|Number=Sing|Tuesday ,|PU'
+ 'NCT|, 4|NUM|NumType=Card|4 October|PROPN|Number=Sing|Oct'
+ 'ober ,|PUNCT|, 1988|NUM|NumType=Card|1988 .|PUNCT|.\n')
with open(os.path.join(self.tempdir1,
- 'test_files', 'test.trg'),
- 'r') as f:
- self.assertEqual(f.read(),
- 'REGERINGSFÖRKLARING|NOUN|Case=Nom|Definite=Ind|Gender=Ne'
- 'ut|Number=Sing|Regeringsförklaring .|PUNCT|.\n')
+ 'test_files', 'test.trg'),
+ 'r') as f:
+ self.assertEqual(
+ f.read(),
+ 'REGERINGSFÖRKLARING|NOUN|Case=Nom|Definite=Ind|Gender=Ne'
+ 'ut|Number=Sing|Regeringsförklaring .|PUNCT|.\n')
def test_moses_parsed_print(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- maximum=1, write_mode='moses', preprocess='parsed',
- print_annotations=True,
- source_annotations=['upos', 'feats', 'lemma'],
- target_annotations=['upos', 'feats', 'lemma'],
- root_directory=self.root_directory)
- self.assertTrue(
- 'Statement|NOUN|Number=Sing|st'
- 'atement of|ADP|of Government|NOUN|Number=Sing|government'
- ' Policy|NOUN|Number=Sing|policy by|ADP|by the|DET|Definit'
- 'e=Def|PronType=Art|the Prime|PROPN|Number=Sing|Prime Min'
- 'ister|PROPN|Number=Sing|Minister ,|PUNCT|, Mr|PROPN|Numb'
- 'er=Sing|Mr Ingvar|PROPN|Number=Sing|Ingvar Carlsson|PROP'
- 'N|Number=Sing|Carlsson ,|PUNCT|, at|ADP|at the|DET|Defin'
- 'ite=Def|PronType=Art|the Opening|NOUN|Number=Sing|openin'
- 'g of|ADP|of the|DET|Definite=Def|PronType=Art|the Swedis'
- 'h|ADJ|Degree=Pos|swedish Parliament|NOUN|Number=Sing|par'
- 'liament on|ADP|on Tuesday|PROPN|Number=Sing|Tuesday ,|PU'
- 'NCT|, 4|NUM|NumType=Card|4 October|PROPN|Number=Sing|Oct'
- 'ober ,|PUNCT|, 1988|NUM|NumType=Card|1988 .|PUNCT|.\tREG'
- 'ERINGSFÖRKLARING|NOUN|Case=Nom|Definite=Ind|Gender=Ne'
- 'ut|Number=Sing|Regeringsförklaring .|PUNCT|.\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ maximum=1, write_mode='moses', preprocess='parsed',
+ print_annotations=True,
+ source_annotations=['upos', 'feats', 'lemma'],
+ target_annotations=['upos', 'feats', 'lemma'],
+ root_directory=self.root_directory).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ 'Statement|NOUN|Number=Sing|st'
+ 'atement of|ADP|of Government|NOUN|Number=Sing|government'
+ ' Policy|NOUN|Number=Sing|policy by|ADP|by the|DET|Definit'
+ 'e=Def|PronType=Art|the Prime|PROPN|Number=Sing|Prime Min'
+ 'ister|PROPN|Number=Sing|Minister ,|PUNCT|, Mr|PROPN|Numb'
+ 'er=Sing|Mr Ingvar|PROPN|Number=Sing|Ingvar Carlsson|PROP'
+ 'N|Number=Sing|Carlsson ,|PUNCT|, at|ADP|at the|DET|Defin'
+ 'ite=Def|PronType=Art|the Opening|NOUN|Number=Sing|openin'
+ 'g of|ADP|of the|DET|Definite=Def|PronType=Art|the Swedis'
+ 'h|ADJ|Degree=Pos|swedish Parliament|NOUN|Number=Sing|par'
+ 'liament on|ADP|on Tuesday|PROPN|Number=Sing|Tuesday ,|PU'
+ 'NCT|, 4|NUM|NumType=Card|4 October|PROPN|Number=Sing|Oct'
+ 'ober ,|PUNCT|, 1988|NUM|NumType=Card|1988 .|PUNCT|.\tREG'
+ 'ERINGSFÖRKLARING|NOUN|Case=Nom|Definite=Ind|Gender=Ne'
+ 'ut|Number=Sing|Regeringsförklaring .|PUNCT|.\n', var)
def test_links_write(self):
OpusRead(directory='RF', source='en', target='sv', maximum=1,
- write=[os.path.join(self.tempdir1, 'test_files', 'test_result')],
- write_mode='links', root_directory=self.root_directory).printPairs()
+ write=[os.path.join(self.tempdir1, 'test_files', 'test_result')],
+ write_mode='links', root_directory=self.root_directory).printPairs()
with open(os.path.join(self.tempdir1, 'test_files', 'test_result'),
- 'r') as f:
- self.assertEqual(f.read(),
+ 'r') as f:
+ self.assertEqual(
+ f.read(),
'\n'
'\n\n '
@@ -939,12 +975,13 @@ def test_links_write(self):
def test_links_write_unalphabetical(self):
OpusRead(directory='RF', source='sv', target='en',
- write=[os.path.join(self.tempdir1, 'test_files', 'test_result')],
- write_mode='links', src_range='1-5', tgt_range='2',
- root_directory=self.root_directory).printPairs()
+ write=[os.path.join(self.tempdir1, 'test_files', 'test_result')],
+ write_mode='links', src_range='1-5', tgt_range='2',
+ root_directory=self.root_directory).printPairs()
with open(os.path.join(self.tempdir1, 'test_files', 'test_result'),
- 'r') as f:
- self.assertEqual(f.read(),
+ 'r') as f:
+ self.assertEqual(
+ f.read(),
'\n'
'\n\n '
@@ -956,60 +993,66 @@ def test_links_write_unalphabetical(self):
' \n\n')
def test_links_print(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- maximum=1, write_mode='links', root_directory=self.root_directory)
- self.assertTrue(
- '\n'
- '\n\n '
- '\n'
- '\n \n\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ maximum=1, write_mode='links', root_directory=self.root_directory).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n'
+ '\n\n '
+ '\n'
+ '\n \n\n', var)
def test_links_print_unalphabetical(self):
- var = pairPrinterToVariable(directory='RF', source='sv', target='en',
- write_mode='links', src_range='1', tgt_range='2',
- root_directory=self.root_directory)
- self.assertTrue(
- '\n'
- '\n\n '
- '\n'
- '\n'
- ' \n'
- ' \n'
- ' \n\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='sv', target='en',
+ write_mode='links', src_range='1', tgt_range='2',
+ root_directory=self.root_directory).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n'
+ '\n\n '
+ '\n'
+ '\n'
+ ' \n'
+ ' \n'
+ ' \n\n', var)
def test_iteration_stops_at_the_end_of_the_document_even_if_max_is_not_filled(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- src_range='2', tgt_range='1', maximum=5,
- root_directory=self.root_directory)
- self.assertTrue(
- """\n# en/1988.xml.gz\n# sv/1988.xml.gz\n\n=============="""
- """==================\n(src)="s4.4">The army will be reor"""
- """ganized with the aim of making it more effective .\n("""
- """src)="s4.5">It is the Government 's intention to seek """
- """broad solutions in issues that are of importance for o"""
- """ur national security .\n(trg)="s4.4">Det är regeringe"""
- """ns föresats att söka breda lösningar i frågor som är a"""
- """v betydelse för vår nationella säkerhet .\n=========="""
- """======================""" in var)
- self.assertTrue("""# en/1996.xml.gz\n# sv/1996"""
- """.xml.gz\n\n================================\n""" in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ src_range='2', tgt_range='1', maximum=5,
+ root_directory=self.root_directory).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ """\n# en/1988.xml.gz\n# sv/1988.xml.gz\n\n=============="""
+ """==================\n(src)="s4.4">The army will be reor"""
+ """ganized with the aim of making it more effective .\n("""
+ """src)="s4.5">It is the Government 's intention to seek """
+ """broad solutions in issues that are of importance for o"""
+ """ur national security .\n(trg)="s4.4">Det är regeringe"""
+ """ns föresats att söka breda lösningar i frågor som är a"""
+ """v betydelse för vår nationella säkerhet .\n=========="""
+ """======================""", var)
+ self.assertIn("""# en/1996.xml.gz\n# sv/1996"""
+ """.xml.gz\n\n================================\n""", var)
def test_use_given_sentence_alignment_file(self):
OpusRead(directory='Books', source='eo', target='pt', src_range='2',
- tgt_range='2', maximum=1, write_mode='links',
- write=[os.path.join(self.tempdir1, 'test_files', 'testlinks')],
- root_directory=self.root_directory).printPairs()
+ tgt_range='2', maximum=1, write_mode='links',
+ write=[os.path.join(self.tempdir1, 'test_files', 'testlinks')],
+ root_directory=self.root_directory).printPairs()
var = pairPrinterToVariable(
directory='Books', source='eo', target='pt',
alignment_file=os.path.join(self.tempdir1, 'test_files',
- 'testlinks'),
+ 'testlinks'),
root_directory=self.root_directory)
- self.assertTrue(
+ self.assertIn(
'\n# eo/Carroll_Lewis-Alice_in_wonderland.xml.gz\n'
'# pt/Carroll_Lewis-Alice_in_wonderland.xml.gz\n\n======='
'=========================\n'
@@ -1032,17 +1075,17 @@ def test_use_given_sentence_alignment_file(self):
', correu através do campo atrás dele e felizmente chegou '
'bem a tempo de o ver pular para dentro de uma grande toc'
'a de coelho debaixo da cerca .\n========================'
- '========\n' in var)
+ '========\n', var)
def test_use_given_sentence_alignment_file_with_lingGrp_end_tag_on_the_same_line_as_link_tag(self):
OpusRead(directory='RF', source='en', target='sv', src_range='2',
- tgt_range='1', write_mode='links',
- write=[os.path.join(self.tempdir1, 'test_files', 'testlinks')],
- root_directory=self.root_directory).printPairs()
+ tgt_range='1', write_mode='links',
+ write=[os.path.join(self.tempdir1, 'test_files', 'testlinks')],
+ root_directory=self.root_directory).printPairs()
var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- alignment_file=os.path.join(self.tempdir1, 'test_files',
- 'testlinks'), root_directory=self.root_directory)
- self.assertTrue(
+ alignment_file=os.path.join(self.tempdir1, 'test_files',
+ 'testlinks'), root_directory=self.root_directory)
+ self.assertIn(
"""\n# en/1988.xml.gz\n# sv/1988.xml.gz\n\n=============="""
"""==================\n(src)="s4.4">The army will be reor"""
"""ganized with the aim of making it more effective .\n("""
@@ -1051,85 +1094,87 @@ def test_use_given_sentence_alignment_file_with_lingGrp_end_tag_on_the_same_line
"""ur national security .\n(trg)="s4.4">Det är regeringe"""
"""ns föresats att söka breda lösningar i frågor som är a"""
"""v betydelse för vår nationella säkerhet .\n=========="""
- """======================\n""" in var)
- self.assertTrue("""# en/1996.xml.gz\n# sv/1996"""
- """.xml.gz\n\n================================\n""" in var)
+ """======================\n""", var)
+ self.assertIn("""# en/1996.xml.gz\n# sv/1996"""
+ """.xml.gz\n\n================================\n""", var)
def test_use_given_sentence_alignment_file_and_print_links(self):
OpusRead(directory='RF', source='en', target='sv', maximum=1,
- write_mode='links',
- write=[os.path.join(self.tempdir1, 'test_files', 'testlinks')],
- root_directory=self.root_directory).printPairs()
+ write_mode='links',
+ write=[os.path.join(self.tempdir1, 'test_files', 'testlinks')],
+ root_directory=self.root_directory).printPairs()
var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- write_mode='links',
- alignment_file=os.path.join(self.tempdir1, 'test_files',
- 'testlinks'),
- root_directory=self.root_directory)
- self.assertTrue(''
- '\n\n\n \n\n \n<'
- '/cesAlign>\n' in var)
+ write_mode='links',
+ alignment_file=os.path.join(self.tempdir1, 'test_files',
+ 'testlinks'),
+ root_directory=self.root_directory)
+ self.assertIn(''
+ '\n\n\n \n\n \n<'
+ '/cesAlign>\n', var)
def test_use_given_sentence_alignment_file_and_write_links(self):
OpusRead(directory='RF', source='en', target='sv', maximum=1,
- write_mode='links',
- write=[os.path.join(self.tempdir1, 'test_files', 'testlinks')],
- root_directory=self.root_directory).printPairs()
+ write_mode='links',
+ write=[os.path.join(self.tempdir1, 'test_files', 'testlinks')],
+ root_directory=self.root_directory).printPairs()
OpusRead(directory='RF', source='en', target='sv', write_mode='links',
- alignment_file=os.path.join(self.tempdir1, 'test_files',
- 'testlinks'),
- write=[os.path.join(self.tempdir1, 'test_files', 'testresult')],
- root_directory=self.root_directory).printPairs()
+ alignment_file=os.path.join(self.tempdir1, 'test_files',
+ 'testlinks'),
+ write=[os.path.join(self.tempdir1, 'test_files', 'testresult')],
+ root_directory=self.root_directory).printPairs()
with open(os.path.join(self.tempdir1, 'test_files', 'testresult'),
- 'r') as f:
- self.assertEqual(f.read(), ''
- '\n\n\n \n\n \n<'
- '/cesAlign>\n')
+ 'r') as f:
+ self.assertEqual(
+ f.read(), ''
+ '\n\n\n \n\n \n<'
+ '/cesAlign>\n')
def test_use_given_sentence_alignment_file_and_print_links_Books(self):
OpusRead(directory='Books', source='eo', target='pt', maximum=1,
- write_mode='links',
- write=[os.path.join(self.tempdir1, 'test_files', 'testlinks')],
- root_directory=self.root_directory).printPairs()
+ write_mode='links',
+ write=[os.path.join(self.tempdir1, 'test_files', 'testlinks')],
+ root_directory=self.root_directory).printPairs()
var = pairPrinterToVariable(directory='Books', source='eo',
- target='pt', write_mode='links',
- alignment_file=os.path.join(self.tempdir1, 'test_files',
- 'testlinks'),
- root_directory=self.root_directory)
- self.assertTrue(''
- '\n\n\n \n\n \n\n' in var)
+ target='pt', write_mode='links',
+ alignment_file=os.path.join(self.tempdir1, 'test_files',
+ 'testlinks'),
+ root_directory=self.root_directory)
+ self.assertIn(''
+ '\n\n\n \n\n \n\n', var)
def test_use_given_sentence_alignment_file_and_write_links_Books(self):
OpusRead(directory='Books', source='eo', target='pt', maximum=1,
- write_mode='links',
- write=[os.path.join(self.tempdir1, 'test_files', 'testlinks')],
- root_directory=self.root_directory).printPairs()
+ write_mode='links',
+ write=[os.path.join(self.tempdir1, 'test_files', 'testlinks')],
+ root_directory=self.root_directory).printPairs()
OpusRead(directory='Books', source='eo', target='pt',
- write_mode='links', alignment_file=os.path.join(self.tempdir1,
- 'test_files', 'testlinks'),
- write=[os.path.join(self.tempdir1, 'test_files', 'testresult')],
- root_directory=self.root_directory).printPairs()
+ write_mode='links', alignment_file=os.path.join(self.tempdir1,
+ 'test_files', 'testlinks'),
+ write=[os.path.join(self.tempdir1, 'test_files', 'testresult')],
+ root_directory=self.root_directory).printPairs()
with open(os.path.join(self.tempdir1, 'test_files', 'testresult'),
- 'r') as f:
- self.assertEqual(f.read(), ''
- '\n\n\n \n\n \n\n')
+ 'r') as f:
+ self.assertEqual(
+ f.read(), ''
+ '\n\n\n \n\n \n\n')
def test_checks_first_whether_documents_are_in_path(self):
with open(os.path.join(self.tempdir1, 'test_files', 'testlinks'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n'
@@ -1137,31 +1182,31 @@ def test_checks_first_whether_documents_are_in_path(self):
'test_en" toDoc="test_files/test_fi" >\n\n +\n')
with open(os.path.join(self.tempdir1, 'test_files', 'test_en'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n\n'
'\n\n test_en1\n test_en2'
'\n\n \n')
with open(os.path.join(self.tempdir1, 'test_files', 'test_fi'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n\n \n'
'\n test_fi1\n test_fi2'
'\n\n \n')
var = pairPrinterToVariable(directory='Books', source='en',
- target='fi', alignment_file=os.path.join(self.tempdir1,
- 'test_files', 'testlinks'),
- download_dir=self.tempdir1)
- self.assertTrue(
+ target='fi', alignment_file=os.path.join(self.tempdir1,
+ 'test_files', 'testlinks'),
+ download_dir=self.tempdir1)
+ self.assertIn(
'\n# test_files/test_en\n# test_files/test_fi\n\n'
'================================\n(src)="s1">test_en1 test_en2\n'
'(trg)="s1">test_fi1 test_fi2'
- '\n================================\n' in var)
+ '\n================================\n', var)
def test_open_documents_from_specifed_zips(self):
with open(os.path.join(self.tempdir1, 'test_files', 'testlinks'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n'
@@ -1169,40 +1214,40 @@ def test_open_documents_from_specifed_zips(self):
'test_en" toDoc="test_files/test_fi" >\n\n +\n')
with open(os.path.join(self.tempdir1, 'test_files', 'test_en'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n\n'
'\n\n test_en1\n test_en2'
'\n\n \n')
with zipfile.ZipFile(os.path.join(self.tempdir1, 'test_en.zip'),
- 'w') as zf:
+ 'w') as zf:
zf.write(os.path.join(self.tempdir1, 'test_files', 'test_en'),
- arcname=os.path.join('test_files', 'test_en'))
+ arcname=os.path.join('test_files', 'test_en'))
with open(os.path.join(self.tempdir1, 'test_files', 'test_fi'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n\n \n'
'\n test_fi1\n test_fi2'
'\n\n \n')
with zipfile.ZipFile(os.path.join(self.tempdir1, 'test_fi.zip'),
- 'w') as zf:
+ 'w') as zf:
zf.write(os.path.join(self.tempdir1, 'test_files', 'test_fi'),
- arcname=os.path.join('test_files', 'test_fi'))
+ arcname=os.path.join('test_files', 'test_fi'))
var = pairPrinterToVariable(directory='Books', source='en',
- target='fi', alignment_file=os.path.join(self.tempdir1,
- 'test_files', 'testlinks'),
- source_zip = os.path.join(self.tempdir1, 'test_en.zip'),
- target_zip = os.path.join(self.tempdir1, 'test_fi.zip'))
- self.assertTrue(
+ target='fi', alignment_file=os.path.join(self.tempdir1,
+ 'test_files', 'testlinks'),
+ source_zip = os.path.join(self.tempdir1, 'test_en.zip'),
+ target_zip = os.path.join(self.tempdir1, 'test_fi.zip'))
+ self.assertIn(
'\n# test_files/test_en\n# test_files/test_fi\n\n'
'================================\n(src)="s1">test_en1 test_en2\n'
'(trg)="s1">test_fi1 test_fi2'
- '\n================================\n' in var)
+ '\n================================\n', var)
def test_try_to_open_wrongly_named_docs_from_specifed_source_zip(self):
with open(os.path.join(self.tempdir1, 'test_files', 'testlinks'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n'
@@ -1210,39 +1255,40 @@ def test_try_to_open_wrongly_named_docs_from_specifed_source_zip(self):
'test_en" toDoc="test_files/test_fi" >\n\n +\n')
with open(os.path.join(self.tempdir1, 'test_files', 'test_en'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n\n'
'\n\n test_en1\n test_en2'
'\n\n \n')
with zipfile.ZipFile(os.path.join(self.tempdir1, 'test_en.zip'),
- 'w') as zf:
+ 'w') as zf:
zf.write(os.path.join(self.tempdir1, 'test_files', 'test_en'),
- arcname=os.path.join('test_files', 'test_un'))
+ arcname=os.path.join('test_files', 'test_un'))
with open(os.path.join(self.tempdir1, 'test_files', 'test_fi'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n\n \n'
'\n test_fi1\n test_fi2'
'\n\n \n')
with zipfile.ZipFile(os.path.join(self.tempdir1, 'test_fi.zip'),
- 'w') as zf:
+ 'w') as zf:
zf.write(os.path.join(self.tempdir1, 'test_files', 'test_fi'),
- arcname=os.path.join('test_files', 'test_fi'))
-
- var = pairPrinterToVariable(directory='Books', source='en',
- target='fi', alignment_file=os.path.join(self.tempdir1,
- 'test_files', 'testlinks'),
- source_zip = os.path.join(self.tempdir1, 'test_en.zip'),
- target_zip = os.path.join(self.tempdir1, 'test_fi.zip'))
+ arcname=os.path.join('test_files', 'test_fi'))
- self.assertTrue("\nThere is no item named 'test_files/test_en' "
- "in the archive '"+os.path.join(self.tempdir1, 'test_en.zip')+"'\n"
- "Continuing from next sentence file pair.\n" in var)
+ with mock.patch('sys.stderr', new=io.StringIO()) as output:
+ OpusRead(directory='Books', source='en',
+ target='fi', alignment_file=os.path.join(self.tempdir1,
+ 'test_files', 'testlinks'),
+ source_zip = os.path.join(self.tempdir1, 'test_en.zip'),
+ target_zip = os.path.join(self.tempdir1, 'test_fi.zip')).printPairs()
+ var = output.getvalue()
+ self.assertIn("\nThere is no item named 'test_files/test_en' "
+ "in the archive '"+os.path.join(self.tempdir1, 'test_en.zip')+"'\n"
+ "Continuing from next sentence file pair.\n", var)
def test_try_to_open_wrongly_named_docs_from_specifed_target_zip(self):
with open(os.path.join(self.tempdir1, 'test_files', 'testlinks'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n'
@@ -1250,39 +1296,40 @@ def test_try_to_open_wrongly_named_docs_from_specifed_target_zip(self):
'test_en" toDoc="test_files/test_fi" >\n\n +\n')
with open(os.path.join(self.tempdir1, 'test_files', 'test_en'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n\n'
'\n\n test_en1\n test_en2'
'\n\n \n')
with zipfile.ZipFile(os.path.join(self.tempdir1, 'test_en.zip'),
- 'w') as zf:
+ 'w') as zf:
zf.write(os.path.join(self.tempdir1, 'test_files', 'test_en'),
- arcname=os.path.join('test_files', 'test_en'))
+ arcname=os.path.join('test_files', 'test_en'))
with open(os.path.join(self.tempdir1, 'test_files', 'test_fi'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n\n \n'
'\n test_fi1\n test_fi2'
'\n\n \n')
with zipfile.ZipFile(os.path.join(self.tempdir1, 'test_fi.zip'),
- 'w') as zf:
+ 'w') as zf:
zf.write(os.path.join(self.tempdir1, 'test_files', 'test_fi'),
- arcname=os.path.join('test_files', 'test_un'))
-
- var = pairPrinterToVariable(directory='Books', source='en',
- target='fi', alignment_file=os.path.join(self.tempdir1,
- 'test_files', 'testlinks'),
- source_zip = os.path.join(self.tempdir1, 'test_en.zip'),
- target_zip = os.path.join(self.tempdir1, 'test_fi.zip'))
+ arcname=os.path.join('test_files', 'test_un'))
- self.assertTrue("\nThere is no item named 'test_files/test_fi' "
- "in the archive '"+os.path.join(self.tempdir1, 'test_fi.zip')+"'\n"
- "Continuing from next sentence file pair.\n" in var)
+ with mock.patch('sys.stderr', new=io.StringIO()) as output:
+ OpusRead(directory='Books', source='en',
+ target='fi', alignment_file=os.path.join(self.tempdir1,
+ 'test_files', 'testlinks'),
+ source_zip = os.path.join(self.tempdir1, 'test_en.zip'),
+ target_zip = os.path.join(self.tempdir1, 'test_fi.zip')).printPairs()
+ var = output.getvalue()
+ self.assertIn("\nThere is no item named 'test_files/test_fi' "
+ "in the archive '"+os.path.join(self.tempdir1, 'test_fi.zip')+"'\n"
+ "Continuing from next sentence file pair.\n", var)
def test_checks_first_whether_documents_are_in_path_gz(self):
with open(os.path.join(self.tempdir1, 'test_files', 'testlinks'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n'
@@ -1290,171 +1337,190 @@ def test_checks_first_whether_documents_are_in_path_gz(self):
'test_en.gz" toDoc="test_files/test_fi.gz" >\n\n +\n')
with open(os.path.join(self.tempdir1, 'test_files', 'test_en'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n\n'
'\n\n test_en1\n test_en2'
'\n\n \n')
with open(os.path.join(self.tempdir1, 'test_files', 'test_fi'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n\n \n'
'\n test_fi1\n test_fi2'
'\n\n \n')
with open(os.path.join(self.tempdir1, 'test_files', 'test_en'),
- 'rb') as f:
+ 'rb') as f:
with gzip.open(os.path.join(self.tempdir1, 'test_files',
- 'test_en.gz'), 'wb') as gf:
+ 'test_en.gz'), 'wb') as gf:
shutil.copyfileobj(f, gf)
with open(os.path.join(self.tempdir1, 'test_files', 'test_fi'),
- 'rb') as f:
+ 'rb') as f:
with gzip.open(os.path.join(self.tempdir1,
- 'test_files', 'test_fi.gz'), 'wb') as gf:
+ 'test_files', 'test_fi.gz'), 'wb') as gf:
shutil.copyfileobj(f, gf)
var = pairPrinterToVariable(directory='Books', source='eo',
- target='pt', alignment_file=os.path.join(self.tempdir1,
- 'test_files', 'testlinks'), download_dir=self.tempdir1,
- root_directory=self.root_directory)
- self.assertTrue(
+ target='pt', alignment_file=os.path.join(self.tempdir1,
+ 'test_files', 'testlinks'), download_dir=self.tempdir1,
+ root_directory=self.root_directory)
+ self.assertIn(
'\n# test_files/test_en.gz\n# test_files/test_fi.gz\n\n'
'================================\n(src)="s1">test_en1 test_en2\n'
'(trg)="s1">test_fi1 test_fi2'
- '\n================================\n' in var)
+ '\n================================\n', var)
def test_filtering_by_src_cld2(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- release='v1', maximum=1, src_cld2=['en', '0.98'],
- alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'),
- download_dir=self.tempdir1)
- self.assertTrue(
- '\n# en/1996.xml.gz\n'
- '# sv/1996.xml.gz\n'
- '\n================================'
- '\n(src)="s5.0">Mr. Sherlock Holmes'
- '\n(trg)="s5.0">Herra Sherlock Holmes'
- '\n================================\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ release='v1', maximum=1, src_cld2=['en', '0.98'],
+ alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'),
+ download_dir=self.tempdir1).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n# en/1996.xml.gz\n'
+ '# sv/1996.xml.gz\n'
+ '\n================================'
+ '\n(src)="s5.0">Mr. Sherlock Holmes'
+ '\n(trg)="s5.0">Herra Sherlock Holmes'
+ '\n================================\n', var)
def test_filtering_by_trg_cld2(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- release='v1', maximum=1, trg_cld2=['ia', '0'],
- alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'),
- download_dir=self.tempdir1)
- self.assertTrue(
- '\n# en/1996.xml.gz\n'
- '# sv/1996.xml.gz\n'
- '\n================================'
- '\n(src)="s4">Chapter 1 Mr. Sherlock Holmes'
- '\n(trg)="s4">Herra Sherlock Holmes .'
- '\n================================\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ release='v1', maximum=1, trg_cld2=['ia', '0'],
+ alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'),
+ download_dir=self.tempdir1).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n# en/1996.xml.gz\n'
+ '# sv/1996.xml.gz\n'
+ '\n================================'
+ '\n(src)="s4">Chapter 1 Mr. Sherlock Holmes'
+ '\n(trg)="s4">Herra Sherlock Holmes .'
+ '\n================================\n', var)
def test_filtering_by_src_langid(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- release='v1', maximum=1, src_langid=['de', '0'],
- alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'),
- download_dir=self.tempdir1)
- self.assertTrue(
- '\n# en/1996.xml.gz\n'
- '# sv/1996.xml.gz\n'
- '\n================================'
- '\n(src)="s167.0">" Excellent !'
- '\n(trg)="s167.0">" Erinomaista .'
- '\n================================\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ release='v1', maximum=1, src_langid=['de', '0'],
+ alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'),
+ download_dir=self.tempdir1).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n# en/1996.xml.gz\n'
+ '# sv/1996.xml.gz\n'
+ '\n================================'
+ '\n(src)="s167.0">" Excellent !'
+ '\n(trg)="s167.0">" Erinomaista .'
+ '\n================================\n', var)
def test_filtering_by_trg_langid(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- release='v1', maximum=1, trg_langid=['et', '0'],
- alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'),
- download_dir=self.tempdir1)
- self.assertTrue(
- '\n# en/1996.xml.gz\n'
- '# sv/1996.xml.gz\n'
- '\n================================'
- '\n(src)="s4">Chapter 1 Mr. Sherlock Holmes'
- '\n(trg)="s4">Herra Sherlock Holmes .'
- '\n================================\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ release='v1', maximum=1, trg_langid=['et', '0'],
+ alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'),
+ download_dir=self.tempdir1).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n# en/1996.xml.gz\n'
+ '# sv/1996.xml.gz\n'
+ '\n================================'
+ '\n(src)="s4">Chapter 1 Mr. Sherlock Holmes'
+ '\n(trg)="s4">Herra Sherlock Holmes .'
+ '\n================================\n', var)
def test_filtering_by_lang_labels(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- release='v1', maximum=1, src_cld2=['un', '0'],
- trg_cld2=['fi', '0.97'], src_langid=['en', '0.17'],
- trg_langid=['fi', '1'],
- alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'),
- download_dir=self.tempdir1)
- self.assertTrue(
- '\n# en/1996.xml.gz\n'
- '# sv/1996.xml.gz\n'
- '\n================================'
- '\n(src)="s8.1">I believe'
- '\n(trg)="s8.1">Luulenpa että sinulla'
- '\n================================\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ release='v1', maximum=1, src_cld2=['un', '0'],
+ trg_cld2=['fi', '0.97'], src_langid=['en', '0.17'],
+ trg_langid=['fi', '1'],
+ alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'),
+ download_dir=self.tempdir1).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n# en/1996.xml.gz\n'
+ '# sv/1996.xml.gz\n'
+ '\n================================'
+ '\n(src)="s8.1">I believe'
+ '\n(trg)="s8.1">Luulenpa että sinulla'
+ '\n================================\n', var)
def test_filtering_by_lang_labels_nonalphabetical_lang_order(self):
- var = pairPrinterToVariable(directory='RF', source='sv', target='en',
- release='v1', maximum=1, trg_cld2=['un', '0'],
- src_cld2=['fi', '0.97'], trg_langid=['en', '0.17'],
- src_langid=['fi', '1'],
- alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'),
- download_dir=self.tempdir1)
- self.assertTrue(
- '\n# en/1996.xml.gz\n'
- '# sv/1996.xml.gz\n'
- '\n================================'
- '\n(src)="s8.1">Luulenpa että sinulla'
- '\n(trg)="s8.1">I believe'
- '\n================================\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='sv', target='en',
+ release='v1', maximum=1, trg_cld2=['un', '0'],
+ src_cld2=['fi', '0.97'], trg_langid=['en', '0.17'],
+ src_langid=['fi', '1'],
+ alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'),
+ download_dir=self.tempdir1).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n# en/1996.xml.gz\n'
+ '# sv/1996.xml.gz\n'
+ '\n================================'
+ '\n(src)="s8.1">Luulenpa että sinulla'
+ '\n(trg)="s8.1">I believe'
+ '\n================================\n', var)
def test_filtering_by_lang_labels_no_matches_found(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- release='v1', maximum=1, src_cld2=['fi', '2'],
- alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'),
- download_dir=self.tempdir1)
- self.assertTrue(
- '\n# en/1996.xml.gz\n'
- '# sv/1996.xml.gz\n'
- '\n================================\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ release='v1', maximum=1, src_cld2=['fi', '2'],
+ alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'),
+ download_dir=self.tempdir1).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n# en/1996.xml.gz\n'
+ '# sv/1996.xml.gz\n'
+ '\n================================\n', var)
def test_filtering_by_src_cld2_print_links(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- release='v1', maximum=1, src_cld2=['en', '0.98'],
- alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'),
- write_mode='links', download_dir=self.tempdir1)
- self.assertTrue(
- '\n\n' in var)
- self.assertTrue(
- ' \n'
- '\n \n\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ release='v1', maximum=1, src_cld2=['en', '0.98'],
+ alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'),
+ write_mode='links', download_dir=self.tempdir1).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n\n', var)
+ self.assertIn(
+ ' \n'
+ '\n \n\n', var)
def test_filtering_by_lang_labels_print_links(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- release='v1', maximum=1, src_cld2=['un', '0'],
- trg_cld2=['fi', '0.97'], src_langid=['en', '0.17'],
- trg_langid=['fi', '1'],
- alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'),
- write_mode='links', download_dir=self.tempdir1)
- self.assertTrue(
- '\n\n' in var)
- self.assertTrue(
- ' \n'
- '\n \n\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ release='v1', maximum=1, src_cld2=['un', '0'],
+ trg_cld2=['fi', '0.97'], src_langid=['en', '0.17'],
+ trg_langid=['fi', '1'],
+ alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'),
+ write_mode='links', download_dir=self.tempdir1).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n\n', var)
+ self.assertIn(
+ ' \n'
+ '\n \n\n', var)
def test_filtering_by_lang_labels_write_links(self):
OpusRead(directory='RF', source='en', target='sv',
- release='v1', maximum=1, src_cld2=['un', '0'],
- trg_cld2=['fi', '0.97'], src_langid=['en', '0.17'],
- trg_langid=['fi', '1'],
- alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'),
- write=[os.path.join(self.tempdir1, 'test_files', 'result')],
- write_mode='links', download_dir=self.tempdir1).printPairs()
+ release='v1', maximum=1, src_cld2=['un', '0'],
+ trg_cld2=['fi', '0.97'], src_langid=['en', '0.17'],
+ trg_langid=['fi', '1'],
+ alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'),
+ write=[os.path.join(self.tempdir1, 'test_files', 'result')],
+ write_mode='links', download_dir=self.tempdir1).printPairs()
with open(os.path.join(self.tempdir1, 'test_files', 'result'),
- 'r') as f:
- self.assertEqual(f.read(),
+ 'r') as f:
+ self.assertEqual(
+ f.read(),
'\n\n\n'
@@ -1463,204 +1529,218 @@ def test_filtering_by_lang_labels_write_links(self):
'\n \n\n')
def test_use_given_zip_files(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- maximum=1, source_zip=os.path.join(self.tempdir1, 'en.zip'),
- target_zip=os.path.join(self.tempdir1, 'sv.zip'),
- alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'),
- root_directory=self.root_directory)
- self.assertTrue(
- '\n# en/1996.xml.gz'
- '\n# sv/1996.xml.gz'
- '\n\n================================'
- '\n(src)="s1">Source&<>"\' : manybooks.netAudiobook available here'
- '\n(trg)="s1">Source : Project Gutenberg'
- '\n================================\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ maximum=1, source_zip=os.path.join(self.tempdir1, 'en.zip'),
+ target_zip=os.path.join(self.tempdir1, 'sv.zip'),
+ alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'),
+ root_directory=self.root_directory).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n# en/1996.xml.gz'
+ '\n# sv/1996.xml.gz'
+ '\n\n================================'
+ '\n(src)="s1">Source&<>"\' : manybooks.netAudiobook available here'
+ '\n(trg)="s1">Source : Project Gutenberg'
+ '\n================================\n', var)
def test_use_given_zip_files_unalphabetical(self):
- var = pairPrinterToVariable(directory='RF', target='en', source='sv',
- maximum=1, target_zip=os.path.join(self.tempdir1, 'en.zip'),
- source_zip=os.path.join(self.tempdir1, 'sv.zip'),
- alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'),
- root_directory=self.root_directory)
- self.assertTrue(
- '\n# en/1996.xml.gz'
- '\n# sv/1996.xml.gz'
- '\n\n================================'
- '\n(src)="s1">Source : Project Gutenberg'
- '\n(trg)="s1">Source&<>"\' : manybooks.netAudiobook available here'
- '\n================================\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', target='en', source='sv',
+ maximum=1, target_zip=os.path.join(self.tempdir1, 'en.zip'),
+ source_zip=os.path.join(self.tempdir1, 'sv.zip'),
+ alignment_file=os.path.join(self.tempdir1, 'books_alignment.xml'),
+ root_directory=self.root_directory).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n# en/1996.xml.gz'
+ '\n# sv/1996.xml.gz'
+ '\n\n================================'
+ '\n(src)="s1">Source : Project Gutenberg'
+ '\n(trg)="s1">Source&<>"\' : manybooks.netAudiobook available here'
+ '\n================================\n', var)
@mock.patch('opustools.opus_get.input', create=True)
def test_alignment_file_not_found(self, mocked_input):
mocked_input.side_effect = ['y', 'n']
pairPrinterToVariable(directory='RF', source='en', target='sv', maximum=1,
- alignment_file=os.path.join(self.tempdir1, 'unfound.xml.gz'),
- download_dir=self.tempdir1, root_directory=self.root_directory)
+ alignment_file=os.path.join(self.tempdir1, 'unfound.xml.gz'),
+ download_dir=self.tempdir1, root_directory=self.root_directory)
os.remove(os.path.join(self.tempdir1, 'RF_latest_xml_en-sv.xml.gz'))
os.remove(os.path.join(self.tempdir1, 'RF_latest_xml_en.zip'))
os.remove(os.path.join(self.tempdir1, 'RF_latest_xml_sv.zip'))
with self.assertRaises(FileNotFoundError):
pairPrinterToVariable(directory='RF', source='en', target='sv',
- maximum=1,
- alignment_file=os.path.join(self.tempdir1, 'unfound.xml.gz'))
+ maximum=1,
+ alignment_file=os.path.join(self.tempdir1, 'unfound.xml.gz'))
def test_alignment_file_not_found_no_prompt(self):
opr = OpusRead(directory='RF', source='en', target='sv', maximum=1,
- alignment_file=os.path.join(self.tempdir1, 'unfound.xml.gz'),
- suppress_prompts=True, download_dir=self.tempdir1,
- root_directory=self.root_directory)
+ alignment_file=os.path.join(self.tempdir1, 'unfound.xml.gz'),
+ suppress_prompts=True, download_dir=self.tempdir1,
+ root_directory=self.root_directory)
opr.printPairs()
self.assertTrue(os.path.isfile(os.path.join(self.tempdir1,
- 'RF_latest_xml_en-sv.xml.gz')))
+ 'RF_latest_xml_en-sv.xml.gz')))
self.assertTrue(os.path.isfile(os.path.join(self.tempdir1,
- 'RF_latest_xml_en.zip')))
+ 'RF_latest_xml_en.zip')))
self.assertTrue(os.path.isfile(os.path.join(self.tempdir1,
- 'RF_latest_xml_sv.zip')))
+ 'RF_latest_xml_sv.zip')))
os.remove(os.path.join(self.tempdir1, 'RF_latest_xml_en-sv.xml.gz'))
os.remove(os.path.join(self.tempdir1, 'RF_latest_xml_en.zip'))
os.remove(os.path.join(self.tempdir1, 'RF_latest_xml_sv.zip'))
def test_id_file_printing(self):
OpusRead(directory='RF', source='en', target='sv', maximum=1,
- attribute='certainty', threshold='1',
- write_ids=os.path.join(self.tempdir1, 'test_files', 'test.id'),
- root_directory=self.root_directory).printPairs()
+ attribute='certainty', threshold='1',
+ write_ids=os.path.join(self.tempdir1, 'test_files', 'test.id'),
+ root_directory=self.root_directory).printPairs()
with open(os.path.join(self.tempdir1,
- 'test_files', 'test.id')) as id_file:
+ 'test_files', 'test.id')) as id_file:
self.assertEqual(id_file.read(), 'en/1988.xml.gz\tsv/1988'
- '.xml.gz\ts3.2\ts3.2\t1.14214\n')
+ '.xml.gz\ts3.2\ts3.2\t1.14214\n')
def test_id_file_printing_unalphabetical(self):
OpusRead(directory='RF', source='sv', target='en', maximum=1,
- src_range='1', tgt_range='2', attribute='certainty',
- threshold='0.1',
- write_ids=os.path.join(self.tempdir1, 'test_files', 'test.id'),
- root_directory=self.root_directory).printPairs()
+ src_range='1', tgt_range='2', attribute='certainty',
+ threshold='0.1',
+ write_ids=os.path.join(self.tempdir1, 'test_files', 'test.id'),
+ root_directory=self.root_directory).printPairs()
with open(os.path.join(
self.tempdir1, 'test_files', 'test.id')) as id_file:
self.assertEqual(id_file.read(), 'sv/1988.xml.gz\ten/1988'
- '.xml.gz\ts4.4\ts4.4 s4.5\t0.188136\n')
+ '.xml.gz\ts4.4\ts4.4 s4.5\t0.188136\n')
def test_id_file_printing_with_no_attribute(self):
OpusRead(directory='RF', source='en', target='sv', maximum=1,
- write_ids=os.path.join(self.tempdir1, 'test_files', 'test.id'),
- root_directory=self.root_directory).printPairs()
+ write_ids=os.path.join(self.tempdir1, 'test_files', 'test.id'),
+ root_directory=self.root_directory).printPairs()
with open(os.path.join(
self.tempdir1, 'test_files/test.id')) as id_file:
self.assertEqual(id_file.read(), 'en/1988.xml.gz\tsv/1988'
- '.xml.gz\ts1.1\ts1.1\tNone\n')
+ '.xml.gz\ts1.1\ts1.1\tNone\n')
def test_id_file_printing_with_attribute_no_threshold(self):
OpusRead(directory='RF', source='en', target='sv', maximum=1,
- attribute='certainty',
- write_ids=os.path.join(self.tempdir1, 'test_files', 'test.id'),
- root_directory=self.root_directory).printPairs()
+ attribute='certainty',
+ write_ids=os.path.join(self.tempdir1, 'test_files', 'test.id'),
+ root_directory=self.root_directory).printPairs()
with open(os.path.join(
self.tempdir1, 'test_files/test.id')) as id_file:
self.assertEqual(id_file.read(), 'en/1988.xml.gz\tsv/1988'
- '.xml.gz\ts1.1\ts1.1\t-0.0636364\n')
+ '.xml.gz\ts1.1\ts1.1\t-0.0636364\n')
def test_id_file_printing_with_invalid_attribute(self):
OpusRead(directory='RF', source='en', target='sv', maximum=1,
- attribute='asfg',
- write_ids=os.path.join(self.tempdir1, 'test_files', 'test.id'),
- root_directory=self.root_directory).printPairs()
+ attribute='asfg',
+ write_ids=os.path.join(self.tempdir1, 'test_files', 'test.id'),
+ root_directory=self.root_directory).printPairs()
with open(os.path.join(
- self.tempdir1, 'test_files/test.id')) as id_file:
+ self.tempdir1, 'test_files/test.id')) as id_file:
self.assertEqual(id_file.read(), 'en/1988.xml.gz\tsv/1988'
- '.xml.gz\ts1.1\ts1.1\tNone\n')
+ '.xml.gz\ts1.1\ts1.1\tNone\n')
def test_id_file_printing_with_only_threshold(self):
OpusRead(directory='RF', source='en', target='sv', maximum=1,
- threshold='0',
- write_ids=os.path.join(self.tempdir1, 'test_files', 'test.id'),
- root_directory=self.root_directory).printPairs()
+ threshold='0',
+ write_ids=os.path.join(self.tempdir1, 'test_files', 'test.id'),
+ root_directory=self.root_directory).printPairs()
with open(os.path.join(
- self.tempdir1, 'test_files/test.id')) as id_file:
+ self.tempdir1, 'test_files/test.id')) as id_file:
self.assertEqual(id_file.read(), 'en/1988.xml.gz\tsv/1988'
- '.xml.gz\ts1.1\ts1.1\tNone\n')
+ '.xml.gz\ts1.1\ts1.1\tNone\n')
def test_writing_time_tags_xml(self):
- var = pairPrinterToVariable(directory='OpenSubtitles', source='eo',
- target='tl', maximum=1, preserve_inline_tags=True,
- root_directory=self.root_directory)
- self.assertTrue(
- '\n# eo/2009/1187043/6483790.xml.gz\n'
- '# tl/2009/1187043/6934998.xml.gz\n\n'
- '================================\n(src)="1"> Ĉiuj nomoj , roluloj kaj evento'
- 'j reprezentitaj en ĉi tiu filmo estas fikciaj .\n========'
- '========================\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='OpenSubtitles', source='eo',
+ target='tl', maximum=1, preserve_inline_tags=True,
+ root_directory=self.root_directory).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n# eo/2009/1187043/6483790.xml.gz\n'
+ '# tl/2009/1187043/6934998.xml.gz\n\n'
+ '================================\n(src)="1"> Ĉiuj nomoj , roluloj kaj evento'
+ 'j reprezentitaj en ĉi tiu filmo estas fikciaj .\n========'
+ '========================\n', var)
def test_writing_time_tags_raw(self):
- var = pairPrinterToVariable(directory='OpenSubtitles', source='eo',
- target='tl', maximum=1, preserve_inline_tags=True,
- preprocess='raw',
- root_directory=self.root_directory)
- self.assertTrue(
- '\n# eo/2009/1187043/6483790.xml.gz\n'
- '# tl/2009/1187043/6934998.xml.gz\n\n'
- '================================\n(src)="1"> Ĉiuj nomoj, roluloj kaj evento'
- 'j reprezentitaj en ĉi tiu filmo estas fikciaj.\n========'
- '========================\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='OpenSubtitles', source='eo',
+ target='tl', maximum=1, preserve_inline_tags=True,
+ preprocess='raw',
+ root_directory=self.root_directory).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n# eo/2009/1187043/6483790.xml.gz\n'
+ '# tl/2009/1187043/6934998.xml.gz\n\n'
+ '================================\n(src)="1"> Ĉiuj nomoj, roluloj kaj evento'
+ 'j reprezentitaj en ĉi tiu filmo estas fikciaj.\n========'
+ '========================\n', var)
def test_escape_characters_when_write_mode_tmx(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- release='v1', maximum=1, write_mode='tmx',
- download_dir=self.tempdir1,
- alignment_file=os.path.join(self.tempdir1,
- 'books_alignment.xml'))
- self.assertTrue(''
- '\n\n\n\t' in var)
- self.assertTrue(
- 'Source&<>"\' : '
- 'manybooks.netAudiobook available here'
- '\n\t\t\tSource : Project Gutenberg'
- '\n\t\t\n\t\n\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ release='v1', maximum=1, write_mode='tmx',
+ download_dir=self.tempdir1,
+ alignment_file=os.path.join(self.tempdir1,
+ 'books_alignment.xml')).printPairs()
+ var = output.getvalue()
+ self.assertIn(''
+ '\n\n\n\t', var)
+ self.assertIn(
+ 'Source&<>"\' : '
+ 'manybooks.netAudiobook available here'
+ '\n\t\t\tSource : Project Gutenberg'
+ '\n\t\t\n\t\n\n', var)
def test_open_predownloaded_alignment_file(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- release='v1', maximum=1, download_dir=self.tempdir1)
- self.assertTrue(
- '\n# en/1996.xml.gz'
- '\n# sv/1996.xml.gz'
- '\n\n================================'
- '\n(src)="s1">Source&<>"\' : manybooks.netAudiobook available here'
- '\n(trg)="s1">Source : Project Gutenberg'
- '\n================================\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ release='v1', maximum=1, download_dir=self.tempdir1).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n# en/1996.xml.gz'
+ '\n# sv/1996.xml.gz'
+ '\n\n================================'
+ '\n(src)="s1">Source&<>"\' : manybooks.netAudiobook available here'
+ '\n(trg)="s1">Source : Project Gutenberg'
+ '\n================================\n', var)
def test_download_zip_files_no_prompt(self):
- var = pairPrinterToVariable(directory='RF', source='fr', target='sv',
- maximum=1, download_dir=self.tempdir1, suppress_prompts=True,
- root_directory=self.root_directory)
- self.assertTrue(
- '(src)="s1.1">Declaration de Politique Générale du '
- 'Gouvernement présentée mardi 4 octobre 1988 devant le '
- 'Riksdag par Monsieur Ingvar Carlsson , Premier Ministre .\n'
- '(trg)="s1.1">REGERINGSFÖRKLARING .\n'
- '================================\n' in var)
- os.remove(os.path.join(self.tempdir1, 'RF_latest_xml_fr.zip'))
- os.remove(os.path.join(self.tempdir1, 'RF_latest_xml_sv.zip'))
- os.remove(os.path.join(self.tempdir1, 'RF_latest_xml_fr-sv.xml.gz'))
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='fr', target='sv',
+ maximum=1, download_dir=self.tempdir1, suppress_prompts=True,
+ root_directory=self.root_directory).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '(src)="s1.1">Declaration de Politique Générale du '
+ 'Gouvernement présentée mardi 4 octobre 1988 devant le '
+ 'Riksdag par Monsieur Ingvar Carlsson , Premier Ministre .\n'
+ '(trg)="s1.1">REGERINGSFÖRKLARING .\n'
+ '================================\n', var)
+ os.remove(os.path.join(self.tempdir1, 'RF_latest_xml_fr.zip'))
+ os.remove(os.path.join(self.tempdir1, 'RF_latest_xml_sv.zip'))
+ os.remove(os.path.join(self.tempdir1, 'RF_latest_xml_fr-sv.xml.gz'))
def test_zip_files_not_found_no_prompt(self):
with self.assertRaises(FileNotFoundError):
OpusRead(directory='RF', source='fi', target='sv',
- maximum=1, download_dir=self.tempdir1, suppress_prompts=True,
- root_directory=self.root_directory).printPairs()
+ maximum=1, download_dir=self.tempdir1, suppress_prompts=True,
+ root_directory=self.root_directory).printPairs()
with self.assertRaises(FileNotFoundError):
OpusRead(directory='RF', source='ab', target='cd',
- maximum=1, download_dir=self.tempdir1, suppress_prompts=True,
- root_directory=self.root_directory).printPairs()
+ maximum=1, download_dir=self.tempdir1, suppress_prompts=True,
+ root_directory=self.root_directory).printPairs()
def test_alignment_file_could_not_be_parsed(self):
with open(os.path.join(self.tempdir1, 'test_files', 'testlinks'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n'
@@ -1668,35 +1748,35 @@ def test_alignment_file_could_not_be_parsed(self):
'test_en" toDoc="test_files/test_fi" >\n\n \n')
with open(os.path.join(self.tempdir1, 'test_files', 'test_en'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n\n'
'\n\n test_en1\n test_en2'
'\n\n \n')
with zipfile.ZipFile(os.path.join(self.tempdir1, 'test_en.zip'),
- 'w') as zf:
+ 'w') as zf:
zf.write(os.path.join(self.tempdir1, 'test_files', 'test_en'),
- arcname=os.path.join('test_files', 'test_en'))
+ arcname=os.path.join('test_files', 'test_en'))
with open(os.path.join(self.tempdir1, 'test_files', 'test_fi'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n\n \n'
'\n test_fi1\n test_fi2'
'\n\n \n')
with zipfile.ZipFile(os.path.join(self.tempdir1, 'test_fi.zip'),
- 'w') as zf:
+ 'w') as zf:
zf.write(os.path.join(self.tempdir1, 'test_files', 'test_fi'),
- arcname=os.path.join('test_files', 'test_fi'))
+ arcname=os.path.join('test_files', 'test_fi'))
with self.assertRaises(AlignmentParserError):
OpusRead(directory='Books', source='en',
- target='fi', alignment_file=os.path.join(self.tempdir1,
- 'test_files', 'testlinks'),
- source_zip = os.path.join(self.tempdir1, 'test_en.zip'),
- target_zip = os.path.join(self.tempdir1, 'test_fi.zip')
- ).printPairs()
+ target='fi', alignment_file=os.path.join(self.tempdir1,
+ 'test_files', 'testlinks'),
+ source_zip = os.path.join(self.tempdir1, 'test_en.zip'),
+ target_zip = os.path.join(self.tempdir1, 'test_fi.zip')
+ ).printPairs()
with open(os.path.join(self.tempdir1, 'test_files', 'testlinks'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n'
@@ -1705,15 +1785,15 @@ def test_alignment_file_could_not_be_parsed(self):
'"s1;s1"/>\n ')
with self.assertRaises(AlignmentParserError):
OpusRead(directory='Books', source='en',
- target='fi', alignment_file=os.path.join(self.tempdir1,
- 'test_files', 'testlinks'),
- source_zip = os.path.join(self.tempdir1, 'test_en.zip'),
- target_zip = os.path.join(self.tempdir1, 'test_fi.zip')
- ).printPairs()
+ target='fi', alignment_file=os.path.join(self.tempdir1,
+ 'test_files', 'testlinks'),
+ source_zip = os.path.join(self.tempdir1, 'test_en.zip'),
+ target_zip = os.path.join(self.tempdir1, 'test_fi.zip')
+ ).printPairs()
def test_sentence_file_could_not_be_parsed(self):
with open(os.path.join(self.tempdir1, 'test_files', 'testlinks'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n'
@@ -1736,112 +1816,105 @@ def test_sentence_file_could_not_be_parsed(self):
'')
with open(os.path.join(self.tempdir1, 'test_files', 'test_en'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n\n'
'\n\n test_en1\n test_en2'
'\n\n \n')
with open(os.path.join(self.tempdir1, 'test_files', 'invalid_en'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n\n'
'\n\n test_en1\n test_en2'
'\n\n \n')
with zipfile.ZipFile(os.path.join(self.tempdir1, 'test_en.zip'),
- 'w') as zf:
+ 'w') as zf:
zf.write(os.path.join(self.tempdir1, 'test_files', 'test_en'),
- arcname=os.path.join('test_files', 'test_en'))
+ arcname=os.path.join('test_files', 'test_en'))
zf.write(os.path.join(self.tempdir1, 'test_files', 'invalid_en'),
- arcname=os.path.join('test_files', 'invalid_en'))
+ arcname=os.path.join('test_files', 'invalid_en'))
with open(os.path.join(self.tempdir1, 'test_files', 'test_fi'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n\n \n'
'\n test_fi1\n test_fi2'
'\n\n \n')
with zipfile.ZipFile(os.path.join(self.tempdir1, 'test_fi.zip'),
- 'w') as zf:
+ 'w') as zf:
zf.write(os.path.join(self.tempdir1, 'test_files', 'test_fi'),
- arcname=os.path.join('test_files', 'test_fi'))
+ arcname=os.path.join('test_files', 'test_fi'))
- var = pairPrinterToVariable(directory='Books', source='en',
+ with mock.patch('sys.stdout', new=io.StringIO()) as output, \
+ mock.patch('sys.stderr', new=io.StringIO()) as errors:
+ OpusRead(
+ directory='Books', source='en',
target='fi', alignment_file=os.path.join(self.tempdir1,
- 'test_files', 'testlinks'),
+ 'test_files', 'testlinks'),
source_zip = os.path.join(self.tempdir1, 'test_en.zip'),
- target_zip = os.path.join(self.tempdir1, 'test_fi.zip'))
-
- self.assertTrue('\n# test_files/test_en\n'
- '# test_files/test_fi\n\n'
- '================================\n'
- '(src)="s1">test_en1 test_en2\n'
- '(trg)="s1">test_fi1 test_fi2\n'
- '================================' in var)
- self.assertTrue(
+ target_zip = os.path.join(self.tempdir1, 'test_fi.zip')).printPairs()
+ var = output.getvalue()
+ err = errors.getvalue()
+ self.assertIn('\n# test_files/test_en\n'
+ '# test_files/test_fi\n\n'
+ '================================\n'
+ '(src)="s1">test_en1 test_en2\n'
+ '(trg)="s1">test_fi1 test_fi2\n'
+ '================================', var)
+ self.assertIn(
'Error while parsing sentence file: Document '
"'test_files/invalid_en' could not be parsed: mismatched "
'tag: line 8, column 3\n'
- 'Continuing from next sentence file pair.' in var)
- self.assertTrue(
- '# test_files/test_en\n'
- '# test_files/test_fi\n\n'
- '================================\n'
- '(src)="s1">test_en1 test_en2\n'
- '(trg)="s1">test_fi1 test_fi2\n'
- '================================\n\n'
+ 'Continuing from next sentence file pair.', err)
+ self.assertIn(
"There is no item named 'test_files/no_file' in the archive "
"'"+os.path.join(self.tempdir1, 'test_en.zip')+"'\n"
- 'Continuing from next sentence file pair.' in var)
- self.assertTrue(
- '# test_files/test_en\n'
- '# test_files/test_fi\n\n'
- '================================\n'
- '(src)="s1">test_en1 test_en2\n'
- '(trg)="s1">test_fi1 test_fi2\n'
- '================================' in var)
-
+ 'Continuing from next sentence file pair.', err)
def test_leave_non_alignments_out(self):
- var = pairPrinterToVariable(directory='RF', target='en', source='sv',
- alignment_file=os.path.join(self.tempdir1, 'non_alignment.xml'),
- leave_non_alignments_out=True,
- root_directory=self.root_directory)
- self.assertTrue(
- '\n# en/1996.xml.gz'
- '\n# sv/1996.xml.gz'
- '\n\n================================' in var)
- self.assertTrue(
- '\n(src)="s5.0">'
- '\n(trg)="s5.0">'
- '\n================================\n' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', target='en', source='sv',
+ alignment_file=os.path.join(self.tempdir1, 'non_alignment.xml'),
+ leave_non_alignments_out=True,
+ root_directory=self.root_directory).printPairs()
+ var = output.getvalue()
+ self.assertIn(
+ '\n# en/1996.xml.gz'
+ '\n# sv/1996.xml.gz'
+ '\n\n================================', var)
+ self.assertIn(
+ '\n(src)="s5.0">'
+ '\n(trg)="s5.0">'
+ '\n================================\n', var)
def test_change_moses_delimiter(self):
OpusRead(directory='RF', source='en', target='sv', maximum=1,
- write=[os.path.join(self.tempdir1, 'test_files', 'test.src')],
- write_mode='moses', root_directory=self.root_directory,
- change_moses_delimiter=' ||| ').printPairs()
+ write=[os.path.join(self.tempdir1, 'test_files', 'test.src')],
+ write_mode='moses', root_directory=self.root_directory,
+ change_moses_delimiter=' ||| ').printPairs()
with open(os.path.join(self.tempdir1, 'test_files', 'test.src'),
- 'r') as f:
+ 'r') as f:
self.assertEqual(f.read(),
- 'Statement of Government Policy by the Prime Minister , '
- 'Mr Ingvar Carlsson , at the Opening of the Swedish Parli'
- 'ament on Tuesday , 4 October , 1988 . ||| REGERINGSFÖRK'
- 'LARING .\n')
+ 'Statement of Government Policy by the Prime Minister , '
+ 'Mr Ingvar Carlsson , at the Opening of the Swedish Parli'
+ 'ament on Tuesday , 4 October , 1988 . ||| REGERINGSFÖRK'
+ 'LARING .\n')
def test_change_annotation_delimiter(self):
OpusRead(directory='RF', source='en', target='sv', maximum=1,
- preprocess='parsed', print_annotations=True,
- source_annotations=['upos', 'feats', 'lemma'],
- target_annotations=['upos', 'feats', 'lemma'],
- change_annotation_delimiter='#',
- write=[os.path.join(self.tempdir1, 'test_files', 'test_result')],
- root_directory=self.root_directory).printPairs()
+ preprocess='parsed', print_annotations=True,
+ source_annotations=['upos', 'feats', 'lemma'],
+ target_annotations=['upos', 'feats', 'lemma'],
+ change_annotation_delimiter='#',
+ write=[os.path.join(self.tempdir1, 'test_files', 'test_result')],
+ root_directory=self.root_directory).printPairs()
with open(os.path.join(self.tempdir1, 'test_files', 'test_result'),
- 'r') as f:
- self.assertEqual(f.read(),
+ 'r') as f:
+ self.assertEqual(
+ f.read(),
'\n# en/1988.xml.gz\n# sv/1988.xml.gz\n\n'
'================================'
'\n(src)="s1.1">Statement#NOUN#Number=Sing#statement '
@@ -1864,7 +1937,7 @@ def test_change_annotation_delimiter(self):
def test_continue_after_empty_linkGrp(self):
with open(os.path.join(self.tempdir1, 'test_files', 'testlinks'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n'
@@ -1880,120 +1953,150 @@ def test_continue_after_empty_linkGrp(self):
'')
with open(os.path.join(self.tempdir1, 'test_files', 'test_en'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n\n'
'\n\n test_en1\n test_en2'
'\n\n \n')
with zipfile.ZipFile(os.path.join(self.tempdir1, 'test_en.zip'),
- 'w') as zf:
+ 'w') as zf:
zf.write(os.path.join(self.tempdir1, 'test_files', 'test_en'),
- arcname=os.path.join('test_files', 'test_en'))
+ arcname=os.path.join('test_files', 'test_en'))
with open(os.path.join(self.tempdir1, 'test_files', 'test_fi'),
- 'w') as f:
+ 'w') as f:
f.write(
'\n\n \n'
'\n test_fi1\n test_fi2'
'\n\n \n')
with zipfile.ZipFile(os.path.join(self.tempdir1, 'test_fi.zip'),
- 'w') as zf:
+ 'w') as zf:
zf.write(os.path.join(self.tempdir1, 'test_files', 'test_fi'),
- arcname=os.path.join('test_files', 'test_fi'))
+ arcname=os.path.join('test_files', 'test_fi'))
- var = pairPrinterToVariable(directory='Books', source='en',
- target='fi', alignment_file=os.path.join(self.tempdir1,
- 'test_files', 'testlinks'),
- source_zip = os.path.join(self.tempdir1, 'test_en.zip'),
- target_zip = os.path.join(self.tempdir1, 'test_fi.zip'))
-
- self.assertTrue(
- '\n# test_files/test_en\n'
- '# test_files/test_fi\n\n'
- '================================\n'
- '(src)="s1">test_en1 test_en2\n'
- '(trg)="s1">test_fi1 test_fi2\n'
- '================================' in var)
- self.assertTrue(
- '# test_files/test_en\n'
- '# test_files/test_fi\n\n'
- '================================' in var)
- self.assertTrue(
- '# test_files/test_en\n'
- '# test_files/test_fi\n\n'
- '================================\n'
- '(src)="s1">test_en1 test_en2\n'
- '(trg)="s1">test_fi1 test_fi2\n'
- '================================')
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='Books', source='en',
+ target='fi', alignment_file=os.path.join(self.tempdir1,
+ 'test_files', 'testlinks'),
+ source_zip = os.path.join(self.tempdir1, 'test_en.zip'),
+ target_zip = os.path.join(self.tempdir1, 'test_fi.zip')).printPairs()
+ var = output.getvalue()
+
+ self.assertIn(
+ '\n# test_files/test_en\n'
+ '# test_files/test_fi\n\n'
+ '================================\n'
+ '(src)="s1">test_en1 test_en2\n'
+ '(trg)="s1">test_fi1 test_fi2\n'
+ '================================', var)
+ self.assertIn(
+ '# test_files/test_en\n'
+ '# test_files/test_fi\n\n'
+ '================================', var)
+ self.assertIn(
+ '# test_files/test_en\n'
+ '# test_files/test_fi\n\n'
+ '================================\n'
+ '(src)="s1">test_en1 test_en2\n'
+ '(trg)="s1">test_fi1 test_fi2\n'
+ '================================', var)
def test_get_documents_by_regex(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- root_directory=self.root_directory, n='19')
- self.assertTrue('# en/1988.xml.gz' in var)
- self.assertTrue('# en/1996.xml.gz' in var)
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- root_directory=self.root_directory, n='88')
- self.assertTrue('# en/1988.xml.gz' in var)
- self.assertTrue('# en/1996.xml.gz' not in var)
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- root_directory=self.root_directory, n='96')
- self.assertTrue('# en/1988.xml.gz' not in var)
- self.assertTrue('# en/1996.xml.gz' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ root_directory=self.root_directory, n='19').printPairs()
+ var = output.getvalue()
+ self.assertIn('# en/1988.xml.gz', var)
+ self.assertIn('# en/1996.xml.gz', var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ root_directory=self.root_directory, n='88').printPairs()
+ var = output.getvalue()
+ self.assertIn('# en/1988.xml.gz', var)
+ self.assertNotIn('# en/1996.xml.gz', var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ root_directory=self.root_directory, n='96').printPairs()
+ var = output.getvalue()
+ self.assertNotIn('# en/1988.xml.gz', var)
+ self.assertIn('# en/1996.xml.gz', var)
def test_skip_documents_by_regex(self):
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- root_directory=self.root_directory, N='19')
- self.assertTrue('# en/1988.xml.gz' not in var)
- self.assertTrue('# en/1996.xml.gz' not in var)
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- root_directory=self.root_directory, N='88')
- self.assertTrue('# en/1988.xml.gz' not in var)
- self.assertTrue('# en/1996.xml.gz' in var)
- var = pairPrinterToVariable(directory='RF', source='en', target='sv',
- root_directory=self.root_directory, N='96')
- self.assertTrue('# en/1988.xml.gz' in var)
- self.assertTrue('# en/1996.xml.gz' not in var)
-
- def test_moses_preprocessing(self):
- var = preMosesToVariable(directory='RF', source='en', target='sv',
- download_dir=self.tempdir1, suppress_prompts=True,
- root_directory=self.root_directory, preprocess='moses')
- self.assertTrue('Moses files written to ' in var)
- self.assertTrue('RF.en-sv.en' in var)
- self.assertTrue('RF.en-sv.sv' in var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ root_directory=self.root_directory, N='19').printPairs()
+ var = output.getvalue()
+ self.assertNotIn('# en/1988.xml.gz', var)
+ self.assertNotIn('# en/1996.xml.gz', var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ root_directory=self.root_directory, N='88').printPairs()
+ var = output.getvalue()
+ self.assertNotIn('# en/1988.xml.gz', var)
+ self.assertIn('# en/1996.xml.gz', var)
+ with mock.patch('sys.stdout', new=io.StringIO()) as output:
+ OpusRead(directory='RF', source='en', target='sv',
+ root_directory=self.root_directory, N='96').printPairs()
+ var = output.getvalue()
+ self.assertIn('# en/1988.xml.gz', var)
+ self.assertNotIn('# en/1996.xml.gz', var)
+
+ def test_moses_preprocessing_defaultnames(self):
+ with self.assertLogs('opustools.opus_read', level=logging.INFO) as cm:
+ OpusRead(directory='RF', source='en', target='sv',
+ download_dir=self.tempdir1, suppress_prompts=True,
+ root_directory=self.root_directory, preprocess='moses').printPairs()
+ full_log = '\n'.join(cm.output)
+ logging.warning(full_log)
+ self.assertIn('Moses files written to ', full_log)
+ self.assertIn('RF.en-sv.en', full_log)
+ self.assertIn('RF.en-sv.sv', full_log)
+ logging.warning(os.listdir(self.tempdir1))
with open(os.path.join(self.tempdir1, 'RF.en-sv.en')) as moses_en:
self.assertEqual(moses_en.readlines()[0][:41], 'Statement of Government Policy by the Pri')
with open(os.path.join(self.tempdir1, 'RF.en-sv.sv')) as moses_sv:
self.assertEqual(moses_sv.readlines()[0][:41], 'REGERINGSFÖRKLARING.\n')
- var = preMosesToVariable(directory='RF', source='en', target='sv',
- download_dir=self.tempdir1, suppress_prompts=True,
- root_directory=self.root_directory, preprocess='moses',
- write=['rf.moses'])
- self.assertTrue('"moses" preprocessing requires two output file '
- 'names. Using default names.\nMoses files written to ' in var)
- self.assertTrue('RF.en-sv.en' in var)
- self.assertTrue('RF.en-sv.sv' in var)
-
- var = preMosesToVariable(directory='RF', source='en', target='sv',
- download_dir=self.tempdir1, suppress_prompts=True,
- root_directory=self.root_directory, preprocess='moses',
- write=['rf.moses.en', 'rf.moses.sv'])
- self.assertTrue('Moses files written to ' in var)
- self.assertTrue('rf.moses.en' in var)
- self.assertTrue('rf.moses.sv' in var)
-
- var = preMosesToVariable(directory='RF', source='sv', target='en',
- download_dir=self.tempdir1, suppress_prompts=True,
- root_directory=self.root_directory, preprocess='moses',
- write=['rf.moses.sv', 'rf.moses.en'])
+ def test_moses_preprocessing_singlefile(self):
+ with self.assertLogs('opustools.opus_read', level=logging.INFO) as cm:
+ OpusRead(directory='RF', source='en', target='sv',
+ download_dir=self.tempdir1, suppress_prompts=True,
+ root_directory=self.root_directory, preprocess='moses',
+ write=[os.path.join(self.tempdir1, 'rf.moses')]).printPairs()
+ full_log = '\n'.join(cm.output)
+ self.assertIn('"moses" preprocessing requires two output file '
+ 'names. Using default names.', full_log)
+ self.assertIn('Moses files written to ', full_log)
+ self.assertIn('RF.en-sv.en', full_log)
+ self.assertIn('RF.en-sv.sv', full_log)
+
+ def test_moses_preprocessing_twofiles(self):
+ with self.assertLogs('opustools.opus_read', level=logging.INFO) as cm:
+ OpusRead(directory='RF', source='en', target='sv',
+ download_dir=self.tempdir1, suppress_prompts=True,
+ root_directory=self.root_directory, preprocess='moses',
+ write=[os.path.join(self.tempdir1, 'rf.moses.en'),
+ os.path.join(self.tempdir1, 'rf.moses.sv')]).printPairs()
+ full_log = '\n'.join(cm.output)
+ self.assertIn('Moses files written to ', full_log)
+ self.assertIn('rf.moses.en', full_log)
+ self.assertIn('rf.moses.sv', full_log)
+
+ def test_moses_preprocessing_switchlangs(self):
+ OpusRead(directory='RF', source='sv', target='en',
+ download_dir=self.tempdir1, suppress_prompts=True,
+ root_directory=self.root_directory, preprocess='moses',
+ write=[os.path.join(self.tempdir1, 'rf.moses.sv'),
+ os.path.join(self.tempdir1, 'rf.moses.en')]).printPairs()
+ logging.warning(os.listdir(self.tempdir1))
+ logging.warning(os.listdir('.'))
with open(os.path.join(self.tempdir1, 'rf.moses.en')) as moses_en:
self.assertEqual(moses_en.readlines()[0][:41], 'Statement of Government Policy by the Pri')
with open(os.path.join(self.tempdir1, 'rf.moses.sv')) as moses_sv:
self.assertEqual(moses_sv.readlines()[0][:41], 'REGERINGSFÖRKLARING.\n')
+
if __name__ == '__main__':
unittest.main()
-