Skip to content

Commit

Permalink
Update the Prepare KNBC script to break chunks by specified sequences (
Browse files Browse the repository at this point in the history
  • Loading branch information
tushuhei authored Mar 1, 2023
1 parent 0324430 commit 4d300f1
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 19 deletions.
47 changes: 28 additions & 19 deletions scripts/prepare_knbc.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,18 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Loads the KNBC corpus to generate training data."""
"""Prepares a dataset from the KNBC corpus.
Before running this script, you need to download the KNBC corpus by running:
$ curl -o knbc.tar.bz2 https://nlp.ist.i.kyoto-u.ac.jp/kuntt/KNBC_v1.0_090925_utf8.tar.bz2
$ tar -xf knbc.tar.bz2
Now you should have a directory named `KNBC_v1.0_090925_utf8`.
Run the following to generate a dataset named `source_knbc.txt`.
$ python scripts/prepare_knbc.py KNBC_v1.0_090925_utf8 -o source_knbc.txt
"""

import argparse
import os
Expand Down Expand Up @@ -55,25 +66,21 @@ def handle_data(self, data: str) -> None:
self.current_word = data


def break_before_open_parentheses(chunks: typing.List[str]) -> typing.List[str]:
"""Adds chunk breaks before every open parentheses.
def break_before_sequence(chunks: typing.List[str],
sequence: str) -> typing.List[str]:
"""Breaks chunks before a specified character sequence appears.
Args:
chunks (List[str]): Source chunks.
chunks (List[str]): Chunks to break.
sequence (str): A character sequence to break chunks before.
Returns:
Processed chunks.
"""
out: typing.List[str] = []
for chunk in chunks:
if '(' in chunk:
index = chunk.index('(')
if index > 0:
out.append(chunk[:index])
out.append(chunk[index:])
else:
out.append(chunk)
return out
chunks = utils.SEP.join(chunks).replace(sequence,
utils.SEP + sequence).split(utils.SEP)
chunks = [chunk for chunk in chunks if len(chunk) > 0]
return chunks


def postprocess(chunks: typing.List[str]) -> typing.List[str]:
Expand All @@ -85,19 +92,21 @@ def postprocess(chunks: typing.List[str]) -> typing.List[str]:
Returns:
Processed chunks.
"""
chunks = break_before_open_parentheses(chunks)
chunks = break_before_sequence(chunks, '(')
chunks = break_before_sequence(chunks, 'もら')
return chunks


def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__)
DEFAULT_OUT_PATH = 'source.txt'
parser = argparse.ArgumentParser(
description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('source_dir', help='Path to the KNBC corpus directory.')
parser.add_argument(
'-o',
'--outfile',
help='''File path to output the training data.
(default: source.txt)''',
default='source.txt')
help=f'File path to the output dataset. (default: {DEFAULT_OUT_PATH})',
default=DEFAULT_OUT_PATH)
return parser.parse_args()


Expand Down
42 changes: 42 additions & 0 deletions scripts/tests/test_prepare_knbc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests the prepare KNBC script."""

import os
import sys
import unittest

# module hack
LIB_PATH = os.path.join(os.path.dirname(__file__), '..', '..')
sys.path.insert(0, os.path.abspath(LIB_PATH))

from scripts import prepare_knbc # type: ignore # noqa (module hack)


class TestBreakBeforeSequence(unittest.TestCase):

def test_standard(self) -> None:
chunks = ['abcdef', 'ghi']
result = prepare_knbc.break_before_sequence(chunks, 'de')
self.assertListEqual(result, ['abc', 'def', 'ghi'])

def test_sequence_on_top(self) -> None:
chunks = ['abcdef', 'ghi']
result = prepare_knbc.break_before_sequence(chunks, 'gh')
self.assertListEqual(result, ['abcdef', 'ghi'])

def test_multiple_hit(self) -> None:
chunks = ['abcabc', 'def']
result = prepare_knbc.break_before_sequence(chunks, 'bc')
self.assertListEqual(result, ['a', 'bca', 'bc', 'def'])

0 comments on commit 4d300f1

Please sign in to comment.