Skip to content

Commit

Permalink
Add Yapf for Python style check (#8)
Browse files Browse the repository at this point in the history
- Add a GitHub workflow to check the coding style.
- Fix some existing code to align with the Google open source style guide.
  • Loading branch information
tushuhei authored Nov 24, 2021
1 parent cbedadb commit dedaafc
Show file tree
Hide file tree
Showing 11 changed files with 316 additions and 265 deletions.
12 changes: 12 additions & 0 deletions .github/workflows/style-check.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
name: Style Check
on: [push]
jobs:
python-style-check:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: '3.7'
- run: python -m pip install --upgrade yapf
- run: yapf --diff --recursive budoux tests
135 changes: 70 additions & 65 deletions budoux/feature_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import os
from .utils import SEP, Result


with open(os.path.join(os.path.dirname(__file__), 'unicode_blocks.json')) as f:
block_starts: list[int] = json.load(f)

Expand All @@ -36,8 +35,8 @@ def unicode_block_index(w: str):
return bisect.bisect_right(block_starts, ord(w))


def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str, w6: str,
p1: str, p2: str, p3: str):
def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str, w6: str, p1: str,
p2: str, p3: str):
"""Generates a feature from characters around (w1-6) and past results (p1-3).
Args:
Expand All @@ -62,59 +61,69 @@ def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str, w6: str,
b5 = '%03d' % (unicode_block_index(w5)) if w5 != '' else '999'
b6 = '%03d' % (unicode_block_index(w6)) if w6 != '' else '999'
raw_feature = {
'UP1': p1,
'UP2': p2,
'UP3': p3,
'BP1': p1 + p2,
'BP2': p2 + p3,
'UW1': w1,
'UW2': w2,
'UW3': w3,
'UW4': w4,
'UW5': w5,
'UW6': w6,
'BW1': w2 + w3,
'BW2': w3 + w4,
'BW3': w4 + w5,
'TW1': w1 + w2 + w3,
'TW2': w2 + w3 + w4,
'TW3': w3 + w4 + w5,
'TW4': w4 + w5 + w6,
'UB1': b1,
'UB2': b2,
'UB3': b3,
'UB4': b4,
'UB5': b5,
'UB6': b6,
'BB1': b2 + b3,
'BB2': b3 + b4,
'BB3': b4 + b5,
'TB1': b1 + b2 + b3,
'TB2': b2 + b3 + b4,
'TB3': b3 + b4 + b5,
'TB4': b4 + b5 + b6,
'UQ1': p1 + b1,
'UQ2': p2 + b2,
'UQ3': p3 + b3,
'BQ1': p2 + b2 + b3,
'BQ2': p2 + b3 + b4,
'BQ3': p3 + b2 + b3,
'BQ4': p3 + b3 + b4,
'TQ1': p2 + b1 + b2 + b3,
'TQ2': p2 + b2 + b3 + b4,
'TQ3': p3 + b1 + b2 + b3,
'TQ4': p3 + b2 + b3 + b4,
'UP1': p1,
'UP2': p2,
'UP3': p3,
'BP1': p1 + p2,
'BP2': p2 + p3,
'UW1': w1,
'UW2': w2,
'UW3': w3,
'UW4': w4,
'UW5': w5,
'UW6': w6,
'BW1': w2 + w3,
'BW2': w3 + w4,
'BW3': w4 + w5,
'TW1': w1 + w2 + w3,
'TW2': w2 + w3 + w4,
'TW3': w3 + w4 + w5,
'TW4': w4 + w5 + w6,
'UB1': b1,
'UB2': b2,
'UB3': b3,
'UB4': b4,
'UB5': b5,
'UB6': b6,
'BB1': b2 + b3,
'BB2': b3 + b4,
'BB3': b4 + b5,
'TB1': b1 + b2 + b3,
'TB2': b2 + b3 + b4,
'TB3': b3 + b4 + b5,
'TB4': b4 + b5 + b6,
'UQ1': p1 + b1,
'UQ2': p2 + b2,
'UQ3': p3 + b3,
'BQ1': p2 + b2 + b3,
'BQ2': p2 + b3 + b4,
'BQ3': p3 + b2 + b3,
'BQ4': p3 + b3 + b4,
'TQ1': p2 + b1 + b2 + b3,
'TQ2': p2 + b2 + b3 + b4,
'TQ3': p3 + b1 + b2 + b3,
'TQ4': p3 + b2 + b3 + b4,
}
if raw_feature['UW4'] == '': del raw_feature['UW4']
if raw_feature['UW5'] == '': del raw_feature['UW5']
if raw_feature['UW6'] == '': del raw_feature['UW6']
if raw_feature['BW3'] == '': del raw_feature['BW3']
if raw_feature['TW4'] == '': del raw_feature['TW4']
if raw_feature['UB4'] == '999': del raw_feature['UB4']
if raw_feature['UB5'] == '999': del raw_feature['UB5']
if raw_feature['UB6'] == '999': del raw_feature['UB6']
if raw_feature['BB3'] == '999999': del raw_feature['BB3']
if raw_feature['TB4'] == '999999999': del raw_feature['TB4']
if raw_feature['UW4'] == '':
del raw_feature['UW4']
if raw_feature['UW5'] == '':
del raw_feature['UW5']
if raw_feature['UW6'] == '':
del raw_feature['UW6']
if raw_feature['BW3'] == '':
del raw_feature['BW3']
if raw_feature['TW4'] == '':
del raw_feature['TW4']
if raw_feature['UB4'] == '999':
del raw_feature['UB4']
if raw_feature['UB5'] == '999':
del raw_feature['UB5']
if raw_feature['UB6'] == '999':
del raw_feature['UB6']
if raw_feature['BB3'] == '999999':
del raw_feature['BB3']
if raw_feature['TB4'] == '999999999':
del raw_feature['TB4']
return [f'{item[0]}:{item[1]}' for item in raw_feature.items()]


Expand All @@ -134,19 +143,16 @@ def process(source_filename: str, entries_filename: str):
chunks = row.strip().split(SEP)
chunk_lengths = [len(chunk) for chunk in chunks]
sep_indices = set(itertools.accumulate(chunk_lengths, lambda x, y: x + y))
sentence = ''.join(chunks)
sentence = ''.join(chunks)
p1 = Result.UNKNOWN.value
p2 = Result.UNKNOWN.value
p3 = Result.UNKNOWN.value
for i in range(3, len(sentence) + 1):
feature = get_feature(
sentence[i - 3],
sentence[i - 2],
sentence[i - 1],
sentence[i] if i < len(sentence) else '',
sentence[i + 1] if i + 1 < len(sentence) else '',
sentence[i + 2] if i + 2 < len(sentence) else '',
p1, p2, p3)
feature = get_feature(sentence[i - 3], sentence[i - 2], sentence[i - 1],
sentence[i] if i < len(sentence) else '',
sentence[i + 1] if i + 1 < len(sentence) else '',
sentence[i + 2] if i + 2 < len(sentence) else '',
p1, p2, p3)
positive = i in sep_indices
p = Result.POSITIVE.value if positive else Result.NEGATIVE.value
with open(entries_filename, 'a') as f:
Expand All @@ -155,4 +161,3 @@ def process(source_filename: str, entries_filename: str):
p1 = p2
p2 = p3
p3 = p

27 changes: 14 additions & 13 deletions budoux/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
with open(os.path.join(os.path.dirname(__file__), 'skip_nodes.json')) as f:
SKIP_NODES: typing.Set[str] = set(json.load(f))

HTMLAttr = typing.List[typing.Tuple[str, typing.Union[str, None]]]


class TextContentExtractor(HTMLParser):
"""An HTML parser to extract text content.
Expand Down Expand Up @@ -56,8 +59,7 @@ def __init__(self, chunks: typing.List[str]):
self.chunks_joined = SEP.join(chunks)
self.to_skip = False

def handle_starttag(self, tag: str,
attrs: typing.List[typing.Tuple[str, typing.Union[str, None]]]):
def handle_starttag(self, tag: str, attrs: HTMLAttr):
attr_pairs = []
for attr in attrs:
if attr[1] is None:
Expand All @@ -78,7 +80,7 @@ def handle_data(self, data: str):
if self.chunks_joined[0] == SEP:
self.chunks_joined = self.chunks_joined[1 + len(data):]
else:
self.chunks_joined = self.chunks_joined[len(data):]
self.chunks_joined = self.chunks_joined[len(data):]
return
for char in data:
if char == self.chunks_joined[0]:
Expand Down Expand Up @@ -117,23 +119,22 @@ def parse(self, sentence: str, thres: int = 1000):
Returns:
A list of semantic chunks (List[str]).
"""
if sentence == '': return []
if sentence == '':
return []
p1 = Result.UNKNOWN.value
p2 = Result.UNKNOWN.value
p3 = Result.UNKNOWN.value
chunks = [sentence[:3]]
for i in range(3, len(sentence)):
feature = get_feature(
sentence[i - 3],
sentence[i - 2],
sentence[i - 1],
sentence[i],
sentence[i + 1] if i + 1 < len(sentence) else '',
sentence[i + 2] if i + 2 < len(sentence) else '',
p1, p2, p3)
feature = get_feature(sentence[i - 3], sentence[i - 2], sentence[i - 1],
sentence[i],
sentence[i + 1] if i + 1 < len(sentence) else '',
sentence[i + 2] if i + 2 < len(sentence) else '',
p1, p2, p3)
score = 0
for f in feature:
if not f in self.model: continue
if not f in self.model:
continue
score += self.model[f]
if score > thres:
chunks.append(sentence[i])
Expand Down
5 changes: 2 additions & 3 deletions budoux/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,12 @@

from enum import Enum


"""The separator string to specify breakpoints."""
SEP = '▁'
"""The separator string to specify breakpoints."""


"""An enum to represent the type of inference result."""
class Result(Enum):
"""An enum to represent the type of inference result."""
UNKNOWN = 'U'
POSITIVE = 'B'
NEGATIVE = 'O'
1 change: 1 addition & 0 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ numpy
html5lib
pytest
pytest-runner
yapf
5 changes: 4 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@ classifiers =
Programming Language :: Python :: 3.8

[aliases]
test=pytest
test = pytest

[options]
packages = find:
include_package_data = True

[yapf]
based_on_style = yapf
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@
# limitations under the License.

from setuptools import setup

setup()
3 changes: 2 additions & 1 deletion tests/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@

import os
import sys

LIB_PATH = os.path.join(os.path.dirname(__file__), '..')
sys.path.insert(0, os.path.abspath(LIB_PATH))
from budoux import feature_extractor
from budoux import parser
from budoux import utils
from scripts import train
from scripts import train
Loading

0 comments on commit dedaafc

Please sign in to comment.