Add Yapf for Python style check (#8)

- Add a GitHub workflow to check the coding style. - Fix some existing code to align with the Google open source style guide.
google · Nov 24, 2021 · dedaafc · dedaafc
1 parent cbedadb
commit dedaafc
Show file tree

Hide file tree

Showing 11 changed files with 316 additions and 265 deletions.
diff --git a/.github/workflows/style-check.yml b/.github/workflows/style-check.yml
@@ -0,0 +1,12 @@
+name: Style Check
+on: [push]
+jobs:
+  python-style-check:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+        with:
+          python-version: '3.7'
+      - run: python -m pip install --upgrade yapf
+      - run: yapf --diff --recursive budoux tests
diff --git a/budoux/feature_extractor.py b/budoux/feature_extractor.py
@@ -19,7 +19,6 @@
 import os
 from .utils import SEP, Result
 
-
 with open(os.path.join(os.path.dirname(__file__), 'unicode_blocks.json')) as f:
   block_starts: list[int] = json.load(f)
 
@@ -36,8 +35,8 @@ def unicode_block_index(w: str):
   return bisect.bisect_right(block_starts, ord(w))
 
 
-def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str, w6: str,
-                p1: str, p2: str, p3: str):
+def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str, w6: str, p1: str,
+                p2: str, p3: str):
   """Generates a feature from characters around (w1-6) and past results (p1-3).
 
   Args:
@@ -62,59 +61,69 @@ def get_feature(w1: str, w2: str, w3: str, w4: str, w5: str, w6: str,
   b5 = '%03d' % (unicode_block_index(w5)) if w5 != '' else '999'
   b6 = '%03d' % (unicode_block_index(w6)) if w6 != '' else '999'
   raw_feature = {
-    'UP1': p1,
-    'UP2': p2,
-    'UP3': p3,
-    'BP1': p1 + p2,
-    'BP2': p2 + p3,
-    'UW1': w1,
-    'UW2': w2,
-    'UW3': w3,
-    'UW4': w4,
-    'UW5': w5,
-    'UW6': w6,
-    'BW1': w2 + w3,
-    'BW2': w3 + w4,
-    'BW3': w4 + w5,
-    'TW1': w1 + w2 + w3,
-    'TW2': w2 + w3 + w4,
-    'TW3': w3 + w4 + w5,
-    'TW4': w4 + w5 + w6,
-    'UB1': b1,
-    'UB2': b2,
-    'UB3': b3,
-    'UB4': b4,
-    'UB5': b5,
-    'UB6': b6,
-    'BB1': b2 + b3,
-    'BB2': b3 + b4,
-    'BB3': b4 + b5,
-    'TB1': b1 + b2 + b3,
-    'TB2': b2 + b3 + b4,
-    'TB3': b3 + b4 + b5,
-    'TB4': b4 + b5 + b6,
-    'UQ1': p1 + b1,
-    'UQ2': p2 + b2,
-    'UQ3': p3 + b3,
-    'BQ1': p2 + b2 + b3,
-    'BQ2': p2 + b3 + b4,
-    'BQ3': p3 + b2 + b3,
-    'BQ4': p3 + b3 + b4,
-    'TQ1': p2 + b1 + b2 + b3,
-    'TQ2': p2 + b2 + b3 + b4,
-    'TQ3': p3 + b1 + b2 + b3,
-    'TQ4': p3 + b2 + b3 + b4,
+      'UP1': p1,
+      'UP2': p2,
+      'UP3': p3,
+      'BP1': p1 + p2,
+      'BP2': p2 + p3,
+      'UW1': w1,
+      'UW2': w2,
+      'UW3': w3,
+      'UW4': w4,
+      'UW5': w5,
+      'UW6': w6,
+      'BW1': w2 + w3,
+      'BW2': w3 + w4,
+      'BW3': w4 + w5,
+      'TW1': w1 + w2 + w3,
+      'TW2': w2 + w3 + w4,
+      'TW3': w3 + w4 + w5,
+      'TW4': w4 + w5 + w6,
+      'UB1': b1,
+      'UB2': b2,
+      'UB3': b3,
+      'UB4': b4,
+      'UB5': b5,
+      'UB6': b6,
+      'BB1': b2 + b3,
+      'BB2': b3 + b4,
+      'BB3': b4 + b5,
+      'TB1': b1 + b2 + b3,
+      'TB2': b2 + b3 + b4,
+      'TB3': b3 + b4 + b5,
+      'TB4': b4 + b5 + b6,
+      'UQ1': p1 + b1,
+      'UQ2': p2 + b2,
+      'UQ3': p3 + b3,
+      'BQ1': p2 + b2 + b3,
+      'BQ2': p2 + b3 + b4,
+      'BQ3': p3 + b2 + b3,
+      'BQ4': p3 + b3 + b4,
+      'TQ1': p2 + b1 + b2 + b3,
+      'TQ2': p2 + b2 + b3 + b4,
+      'TQ3': p3 + b1 + b2 + b3,
+      'TQ4': p3 + b2 + b3 + b4,
   }
-  if raw_feature['UW4'] == '': del raw_feature['UW4']
-  if raw_feature['UW5'] == '': del raw_feature['UW5']
-  if raw_feature['UW6'] == '': del raw_feature['UW6']
-  if raw_feature['BW3'] == '': del raw_feature['BW3']
-  if raw_feature['TW4'] == '': del raw_feature['TW4']
-  if raw_feature['UB4'] == '999': del raw_feature['UB4']
-  if raw_feature['UB5'] == '999': del raw_feature['UB5']
-  if raw_feature['UB6'] == '999': del raw_feature['UB6']
-  if raw_feature['BB3'] == '999999': del raw_feature['BB3']
-  if raw_feature['TB4'] == '999999999': del raw_feature['TB4']
+  if raw_feature['UW4'] == '':
+    del raw_feature['UW4']
+  if raw_feature['UW5'] == '':
+    del raw_feature['UW5']
+  if raw_feature['UW6'] == '':
+    del raw_feature['UW6']
+  if raw_feature['BW3'] == '':
+    del raw_feature['BW3']
+  if raw_feature['TW4'] == '':
+    del raw_feature['TW4']
+  if raw_feature['UB4'] == '999':
+    del raw_feature['UB4']
+  if raw_feature['UB5'] == '999':
+    del raw_feature['UB5']
+  if raw_feature['UB6'] == '999':
+    del raw_feature['UB6']
+  if raw_feature['BB3'] == '999999':
+    del raw_feature['BB3']
+  if raw_feature['TB4'] == '999999999':
+    del raw_feature['TB4']
   return [f'{item[0]}:{item[1]}' for item in raw_feature.items()]
 
 
@@ -134,19 +143,16 @@ def process(source_filename: str, entries_filename: str):
     chunks = row.strip().split(SEP)
     chunk_lengths = [len(chunk) for chunk in chunks]
     sep_indices = set(itertools.accumulate(chunk_lengths, lambda x, y: x + y))
-    sentence  = ''.join(chunks)
+    sentence = ''.join(chunks)
     p1 = Result.UNKNOWN.value
     p2 = Result.UNKNOWN.value
     p3 = Result.UNKNOWN.value
     for i in range(3, len(sentence) + 1):
-      feature = get_feature(
-        sentence[i - 3],
-        sentence[i - 2],
-        sentence[i - 1],
-        sentence[i]     if i     < len(sentence) else '',
-        sentence[i + 1] if i + 1 < len(sentence) else '',
-        sentence[i + 2] if i + 2 < len(sentence) else '',
-        p1, p2, p3)
+      feature = get_feature(sentence[i - 3], sentence[i - 2], sentence[i - 1],
+                            sentence[i] if i < len(sentence) else '',
+                            sentence[i + 1] if i + 1 < len(sentence) else '',
+                            sentence[i + 2] if i + 2 < len(sentence) else '',
+                            p1, p2, p3)
       positive = i in sep_indices
       p = Result.POSITIVE.value if positive else Result.NEGATIVE.value
       with open(entries_filename, 'a') as f:
@@ -155,4 +161,3 @@ def process(source_filename: str, entries_filename: str):
       p1 = p2
       p2 = p3
       p3 = p
-
diff --git a/budoux/parser.py b/budoux/parser.py
@@ -25,6 +25,9 @@
 with open(os.path.join(os.path.dirname(__file__), 'skip_nodes.json')) as f:
   SKIP_NODES: typing.Set[str] = set(json.load(f))
 
+HTMLAttr = typing.List[typing.Tuple[str, typing.Union[str, None]]]
+
+
 class TextContentExtractor(HTMLParser):
   """An HTML parser to extract text content.
 
@@ -56,8 +59,7 @@ def __init__(self, chunks: typing.List[str]):
     self.chunks_joined = SEP.join(chunks)
     self.to_skip = False
 
-  def handle_starttag(self, tag: str,
-    attrs: typing.List[typing.Tuple[str, typing.Union[str, None]]]):
+  def handle_starttag(self, tag: str, attrs: HTMLAttr):
     attr_pairs = []
     for attr in attrs:
       if attr[1] is None:
@@ -78,7 +80,7 @@ def handle_data(self, data: str):
       if self.chunks_joined[0] == SEP:
         self.chunks_joined = self.chunks_joined[1 + len(data):]
       else:
-        self.chunks_joined = self.chunks_joined[len(data):]        
+        self.chunks_joined = self.chunks_joined[len(data):]
       return
     for char in data:
       if char == self.chunks_joined[0]:
@@ -117,23 +119,22 @@ def parse(self, sentence: str, thres: int = 1000):
     Returns:
       A list of semantic chunks (List[str]).
     """
-    if sentence == '': return []
+    if sentence == '':
+      return []
     p1 = Result.UNKNOWN.value
     p2 = Result.UNKNOWN.value
     p3 = Result.UNKNOWN.value
     chunks = [sentence[:3]]
     for i in range(3, len(sentence)):
-      feature = get_feature(
-        sentence[i - 3],
-        sentence[i - 2],
-        sentence[i - 1],
-        sentence[i],
-        sentence[i + 1] if i + 1 < len(sentence) else '',
-        sentence[i + 2] if i + 2 < len(sentence) else '',
-        p1, p2, p3)
+      feature = get_feature(sentence[i - 3], sentence[i - 2], sentence[i - 1],
+                            sentence[i],
+                            sentence[i + 1] if i + 1 < len(sentence) else '',
+                            sentence[i + 2] if i + 2 < len(sentence) else '',
+                            p1, p2, p3)
       score = 0
       for f in feature:
-        if not f in self.model: continue
+        if not f in self.model:
+          continue
         score += self.model[f]
       if score > thres:
         chunks.append(sentence[i])

diff --git a/budoux/utils.py b/budoux/utils.py
@@ -15,13 +15,12 @@
 
 from enum import Enum
 
-
-"""The separator string to specify breakpoints."""
 SEP = '▁'
+"""The separator string to specify breakpoints."""
 
 
-"""An enum to represent the type of inference result."""
 class Result(Enum):
+  """An enum to represent the type of inference result."""
   UNKNOWN = 'U'
   POSITIVE = 'B'
   NEGATIVE = 'O'
diff --git a/requirements_dev.txt b/requirements_dev.txt
@@ -2,3 +2,4 @@ numpy
 html5lib
 pytest
 pytest-runner
+yapf
diff --git a/setup.cfg b/setup.cfg
@@ -15,8 +15,11 @@ classifiers =
     Programming Language :: Python :: 3.8
 
 [aliases]
-test=pytest
+test = pytest
 
 [options]
 packages = find:
 include_package_data = True
+
+[yapf]
+based_on_style = yapf
diff --git a/setup.py b/setup.py
@@ -13,4 +13,5 @@
 # limitations under the License.
 
 from setuptools import setup
+
 setup()
diff --git a/tests/context.py b/tests/context.py
@@ -14,9 +14,10 @@
 
 import os
 import sys
+
 LIB_PATH = os.path.join(os.path.dirname(__file__), '..')
 sys.path.insert(0, os.path.abspath(LIB_PATH))
 from budoux import feature_extractor
 from budoux import parser
 from budoux import utils
-from scripts import train
+from scripts import train