data61 · hardbyte · Jun 26, 2018 · May 18, 2018 · Jun 25, 2018 · nbgl
diff --git a/clkhash/__init__.py b/clkhash/__init__.py
@@ -1,10 +1,6 @@
 import pkg_resources
 
-from . import bloomfilter
-from . import field_formats
-from . import key_derivation
-from . import schema
-from . import randomnames
+from . import bloomfilter, field_formats, key_derivation, schema, randomnames
 
 try:
     __version__ = pkg_resources.get_distribution('clkhash').version

diff --git a/clkhash/backports.py b/clkhash/backports.py
@@ -1,19 +1,19 @@
 import csv
-from datetime import datetime
 import re
 import sys
 import time
+from datetime import datetime
 from typing import AnyStr, Callable, cast, Pattern, Sequence, Text
 
 from future.utils import raise_from as _raise_from
 from mypy_extensions import Arg, DefaultNamedArg, NoReturn
 
-
 try:
     int_from_bytes = int.from_bytes
 except AttributeError:
     import codecs
 
+
     def _int_from_bytes(bytes, byteorder, signed=False):
         # type: (Sequence[int], str, bool) -> int
         """ Emulate Python 3's `int.from_bytes`.
@@ -39,6 +39,7 @@ def _int_from_bytes(bytes, byteorder, signed=False):
         hex_str = codecs.encode(bytes, 'hex')  # type: ignore
         return int(hex_str, 16)
 
+
     # Make this cast since Python 2 doesn't have syntax for default
     # named arguments. Hence, must cast so Mypy thinks it matches the
     # original function.
@@ -99,10 +100,9 @@ def _p2_unicode_reader(unicode_csv_data, dialect=csv.excel, **kwargs):
 
 
 unicode_reader = (_p2_unicode_reader  # Python 2 with hacky workarounds.
-                  if sys.version_info < (3,0)
+                  if sys.version_info < (3, 0)
                   else csv.reader)  # Py3 with native Unicode support.
 
-
 if sys.version_info > (3, 2):
     strftime = datetime.strftime
 
@@ -113,17 +113,19 @@ def _p2_unicode_reader(unicode_csv_data, dialect=csv.excel, **kwargs):
     # even number of '%'s before the 's' because those are all escaped.
     _illegal_s = re.compile(r'((^|[^%])(%%)*%s)')
 
+
     def _findall(text, substr):
         # Also finds overlaps
         i = 0
         while True:
             j = text.find(substr, i)
             if j == -1:
                 return
-            
+
             yield j
             i = j + 1
 
+
     def strftime(dt, fmt):
         # type: (datetime, Text) -> Text
         """ strftime that support years < 1900 in Python < 3.2.
@@ -177,7 +179,6 @@ def strftime(dt, fmt):
             s = s[:site] + syear + s[site + _YEAR_LEN:]
         return s
 
-
 # Help MyPy understand that this always throws.
 raise_from = cast(Callable[[BaseException, BaseException], NoReturn],
                   _raise_from)
diff --git a/clkhash/bloomfilter.py b/clkhash/bloomfilter.py
@@ -5,21 +5,21 @@
 """
 
 import base64
-from enum import Enum
-from functools import partial
-from hashlib import md5, sha1
 import hmac
 import math
 import struct
+from enum import Enum
+from functools import partial
+from hashlib import md5, sha1
 from typing import Callable, Iterable, List, Sequence, Text, Tuple
 
 from bitarray import bitarray
 from future.builtins import range, zip
 
 from clkhash import tokenizer
 from clkhash.backports import int_from_bytes
-from clkhash.schema import Schema, GlobalHashingProperties
 from clkhash.field_formats import FieldSpec
+from clkhash.schema import Schema, GlobalHashingProperties
 
 try:
     from hashlib import blake2b
@@ -30,11 +30,11 @@
     # blake2b is already defined.
 
 
-def double_hash_encode_ngrams(ngrams,          # type: Iterable[str]
-                              keys,            # type: Sequence[bytes]
-                              k,               # type: int
-                              l,               # type: int
-                              encoding         # type: str
+def double_hash_encode_ngrams(ngrams,   # type: Iterable[str]
+                              keys,     # type: Sequence[bytes]
+                              k,        # type: int
+                              l,        # type: int
+                              encoding  # type: str
                               ):
     # type: (...) -> bitarray
     """
@@ -64,11 +64,11 @@ def double_hash_encode_ngrams(ngrams,          # type: Iterable[str]
     return bf
 
 
-def double_hash_encode_ngrams_non_singular(ngrams,          # type: Iterable[str]
-                                           keys,            # type: Sequence[bytes]
-                                           k,               # type: int
-                                           l,               # type: int
-                                           encoding         # type: str
+def double_hash_encode_ngrams_non_singular(ngrams,  # type: Iterable[str]
+                                           keys,  # type: Sequence[bytes]
+                                           k,  # type: int
+                                           l,  # type: int
+                                           encoding  # type: str
                                            ):
     # type: (...) -> bitarray.bitarray
     """
@@ -131,11 +131,11 @@ def double_hash_encode_ngrams_non_singular(ngrams,          # type: Iterable[str
     return bf
 
 
-def blake_encode_ngrams(ngrams,          # type: Iterable[str]
-                        keys,            # type: Sequence[bytes]
-                        k,               # type: int
-                        l,               # type: int
-                        encoding         # type: str
+def blake_encode_ngrams(ngrams,  # type: Iterable[str]
+                        keys,  # type: Sequence[bytes]
+                        k,  # type: int
+                        l,  # type: int
+                        encoding  # type: str
                         ):
     # type: (...) -> bitarray.bitarray
     """
@@ -190,13 +190,13 @@ def blake_encode_ngrams(ngrams,          # type: Iterable[str]
     key, = keys  # Unpack.
 
     log_l = int(math.log(l, 2))
-    if not 2**log_l == l:
+    if not 2 ** log_l == l:
         raise ValueError('parameter "l" has to be a power of two for the BLAKE2 encoding, but was: {}'.format(l))
     bf = bitarray(l)
     bf.setall(False)
     if k < 1:
         return bf
-    num_macs = (k+31) // 32
+    num_macs = (k + 31) // 32
 
     for m in ngrams:
         random_shorts = []  # type: List[int]
@@ -249,7 +249,7 @@ def from_properties(cls,
 
 
 def fold_xor(bloomfilter,  # type: bitarray
-             folds         # type: int
+             folds  # type: int
              ):
     # type: (...) -> bitarray
     """ Performs XOR folding on a Bloom filter.
@@ -279,10 +279,10 @@ def fold_xor(bloomfilter,  # type: bitarray
     return bloomfilter
 
 
-def crypto_bloom_filter(record,          # type: Sequence[Text]
-                        tokenizers,      # type: List[Callable[[Text], Iterable[Text]]]
-                        fields,          # type: Sequence[FieldSpec]
-                        keys,            # type: Sequence[Sequence[bytes]]
+def crypto_bloom_filter(record,  # type: Sequence[Text]
+                        tokenizers,  # type: List[Callable[[Text], Iterable[Text]]]
+                        fields,  # type: Sequence[FieldSpec]
+                        keys,  # type: Sequence[Sequence[bytes]]
                         hash_properties  # type: GlobalHashingProperties
                         ):
     # type: (...) -> Tuple[bitarray, Text, int]
@@ -329,8 +329,8 @@ def crypto_bloom_filter(record,          # type: Sequence[Text]
 
 
 def stream_bloom_filters(dataset,  # type: Iterable[Sequence[Text]]
-                         keys,     # type: Sequence[Sequence[bytes]]
-                         schema    # type: Schema
+                         keys,  # type: Sequence[Sequence[bytes]]
+                         schema  # type: Schema
                          ):
     # type: (...) -> Iterable[Tuple[bitarray, Text, int]]
     """

diff --git a/clkhash/cli.py b/clkhash/cli.py
@@ -12,7 +12,6 @@
 import clkhash
 from clkhash import benchmark as bench, clk, randomnames, validate_data
 
-
 DEFAULT_SERVICE_URL = 'https://es.data61.xyz'
 
 
@@ -43,7 +42,6 @@ def cli(verbose=False):
     """
 
 
-
 @cli.command('hash', short_help="generate hashes from local PII data")
 @click.argument('input', type=click.File('r'))
 @click.argument('keys', nargs=2, type=click.Tuple([str, str]))
@@ -94,7 +92,7 @@ def hash(input, keys, schema, output, quiet, no_header, check_header, validate):
 
 @cli.command('status', short_help='Get status of entity service')
 @click.option('--server', type=str, default=DEFAULT_SERVICE_URL, help="Server address including protocol")
-@click.option('-o','--output', type=click.File('w'), default='-')
+@click.option('-o', '--output', type=click.File('w'), default='-')
 @click.option('-v', '--verbose', default=False, is_flag=True, help="Script is more talkative")
 def status(server, output, verbose):
     """Connect to an entity matching server and check the service status.
@@ -130,13 +128,14 @@ def status(server, output, verbose):
 
 """
 
+
 @cli.command('create', short_help="create a mapping on the entity service")
 @click.option('--type', default='permutation_unencrypted_mask',
               help='Alternative protocol/view type of the mapping. Default is unencrypted permutation and mask.')
 @click.option('--schema', type=click.File('r'), help="Schema to publicly share with participating parties.")
 @click.option('--server', type=str, default=DEFAULT_SERVICE_URL, help="Server address including protocol")
-@click.option('-o','--output', type=click.File('w'), default='-')
-@click.option('-t','--threshold', type=float, default=0.95)
+@click.option('-o', '--output', type=click.File('w'), default='-')
+@click.option('-t', '--threshold', type=float, default=0.95)
 @click.option('-v', '--verbose', default=False, is_flag=True, help="Script is more talkative")
 def create(type, schema, server, output, threshold, verbose):
     """Create a new mapping on an entity matching server.
@@ -187,7 +186,7 @@ def create(type, schema, server, output, threshold, verbose):
 @click.option('--mapping', help='Server identifier of the mapping')
 @click.option('--apikey', help='Authentication API key for the server.')
 @click.option('--server', type=str, default=DEFAULT_SERVICE_URL, help="Server address including protocol")
-@click.option('-o','--output', type=click.File('w'), default='-')
+@click.option('-o', '--output', type=click.File('w'), default='-')
 @click.option('-v', '--verbose', default=False, is_flag=True, help="Script is more talkative")
 def upload(input, mapping, apikey, server, output, verbose):
     """Upload CLK data to entity matching server.
@@ -221,18 +220,16 @@ def upload(input, mapping, apikey, server, output, verbose):
         log(response.text)
         log("When the other party has uploaded their CLKS, you should be able to watch for results")
 
-
     print(response.text, file=output)
 
 
-
 @cli.command('results', short_help="fetch results from entity service")
 @click.option('--mapping',
               help='Server identifier of the mapping')
 @click.option('--apikey', help='Authentication API key for the server.')
 @click.option('-w', '--watch', help='Follow/wait until results are available', is_flag=True)
 @click.option('--server', type=str, default=DEFAULT_SERVICE_URL, help="Server address including protocol")
-@click.option('-o','--output', type=click.File('w'), default='-')
+@click.option('-o', '--output', type=click.File('w'), default='-')
 def results(mapping, apikey, watch, server, output):
     """
     Check to see if results are available for a particular mapping

diff --git a/clkhash/clk.py b/clkhash/clk.py
@@ -19,15 +19,14 @@
 from clkhash.validate_data import (validate_entries, validate_header,
                                    validate_row_lengths)
 
-
 log = logging.getLogger('clkhash.clk')
 
 CHUNK_SIZE = 1000
 
 
 def hash_and_serialize_chunk(chunk_pii_data,  # type: Sequence[Sequence[str]]
-                             keys,            # type: Sequence[Sequence[bytes]]
-                             schema           # type: Schema
+                             keys,  # type: Sequence[Sequence[bytes]]
+                             schema  # type: Schema
                              ):
     # type: (...) -> Tuple[List[str], Sequence[int]]
     """
@@ -49,12 +48,12 @@ def hash_and_serialize_chunk(chunk_pii_data,  # type: Sequence[Sequence[str]]
     return clk_data, clk_popcounts
 
 
-def generate_clk_from_csv(input_f,             # type: TextIO
-                          keys,                # type: Tuple[AnyStr, AnyStr]
-                          schema,              # type: Schema
-                          validate=True,       # type: bool
-                          header=True,         # type: Union[bool, AnyStr]
-                          progress_bar=True    # type: bool
+def generate_clk_from_csv(input_f,  # type: TextIO
+                          keys,  # type: Tuple[AnyStr, AnyStr]
+                          schema,  # type: Schema
+                          validate=True,  # type: bool
+                          header=True,  # type: Union[bool, AnyStr]
+                          progress_bar=True  # type: bool
                           ):
     # type: (...) -> List[str]
     """ Generate Bloom filters from CSV file, then serialise them.
@@ -127,11 +126,11 @@ def callback(tics, clk_stats):
     return results
 
 
-def generate_clks(pii_data,       # type: Sequence[Sequence[str]]
-                  schema,         # type: Schema
-                  keys,           # type: Tuple[AnyStr, AnyStr]
+def generate_clks(pii_data,  # type: Sequence[Sequence[str]]
+                  schema,  # type: Schema
+                  keys,  # type: Tuple[AnyStr, AnyStr]
                   validate=True,  # type: bool
-                  callback=None   # type: Optional[Callable[[int, Sequence[int]], None]]
+                  callback=None  # type: Optional[Callable[[int, Sequence[int]], None]]
                   ):
     # type: (...) -> List[str]
 
@@ -158,7 +157,7 @@ def generate_clks(pii_data,       # type: Sequence[Sequence[str]]
         for chunk in chunks(pii_data, chunk_size):
             future = executor.submit(
                 hash_and_serialize_chunk,
-                chunk, key_lists, schema,)
+                chunk, key_lists, schema, )
             if callback is not None:
                 unpacked_callback = cast(Callable[[int, Sequence[int]], None],
                                          callback)
@@ -175,7 +174,7 @@ def generate_clks(pii_data,       # type: Sequence[Sequence[str]]
     return results
 
 
-T = TypeVar('T')      # Declare generic type variable
+T = TypeVar('T')  # Declare generic type variable
 
 
 def chunks(seq, chunk_size):