Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automated code format update #142

Merged
merged 2 commits into from
Jun 26, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions clkhash/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
import pkg_resources

from . import bloomfilter
from . import field_formats
from . import key_derivation
from . import schema
from . import randomnames
from . import bloomfilter, field_formats, key_derivation, schema, randomnames

try:
__version__ = pkg_resources.get_distribution('clkhash').version
Expand Down
13 changes: 7 additions & 6 deletions clkhash/backports.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
import csv
from datetime import datetime
import re
import sys
import time
from datetime import datetime
from typing import AnyStr, Callable, cast, Pattern, Sequence, Text

from future.utils import raise_from as _raise_from
from mypy_extensions import Arg, DefaultNamedArg, NoReturn


try:
int_from_bytes = int.from_bytes
except AttributeError:
import codecs


def _int_from_bytes(bytes, byteorder, signed=False):
# type: (Sequence[int], str, bool) -> int
""" Emulate Python 3's `int.from_bytes`.
Expand All @@ -39,6 +39,7 @@ def _int_from_bytes(bytes, byteorder, signed=False):
hex_str = codecs.encode(bytes, 'hex') # type: ignore
return int(hex_str, 16)


# Make this cast since Python 2 doesn't have syntax for default
# named arguments. Hence, must cast so Mypy thinks it matches the
# original function.
Expand Down Expand Up @@ -99,10 +100,9 @@ def _p2_unicode_reader(unicode_csv_data, dialect=csv.excel, **kwargs):


unicode_reader = (_p2_unicode_reader # Python 2 with hacky workarounds.
if sys.version_info < (3,0)
if sys.version_info < (3, 0)
else csv.reader) # Py3 with native Unicode support.


if sys.version_info > (3, 2):
strftime = datetime.strftime

Expand All @@ -113,17 +113,19 @@ def _p2_unicode_reader(unicode_csv_data, dialect=csv.excel, **kwargs):
# even number of '%'s before the 's' because those are all escaped.
_illegal_s = re.compile(r'((^|[^%])(%%)*%s)')


def _findall(text, substr):
# Also finds overlaps
i = 0
while True:
j = text.find(substr, i)
if j == -1:
return

yield j
i = j + 1


def strftime(dt, fmt):
# type: (datetime, Text) -> Text
""" strftime that support years < 1900 in Python < 3.2.
Expand Down Expand Up @@ -177,7 +179,6 @@ def strftime(dt, fmt):
s = s[:site] + syear + s[site + _YEAR_LEN:]
return s


# Help MyPy understand that this always throws.
raise_from = cast(Callable[[BaseException, BaseException], NoReturn],
_raise_from)
56 changes: 28 additions & 28 deletions clkhash/bloomfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,21 @@
"""

import base64
from enum import Enum
from functools import partial
from hashlib import md5, sha1
import hmac
import math
import struct
from enum import Enum
from functools import partial
from hashlib import md5, sha1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems that it’s listing namespace imports (import foo) before from imports (from foo import bar). This behaviour doesn’t seem to be part of PEP8. Looks better though!

from typing import Callable, Iterable, List, Sequence, Text, Tuple

from bitarray import bitarray
from future.builtins import range, zip

from clkhash import tokenizer
from clkhash.backports import int_from_bytes
from clkhash.schema import Schema, GlobalHashingProperties
from clkhash.field_formats import FieldSpec
from clkhash.schema import Schema, GlobalHashingProperties

try:
from hashlib import blake2b
Expand All @@ -30,11 +30,11 @@
# blake2b is already defined.


def double_hash_encode_ngrams(ngrams, # type: Iterable[str]
keys, # type: Sequence[bytes]
k, # type: int
l, # type: int
encoding # type: str
def double_hash_encode_ngrams(ngrams, # type: Iterable[str]
keys, # type: Sequence[bytes]
k, # type: int
l, # type: int
encoding # type: str
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In Mypy’s Python 2 examples, the type annotations are aligned like in the original.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had a quick look and it didn't appear to be configurable 👎

I'm not particularly keen to go through and manually edit... if you feel strongly about it do you want to make any changes (or look into other tools that might reformat more optimally)

):
# type: (...) -> bitarray
"""
Expand Down Expand Up @@ -64,11 +64,11 @@ def double_hash_encode_ngrams(ngrams, # type: Iterable[str]
return bf


def double_hash_encode_ngrams_non_singular(ngrams, # type: Iterable[str]
keys, # type: Sequence[bytes]
k, # type: int
l, # type: int
encoding # type: str
def double_hash_encode_ngrams_non_singular(ngrams, # type: Iterable[str]
keys, # type: Sequence[bytes]
k, # type: int
l, # type: int
encoding # type: str
):
# type: (...) -> bitarray.bitarray
"""
Expand Down Expand Up @@ -131,11 +131,11 @@ def double_hash_encode_ngrams_non_singular(ngrams, # type: Iterable[str
return bf


def blake_encode_ngrams(ngrams, # type: Iterable[str]
keys, # type: Sequence[bytes]
k, # type: int
l, # type: int
encoding # type: str
def blake_encode_ngrams(ngrams, # type: Iterable[str]
keys, # type: Sequence[bytes]
k, # type: int
l, # type: int
encoding # type: str
):
# type: (...) -> bitarray.bitarray
"""
Expand Down Expand Up @@ -190,13 +190,13 @@ def blake_encode_ngrams(ngrams, # type: Iterable[str]
key, = keys # Unpack.

log_l = int(math.log(l, 2))
if not 2**log_l == l:
if not 2 ** log_l == l:
raise ValueError('parameter "l" has to be a power of two for the BLAKE2 encoding, but was: {}'.format(l))
bf = bitarray(l)
bf.setall(False)
if k < 1:
return bf
num_macs = (k+31) // 32
num_macs = (k + 31) // 32

for m in ngrams:
random_shorts = [] # type: List[int]
Expand Down Expand Up @@ -249,7 +249,7 @@ def from_properties(cls,


def fold_xor(bloomfilter, # type: bitarray
folds # type: int
folds # type: int
):
# type: (...) -> bitarray
""" Performs XOR folding on a Bloom filter.
Expand Down Expand Up @@ -279,10 +279,10 @@ def fold_xor(bloomfilter, # type: bitarray
return bloomfilter


def crypto_bloom_filter(record, # type: Sequence[Text]
tokenizers, # type: List[Callable[[Text], Iterable[Text]]]
fields, # type: Sequence[FieldSpec]
keys, # type: Sequence[Sequence[bytes]]
def crypto_bloom_filter(record, # type: Sequence[Text]
tokenizers, # type: List[Callable[[Text], Iterable[Text]]]
fields, # type: Sequence[FieldSpec]
keys, # type: Sequence[Sequence[bytes]]
hash_properties # type: GlobalHashingProperties
):
# type: (...) -> Tuple[bitarray, Text, int]
Expand Down Expand Up @@ -329,8 +329,8 @@ def crypto_bloom_filter(record, # type: Sequence[Text]


def stream_bloom_filters(dataset, # type: Iterable[Sequence[Text]]
keys, # type: Sequence[Sequence[bytes]]
schema # type: Schema
keys, # type: Sequence[Sequence[bytes]]
schema # type: Schema
):
# type: (...) -> Iterable[Tuple[bitarray, Text, int]]
"""
Expand Down
15 changes: 6 additions & 9 deletions clkhash/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import clkhash
from clkhash import benchmark as bench, clk, randomnames, validate_data


DEFAULT_SERVICE_URL = 'https://es.data61.xyz'


Expand Down Expand Up @@ -43,7 +42,6 @@ def cli(verbose=False):
"""



@cli.command('hash', short_help="generate hashes from local PII data")
@click.argument('input', type=click.File('r'))
@click.argument('keys', nargs=2, type=click.Tuple([str, str]))
Expand Down Expand Up @@ -94,7 +92,7 @@ def hash(input, keys, schema, output, quiet, no_header, check_header, validate):

@cli.command('status', short_help='Get status of entity service')
@click.option('--server', type=str, default=DEFAULT_SERVICE_URL, help="Server address including protocol")
@click.option('-o','--output', type=click.File('w'), default='-')
@click.option('-o', '--output', type=click.File('w'), default='-')
@click.option('-v', '--verbose', default=False, is_flag=True, help="Script is more talkative")
def status(server, output, verbose):
"""Connect to an entity matching server and check the service status.
Expand Down Expand Up @@ -130,13 +128,14 @@ def status(server, output, verbose):

"""


@cli.command('create', short_help="create a mapping on the entity service")
@click.option('--type', default='permutation_unencrypted_mask',
help='Alternative protocol/view type of the mapping. Default is unencrypted permutation and mask.')
@click.option('--schema', type=click.File('r'), help="Schema to publicly share with participating parties.")
@click.option('--server', type=str, default=DEFAULT_SERVICE_URL, help="Server address including protocol")
@click.option('-o','--output', type=click.File('w'), default='-')
@click.option('-t','--threshold', type=float, default=0.95)
@click.option('-o', '--output', type=click.File('w'), default='-')
@click.option('-t', '--threshold', type=float, default=0.95)
@click.option('-v', '--verbose', default=False, is_flag=True, help="Script is more talkative")
def create(type, schema, server, output, threshold, verbose):
"""Create a new mapping on an entity matching server.
Expand Down Expand Up @@ -187,7 +186,7 @@ def create(type, schema, server, output, threshold, verbose):
@click.option('--mapping', help='Server identifier of the mapping')
@click.option('--apikey', help='Authentication API key for the server.')
@click.option('--server', type=str, default=DEFAULT_SERVICE_URL, help="Server address including protocol")
@click.option('-o','--output', type=click.File('w'), default='-')
@click.option('-o', '--output', type=click.File('w'), default='-')
@click.option('-v', '--verbose', default=False, is_flag=True, help="Script is more talkative")
def upload(input, mapping, apikey, server, output, verbose):
"""Upload CLK data to entity matching server.
Expand Down Expand Up @@ -221,18 +220,16 @@ def upload(input, mapping, apikey, server, output, verbose):
log(response.text)
log("When the other party has uploaded their CLKS, you should be able to watch for results")


print(response.text, file=output)



@cli.command('results', short_help="fetch results from entity service")
@click.option('--mapping',
help='Server identifier of the mapping')
@click.option('--apikey', help='Authentication API key for the server.')
@click.option('-w', '--watch', help='Follow/wait until results are available', is_flag=True)
@click.option('--server', type=str, default=DEFAULT_SERVICE_URL, help="Server address including protocol")
@click.option('-o','--output', type=click.File('w'), default='-')
@click.option('-o', '--output', type=click.File('w'), default='-')
def results(mapping, apikey, watch, server, output):
"""
Check to see if results are available for a particular mapping
Expand Down
29 changes: 14 additions & 15 deletions clkhash/clk.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,14 @@
from clkhash.validate_data import (validate_entries, validate_header,
validate_row_lengths)


log = logging.getLogger('clkhash.clk')

CHUNK_SIZE = 1000


def hash_and_serialize_chunk(chunk_pii_data, # type: Sequence[Sequence[str]]
keys, # type: Sequence[Sequence[bytes]]
schema # type: Schema
keys, # type: Sequence[Sequence[bytes]]
schema # type: Schema
):
# type: (...) -> Tuple[List[str], Sequence[int]]
"""
Expand All @@ -49,12 +48,12 @@ def hash_and_serialize_chunk(chunk_pii_data, # type: Sequence[Sequence[str]]
return clk_data, clk_popcounts


def generate_clk_from_csv(input_f, # type: TextIO
keys, # type: Tuple[AnyStr, AnyStr]
schema, # type: Schema
validate=True, # type: bool
header=True, # type: Union[bool, AnyStr]
progress_bar=True # type: bool
def generate_clk_from_csv(input_f, # type: TextIO
keys, # type: Tuple[AnyStr, AnyStr]
schema, # type: Schema
validate=True, # type: bool
header=True, # type: Union[bool, AnyStr]
progress_bar=True # type: bool
):
# type: (...) -> List[str]
""" Generate Bloom filters from CSV file, then serialise them.
Expand Down Expand Up @@ -127,11 +126,11 @@ def callback(tics, clk_stats):
return results


def generate_clks(pii_data, # type: Sequence[Sequence[str]]
schema, # type: Schema
keys, # type: Tuple[AnyStr, AnyStr]
def generate_clks(pii_data, # type: Sequence[Sequence[str]]
schema, # type: Schema
keys, # type: Tuple[AnyStr, AnyStr]
validate=True, # type: bool
callback=None # type: Optional[Callable[[int, Sequence[int]], None]]
callback=None # type: Optional[Callable[[int, Sequence[int]], None]]
):
# type: (...) -> List[str]

Expand All @@ -158,7 +157,7 @@ def generate_clks(pii_data, # type: Sequence[Sequence[str]]
for chunk in chunks(pii_data, chunk_size):
future = executor.submit(
hash_and_serialize_chunk,
chunk, key_lists, schema,)
chunk, key_lists, schema, )
if callback is not None:
unpacked_callback = cast(Callable[[int, Sequence[int]], None],
callback)
Expand All @@ -175,7 +174,7 @@ def generate_clks(pii_data, # type: Sequence[Sequence[str]]
return results


T = TypeVar('T') # Declare generic type variable
T = TypeVar('T') # Declare generic type variable


def chunks(seq, chunk_size):
Expand Down
Loading