Skip to content

Commit

Permalink
Implement merging of dicts with missing keys
Browse files Browse the repository at this point in the history
  • Loading branch information
cpcloud committed Sep 3, 2014
1 parent 3d72df1 commit 2ee7f50
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 9 deletions.
32 changes: 23 additions & 9 deletions datashape/discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,11 @@
from dateutil.parser import parse as dateparse
from datetime import datetime, date, time
from .dispatch import dispatch
from time import strptime
from toolz import compose, second

from .coretypes import (int32, int64, float64, bool_, complex128, datetime_,
Option, isdimension, var, from_numpy, Tuple, null,
Record, string, Null, DataShape, real, date_, time_,
Mono)
Record, string, Null, DataShape, real, date_, time_)
from .py2help import _strtypes, _inttypes
from .internal_utils import _toposort, groupby

Expand Down Expand Up @@ -96,21 +95,20 @@ def discover(seq):
for column in columns]
unite = do_one([unite_identical, unite_merge_dimensions, Tuple])
return len(seq) * unite(types)
except AttributeError: # no subshape available
except AttributeError: # no subshape available
pass

# [{k: v, k: v}, {k: v, k: v}]
if (all(isinstance(item, dict) for item in seq) and
len(set(frozenset(item.keys()) for item in seq)) == 1):
keys = sorted(seq[0].keys())
columns = [[item[key] for item in seq] for key in keys]
if all(isinstance(item, dict) for item in seq):
keys = sorted(seq[max(enumerate(seq), key=compose(len, second))[0]].keys())
columns = [[item.get(key, None) for item in seq] for key in keys]
try:
types = [unite([discover(dshape) for dshape in column]).subshape[0]
for column in columns]
return len(seq) * Record(list(zip(keys, types)))
except AttributeError:
pass


types = list(map(discover, seq))
return do_one([unite_identical, unite_merge_dimensions, Tuple])(types)

Expand Down Expand Up @@ -267,3 +265,19 @@ def descendents(d, x):
children -= desc
desc.update(children)
return desc


# @dispatch(list)
# def discover(lst):
# ct = Counter()

# for el in lst:
# items = discover(el)
# ct += Counter(items)

# ((_, _), m), = ct.most_common(1)
# s = [(field, datashape.Option(ftype) if count < m else ftype)
# for (field, ftype), count in ct.items()]

# dshape = len(lst) * datashape.Record(sorted(s))
# return dshape
8 changes: 8 additions & 0 deletions datashape/tests/test_discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,11 @@ def test_big_discover():

def test_unite_base():
assert unite_base([date_, datetime_]) == 2 * datetime_


def test_list_of_dicts():
data = [{'name': 'Alice', 'amount': 100},
{'name': 'Bob'}]
result = discover(data)
expected = dshape('2 * {amount: ?int64, name: string}')
assert result == expected

0 comments on commit 2ee7f50

Please sign in to comment.