Skip to content

Commit

Permalink
Merge pull request #12 from montoyjh/master
Browse files Browse the repository at this point in the history
adds grouping function and test to make aggregation-based builds
  • Loading branch information
dwinston authored Jan 23, 2018
2 parents 7332784 + 91f41b9 commit 47a1dd8
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 5 deletions.
46 changes: 42 additions & 4 deletions maggma/stores.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,8 @@ def distinct(self, key, criteria=None, all_exist=False, **kwargs):
if all_exist:
agg_pipeline.append(
{"$match": {k: {"$exists": True} for k in key}})
# use string ints as keys and replace later to avoid bug where periods
# can't be in group keys, then reconstruct after
# use string ints as keys and replace later to avoid bug
# where periods can't be in group keys, then reconstruct after
group_op = {"$group": {
"_id": {str(n): "${}".format(k) for n, k in enumerate(key)}}}
agg_pipeline.append(group_op)
Expand Down Expand Up @@ -207,6 +207,44 @@ def update(self, docs, update_lu=True, key=None):
bulk.find(search_doc).upsert().replace_one(d)
bulk.execute()

def groupby(self, keys, properties=None, criteria=None,
allow_disk_use=True):
"""
Simple grouping function that will group documents
by keys.
Args:
keys (list or string): fields to group documents
properties (list): properties to return in grouped documents
criteria (dict): filter for documents to group
allow_disk_use (bool): whether to allow disk use in aggregation
Returns:
command cursor corresponding to grouped documents
elements of the command cursor have the structure:
{'_id': {"KEY_1": value_1, "KEY_2": value_2 ...,
'docs': [list_of_documents corresponding to key values]}
"""
pipeline = []
if criteria is not None:
pipeline.append({"$match": criteria})

if properties is not None:
pipeline.append({"$project": {p: 1 for p in properties}})

if isinstance(keys, str):
keys = [keys]

group_id = {key: "${}".format(key) for key in keys}
pipeline.append({"$group": {"_id": group_id,
"docs": {"$push": "$$ROOT"}
}
})

return self.collection.aggregate(pipeline, allowDiskUse=allow_disk_use)

def close(self):
self.collection.database.client.close()

Expand Down Expand Up @@ -399,8 +437,8 @@ def distinct(self, key, criteria=None, all_exist=False, **kwargs):
if all_exist:
agg_pipeline.append(
{"$match": {k: {"$exists": True} for k in key}})
# use string ints as keys and replace later to avoid bug where periods
# can't be in group keys, then reconstruct after
# use string ints as keys and replace later to avoid bug
# where periods can't be in group keys, then reconstruct after
group_op = {"$group": {
"_id": {str(n): "${}".format(k) for n, k in enumerate(key)}}}
agg_pipeline.append(group_op)
Expand Down
19 changes: 18 additions & 1 deletion maggma/tests/test_stores.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,24 @@ def test_update(self):
criteria={"d": 8, "f": 9}, properties=["e"])["e"], 7)
self.mongostore.update([{"e": 11, "d": 8, "f": 9}], key=["d","f"])
self.assertEqual(self.mongostore.query_one(
criteria={"d": 8, "f": 9}, properties=["e"])["e"], 11)
criteria={"d": 8, "f": 9}, properties=["e"])["e"], 11)

def test_groupby(self):
self.mongostore.collection.drop()
self.mongostore.update([{"e": 7, "d": 9, "f": 9},
{"e": 7, "d": 9, "f": 10},
{"e": 8, "d": 9, "f": 11},
{"e": 9, "d": 10, "f": 12}], key="f")
data = list(self.mongostore.groupby("d"))
self.assertEqual(len(data), 2)
grouped_by_9 = [g['docs'] for g in data if g['_id']['d'] == 9][0]
self.assertEqual(len(grouped_by_9), 3)
grouped_by_10 = [g['docs'] for g in data if g['_id']['d'] == 10][0]
self.assertEqual(len(grouped_by_10), 1)

data = list(self.mongostore.groupby(["e", "d"]))
self.assertEqual(len(data), 3)

def test_from_db_file(self):
ms = MongoStore.from_db_file(os.path.join(db_dir, "db.json"))

Expand Down

0 comments on commit 47a1dd8

Please sign in to comment.