From e81d6a82c18393e293c1a5f82c5b21f7633eb5bb Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Tue, 5 Nov 2019 15:28:51 -0800
Subject: [PATCH 01/99] ignore vs_code

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index e8605a5db..11624d729 100644
--- a/.gitignore
+++ b/.gitignore
@@ -110,3 +110,4 @@ ENV/
 
 .idea
 .DS_Store
+.vscode

From 9cab45d04f8e462a7def016ac19af4c60bf18865 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Tue, 5 Nov 2019 15:29:06 -0800
Subject: [PATCH 02/99] refactor core components

---
 maggma/core/__init__.py  |   3 +
 maggma/core/builder.py   | 139 +++++++++++++++++
 maggma/core/store.py     | 317 +++++++++++++++++++++++++++++++++++++++
 maggma/core/validator.py |  34 +++++
 4 files changed, 493 insertions(+)
 create mode 100644 maggma/core/__init__.py
 create mode 100644 maggma/core/builder.py
 create mode 100644 maggma/core/store.py
 create mode 100644 maggma/core/validator.py

diff --git a/maggma/core/__init__.py b/maggma/core/__init__.py
new file mode 100644
index 000000000..c6d8b73f1
--- /dev/null
+++ b/maggma/core/__init__.py
@@ -0,0 +1,3 @@
+from maggma.core.store import Store, Sort, DateTimeFormat, StoreError
+from maggma.core.builder import Builder
+from maggma.core.validator import Validator
\ No newline at end of file
diff --git a/maggma/core/builder.py b/maggma/core/builder.py
new file mode 100644
index 000000000..678ac8a8d
--- /dev/null
+++ b/maggma/core/builder.py
@@ -0,0 +1,139 @@
+# coding: utf-8
+"""
+Module containing the core builder definition
+"""
+from __future__ import annotations
+
+import logging
+from abc import ABCMeta, abstractmethod
+from typing import Union, Optional, Dict, List, Iterator, Any
+
+from monty.json import MSONable, MontyDecoder
+from maggma.utils import grouper
+from maggma.core import Store
+
+
+class Builder(MSONable, metaclass=ABCMeta):
+    """
+    Base Builder class
+    At minimum this class should implement:
+    get_items - Get items from the sources
+    update_targets - Updates the sources with results
+
+    Multiprocessing and MPI processing can be used if all
+    the data processing is  limited to process_items
+    """
+
+    def __init__(
+        self,
+        sources: Union[List[Store], Store],
+        targets: Union[List[Store], Store],
+        chunk_size: int = 1000,
+        query: Optional[Dict] = None,
+    ):
+        """
+        Initialize the builder the framework.
+
+        Args:
+            sources: source Store(s)
+            targets: target Store(s)
+            chunk_size: chunk size for processing
+            query: dictionary of options to utilize on a source;
+                   Each builder has internal logic on which souce this will apply to
+        """
+        self.sources = sources if isinstance(sources, list) else [sources]
+        self.targets = targets if isinstance(targets, list) else [targets]
+        self.chunk_size = chunk_size
+        self.query = query
+        self.logger = logging.getLogger(type(self).__name__)
+        self.logger.addHandler(logging.NullHandler())
+
+    def connect(self):
+        """
+        Connect to the builder sources and targets.
+        """
+        stores = self.sources + self.targets
+        for s in stores:
+            s.connect()
+
+    @abstractmethod
+    def get_items(self) -> Iterator:
+        """
+        Returns all the items to process.
+
+        Returns:
+            generator or list of items to process
+        """
+        pass
+
+    def process_item(self, item: Any) -> Any:
+        """
+        Process an item. Should not expect DB access as this can be run MPI
+        Default behavior is to return the item.
+        Args:
+            item:
+
+        Returns:
+           item: an item to update
+        """
+        return item
+
+    @abstractmethod
+    def update_targets(self, items: List):
+        """
+        Takes a dictionary of targets and items from process item and updates them
+        Can also perform other book keeping in the process such as storing gridfs oids, etc.
+
+        Args:
+            items:
+
+        Returns:
+
+        """
+        pass
+
+    def finalize(self, cursor=None):
+        """
+        Perform any final clean up.
+        """
+        # Close any Mongo connections.
+        for store in self.sources + self.targets:
+            try:
+                store.collection.database.client.close()
+            except AttributeError:
+                continue
+        # Runner will pass iterable yielded by `self.get_items` as `cursor`. If
+        # this is a Mongo cursor with `no_cursor_timeout=True` (not the
+        # default), we must be explicitly kill it.
+        try:
+            cursor and cursor.close()
+        except AttributeError:
+            pass
+
+    def run(self):
+        """
+        Run the builder serially
+
+        Args:
+            builder_id (int): the index of the builder in the builders list
+        """
+        self.connect()
+
+        cursor = self.get_items()
+
+        for chunk in grouper(cursor, self.chunk_size):
+            self.logger.info("Processing batch of {} items".format(self.chunk_size))
+            processed_items = [
+                self.process_item(item) for item in chunk if item is not None
+            ]
+            self.update_targets(processed_items)
+
+        self.finalize(cursor)
+
+    def __getstate__(self):
+        return self.as_dict()
+
+    def __setstate__(self, d):
+        d = {k: v for k, v in d.items() if not k.startswith("@")}
+        d = MontyDecoder().process_decoded(d)
+        self.__init__(**d)
diff --git a/maggma/core/store.py b/maggma/core/store.py
new file mode 100644
index 000000000..ba356630a
--- /dev/null
+++ b/maggma/core/store.py
@@ -0,0 +1,317 @@
+# coding: utf-8
+"""
+Module containing the core Store definition
+"""
+from __future__ import annotations
+
+import logging
+
+
+from abc import ABCMeta, abstractmethod, abstractproperty
+
+from datetime import datetime
+from enum import Enum
+from typing import Union, Optional, Dict, List, Iterator, Tuple
+
+from pydash import identity
+
+from monty.dev import deprecated
+from monty.json import MSONable, MontyDecoder
+from maggma.utils import source_keys_updated, LU_KEY_ISOFORMAT
+from maggma.core import Validator
+
+
+class Sort(Enum):
+    Ascending = 1
+    Descending = 2
+
+
+class DateTimeFormat(Enum):
+    DateTime = "datetime"
+    IsoFormat = "isoformat"
+
+
+class Store(MSONable, metaclass=ABCMeta):
+    """
+    Abstract class for a data Store
+    Defines the interface for all data going in and out of a Builder
+    """
+
+    def __init__(
+        self,
+        key: str = "task_id",
+        last_updated_field: str = "last_updated",
+        last_updated_type: DateTimeFormat = "datetime",
+        validator: Optional[Validator] = None,
+    ):
+        """
+        Args:
+            key : master key to index on
+            last_updated_field : field for date/time stamping the data
+            last_updated_type : the date/time format for the last_updated_field.
+                                Can be "datetime" or "isoformat"
+            validator : Validator to validate documents going into the store
+        """
+        self.key = key
+        self.last_updated_field = last_updated_field
+        self.last_updated_type = last_updated_type
+        self._lu_func = (
+            LU_KEY_ISOFORMAT
+            if last_updated_type == DateTimeFormat.IsoFormat
+            else (identity, identity)
+        )
+        self.validator = validator
+        self.logger = logging.getLogger(type(self).__name__)
+        self.logger.addHandler(logging.NullHandler())
+
+    @abstractproperty
+    @deprecated(message="This will be removed in the future")
+    def collection(self):
+        """
+        Returns a handle to the pymongo collection object
+        Not guaranteed to exist in the future
+        """
+        pass
+
+    @abstractmethod
+    def connect(self, force_reset: bool = False):
+        """
+        Connect to the source data
+        """
+        pass
+
+    @abstractmethod
+    def close(self):
+        """
+        Closes any connections
+        """
+        pass
+
+    @abstractmethod
+    def query(
+        self,
+        criteria: Optional[Dict] = None,
+        properties: Union[Dict, List, None] = None,
+        sort: Optional[Dict[str, Sort]] = None,
+        skip: int = 0,
+        limit: int = 0,
+    ) -> Iterator[Dict]:
+        """
+        Queries the Store for a set of documents
+
+        Args:
+            criteria : PyMongo filter for documents to search in
+            properties: properties to return in grouped documents
+            sort: Dictionary of sort order for fields
+            skip: number documents to skip
+            limit: limit on total number of documents returned
+        """
+        pass
+
+    def query_one(self, criteria=None, properties=None, **kwargs):
+        """
+        Function that gets a single document from GridFS. This store
+        ignores all property projections as its designed for whole
+        document access
+
+        Args:
+            criteria (dict): filter for query, matches documents
+                against key-value pairs
+            properties (list or dict): This will be ignored by the GridFS
+                Store
+            **kwargs (kwargs): further kwargs to Collection.find
+        """
+        return next(self.query(criteria=criteria, **kwargs), None)
+
+    def distinct(
+        self,
+        field: Union[List[str], str],
+        criteria: Optional[Dict] = None,
+        all_exist: bool = False,
+    ) -> List:
+        """
+        Get all distinct values for a key
+
+        Args:
+            field: the field(s) to get distinct values for
+            criteria : PyMongo filter for documents to search in
+            all_exist : ensure all fields exist for the distinct set
+        """
+        field = field if isinstance(field, list) else [field]
+
+        criteria = criteria or {}
+
+        if all_exist:
+            criteria.update({f: {"$exists": 1} for f in field if f not in criteria})
+        results = [
+            key for key, _ in self.groupby(field, properties=field, criteria=criteria)
+        ]
+        return results
+
+    @abstractmethod
+    def update(self, docs: Union[List[Dict], Dict], key: Union[List, str, None] = None):
+        """
+        Update documents into the Store
+
+        Args:
+            docs: the document or list of documents to update
+            key: field name(s) to determine uniqueness for a
+                 document, can be a list of multiple fields,
+                 a single field, or None if the Store's key
+                 field is to be used
+        """
+        pass
+
+    @abstractmethod
+    def ensure_index(self, key: str, unique: Optional[bool] = False) -> bool:
+        """
+        Tries to create an index and return true if it suceeded
+        Args:
+            key: single key to index
+            unique: Whether or not this index contains only unique keys
+
+        Returns:
+            bool indicating if the index exists/was created
+        """
+        pass
+
+    @abstractmethod
+    def groupby(
+        self,
+        keys: Union[List[str], str],
+        criteria: Optional[Dict] = None,
+        properties: Union[Dict, List, None] = None,
+        sort: Optional[Dict[str, Sort]] = None,
+        skip: int = 0,
+        limit: int = 0,
+    ) -> Iterator[Tuple[Dict, List[Dict]]]:
+        """
+        Simple grouping function that will group documents
+        by keys.
+
+        Args:
+            keys: fields to group documents
+            criteria : PyMongo filter for documents to search in
+            properties: properties to return in grouped documents
+            sort: Dictionary of sort order for fields
+            skip: number documents to skip
+            limit: limit on total number of documents returned
+
+        Returns:
+            generator returning tuples of (key, list of docs)
+        """
+        pass
+
+    @property
+    def last_updated(self):
+        """
+        Provides the most recent last_updated date time stamp from
+        the documents in this Store
+        """
+        doc = next(
+            self.query(
+                properties=[self.last_updated_field],
+                sort={self.last_updated_field: Sort.Descending},
+                limit=1,
+            ),
+            None,
+        )
+        if doc and self.last_updated_field not in doc:
+            raise StoreError(
+                f"No field '{self.last_updated_field}' in store document. Please ensure Store.last_updated_field "
+                "is a datetime field in your store that represents the time of "
+                "last update to each document."
+            )
+        # Handle when collection has docs but `NoneType` last_updated_field.
+        return (
+            self._lu_func[0](doc[self.last_updated_field])
+            if (doc and doc[self.last_updated_field])
+            else datetime.min
+        )
+
+    def newer_in(
+        self,
+        target: Store,
+        key: Union[str, None] = None,
+        criteria: Optional[Dict] = None,
+        exhaustive: bool = False,
+    ) -> List[str]:
+        """
+        Returns the keys of documents that are newer in the target
+        Store than this Store.
+
+        Args:
+            key: a single key field to return, defaults to Store.key
+            criteria : PyMongo filter for documents to search in
+            exhaustive: triggers an item-by-item check vs. checking
+                        the last_updated of the target Store and using
+                        that to filter out new items in
+        """
+        self.ensure_index(self.key)
+        self.ensure_index(self.last_updated_field)
+        if exhaustive:
+            return source_keys_updated(target, self, query=criteria)
+        else:
+            key = key if key is not None else self.key  # Default value
+            criteria = {
+                self.last_updated_field: {"$gt": self._lu_func[1](self.last_updated)}
+            }
+            return target.distinct(field=key, criteria=criteria)
+
+    @deprecated(message="Please use Store.newer_in")
+    def lu_filter(self, targets):
+        """Creates a MongoDB filter for new documents.
+
+        By "new", we mean documents in this Store that were last updated later
+        than any document in targets.
+
+        Args:
+            targets (list): A list of Stores
+
+        """
+        if isinstance(targets, Store):
+            targets = [targets]
+
+        lu_list = [t.last_updated for t in targets]
+        return {self.last_updated_field: {"$gt": self._lu_func[1](max(lu_list))}}
+
+    @deprecated(message="Use Store.newer_in")
+    def updated_keys(self, target, criteria=None):
+        """
+        Returns keys for docs that are newer in the target store in comparison
+        with this store when comparing the last updated field (last_updated_field)
+
+        Args:
+            target (Store): store to look for updated documents
+            criteria (dict): mongo query to limit scope
+
+        Returns:
+            list of keys that have been updated in target store
+        """
+        self.ensure_index(self.key)
+        self.ensure_index(self.last_updated_field)
+
+        return source_keys_updated(target, self, query=criteria)
+
+    def __eq__(self, other):
+        return hash(self) == hash(other)
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __hash__(self):
+        return hash((self.last_updated_field,))
+
+    def __getstate__(self):
+        return self.as_dict()
+
+    def __setstate__(self, d):
+        d = {k: v for k, v in d.items() if not k.startswith("@")}
+        d = MontyDecoder().process_decoded(d)
+        self.__init__(**d)
+
+
+class StoreError(Exception):
+    """General Store-related error."""
+
+    pass
diff --git a/maggma/core/validator.py b/maggma/core/validator.py
new file mode 100644
index 000000000..f5d763882
--- /dev/null
+++ b/maggma/core/validator.py
@@ -0,0 +1,34 @@
+# coding: utf-8
+"""
+Validator class for document-level validation on Stores. Attach an instance
+of a Validator subclass to a Store .schema variable to enable validation on
+that Store.
+"""
+
+from abc import ABCMeta, abstractmethod
+from monty.json import MSONable
+from typing import Dict
+
+
+class Validator(MSONable, metaclass=ABCMeta):
+    """
+    A generic class to perform document-level validation on Stores.
+    Attach a Validator to a Store during initialization, any all documents
+    added to the Store will call .validate_doc() before being added.
+    """
+
+    @abstractmethod
+    def is_valid(self, doc: Dict) -> bool:
+        """
+        Returns (bool): True if document valid, False if document
+        invalid
+        """
+        return NotImplementedError
+
+    @abstractmethod
+    def validation_errors(self, doc: Dict) -> bool:
+        """
+        Returns (bool): if document is not valid, provide a list of
+        strings to display for why validation has failed
+        """
+        return NotImplementedError

From 099dfb96278aad6f03ce899a44cc425b230e20b8 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Tue, 5 Nov 2019 15:29:13 -0800
Subject: [PATCH 03/99] module for stores

---
 maggma/stores/__init__.py        |   3 +
 maggma/stores/advanced_stores.py | 325 ++++++++++++++++++++++++++
 maggma/stores/aws.py             | 240 +++++++++++++++++++
 maggma/stores/gridfs.py          | 307 ++++++++++++++++++++++++
 maggma/stores/mongolike.py       | 388 +++++++++++++++++++++++++++++++
 5 files changed, 1263 insertions(+)
 create mode 100644 maggma/stores/__init__.py
 create mode 100644 maggma/stores/advanced_stores.py
 create mode 100644 maggma/stores/aws.py
 create mode 100644 maggma/stores/gridfs.py
 create mode 100644 maggma/stores/mongolike.py

diff --git a/maggma/stores/__init__.py b/maggma/stores/__init__.py
new file mode 100644
index 000000000..19addd98e
--- /dev/null
+++ b/maggma/stores/__init__.py
@@ -0,0 +1,3 @@
+from maggma.stores.mongolike import MongoStore, JSONStore, MemoryStore
+from maggma.stores.gridfs import GridFSStore
+from maggma.stores.aws import AmazonS3Store
\ No newline at end of file
diff --git a/maggma/stores/advanced_stores.py b/maggma/stores/advanced_stores.py
new file mode 100644
index 000000000..961aca13d
--- /dev/null
+++ b/maggma/stores/advanced_stores.py
@@ -0,0 +1,325 @@
+# coding: utf-8
+"""
+Advanced Stores for behavior outside normal access patterns
+"""
+import os
+import hvac
+import json
+from typing import Union, Optional, Dict, List, Iterator
+
+from maggma.core import Store, StoreError, Sort
+from maggma.stores.mongolike import MongoStore
+from maggma.utils import lazy_substitute, substitute
+from mongogrant import Client
+from mongogrant.client import check
+from mongogrant.config import Config
+
+
+class MongograntStore(MongoStore):
+    """Initialize a Store with a mongogrant "<role>:<host>/<db>." spec.
+
+    This class does not subclass MongoStore, though it aims to reproduce
+    relevant functionality through method delegation, e.g. groupby.
+
+    It does not subclass MongoStore because some class methods of
+    MongoStore, e.g. from_db_file and from_collection, are not supported.
+
+    mongogrant documentation: https://github.com/materialsproject/mongogrant
+    """
+
+    def __init__(
+        self,
+        mongogrant_spec: str,
+        collection_name: str,
+        mgclient_config_path: Optional[str] = None,
+        **kwargs
+    ):
+        """
+
+        Args:
+            mongogrant_spec (str): of the form <role>:<host>/<db>, where
+                role is one of {"read", "readWrite"} or aliases {"ro", "rw"};
+                host is a db host (w/ optional port) or alias; and db is a db
+                on that host, or alias. See mongogrant documentation.
+            collection_name (str): name of mongo collection
+            mgclient_config_path (str): Path to mongogrant client config file,
+               or None if default path (`mongogrant.client.path`).
+        """
+        self.mongogrant_spec = mongogrant_spec
+        self.collection_name = collection_name
+        self.mgclient_config_path = mgclient_config_path
+        self._collection = None
+        if set(("username", "password", "database", "host")) & set(kwargs):
+            raise StoreError(
+                "MongograntStore does not accept "
+                "username, password, database, or host "
+                "arguments. Use `mongogrant_spec`."
+            )
+        self.kwargs = kwargs
+        super().__init__(**kwargs)
+
+    def connect(self, force_reset: bool = False):
+        """
+        Connect to the mongogrant source
+        Args:
+            force_reset: forces the connection to reset rather than just
+                         ensuring the connection is present
+        """
+        if not self._collection or force_reset:
+            if self.mgclient_config_path:
+                config = Config(check=check, path=self.mgclient_config_path)
+                client = Client(config)
+            else:
+                client = Client()
+            db = client.db(self.mongogrant_spec)
+            self._collection = db[self.collection_name]
+
+    def __hash__(self):
+        return hash((self.mongogrant_spec, self.collection_name, self.lu_field))
+
+
+class VaultStore(MongoStore):
+    """
+    Extends MongoStore to read credentials out of Vault server
+    and uses these values to initialize MongoStore instance
+    """
+
+    def __init__(self, collection_name: str, vault_secret_path: str):
+        """
+        collection (string): name of mongo collection
+        vault_secret_path (string): path on vault server with mongo creds object
+
+        Environment (must be set prior to invocation):
+        VAULT_ADDR - URL of vault server (eg. https://matgen8.lbl.gov:8200)
+        VAULT_TOKEN or GITHUB_TOKEN - token used to authenticate to vault
+        """
+        # TODO: Switch this over to Pydantic ConfigSettings
+        vault_addr = os.getenv("VAULT_ADDR")
+
+        if not vault_addr:
+            raise RuntimeError("VAULT_ADDR not set")
+
+        client = hvac.Client(vault_addr)
+
+        # If we have a vault token use this
+        token = os.getenv("VAULT_TOKEN")
+
+        # Look for a github token instead
+        if not token:
+            github_token = os.getenv("GITHUB_TOKEN")
+
+            if github_token:
+                client.auth_github(github_token)
+            else:
+                raise RuntimeError("VAULT_TOKEN or GITHUB_TOKEN not set")
+        else:
+            client.token = token
+            if not client.is_authenticated():
+                raise RuntimeError("Bad token")
+
+        # Read the vault secret
+        json_db_creds = client.read(vault_secret_path)
+        db_creds = json.loads(json_db_creds["data"]["value"])
+
+        database = db_creds.get("db")
+        host = db_creds.get("host", "localhost")
+        port = db_creds.get("port", 27017)
+        username = db_creds.get("username", "")
+        password = db_creds.get("password", "")
+
+        super(VaultStore, self).__init__(
+            database, collection_name, host, port, username, password
+        )
+
+
+class AliasingStore(Store):
+    """
+    Special Store that aliases for the primary accessors
+    """
+
+    def __init__(self, store: Store, aliases: Dict, **kwargs):
+        """
+        Args:
+            store: the store to wrap around
+            aliases: dict of aliases of the form external key: internal key
+        """
+        self.store = store
+        # Given an external key tells what the internal key is
+        self.aliases = aliases
+        # Given the internal key tells us what the external key is
+        self.reverse_aliases = {v: k for k, v in aliases.items()}
+        self.kwargs = kwargs
+
+        kwargs.update({"lu_field": store.lu_field, "lu_type": store.lu_type})
+        super(AliasingStore, self).__init__(**kwargs)
+
+    def query(
+        self,
+        criteria: Optional[Dict] = None,
+        properties: Union[Dict, List, None] = None,
+        sort: Optional[Dict[str, Sort]] = None,
+        skip: int = 0,
+        limit: int = 0,
+    ) -> Iterator[Dict]:
+        """
+        Queries the Store for a set of documents
+
+        Args:
+            criteria : PyMongo filter for documents to search in
+            properties: properties to return in grouped documents
+            sort: Dictionary of sort order for fields
+            skip: number documents to skip
+            limit: limit on total number of documents returned
+        """
+
+        if isinstance(properties, list):
+            properties = {p: 1 for p in properties}
+
+        criteria = criteria if criteria else {}
+        substitute(properties, self.reverse_aliases)
+        lazy_substitute(criteria, self.reverse_aliases)
+        for d in self.store.query(
+            properties=properties, criteria=criteria, sort=sort, limit=limit, skip=skip
+        ):
+            substitute(d, self.aliases)
+            yield d
+
+    def distinct(
+        self, field: Union[List[str], str], criteria: Optional[Dict] = None, all_exist: bool = False
+    ) -> List:
+        """
+        Get all distinct values for a key
+
+        Args:
+            field: the field(s) to get distinct values for
+            criteria : PyMongo filter for documents to search in
+            all_exist : ensure all fields exist for the distinct set
+        """
+        criteria = criteria if criteria else {}
+        lazy_substitute(criteria, self.reverse_aliases)
+        field = field if isinstance(field, list) else [field]
+        # substitute forward
+        field = [self.aliases[f] for f in field]
+        return self.store.distinct(field, criteria=criteria)
+
+    def groupby(self, keys, criteria=None, properties=None, **kwargs):
+        # Convert to a list
+        keys = keys if isinstance(keys, list) else [keys]
+
+        # Make the aliasing transformations on keys
+        keys = [self.aliases[k] if k in self.aliases else k for k in keys]
+
+        # Update criteria and properties based on aliases
+        criteria = criteria if criteria else {}
+        substitute(properties, self.reverse_aliases)
+        lazy_substitute(criteria, self.reverse_aliases)
+
+        return self.store.groupby(
+            keys=keys, properties=properties, criteria=criteria, **kwargs
+        )
+
+    def update(self, docs, update_lu=True, key=None):
+        key = key if key else self.key
+
+        for d in docs:
+            substitute(d, self.reverse_aliases)
+
+        if key in self.aliases:
+            key = self.aliases[key]
+
+        self.store.update(docs, update_lu=update_lu, key=key)
+
+    def ensure_index(self, key, unique=False, **kwargs):
+        if key in self.aliases:
+            key = self.aliases
+        return self.store.ensure_index(key, unique, **kwargs)
+
+    def close(self):
+        self.store.close()
+
+    @property
+    def collection(self):
+        return self.store.collection
+
+    def connect(self, force_reset=False):
+        self.store.connect(force_reset=force_reset)
+
+
+class SandboxStore(Store):
+    """
+    Provides a sandboxed view to another store
+    """
+
+    def __init__(self, store, sandbox, exclusive=False):
+        """
+        store (Store): store to wrap sandboxing around
+        sandbox (string): the corresponding sandbox
+        exclusive (bool): whether to be exclusively in this sandbox or include global items
+        """
+        self.store = store
+        self.sandbox = sandbox
+        self.exclusive = exclusive
+        super().__init__(
+            key=self.store.key,
+            lu_field=self.store.lu_field,
+            lu_type=self.store.lu_type,
+            validator=self.store.validator,
+        )
+
+    @property
+    def sbx_criteria(self):
+        if self.exclusive:
+            return {"sbxn": self.sandbox}
+        else:
+            return {
+                "$or": [{"sbxn": {"$in": [self.sandbox]}}, {"sbxn": {"$exists": False}}]
+            }
+
+    def query(self, criteria=None, properties=None, **kwargs):
+        criteria = (
+            dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria
+        )
+        return self.store.query(properties=properties, criteria=criteria, **kwargs)
+
+    def query_one(self, criteria=None, properties=None, **kwargs):
+        criteria = (
+            dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria
+        )
+        return self.store.query_one(properties=properties, criteria=criteria, **kwargs)
+
+    def distinct(self, key, criteria=None, **kwargs):
+        criteria = (
+            dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria
+        )
+        return self.store.distinct(key=key, criteria=criteria, **kwargs)
+
+    def groupby(self, keys, criteria=None, properties=None, **kwargs):
+        criteria = (
+            dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria
+        )
+
+        return self.store.groupby(
+            keys=keys, properties=properties, criteria=criteria, **kwargs
+        )
+
+    def update(self, docs, update_lu=True, key=None):
+        for d in docs:
+            if "sbxn" in d:
+                d["sbxn"] = list(set(d["sbxn"] + [self.sandbox]))
+            else:
+                d["sbxn"] = [self.sandbox]
+
+        self.store.update(docs, update_lu=update_lu, key=key)
+
+    def ensure_index(self, key, unique=False, **kwargs):
+        return self.store.ensure_index(key, unique, **kwargs)
+
+    def close(self):
+        self.store.close()
+
+    @property
+    def collection(self):
+        return self.store.collection
+
+    def connect(self, force_reset=False):
+        self.store.connect(force_reset=force_reset)
diff --git a/maggma/stores/aws.py b/maggma/stores/aws.py
new file mode 100644
index 000000000..432d0b792
--- /dev/null
+++ b/maggma/stores/aws.py
@@ -0,0 +1,240 @@
+# coding: utf-8
+"""
+Advanced Stores for behavior outside normal access patterns
+"""
+
+import json
+import zlib
+from datetime import datetime
+
+from maggma.stores import Store
+from monty.json import jsanitize
+
+try:
+    import boto3
+    import botocore
+
+    boto_import = True
+except ImportError:
+    boto_import = False
+
+
+class AmazonS3Store(Store):
+    """
+    GridFS like storage using Amazon S3 and a regular store for indexing
+    Assumes Amazon AWS key and secret key are set in environment or default config file
+    """
+
+    def __init__(self, index, bucket, **kwargs):
+        """
+        Initializes an S3 Store
+        Args:
+            index (Store): a store to use to index the S3 Bucket
+            bucket (str) : name of the bucket
+        """
+        if not boto_import:
+            raise ValueError(
+                "boto not available, please install boto3 to " "use AmazonS3Store"
+            )
+        self.index = index
+        self.bucket = bucket
+        self.s3 = None
+        self.s3_bucket = None
+        # Force the key to be the same as the index
+        kwargs["key"] = index.key
+        super(AmazonS3Store, self).__init__(**kwargs)
+
+    def connect(self, force_reset=False):
+        self.index.connect(force_reset=force_reset)
+        if not self.s3:
+            self.s3 = boto3.resource("s3")
+            # TODO: Provide configuration variable to create bucket if not present
+            if self.bucket not in self.s3.list_buckets():
+                raise Exception("Bucket not present on AWS: {}".format(self.bucket))
+            self.s3_bucket = self.s3.Bucket(self.bucket)
+
+    def close(self):
+        self.index.close()
+
+    @property
+    def collection(self):
+        # For now returns the index collection since that is what we would "search" on
+        return self.index
+
+    def query(self, criteria=None, properties=None, **kwargs):
+        """
+        Function that gets data from Amazon S3. This store ignores all
+        property projections as its designed for whole document access
+
+        Args:
+            properties (list or dict): This will be ignored by the S3
+                Store
+            criteria (dict): filter for query, matches documents
+                against key-value pairs
+            **kwargs (kwargs): further kwargs to Collection.find
+        """
+        for f in self.index.query(criteria=criteria, **kwargs):
+            try:
+                data = self.s3_bucket.Object(f[self.key]).get()
+            except botocore.exceptions.ClientError as e:
+                # If a client error is thrown, then check that it was a 404 error.
+                # If it was a 404 error, then the object does not exist.
+                error_code = int(e.response["Error"]["Code"])
+                if error_code == 404:
+                    self.logger.error("Could not find S3 object {}".format(f[self.key]))
+                    break
+
+            if f.get("compression", "") != "zlib":
+                data = zlib.decompress(data)
+
+            yield json.loads(data)
+
+    def query_one(self, criteria=None, properties=None, **kwargs):
+        """
+        Function that gets a single document from Amazon S3. This store
+        ignores all property projections as its designed for whole
+        document access
+
+        Args:
+            properties (list or dict): This will be ignored by the S3
+                Store
+            criteria (dict): filter for query, matches documents
+                against key-value pairs
+            **kwargs (kwargs): further kwargs to Collection.find
+        """
+        f = self.index.query_one(criteria=criteria, **kwargs)
+        if f:
+            try:
+                data = self.s3_bucket.Object(f[self.key]).get()
+            except botocore.exceptions.ClientError as e:
+                # If a client error is thrown, then check that it was a 404 error.
+                # If it was a 404 error, then the object does not exist.
+                error_code = int(e.response["Error"]["Code"])
+                if error_code == 404:
+                    self.logger.error("Could not find S3 object {}".format(f[self.key]))
+                    return None
+
+            if f.get("compression", "") != "zlib":
+                data = zlib.decompress(data)
+
+            return json.loads(data)
+        else:
+            return None
+
+    def distinct(self, key, criteria=None, all_exist=False, **kwargs):
+        """
+        Function get to get all distinct values of a certain key in the
+        AmazonS3 Store. This searches the index collection for this data
+
+        Args:
+            key (mongolike key or list of mongolike keys): key or keys
+                for which to find distinct values or sets of values.
+            criteria (filter criteria): criteria for filter
+            all_exist (bool): whether to ensure all keys in list exist
+                in each document, defaults to False
+            **kwargs (kwargs): kwargs corresponding to collection.distinct
+        """
+        # Index is a store so it should have its own distinct function
+        return self.index.distinct(key, filter=criteria, **kwargs)
+
+    def groupby(self, keys, criteria=None, properties=None, **kwargs):
+        """
+        Simple grouping function that will group documents
+        by keys. Only searches the index collection
+
+        Args:
+            keys (list or string): fields to group documents
+            criteria (dict): filter for documents to group
+            properties (list): properties to return in grouped documents
+            allow_disk_use (bool): whether to allow disk use in aggregation
+
+        Returns:
+            command cursor corresponding to grouped documents
+
+            elements of the command cursor have the structure:
+            {'_id': {"KEY_1": value_1, "KEY_2": value_2 ...,
+             'docs': [list_of_documents corresponding to key values]}
+
+        """
+        self.index.groupby(keys, properties, criteria, **kwargs)
+
+    def ensure_index(self, key, unique=False):
+        """
+        Wrapper for pymongo.Collection.ensure_index for the files collection
+        """
+        return self.index.ensure_index(key, unique=unique, background=True)
+
+    def update(self, docs, update_lu=True, key=None, compress=False):
+        """
+        Function to update associated MongoStore collection.
+
+        Args:
+            docs ([dict]): list of documents
+            key ([str] or str): keys to use to build search doc
+            compress (bool): compress the document or not
+        """
+        now = datetime.now()
+        search_docs = []
+        for d in docs:
+            if isinstance(key, list):
+                search_doc = {k: d[k] for k in key}
+            elif key:
+                search_doc = {key: d[key]}
+            else:
+                search_doc = {}
+
+            # Always include our main key
+            search_doc[self.key] = d[self.key]
+
+            # Remove MongoDB _id from search
+            if "_id" in search_doc:
+                del search_doc["_id"]
+
+            # Add a timestamp
+            if update_lu:
+                search_doc[self.lu_field] = now
+                d[self.lu_field] = now
+
+            data = json.dumps(jsanitize(d)).encode()
+
+            # Compress with zlib if chosen
+            if compress:
+                search_doc["compression"] = "zlib"
+                data = zlib.compress(data)
+
+            self.s3_bucket.put_object(Key=d[self.key], Body=data, Metadata=search_doc)
+            search_docs.append(search_doc)
+
+        # Use store's update to remove key clashes
+        self.index.update(search_docs)
+
+    @property
+    def last_updated(self):
+        return self.index.last_updated
+
+    def lu_filter(self, targets):
+        """Creates a MongoDB filter for new documents.
+
+        By "new", we mean documents in this Store that were last updated later
+        than any document in targets.
+
+        Args:
+            targets (list): A list of Stores
+
+        """
+        self.index.lu_filter(targets)
+
+    def __hash__(self):
+        return hash((self.index.__hash__, self.bucket))
+
+    def rebuild_index_from_s3_data(self):
+        """
+        Rebuilds the index Store from the data in S3
+        Relies on the index document being stores as the metadata for the file
+        """
+        index_docs = []
+        for file in self.s3_bucket.objects.all():
+            # TODO: Transform the data back from strings and remove AWS S3 specific keys
+            index_docs.append(file.metadata)
+
+        self.index.update(index_docs)
diff --git a/maggma/stores/gridfs.py b/maggma/stores/gridfs.py
new file mode 100644
index 000000000..02a19259f
--- /dev/null
+++ b/maggma/stores/gridfs.py
@@ -0,0 +1,307 @@
+# coding: utf-8
+"""
+Module containing various definitions of Stores.
+Stores are a default access pattern to data and provide
+various utillities
+"""
+from __future__ import annotations
+import copy
+from datetime import datetime
+import json
+import zlib
+import pymongo
+import gridfs
+
+from pymongo import MongoClient
+from monty.json import jsanitize
+from maggma.utils import confirm_field_index
+from maggma.core import Store
+
+
+class GridFSStore(Store):
+    """
+    A Store for GrdiFS backend. Provides a common access method consistent with other stores
+    """
+
+    # https://github.com/mongodb/specifications/
+    #   blob/master/source/gridfs/gridfs-spec.rst#terms
+    #   (Under "Files collection document")
+    files_collection_fields = (
+        "_id",
+        "length",
+        "chunkSize",
+        "uploadDate",
+        "md5",
+        "filename",
+        "contentType",
+        "aliases",
+        "metadata",
+    )
+
+    def __init__(
+        self,
+        database: str,
+        collection_name: str,
+        host: str = "localhost",
+        port: int = 27017,
+        username: str = "",
+        password: str = "",
+        compression: bool = False,
+        **kwargs,
+    ):
+        """
+        Initializes a GrdiFS Store for binary data
+        Args:
+            database: database name
+            collection_name: The name of the collection.
+                This is the string portion before the GridFS extensions
+            host: hostname for the database
+            port: port to connec to
+            username: username to connect as
+            password: password to authenticate as
+        """
+
+        self.database = database
+        self.collection_name = collection_name
+        self.host = host
+        self.port = port
+        self.username = username
+        self.password = password
+        self._collection = None
+        self.compression = compression
+        self.kwargs = kwargs
+        self.meta_keys = set()
+
+        if "key" not in kwargs:
+            kwargs["key"] = "_id"
+
+        kwargs["last_updated_field"] = "uploadDate"
+
+        super().__init__(**kwargs)
+
+    def connect(self, force_reset=False):
+        conn = MongoClient(self.host, self.port)
+        if not self._collection or force_reset:
+            db = conn[self.database]
+            if self.username != "":
+                db.authenticate(self.username, self.password)
+
+            self._collection = gridfs.GridFS(db, self.collection_name)
+            self._files_collection = db["{}.files".format(self.collection_name)]
+            self._chunks_collection = db["{}.chunks".format(self.collection_name)]
+
+    @property
+    def collection(self):
+        # TODO: Should this return the real MongoCollection or the GridFS
+        return self._collection
+
+    @property
+    def last_updated(self):
+        doc = next(
+            self._files_collection.find(projection=[self.last_updated_field])
+            .sort([(self.last_updated_field, pymongo.DESCENDING)])
+            .limit(1),
+            None,
+        )
+        if doc and self.last_updated_field not in doc:
+            raise StoreError(
+                "No field '{}' in store document. Please ensure Store.last_updated_field "
+                "is a datetime field in your store that represents the time of "
+                "last update to each document.".format(self.last_updated_field)
+            )
+        # Handle when collection has docs but `NoneType` last_updated_field.
+        return (
+            self._lu_func[0](doc[self.last_updated_field])
+            if (doc and doc[self.last_updated_field])
+            else datetime.min
+        )
+
+    @classmethod
+    def transform_criteria(cls, criteria):
+        """
+        Allow client to not need to prepend 'metadata.' to query fields.
+        Args:
+            criteria (dict): Query criteria
+        """
+        for field in criteria:
+            if field not in cls.files_collection_fields and not field.startswith(
+                "metadata."
+            ):
+                criteria["metadata." + field] = copy.copy(criteria[field])
+                del criteria[field]
+
+    def query(self, criteria=None, properties=None, **kwargs):
+        """
+        Function that gets data from GridFS. This store ignores all
+        property projections as its designed for whole document access
+
+        Args:
+            criteria (dict): filter for query, matches documents
+                against key-value pairs
+            properties (list or dict): This will be ignored by the GridFS
+                Store
+            **kwargs (kwargs): further kwargs to Collection.find
+        """
+        if isinstance(criteria, dict):
+            self.transform_criteria(criteria)
+        for f in self.collection.find(filter=criteria, **kwargs):
+            data = f.read()
+
+            metadata = f.metadata
+            if metadata.get("compression", "") == "zlib":
+                data = zlib.decompress(data).decode("UTF-8")
+
+            try:
+                data = json.loads(data)
+            except Exception:
+                pass
+            yield data
+
+    def distinct(self, key, criteria=None, all_exist=False, **kwargs):
+        """
+        Function get to get all distinct values of a certain key in
+        a mongolike store.  May take a single key or a list of keys
+
+        Args:
+            key (mongolike key or list of mongolike keys): key or keys
+                for which to find distinct values or sets of values.
+            criteria (filter criteria): criteria for filter
+            all_exist (bool): whether to ensure all keys in list exist
+                in each document, defaults to False
+            **kwargs (kwargs): kwargs corresponding to collection.distinct
+        """
+        if isinstance(key, list):
+            criteria = criteria if criteria else {}
+            # Update to ensure keys are there
+            if all_exist:
+                criteria.update(
+                    {k: {"$exists": True} for k in key if k not in criteria}
+                )
+
+            results = []
+            for d in self.groupby(key, properties=key, criteria=criteria):
+                results.append(d["_id"])
+            return results
+
+        else:
+            if criteria:
+                self.transform_criteria(criteria)
+            # Transfor to metadata subfield if not supposed to be in gridfs main fields
+            if key not in self.files_collection_fields:
+                key = "metadata.{}".format(key)
+
+            return self._files_collection.distinct(key, filter=criteria, **kwargs)
+
+    def groupby(
+        self, keys, criteria=None, properties=None, allow_disk_use=True, **kwargs
+    ):
+        """
+        Simple grouping function that will group documents
+        by keys.
+
+        Args:
+            keys (list or string): fields to group documents
+            criteria (dict): filter for documents to group
+            properties (list): properties to return in grouped documents
+            allow_disk_use (bool): whether to allow disk use in aggregation
+
+        Returns:
+            command cursor corresponding to grouped documents
+
+            elements of the command cursor have the structure:
+            {'_id': {"KEY_1": value_1, "KEY_2": value_2 ...,
+             'docs': [list_of_documents corresponding to key values]}
+
+        """
+        pipeline = []
+        if criteria is not None:
+            self.transform_criteria(criteria)
+            pipeline.append({"$match": criteria})
+
+        if properties is not None:
+            properties = [
+                p if p in self.files_collection_fields else "metadata.{}".format(p)
+                for p in properties
+            ]
+            pipeline.append({"$project": {p: 1 for p in properties}})
+
+        if isinstance(keys, str):
+            keys = [keys]
+
+        # ensure propper naming for keys in and outside of metadata
+        keys = [
+            k if k in self.files_collection_fields else "metadata.{}".format(k)
+            for k in keys
+        ]
+
+        group_id = {key: "${}".format(key) for key in keys}
+        pipeline.append({"$group": {"_id": group_id, "docs": {"$push": "$$ROOT"}}})
+
+        return self.collection.aggregate(pipeline, allowDiskUse=allow_disk_use)
+
+    def ensure_index(self, key, unique=False):
+        """
+        Wrapper for pymongo.Collection.ensure_index for the files collection
+        """
+        # Transform key for gridfs first
+        if key not in self.files_collection_fields:
+            key = "metadata.{}".format(key)
+
+        if confirm_field_index(self.collection, key):
+            return True
+        else:
+            try:
+                self.collection.create_index(key, unique=unique, background=True)
+                return True
+            except Exception:
+                return False
+
+    def update(self, docs, update_lu=True, key=None):
+        """
+        Function to update associated MongoStore collection.
+
+        Args:
+            docs ([dict]): list of documents
+            update_lu (bool) : Updat the last_updated field or not
+            key (list or str): list or str of important parameters
+        """
+        if isinstance(key, str):
+            key = [key]
+        elif not key:
+            key = [self.key]
+
+        key = list(set(key) | self.meta_keys - set(self.files_collection_fields))
+
+        for d in docs:
+
+            search_doc = {k: d[k] for k in key}
+            if update_lu:
+                d[self.last_updated_field] = datetime.utcnow()
+
+            metadata = {self.last_updated_field: d[self.last_updated_field]}
+            metadata.update(search_doc)
+
+            data = json.dumps(jsanitize(d)).encode("UTF-8")
+            if self.compression:
+                data = zlib.compress(data)
+                metadata["compression"] = "zlib"
+
+            self.collection.put(data, metadata=metadata)
+            self.transform_criteria(search_doc)
+
+            # Cleans up old gridfs entries
+            for fdoc in (
+                self._files_collection.find(search_doc, ["_id"])
+                .sort("uploadDate", -1)
+                .skip(1)
+            ):
+                self.collection.delete(fdoc["_id"])
+
+    def close(self):
+        self.collection.database.client.close()
+
+
+class StoreError(Exception):
+    """General Store-related error."""
+
+    pass
diff --git a/maggma/stores/mongolike.py b/maggma/stores/mongolike.py
new file mode 100644
index 000000000..e174fe306
--- /dev/null
+++ b/maggma/stores/mongolike.py
@@ -0,0 +1,388 @@
+# coding: utf-8
+"""
+Module containing various definitions of Stores.
+Stores are a default access pattern to data and provide
+various utillities
+"""
+from __future__ import annotations
+
+import json
+
+from typing import Union, Optional, Dict, List, Iterator, Tuple
+
+import mongomock
+
+from itertools import groupby
+from operator import itemgetter
+from pymongo import MongoClient
+from pydash import set_
+
+from pymongo import ReplaceOne
+
+from monty.json import jsanitize
+from monty.io import zopen
+from monty.serialization import loadfn
+from monty.dev import deprecated
+from maggma.utils import confirm_field_index
+
+from maggma.core import Store, Sort, StoreError
+
+
+class MongoStore(Store):
+    """
+    A Store that connects to a Mongo collection
+    """
+
+    def __init__(
+        self,
+        database: str,
+        collection_name: str,
+        host: str = "localhost",
+        port: int = 27017,
+        username: str = "",
+        password: str = "",
+        **kwargs,
+    ):
+        """
+        Args:
+            database: The database name
+            collection: The collection name
+            host: Hostname for the database
+            port: TCP port to connect to
+            username: Username for the collection
+            password: Password to connect with
+        """
+        self.database = database
+        self.collection_name = collection_name
+        self.host = host
+        self.port = port
+        self.username = username
+        self.password = password
+        self._collection = None
+        self.kwargs = kwargs
+        super().__init__(**kwargs)
+
+    def connect(self, force_reset: bool = False):
+        """
+        Connect to the source data
+        """
+        if not self._collection or force_reset:
+            conn = MongoClient(self.host, self.port)
+            db = conn[self.database]
+            if self.username != "":
+                db.authenticate(self.username, self.password)
+            self._collection = db[self.collection_name]
+
+    def __hash__(self):
+        return hash((self.database, self.collection_name, self.last_updated_field))
+
+    @classmethod
+    def from_db_file(cls, filename: str):
+        """
+        Convenience method to construct MongoStore from db_file
+        from old QueryEngine format
+        """
+        kwargs = loadfn(filename)
+        if "collection" in kwargs:
+            kwargs["collection_name"] = kwargs.pop("collection")
+        # Get rid of aliases from traditional query engine db docs
+        kwargs.pop("aliases", None)
+        return cls(**kwargs)
+
+    def groupby(
+        self,
+        keys: Union[List[str], str],
+        criteria: Optional[Dict] = None,
+        properties: Union[Dict, List, None] = None,
+        sort: Optional[Dict[str, Sort]] = None,
+        skip: int = 0,
+        limit: int = 0,
+    ) -> Iterator[Tuple[Dict, List[Dict]]]:
+        """
+        Simple grouping function that will group documents
+        by keys.
+
+        Args:
+            keys: fields to group documents
+            criteria : PyMongo filter for documents to search in
+            properties: properties to return in grouped documents
+            sort: Dictionary of sort order for fields
+            skip: number documents to skip
+            limit: limit on total number of documents returned
+
+        Returns:
+            generator returning tuples of (key, list of docs)
+        """
+        pipeline = []
+        if criteria is not None:
+            pipeline.append({"$match": criteria})
+
+        if properties is not None:
+            pipeline.append({"$project": {p: 1 for p in properties}})
+
+        if isinstance(keys, str):
+            keys = [keys]
+
+        group_id = {}
+        for key in keys:
+            set_(group_id, key, "${}".format(key))
+        pipeline.append({"$group": {"_id": group_id, "docs": {"$push": "$$ROOT"}}})
+
+        for d in self.collection.aggregate(pipeline, allowDiskUse=True):
+            yield (d["_id"], d["docs"])
+
+    @classmethod
+    def from_collection(cls, collection):
+        """
+        Generates a MongoStore from a pymongo collection object
+        This is not a fully safe operation as it gives dummy information to the MongoStore
+        As a result, this will not serialize and can not reset its connection
+        """
+        # TODO: How do we make this safer?
+        coll_name = collection.name
+        db_name = collection.database.name
+
+        store = cls(db_name, coll_name)
+        store._collection = collection
+        return store
+
+    @property
+    @deprecated(message="This will be removed in the future")
+    def collection(self):
+        if self._collection is None:
+            raise StoreError("Must connect Mongo-like store before attemping to use it")
+        return self._collection
+
+    def query(
+        self,
+        criteria: Optional[Dict] = None,
+        properties: Union[Dict, List, None] = None,
+        sort: Optional[Dict[str, Sort]] = None,
+        skip: int = 0,
+        limit: int = 0,
+    ) -> Iterator[Dict]:
+        """
+        Queries the Store for a set of documents
+
+        Args:
+            criteria : PyMongo filter for documents to search in
+            properties: properties to return in grouped documents
+            sort: Dictionary of sort order for fields
+            skip: number documents to skip
+            limit: limit on total number of documents returned
+        """
+        if isinstance(properties, list):
+            properties = {p: 1 for p in properties}
+        for d in self.collection.find(
+            filter=criteria, projection=properties, skip=skip, limit=limit
+        ):
+            yield d
+
+    def ensure_index(self, key: str, unique: Optional[bool] = False) -> bool:
+        """
+        Tries to create an index and return true if it suceeded
+        Args:
+            key: single key to index
+            unique: Whether or not this index contains only unique keys
+
+        Returns:
+            bool indicating if the index exists/was created
+        """
+
+        if confirm_field_index(self.collection, key):
+            return True
+        else:
+            try:
+                self.collection.create_index(key, unique=unique, background=True)
+                return True
+            except Exception:
+                return False
+
+    def update(self, docs: Union[List[Dict], Dict], key: Union[List, str, None] = None):
+        """
+        Update documents into the Store
+
+        Args:
+            docs: the document or list of documents to update
+            key: field name(s) to determine uniqueness for a
+                 document, can be a list of multiple fields,
+                 a single field, or None if the Store's key
+                 field is to be used
+        """
+
+        requests = []
+
+        if not isinstance(docs, list):
+            docs = [docs]
+
+        for d in docs:
+
+            d = jsanitize(d, allow_bson=True)
+
+            # document-level validation is optional
+            validates = True
+            if self.validator:
+                validates = self.validator.is_valid(d)
+                if not validates:
+                    if self.validator.strict:
+                        raise ValueError(self.validator.validation_errors(d))
+                    else:
+                        self.logger.error(self.validator.validation_errors(d))
+
+            if validates:
+                key = key or self.key
+                if isinstance(key, list):
+                    search_doc = {k: d[k] for k in key}
+                else:
+                    search_doc = {key: d[key]}
+
+                requests.append(ReplaceOne(search_doc, d, upsert=True))
+
+        self.collection.bulk_write(requests, ordered=False)
+
+    def close(self):
+        self.collection.database.client.close()
+
+
+class MemoryStore(MongoStore):
+    """
+    An in-memory Store that functions similarly
+    to a MongoStore
+    """
+
+    def __init__(self, name: str = "memory_db", **kwargs):
+        self.name = name
+        self._collection = None
+        self.kwargs = kwargs
+        super().__init__(**kwargs)
+
+    def connect(self, force_reset: bool = False):
+        """
+        Connect to the source data
+        """
+        if not self._collection or force_reset:
+            self._collection = mongomock.MongoClient().db[self.name]
+
+    def __hash__(self):
+        return hash((self.name, self.last_updated_field))
+
+    def groupby(
+        self,
+        keys: Union[List[str], str],
+        criteria: Optional[Dict] = None,
+        properties: Union[Dict, List, None] = None,
+        sort: Optional[Dict[str, Sort]] = None,
+        skip: int = 0,
+        limit: int = 0,
+    ) -> Iterator[Tuple[Dict, List[Dict]]]:
+        """
+        Simple grouping function that will group documents
+        by keys.
+
+        Args:
+            keys: fields to group documents
+            criteria : PyMongo filter for documents to search in
+            properties: properties to return in grouped documents
+            sort: Dictionary of sort order for fields
+            skip: number documents to skip
+            limit: limit on total number of documents returned
+
+        Returns:
+            generator returning tuples of (key, list of elemnts)
+        """
+        keys = keys if isinstance(keys, list) else [keys]
+
+        input_data = list(self.query(properties=keys, criteria=criteria))
+
+        if len(keys) > 1:
+            grouper = itemgetter(*keys)
+            for key, grp in groupby(sorted(input_data, key=grouper), grouper):
+                temp_dict = {"_id": zip(keys, key), "docs": list(grp)}
+                yield temp_dict
+        else:
+            grouper = itemgetter(*keys)
+            for key, group in groupby(sorted(input_data, key=grouper), grouper):
+                yield (key, list(group))
+
+    def update(self, docs: Union[List[Dict], Dict], key: Union[List, str, None] = None):
+        """
+        Update documents into the Store
+
+        Args:
+            docs: the document or list of documents to update
+            key: field name(s) to determine uniqueness for a
+                 document, can be a list of multiple fields,
+                 a single field, or None if the Store's key
+                 field is to be used
+        """
+
+        for d in docs:
+
+            d = jsanitize(d, allow_bson=True)
+
+            # document-level validation is optional
+            validates = True
+            if self.validator:
+                validates = self.validator.is_valid(d)
+                if not validates:
+                    if self.validator.strict:
+                        raise ValueError(self.validator.validation_errors(d))
+                    else:
+                        self.logger.error(self.validator.validation_errors(d))
+
+            if validates:
+                if isinstance(key, list):
+                    search_doc = {k: d[k] for k in key}
+                elif key:
+                    search_doc = {key: d[key]}
+                else:
+                    search_doc = {self.key: d[self.key]}
+
+                self.collection.update_one(d, criteria=search_doc)
+
+
+class JSONStore(MemoryStore):
+    """
+    A Store for access to a single or multiple JSON files
+    """
+
+    def __init__(self, paths, **kwargs):
+        """
+        Args:
+            paths (str or list): paths for json files to
+                turn into a Store
+        """
+        paths = paths if isinstance(paths, (list, tuple)) else [paths]
+        self.paths = paths
+        self.kwargs = kwargs
+        super().__init__("collection", **kwargs)
+
+    def connect(self, force_reset=False):
+        super().connect(force_reset=force_reset)
+        for path in self.paths:
+            with zopen(path) as f:
+                data = f.read()
+                data = data.decode() if isinstance(data, bytes) else data
+                objects = json.loads(data)
+                objects = [objects] if not isinstance(objects, list) else objects
+                self.update(objects)
+
+    def __hash__(self):
+        return hash((*self.paths, self.last_updated_field))
+
+
+class DatetimeStore(MemoryStore):
+    """Utility store intended for use with `Store.lu_filter`."""
+
+    def __init__(self, dt, **kwargs):
+        """
+        Args:
+            dt (Datetime): Datetime to set
+        """
+        self.__dt = dt
+        self.kwargs = kwargs
+        super().__init__("date", **kwargs)
+
+    def connect(self, force_reset=False):
+        super().connect(force_reset)
+        self.collection.insert_one({self.last_updated_field: self.__dt})

From e3fea5b84c2b1b72ad1b0e2b93536a0551fb3c55 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Tue, 5 Nov 2019 15:29:50 -0800
Subject: [PATCH 04/99] finish core refactoring

---
 maggma/advanced_stores.py              | 854 ------------------------
 maggma/builder.py                      |   9 -
 maggma/builders.py                     | 123 +---
 maggma/stores.py                       | 882 -------------------------
 maggma/{validator.py => validators.py} |  39 +-
 5 files changed, 9 insertions(+), 1898 deletions(-)
 delete mode 100644 maggma/advanced_stores.py
 delete mode 100644 maggma/builder.py
 delete mode 100644 maggma/stores.py
 rename maggma/{validator.py => validators.py} (75%)

diff --git a/maggma/advanced_stores.py b/maggma/advanced_stores.py
deleted file mode 100644
index 767d7ccb8..000000000
--- a/maggma/advanced_stores.py
+++ /dev/null
@@ -1,854 +0,0 @@
-# coding: utf-8
-"""
-Advanced Stores for behavior outside normal access patterns
-"""
-import os
-import hvac
-import json
-import zlib
-from datetime import datetime
-from itertools import groupby
-
-from pydash import get, set_
-from maggma.stores import Store, MongoStore, StoreError, Mongolike
-from maggma.utils import lazy_substitute, substitute
-from mongogrant import Client
-from mongogrant.client import check
-from mongogrant.config import Config
-from monty.json import jsanitize
-from monty.functools import lru_cache
-from pymongo import MongoClient
-
-try:
-    import boto3
-    import botocore
-
-    boto_import = True
-except ImportError:
-    boto_import = False
-
-
-class MongograntStore(Mongolike, Store):
-    """Initialize a Store with a mongogrant "<role>:<host>/<db>." spec.
-
-    This class does not subclass MongoStore, though it aims to reproduce
-    relevant functionality through method delegation, e.g. groupby.
-
-    It does not subclass MongoStore because some class methods of
-    MongoStore, e.g. from_db_file and from_collection, are not supported.
-
-    mongogrant documentation: https://github.com/materialsproject/mongogrant
-    """
-
-    def __init__(
-        self, mongogrant_spec, collection_name, mgclient_config_path=None, **kwargs
-    ):
-        """
-
-        Args:
-            mongogrant_spec (str): of the form <role>:<host>/<db>, where
-                role is one of {"read", "readWrite"} or aliases {"ro", "rw"};
-                host is a db host (w/ optional port) or alias; and db is a db
-                on that host, or alias. See mongogrant documentation.
-            collection_name (str): name of mongo collection
-            mgclient_config_path (str): Path to mongogrant client config file,
-               or None if default path (`mongogrant.client.path`).
-        """
-        self.mongogrant_spec = mongogrant_spec
-        self.collection_name = collection_name
-        self.mgclient_config_path = mgclient_config_path
-        self._collection = None
-        if set(("username", "password", "database", "host")) & set(kwargs):
-            raise StoreError(
-                "MongograntStore does not accept "
-                "username, password, database, or host "
-                "arguments. Use `mongogrant_spec`."
-            )
-        self.kwargs = kwargs
-        super().__init__(**kwargs)
-
-    def connect(self, force_reset=False):
-        if not self._collection or force_reset:
-            if self.mgclient_config_path:
-                config = Config(check=check, path=self.mgclient_config_path)
-                client = Client(config)
-            else:
-                client = Client()
-            db = client.db(self.mongogrant_spec)
-            self._collection = db[self.collection_name]
-
-    def __hash__(self):
-        return hash((self.mongogrant_spec, self.collection_name, self.lu_field))
-
-    def groupby(self, keys, criteria=None, properties=None, **kwargs):
-        return MongoStore.groupby(self, keys, criteria=None, properties=None, **kwargs)
-
-
-class VaultStore(MongoStore):
-    """
-    Extends MongoStore to read credentials out of Vault server
-    and uses these values to initialize MongoStore instance
-    """
-
-    def __init__(self, collection_name, vault_secret_path):
-        """
-        collection (string): name of mongo collection
-        vault_secret_path (string): path on vault server with mongo creds object
-
-        Environment (must be set prior to invocation):
-        VAULT_ADDR - URL of vault server (eg. https://matgen8.lbl.gov:8200)
-        VAULT_TOKEN or GITHUB_TOKEN - token used to authenticate to vault
-        """
-        vault_addr = os.getenv("VAULT_ADDR")
-
-        if not vault_addr:
-            raise RuntimeError("VAULT_ADDR not set")
-
-        client = hvac.Client(vault_addr)
-
-        # If we have a vault token use this
-        token = os.getenv("VAULT_TOKEN")
-
-        # Look for a github token instead
-        if not token:
-            github_token = os.getenv("GITHUB_TOKEN")
-
-            if github_token:
-                client.auth_github(github_token)
-            else:
-                raise RuntimeError("VAULT_TOKEN or GITHUB_TOKEN not set")
-        else:
-            client.token = token
-            if not client.is_authenticated():
-                raise RuntimeError("Bad token")
-
-        # Read the vault secret
-        json_db_creds = client.read(vault_secret_path)
-        db_creds = json.loads(json_db_creds["data"]["value"])
-
-        database = db_creds.get("db")
-        host = db_creds.get("host", "localhost")
-        port = db_creds.get("port", 27017)
-        username = db_creds.get("username", "")
-        password = db_creds.get("password", "")
-
-        super(VaultStore, self).__init__(
-            database, collection_name, host, port, username, password
-        )
-
-
-class AliasingStore(Store):
-    """
-    Special Store that aliases for the primary accessors
-    """
-
-    def __init__(self, store, aliases, **kwargs):
-        """
-        store (Store): the store to wrap around
-        aliases (dict): dict of aliases of the form external key: internal key
-        """
-        self.store = store
-        # Given an external key tells what the internal key is
-        self.aliases = aliases
-        # Given the internal key tells us what the external key is
-        self.reverse_aliases = {v: k for k, v in aliases.items()}
-        self.kwargs = kwargs
-
-        kwargs.update({"lu_field": store.lu_field, "lu_type": store.lu_type})
-        super(AliasingStore, self).__init__(**kwargs)
-
-    def query(self, criteria=None, properties=None, **kwargs):
-
-        if isinstance(properties, list):
-            properties = {p: 1 for p in properties}
-
-        criteria = criteria if criteria else {}
-        substitute(properties, self.reverse_aliases)
-        lazy_substitute(criteria, self.reverse_aliases)
-        for d in self.store.query(properties=properties, criteria=criteria, **kwargs):
-            substitute(d, self.aliases)
-            yield d
-
-    def query_one(self, criteria=None, properties=None, **kwargs):
-
-        if isinstance(properties, list):
-            properties = {p: 1 for p in properties}
-
-        criteria = criteria if criteria else {}
-        substitute(properties, self.reverse_aliases)
-        lazy_substitute(criteria, self.reverse_aliases)
-        d = self.store.query_one(properties=properties, criteria=criteria, **kwargs)
-        substitute(d, self.aliases)
-        return d
-
-    def distinct(self, key, criteria=None, all_exist=True, **kwargs):
-        if isinstance(key, list):
-            criteria = criteria if criteria else {}
-            # Update to ensure keys are there
-            if all_exist:
-                criteria.update(
-                    {k: {"$exists": True} for k in key if k not in criteria}
-                )
-
-            results = []
-            for d in self.groupby(key, properties=key, criteria=criteria):
-                results.append(d["_id"])
-            return results
-
-        else:
-            criteria = criteria if criteria else {}
-            lazy_substitute(criteria, self.reverse_aliases)
-            key = self.aliases[key] if key in self.aliases else key
-            return self.collection.distinct(key, filter=criteria, **kwargs)
-
-    def groupby(self, keys, criteria=None, properties=None, **kwargs):
-        # Convert to a list
-        keys = keys if isinstance(keys, list) else [keys]
-
-        # Make the aliasing transformations on keys
-        keys = [self.aliases[k] if k in self.aliases else k for k in keys]
-
-        # Update criteria and properties based on aliases
-        criteria = criteria if criteria else {}
-        substitute(properties, self.reverse_aliases)
-        lazy_substitute(criteria, self.reverse_aliases)
-
-        return self.store.groupby(
-            keys=keys, properties=properties, criteria=criteria, **kwargs
-        )
-
-    def update(self, docs, update_lu=True, key=None):
-        key = key if key else self.key
-
-        for d in docs:
-            substitute(d, self.reverse_aliases)
-
-        if key in self.aliases:
-            key = self.aliases[key]
-
-        self.store.update(docs, update_lu=update_lu, key=key)
-
-    def ensure_index(self, key, unique=False, **kwargs):
-        if key in self.aliases:
-            key = self.aliases
-        return self.store.ensure_index(key, unique, **kwargs)
-
-    def close(self):
-        self.store.close()
-
-    @property
-    def collection(self):
-        return self.store.collection
-
-    def connect(self, force_reset=False):
-        self.store.connect(force_reset=force_reset)
-
-
-class SandboxStore(Store):
-    """
-    Provides a sandboxed view to another store
-    """
-
-    def __init__(self, store, sandbox, exclusive=False):
-        """
-        store (Store): store to wrap sandboxing around
-        sandbox (string): the corresponding sandbox
-        exclusive (bool): whether to be exclusively in this sandbox or include global items
-        """
-        self.store = store
-        self.sandbox = sandbox
-        self.exclusive = exclusive
-        super().__init__(
-            key=self.store.key,
-            lu_field=self.store.lu_field,
-            lu_type=self.store.lu_type,
-            validator=self.store.validator,
-        )
-
-    @property
-    def sbx_criteria(self):
-        if self.exclusive:
-            return {"sbxn": self.sandbox}
-        else:
-            return {
-                "$or": [{"sbxn": {"$in": [self.sandbox]}}, {"sbxn": {"$exists": False}}]
-            }
-
-    def query(self, criteria=None, properties=None, **kwargs):
-        criteria = (
-            dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria
-        )
-        return self.store.query(properties=properties, criteria=criteria, **kwargs)
-
-    def query_one(self, criteria=None, properties=None, **kwargs):
-        criteria = (
-            dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria
-        )
-        return self.store.query_one(properties=properties, criteria=criteria, **kwargs)
-
-    def distinct(self, key, criteria=None, **kwargs):
-        criteria = (
-            dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria
-        )
-        return self.store.distinct(key=key, criteria=criteria, **kwargs)
-
-    def groupby(self, keys, criteria=None, properties=None, **kwargs):
-        criteria = (
-            dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria
-        )
-
-        return self.store.groupby(
-            keys=keys, properties=properties, criteria=criteria, **kwargs
-        )
-
-    def update(self, docs, update_lu=True, key=None):
-        for d in docs:
-            if "sbxn" in d:
-                d["sbxn"] = list(set(d["sbxn"] + [self.sandbox]))
-            else:
-                d["sbxn"] = [self.sandbox]
-
-        self.store.update(docs, update_lu=update_lu, key=key)
-
-    def ensure_index(self, key, unique=False, **kwargs):
-        return self.store.ensure_index(key, unique, **kwargs)
-
-    def close(self):
-        self.store.close()
-
-    @property
-    def collection(self):
-        return self.store.collection
-
-    def connect(self, force_reset=False):
-        self.store.connect(force_reset=force_reset)
-
-
-class AmazonS3Store(Store):
-    """
-    GridFS like storage using Amazon S3 and a regular store for indexing
-    Assumes Amazon AWS key and secret key are set in environment or default config file
-    """
-
-    def __init__(self, index, bucket, **kwargs):
-        """
-        Initializes an S3 Store
-        Args:
-            index (Store): a store to use to index the S3 Bucket
-            bucket (str) : name of the bucket
-        """
-        if not boto_import:
-            raise ValueError(
-                "boto not available, please install boto3 to " "use AmazonS3Store"
-            )
-        self.index = index
-        self.bucket = bucket
-        self.s3 = None
-        self.s3_bucket = None
-        # Force the key to be the same as the index
-        kwargs["key"] = index.key
-        super(AmazonS3Store, self).__init__(**kwargs)
-
-    def connect(self, force_reset=False):
-        self.index.connect(force_reset=force_reset)
-        if not self.s3:
-            self.s3 = boto3.resource("s3")
-            # TODO: Provide configuration variable to create bucket if not present
-            if self.bucket not in self.s3.list_buckets():
-                raise Exception("Bucket not present on AWS: {}".format(self.bucket))
-            self.s3_bucket = self.s3.Bucket(self.bucket)
-
-    def close(self):
-        self.index.close()
-
-    @property
-    def collection(self):
-        # For now returns the index collection since that is what we would "search" on
-        return self.index
-
-    def query(self, criteria=None, properties=None, **kwargs):
-        """
-        Function that gets data from Amazon S3. This store ignores all
-        property projections as its designed for whole document access
-
-        Args:
-            properties (list or dict): This will be ignored by the S3
-                Store
-            criteria (dict): filter for query, matches documents
-                against key-value pairs
-            **kwargs (kwargs): further kwargs to Collection.find
-        """
-        for f in self.index.query(criteria=criteria, **kwargs):
-            try:
-                data = self.s3_bucket.Object(f[self.key]).get()
-            except botocore.exceptions.ClientError as e:
-                # If a client error is thrown, then check that it was a 404 error.
-                # If it was a 404 error, then the object does not exist.
-                error_code = int(e.response["Error"]["Code"])
-                if error_code == 404:
-                    self.logger.error("Could not find S3 object {}".format(f[self.key]))
-                    break
-
-            if f.get("compression", "") is "zlib":
-                data = zlib.decompress(data)
-
-            yield json.loads(data)
-
-    def query_one(self, criteria=None, properties=None, **kwargs):
-        """
-        Function that gets a single document from Amazon S3. This store
-        ignores all property projections as its designed for whole
-        document access
-
-        Args:
-            properties (list or dict): This will be ignored by the S3
-                Store
-            criteria (dict): filter for query, matches documents
-                against key-value pairs
-            **kwargs (kwargs): further kwargs to Collection.find
-        """
-        f = self.index.query_one(criteria=criteria, **kwargs)
-        if f:
-            try:
-                data = self.s3_bucket.Object(f[self.key]).get()
-            except botocore.exceptions.ClientError as e:
-                # If a client error is thrown, then check that it was a 404 error.
-                # If it was a 404 error, then the object does not exist.
-                error_code = int(e.response["Error"]["Code"])
-                if error_code == 404:
-                    self.logger.error("Could not find S3 object {}".format(f[self.key]))
-                    return None
-
-            if f.get("compression", "") is "zlib":
-                data = zlib.decompress(data)
-
-            return json.loads(data)
-        else:
-            return None
-
-    def distinct(self, key, criteria=None, all_exist=False, **kwargs):
-        """
-        Function get to get all distinct values of a certain key in the
-        AmazonS3 Store. This searches the index collection for this data
-
-        Args:
-            key (mongolike key or list of mongolike keys): key or keys
-                for which to find distinct values or sets of values.
-            criteria (filter criteria): criteria for filter
-            all_exist (bool): whether to ensure all keys in list exist
-                in each document, defaults to False
-            **kwargs (kwargs): kwargs corresponding to collection.distinct
-        """
-        # Index is a store so it should have its own distinct function
-        return self.index.distinct(key, filter=criteria, **kwargs)
-
-    def groupby(self, keys, criteria=None, properties=None, **kwargs):
-        """
-        Simple grouping function that will group documents
-        by keys. Only searches the index collection
-
-        Args:
-            keys (list or string): fields to group documents
-            criteria (dict): filter for documents to group
-            properties (list): properties to return in grouped documents
-            allow_disk_use (bool): whether to allow disk use in aggregation
-
-        Returns:
-            command cursor corresponding to grouped documents
-
-            elements of the command cursor have the structure:
-            {'_id': {"KEY_1": value_1, "KEY_2": value_2 ...,
-             'docs': [list_of_documents corresponding to key values]}
-
-        """
-        self.index.groupby(keys, properties, criteria, **kwargs)
-
-    def ensure_index(self, key, unique=False):
-        """
-        Wrapper for pymongo.Collection.ensure_index for the files collection
-        """
-        return self.index.ensure_index(key, unique=unique, background=True)
-
-    def update(self, docs, update_lu=True, key=None, compress=False):
-        """
-        Function to update associated MongoStore collection.
-
-        Args:
-            docs ([dict]): list of documents
-            key ([str] or str): keys to use to build search doc
-            compress (bool): compress the document or not
-        """
-        now = datetime.now()
-        search_docs = []
-        for d in docs:
-            if isinstance(key, list):
-                search_doc = {k: d[k] for k in key}
-            elif key:
-                search_doc = {key: d[key]}
-            else:
-                search_doc = {}
-
-            # Always include our main key
-            search_doc[self.key] = d[self.key]
-
-            # Remove MongoDB _id from search
-            if "_id" in search_doc:
-                del search_doc["_id"]
-
-            # Add a timestamp
-            if update_lu:
-                search_doc[self.lu_field] = now
-                d[self.lu_field] = now
-
-            data = json.dumps(jsanitize(d)).encode()
-
-            # Compress with zlib if chosen
-            if compress:
-                search_doc["compression"] = "zlib"
-                data = zlib.compress(data)
-
-            self.s3_bucket.put_object(Key=d[self.key], Body=data, Metadata=search_doc)
-            search_docs.append(search_doc)
-
-        # Use store's update to remove key clashes
-        self.index.update(search_docs)
-
-    @property
-    def last_updated(self):
-        return self.index.last_updated
-
-    def lu_filter(self, targets):
-        """Creates a MongoDB filter for new documents.
-
-        By "new", we mean documents in this Store that were last updated later
-        than any document in targets.
-
-        Args:
-            targets (list): A list of Stores
-
-        """
-        self.index.lu_filter(targets)
-
-    def __hash__(self):
-        return hash((self.index.__hash__, self.bucket))
-
-    def rebuild_index_from_s3_data(self):
-        """
-        Rebuilds the index Store from the data in S3
-        Relies on the index document being stores as the metadata for the file
-        """
-        index_docs = []
-        for file in self.s3_bucket.objects.all():
-            # TODO: Transform the data back from strings and remove AWS S3 specific keys
-            index_docs.append(file.metadata)
-
-        self.index.update(index_docs)
-
-
-class JointStore(Store):
-    """Store corresponding to multiple collections, uses lookup to join"""
-
-    def __init__(
-        self,
-        database,
-        collection_names,
-        host="localhost",
-        port=27017,
-        username="",
-        password="",
-        master=None,
-        merge_at_root=False,
-        **kwargs
-    ):
-        self.database = database
-        self.collection_names = collection_names
-        self.host = host
-        self.port = port
-        self.username = username
-        self.password = password
-        self._collection = None
-        self.master = master or collection_names[0]
-        self.merge_at_root = merge_at_root
-        self.kwargs = kwargs
-        super(JointStore, self).__init__(**kwargs)
-
-    def connect(self, force_reset=False):
-        conn = MongoClient(self.host, self.port)
-        db = conn[self.database]
-        if self.username is not "":
-            db.authenticate(self.username, self.password)
-        self._collection = db[self.master]
-        self._has_merge_objects = (
-            self._collection.database.client.server_info()["version"] > "3.6"
-        )
-
-    def close(self):
-        self.collection.database.client.close()
-
-    @property
-    def collection(self):
-        return self._collection
-
-    @property
-    def nonmaster_names(self):
-        return list(set(self.collection_names) - {self.master})
-
-    @property
-    def last_updated(self):
-        lus = []
-        for cname in self.collection_names:
-            lu = MongoStore.from_collection(
-                self.collection.database[cname], lu_field=self.lu_field
-            ).last_updated
-            lus.append(lu)
-        return max(lus)
-
-    # TODO: implement update?
-    def update(self, docs, update_lu=True, key=None, **kwargs):
-        raise NotImplementedError("No update method for JointStore")
-
-    def _get_store_by_name(self, name):
-        return MongoStore.from_collection(self.collection.database[name])
-
-    def distinct(self, key, criteria=None, all_exist=True, **kwargs):
-        g_key = key if isinstance(key, list) else [key]
-        if all_exist:
-            criteria = criteria or {}
-            criteria.update({k: {"$exists": True} for k in g_key if k not in criteria})
-        cursor = self.groupby(g_key, criteria=criteria, **kwargs)
-        if isinstance(key, list):
-            return [d["_id"] for d in cursor]
-        else:
-            return [get(d["_id"], key) for d in cursor]
-
-    def ensure_index(self, key, unique=False, **kwargs):
-        raise NotImplementedError("No ensure_index method for JointStore")
-
-    def _get_pipeline(self, criteria=None, properties=None):
-        """
-        Gets the aggregation pipeline for query and query_one
-
-        Args:
-            properties: properties to be returned
-            criteria: criteria to filter by
-
-        Returns:
-            list of aggregation operators
-        """
-        pipeline = []
-        for cname in self.collection_names:
-            if cname is not self.master:
-                pipeline.append(
-                    {
-                        "$lookup": {
-                            "from": cname,
-                            "localField": self.key,
-                            "foreignField": self.key,
-                            "as": cname,
-                        }
-                    }
-                )
-
-                if self.merge_at_root:
-                    if not self._has_merge_objects:
-                        raise Exception(
-                            "MongoDB server version too low to use $mergeObjects."
-                        )
-
-                    pipeline.append(
-                        {
-                            "$replaceRoot": {
-                                "newRoot": {
-                                    "$mergeObjects": [
-                                        {"$arrayElemAt": ["${}".format(cname), 0]},
-                                        "$$ROOT",
-                                    ]
-                                }
-                            }
-                        }
-                    )
-                else:
-                    pipeline.append(
-                        {
-                            "$unwind": {
-                                "path": "${}".format(cname),
-                                "preserveNullAndEmptyArrays": True,
-                            }
-                        }
-                    )
-
-        # Do projection for max last_updated
-        lu_max_fields = ["${}".format(self.lu_field)]
-        lu_max_fields.extend(
-            ["${}.{}".format(cname, self.lu_field) for cname in self.collection_names]
-        )
-        lu_proj = {self.lu_field: {"$max": lu_max_fields}}
-        pipeline.append({"$addFields": lu_proj})
-
-        if criteria:
-            pipeline.append({"$match": criteria})
-        if isinstance(properties, list):
-            properties = {k: 1 for k in properties}
-        if properties:
-            pipeline.append({"$project": properties})
-
-        return pipeline
-
-    def query(self, criteria=None, properties=None, **kwargs):
-        pipeline = self._get_pipeline(criteria=criteria, properties=properties)
-        agg = self.collection.aggregate(pipeline, **kwargs)
-        return agg
-
-    def groupby(self, keys, criteria=None, properties=None, **kwargs):
-        pipeline = self._get_pipeline(criteria=criteria, properties=properties)
-        if not isinstance(keys, list):
-            keys = [keys]
-        group_id = {}
-        for key in keys:
-            set_(group_id, key, "${}".format(key))
-        pipeline.append({"$group": {"_id": group_id, "docs": {"$push": "$$ROOT"}}})
-
-        agg = self.collection.aggregate(pipeline, **kwargs)
-
-        return agg
-
-    def query_one(self, criteria=None, properties=None, **kwargs):
-        """
-        Get one document
-
-        Args:
-            properties([str] or {}): properties to return in query
-            criteria ({}): filter for matching
-            **kwargs: kwargs for collection.aggregate
-
-        Returns:
-            single document
-        """
-        # TODO: maybe adding explicit limit in agg pipeline is better as below?
-        # pipeline = self._get_pipeline(properties, criteria)
-        # pipeline.append({"$limit": 1})
-        query = self.query(criteria=criteria, properties=properties, **kwargs)
-        try:
-            doc = next(query)
-            return doc
-        except StopIteration:
-            return None
-
-
-class ConcatStore(Store):
-    """Store concatting multiple stores"""
-
-    def __init__(self, *stores, **kwargs):
-        """
-        Initialize a ConcatStore that concatenates multiple stores together
-        to appear as one store
-        """
-        self.stores = stores
-        super(ConcatStore, self).__init__(**kwargs)
-
-    def connect(self, force_reset=False):
-        """
-        Connect all stores in this ConcatStore
-
-        Args:
-            force_reset (bool): Whether to forcibly reset the connection for
-            all stores
-        """
-        for store in self.stores:
-            store.connect(force_reset)
-
-    def close(self):
-        """
-        Close all connections in this ConcatStore
-        """
-        for store in self.stores:
-            store.close()
-
-    @property
-    def collection(self):
-        raise NotImplementedError("No collection property for ConcatStore")
-
-    @property
-    def last_updated(self):
-        """
-        Finds the most recent last_updated across all the stores.
-        This might not be the most usefull way to do this for this type of Store
-        since it could very easily over-estimate the last_updated based on what stores
-        are used
-        """
-        lus = []
-        for store in self.stores:
-            lu = store.last_updated
-            lus.append(lu)
-        return max(lus)
-
-    # TODO: implement update?
-    def update(self, docs, update_lu=True, key=None, **kwargs):
-        raise NotImplementedError("No update method for JointStore")
-
-    def distinct(self, key, criteria=None, all_exist=True, **kwargs):
-        """
-        Return all distinct values for a key within the stores
-
-        Args:
-            key (str): key to find distinct values
-            criteria (dict): criteria dictionary to reduce the documents to search on
-            all_exist (bool): ensure the key exists in the doc or not
-        """
-        distincts = []
-        for store in self.stores:
-            distincts.extend(store.distinct(key, criteria, all_exist, **kwargs))
-        return list(set(distincts))
-
-    def ensure_index(self, key, unique=False, **kwargs):
-        """
-        Ensure an index is properly set. Returns whether all stores support this index or not
-
-        Args:
-            key (str or [str]): single key or list of keys to group by
-        """
-        return all([store.ensure_index(key, unique, **kwargs) for store in self.stores])
-
-    def query(self, criteria=None, properties=None, **kwargs):
-        """
-        Queries across all the stores.
-
-        Args:
-            criteria (dict): mongo style query to reduce the docs to group
-            properties (str or [str]): properties to project
-        """
-        for store in self.stores:
-            for d in store.query(criteria=criteria, properties=properties, **kwargs):
-                yield d
-
-    def query_one(self, criteria=None, properties=None, **kwargs):
-        return next(self.query(criteria=criteria, properties=properties, **kwargs))
-
-    def groupby(self, keys, criteria=None, properties=None, **kwargs):
-        """
-        Group documents by a key. This version is highly inefficient since it performs
-        post-grouping in python across all of its stores
-
-        Args:
-            keys (str or [str]): single key or list of keys to group by
-            criteria (dict): mongo style query to reduce the docs to group
-            properties (str or [str]): properties to project
-        """
-        if isinstance(keys, str):
-            keys = [keys]
-
-        docs = []
-        for store in self.stores:
-            temp_docs = list(
-                store.groupby(keys, criteria=criteria, properties=properties, **kwargs)
-            )
-            for group in temp_docs:
-                docs.extend(group["docs"])
-
-        def key_set(d):
-            "index function based on passed in keys"
-            test_d = tuple(d.get(k, "") for k in keys)
-            return test_d
-
-        for k, group in groupby(docs, key=key_set):
-            yield list(group)
diff --git a/maggma/builder.py b/maggma/builder.py
deleted file mode 100644
index 3247824a1..000000000
--- a/maggma/builder.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# coding: utf-8
-"""
-Base Builder class to define how builders need to be defined
-"""
-from maggma.builders import *
-
-import warnings
-
-warnings.warn("maggma.examples.builder is now deprecated.")
diff --git a/maggma/builders.py b/maggma/builders.py
index 42689cdc9..711f090e7 100644
--- a/maggma/builders.py
+++ b/maggma/builders.py
@@ -3,130 +3,11 @@
 Base Builder class to define how builders need to be defined
 """
 from abc import ABCMeta, abstractmethod
-import logging
 import traceback
 from datetime import datetime
-from monty.json import MSONable, MontyDecoder
 from maggma.utils import source_keys_updated, grouper, Timeout
 from time import time
-
-
-class Builder(MSONable, metaclass=ABCMeta):
-    """
-    Base Builder class
-    At minimum this class should implement:
-    get_items - Get items from the sources
-    update_targets - Updates the sources with results
-
-    Multiprocessing and MPI processing can be used if all
-    the data processing is  limited to process_items
-    """
-
-    def __init__(self, sources, targets, chunk_size=1000):
-        """
-        Initialize the builder the framework.
-
-        Args:
-            sources([Store]): list of source stores
-            targets([Store]): list of target stores
-            chunk_size(int): chunk size for processing
-        """
-        self.sources = sources
-        self.targets = targets
-        self.chunk_size = chunk_size
-
-        self.logger = logging.getLogger(type(self).__name__)
-        self.logger.addHandler(logging.NullHandler())
-
-    def connect(self):
-        """
-        Connect to the builder sources and targets.
-        """
-        stores = self.sources + self.targets
-        for s in stores:
-            s.connect()
-
-    @abstractmethod
-    def get_items(self):
-        """
-        Returns all the items to process.
-
-        Returns:
-            generator or list of items to process
-        """
-        pass
-
-    def process_item(self, item):
-        """
-        Process an item. Should not expect DB access as this can be run MPI
-        Default behavior is to return the item.
-        Args:
-            item:
-
-        Returns:
-           item: an item to update
-        """
-        return item
-
-    @abstractmethod
-    def update_targets(self, items):
-        """
-        Takes a dictionary of targets and items from process item and updates them
-        Can also perform other book keeping in the process such as storing gridfs oids, etc.
-
-        Args:
-            items:
-
-        Returns:
-
-        """
-        pass
-
-    def finalize(self, cursor=None):
-        """
-        Perform any final clean up.
-        """
-        # Close any Mongo connections.
-        for store in self.sources + self.targets:
-            try:
-                store.collection.database.client.close()
-            except AttributeError:
-                continue
-        # Runner will pass iterable yielded by `self.get_items` as `cursor`. If
-        # this is a Mongo cursor with `no_cursor_timeout=True` (not the
-        # default), we must be explicitly kill it.
-        try:
-            cursor and cursor.close()
-        except AttributeError:
-            pass
-
-    def run(self):
-        """
-        Run the builder serially
-
-        Args:
-            builder_id (int): the index of the builder in the builders list
-        """
-        self.connect()
-
-        cursor = self.get_items()
-
-        for chunk in grouper(cursor, self.chunk_size):
-            self.logger.info("Processing batch of {} items".format(self.chunk_size))
-            processed_items = [
-                self.process_item(item) for item in chunk if item is not None
-            ]
-            self.update_targets(processed_items)
-
-        self.finalize(cursor)
-
-    def __getstate__(self):
-        return self.as_dict()
-
-    def __setstate__(self, d):
-        d = {k: v for k, v in d.items() if not k.startswith("@")}
-        d = MontyDecoder().process_decoded(d)
-        self.__init__(**d)
+from maggma.core import Builder
 
 
 class MapBuilder(Builder, metaclass=ABCMeta):
@@ -258,7 +139,7 @@ def process_item(self, item):
 
         out = {
             self.target.key: item[key],
-            self.target.lu_field: self.source.lu_func[0](item[self.source.lu_field]),
+            self.target.lu_field: self.source.lu_func[0](item[lu_field]),
         }
         if self.store_process_time:
             out["_process_time"] = time_end - time_start
diff --git a/maggma/stores.py b/maggma/stores.py
deleted file mode 100644
index 6150e83d6..000000000
--- a/maggma/stores.py
+++ /dev/null
@@ -1,882 +0,0 @@
-# coding: utf-8
-"""
-Module containing various definitions of Stores.
-Stores are a default access pattern to data and provide
-various utillities
-"""
-from abc import ABCMeta, abstractmethod
-import copy
-from datetime import datetime
-import json
-import zlib
-import logging
-
-import mongomock
-import pymongo
-import gridfs
-from itertools import groupby
-from operator import itemgetter
-from pymongo import MongoClient
-from pydash import identity, set_
-
-from pymongo import ReplaceOne
-
-from monty.json import MSONable, jsanitize, MontyDecoder
-from monty.io import zopen
-from monty.serialization import loadfn
-from maggma.utils import LU_KEY_ISOFORMAT, confirm_field_index, source_keys_updated
-
-
-class Store(MSONable, metaclass=ABCMeta):
-    """
-    Abstract class for a data Store
-    Defines the interface for all data going in and out of a Builder
-    """
-
-    def __init__(
-        self, key="task_id", lu_field="last_updated", lu_type="datetime", validator=None
-    ):
-        """
-        Args:
-            key (str): master key to index on
-            lu_field (str): 'last updated' field name
-            lu_type (tuple): the date/time format for the lu_field. Can be "datetime" or "isoformat"
-        """
-        self.key = key
-        self.lu_field = lu_field
-        self.lu_type = lu_type
-        self.lu_func = (
-            LU_KEY_ISOFORMAT if lu_type == "isoformat" else (identity, identity)
-        )
-        self.validator = validator
-        self.logger = logging.getLogger(type(self).__name__)
-        self.logger.addHandler(logging.NullHandler())
-
-    @property
-    @abstractmethod
-    def collection(self):
-        """
-        Returns a handle to the pymongo collection object
-        Not guaranteed to exist in the future
-        """
-        pass
-
-    @abstractmethod
-    def connect(self, force_reset=False):
-        """
-        Connect to the source data
-        """
-        pass
-
-    @abstractmethod
-    def close(self):
-        """
-        Closes any connections
-        """
-        pass
-
-    @abstractmethod
-    def query(self, criteria=None, properties=None, **kwargs):
-        """
-        Queries the Store for a set of properties
-        """
-        pass
-
-    @abstractmethod
-    def query_one(self, criteria=None, properties=None, **kwargs):
-        """
-        Get one property from the store
-        """
-        pass
-
-    @abstractmethod
-    def distinct(self, key, criteria=None, **kwargs):
-        """
-        Get all distinct values for a key
-        """
-        pass
-
-    @abstractmethod
-    def update(self, docs, update_lu=True, key=None, **kwargs):
-        """
-        Update docs into the store
-        """
-        pass
-
-    @abstractmethod
-    def ensure_index(self, key, unique=False, **kwargs):
-        """
-        Tries to create and index
-        Args:
-            key (string): single key to index
-            unique (bool): Whether or not this index contains only unique keys
-
-        Returns:
-            bool indicating if the index exists/was created
-        """
-        pass
-
-    @abstractmethod
-    def groupby(self, keys, criteria=None, properties=None, **kwargs):
-        """
-        Simple grouping function that will group documents
-        by keys.
-
-        Args:
-            keys (list or string): fields to group documents
-            criteria (dict): filter for documents to group
-            properties (list): properties to return in grouped documents
-
-        Returns:
-            command cursor corresponding to grouped documents
-
-            elements of the command cursor have the structure:
-            {'_id': {"KEY_1": value_1, "KEY_2": value_2 ...,
-             'docs': [list_of_documents corresponding to key values]}
-
-        """
-        pass
-
-    @property
-    def last_updated(self):
-        doc = next(
-            self.query(properties=[self.lu_field])
-            .sort([(self.lu_field, pymongo.DESCENDING)])
-            .limit(1),
-            None,
-        )
-        if doc and self.lu_field not in doc:
-            raise StoreError(
-                "No field '{}' in store document. Please ensure Store.lu_field "
-                "is a datetime field in your store that represents the time of "
-                "last update to each document.".format(self.lu_field)
-            )
-        # Handle when collection has docs but `NoneType` lu_field.
-        return (
-            self.lu_func[0](doc[self.lu_field])
-            if (doc and doc[self.lu_field])
-            else datetime.min
-        )
-
-    def lu_filter(self, targets):
-        """Creates a MongoDB filter for new documents.
-
-        By "new", we mean documents in this Store that were last updated later
-        than any document in targets.
-
-        Args:
-            targets (list): A list of Stores
-
-        """
-        if isinstance(targets, Store):
-            targets = [targets]
-
-        lu_list = [t.last_updated for t in targets]
-        return {self.lu_field: {"$gt": self.lu_func[1](max(lu_list))}}
-
-    def updated_keys(self, target, criteria=None):
-        """
-        Returns keys for docs that are newer in the target store in comparison
-        with this store when comparing the last updated field (lu_field)
-
-        Args:
-            target (Store): store to look for updated documents
-            criteria (dict): mongo query to limit scope
-
-        Returns:
-            list of keys that have been updated in target store
-        """
-        self.ensure_index(self.key)
-        self.ensure_index(self.lu_field)
-
-        return source_keys_updated(target, self, query=criteria)
-
-    def __eq__(self, other):
-        return hash(self) == hash(other)
-
-    def __ne__(self, other):
-        return not self == other
-
-    def __hash__(self):
-        return hash((self.lu_field,))
-
-    def __getstate__(self):
-        return self.as_dict()
-
-    def __setstate__(self, d):
-        d = {k: v for k, v in d.items() if not k.startswith("@")}
-        d = MontyDecoder().process_decoded(d)
-        self.__init__(**d)
-
-
-class Mongolike(object):
-    """
-    Mixin class that allows for basic mongo functionality
-    """
-
-    @property
-    def collection(self):
-        if self._collection is None:
-            raise StoreError("Must connect Mongo-like store before attemping to use it")
-        return self._collection
-
-    def query(self, criteria=None, properties=None, **kwargs):
-        """
-        Function that gets data from MongoStore with property focus.
-
-        Args:
-            criteria (dict): filter for query, matches documents
-                against key-value pairs
-            properties (list or dict): list of properties to return
-                or dictionary with {"property": 1} type structure
-                from standard mongo Collection.find syntax
-            **kwargs (kwargs): further kwargs to Collection.find
-        """
-        if isinstance(properties, list):
-            properties = {p: 1 for p in properties}
-        return self.collection.find(filter=criteria, projection=properties, **kwargs)
-
-    def query_one(self, criteria=None, properties=None, **kwargs):
-        """
-        Function that gets a single from MongoStore with property focus.
-        Returns None if nothing matches
-
-        Args:
-            criteria (dict): filter for query, matches documents
-                against key-value pairs
-            properties (list or dict): list of properties to return
-                or dictionary with {"property": 1} type structure
-                from standard mongo Collection.find syntax
-            **kwargs (kwargs): further kwargs to Collection.find_one
-        """
-        if isinstance(properties, list):
-            properties = {p: 1 for p in properties}
-        return self.collection.find_one(
-            filter=criteria, projection=properties, **kwargs
-        )
-
-    def ensure_index(self, key, unique=False, **kwargs):
-        """
-        Wrapper for pymongo.Collection.ensure_index
-        """
-        if "background" not in kwargs:
-            kwargs["background"] = True
-
-        if confirm_field_index(self.collection, key):
-            return True
-        else:
-            try:
-                self.collection.create_index(key, unique=unique, **kwargs)
-                return True
-            except:
-                return False
-
-    def update(self, docs, update_lu=True, key=None, ordered=True, **kwargs):
-        """
-        Function to update associated MongoStore collection.
-
-        Args:
-            docs: list of documents
-        """
-
-        requests = []
-
-        for d in docs:
-
-            d = jsanitize(d, allow_bson=True)
-
-            # document-level validation is optional
-            validates = True
-            if self.validator:
-                validates = self.validator.is_valid(d)
-                if not validates:
-                    if self.validator.strict:
-                        raise ValueError(self.validator.validation_errors(d))
-                    else:
-                        self.logger.error(self.validator.validation_errors(d))
-
-            if validates:
-                key = key if key else self.key
-                if isinstance(key, list):
-                    search_doc = {k: d[k] for k in key}
-                else:
-                    search_doc = {key: d[key]}
-                if update_lu:
-                    d[self.lu_field] = datetime.utcnow()
-
-                requests.append(ReplaceOne(search_doc, d, upsert=True))
-
-        self.collection.bulk_write(requests, ordered=ordered)
-
-    def distinct(self, key, criteria=None, all_exist=False, **kwargs):
-        """
-        Function get to get all distinct values of a certain key in
-        a mongolike store.  May take a single key or a list of keys
-
-        Args:
-            key (mongolike key or list of mongolike keys): key or keys
-                for which to find distinct values or sets of values.
-            criteria (filter criteria): criteria for filter
-            all_exist (bool): whether to ensure all keys in list exist
-                in each document, defaults to False
-            **kwargs (kwargs): kwargs corresponding to collection.distinct
-        """
-        if isinstance(key, list):
-            criteria = criteria if criteria else {}
-            # Update to ensure keys are there
-            if all_exist:
-                criteria.update(
-                    {k: {"$exists": True} for k in key if k not in criteria}
-                )
-
-            results = []
-            for d in self.groupby(key, properties=key, criteria=criteria):
-                results.append(d["_id"])
-            return results
-
-        else:
-            return self.collection.distinct(key, filter=criteria, **kwargs)
-
-    def close(self):
-        self.collection.database.client.close()
-
-
-class MongoStore(Mongolike, Store):
-    """
-    A Store that connects to a Mongo collection
-    """
-
-    def __init__(
-        self,
-        database,
-        collection_name,
-        host="localhost",
-        port=27017,
-        username="",
-        password="",
-        **kwargs
-    ):
-        """
-        Args:
-            database (str): database name
-            collection (str): collection name
-            host (str): hostname for mongo db
-            port (int): tcp port for mongo db
-            username (str): username for mongo db
-            password (str): password for mongo db
-        """
-        self.database = database
-        self.collection_name = collection_name
-        self.host = host
-        self.port = port
-        self.username = username
-        self.password = password
-        self._collection = None
-        self.kwargs = kwargs
-        super(MongoStore, self).__init__(**kwargs)
-
-    def connect(self, force_reset=False):
-        if not self._collection or force_reset:
-            conn = MongoClient(self.host, self.port)
-            db = conn[self.database]
-            if self.username is not "":
-                db.authenticate(self.username, self.password)
-            self._collection = db[self.collection_name]
-
-    def __hash__(self):
-        return hash((self.database, self.collection_name, self.lu_field))
-
-    @classmethod
-    def from_db_file(cls, filename):
-        """
-        Convenience method to construct MongoStore from db_file
-        """
-        kwargs = loadfn(filename)
-        if "collection" in kwargs:
-            kwargs["collection_name"] = kwargs.pop("collection")
-        # Get rid of aliases from traditional query engine db docs
-        kwargs.pop("aliases", None)
-        return cls(**kwargs)
-
-    def groupby(
-        self, keys, criteria=None, properties=None, allow_disk_use=True, **kwargs
-    ):
-        """
-        Simple grouping function that will group documents
-        by keys.
-
-        Args:
-            keys (list or string): fields to group documents
-            criteria (dict): filter for documents to group
-            properties (list): properties to return in grouped documents
-            allow_disk_use (bool): whether to allow disk use in aggregation
-
-        Returns:
-            command cursor corresponding to grouped documents
-
-            elements of the command cursor have the structure:
-            {'_id': {"KEY_1": value_1, "KEY_2": value_2 ...,
-             'docs': [list_of_documents corresponding to key values]}
-
-        """
-        pipeline = []
-        if criteria is not None:
-            pipeline.append({"$match": criteria})
-
-        if properties is not None:
-            pipeline.append({"$project": {p: 1 for p in properties}})
-
-        if isinstance(keys, str):
-            keys = [keys]
-
-        group_id = {}
-        for key in keys:
-            set_(group_id, key, "${}".format(key))
-        pipeline.append({"$group": {"_id": group_id, "docs": {"$push": "$$ROOT"}}})
-
-        return self.collection.aggregate(pipeline, allowDiskUse=allow_disk_use)
-
-    @classmethod
-    def from_collection(cls, collection, **kwargs):
-        """
-        Generates a MongoStore from a pymongo collection object
-        This is not a fully safe operation as it gives dummy information to the MongoStore
-        As a result, this will not serialize and can not reset its connection
-        """
-        # TODO: How do we make this safer?
-        coll_name = collection.name
-        db_name = collection.database.name
-
-        store = cls(db_name, coll_name, **kwargs)
-        store._collection = collection
-        return store
-
-
-class MemoryStore(Mongolike, Store):
-    """
-    An in-memory Store that functions similarly
-    to a MongoStore
-    """
-
-    def __init__(self, name="memory_db", **kwargs):
-        self.name = name
-        self._collection = None
-        self.kwargs = kwargs
-        super(MemoryStore, self).__init__(**kwargs)
-
-    def connect(self, force_reset=False):
-        if not self._collection or force_reset:
-            self._collection = mongomock.MongoClient().db[self.name]
-
-    def __hash__(self):
-        return hash((self.name, self.lu_field))
-
-    def groupby(self, keys, criteria=None, properties=None, **kwargs):
-        """
-        Simple grouping function that will group documents
-        by keys.
-
-        Args:
-            keys (list or string): fields to group documents
-            criteria (dict): filter for documents to group
-            properties (list): properties to return in grouped documents
-            allow_disk_use (bool): whether to allow disk use in aggregation
-
-        Returns:
-            command cursor corresponding to grouped documents
-
-            elements of the command cursor have the structure:
-            {'_id': {"KEY_1": value_1, "KEY_2": value_2 ...,
-             'docs': [list_of_documents corresponding to key values]}
-
-        """
-        keys = keys if isinstance(keys, list) else [keys]
-
-        input_data = list(self.query(properties=keys, criteria=criteria))
-
-        if len(keys) > 1:
-            grouper = itemgetter(*keys)
-            for key, grp in groupby(sorted(input_data, key=grouper), grouper):
-                temp_dict = {"_id": zip(keys, key), "docs": list(grp)}
-                yield temp_dict
-        else:
-            grouper = itemgetter(*keys)
-            for key, grp in groupby(sorted(input_data, key=grouper), grouper):
-                temp_dict = {"_id": {keys[0]: key}, "docs": list(grp)}
-                yield temp_dict
-
-    def update(self, docs, update_lu=True, key=None, **kwargs):
-        """
-        Function to update associated MongoStore collection.
-
-        Args:
-            docs: list of documents
-        """
-
-        for d in docs:
-
-            d = jsanitize(d, allow_bson=True)
-
-            # document-level validation is optional
-            validates = True
-            if self.validator:
-                validates = self.validator.is_valid(d)
-                if not validates:
-                    if self.validator.strict:
-                        raise ValueError(self.validator.validation_errors(d))
-                    else:
-                        self.logger.error(self.validator.validation_errors(d))
-
-            if validates:
-                if isinstance(key, list):
-                    search_doc = {k: d[k] for k in key}
-                elif key:
-                    search_doc = {key: d[key]}
-                else:
-                    search_doc = {self.key: d[self.key]}
-                if update_lu:
-                    d[self.lu_field] = datetime.utcnow()
-                self.collection.insert_one(d)
-
-
-class JSONStore(MemoryStore):
-    """
-    A Store for access to a single or multiple JSON files
-    """
-
-    def __init__(self, paths, **kwargs):
-        """
-        Args:
-            paths (str or list): paths for json files to
-                turn into a Store
-        """
-        paths = paths if isinstance(paths, (list, tuple)) else [paths]
-        self.paths = paths
-        self.kwargs = kwargs
-        super(JSONStore, self).__init__("collection", **kwargs)
-
-    def connect(self, force_reset=False):
-        super(JSONStore, self).connect(force_reset=force_reset)
-        for path in self.paths:
-            with zopen(path) as f:
-                data = f.read()
-                data = data.decode() if isinstance(data, bytes) else data
-                objects = json.loads(data)
-                objects = [objects] if not isinstance(objects, list) else objects
-                self.update(objects)
-
-    def __hash__(self):
-        return hash((*self.paths, self.lu_field))
-
-
-class DatetimeStore(MemoryStore):
-    """Utility store intended for use with `Store.lu_filter`."""
-
-    def __init__(self, dt, **kwargs):
-        """
-        Args:
-            dt (Datetime): Datetime to set
-        """
-        self.__dt = dt
-        self.kwargs = kwargs
-        super(DatetimeStore, self).__init__("date", **kwargs)
-
-    def connect(self, force_reset=False):
-        super(DatetimeStore, self).connect(force_reset)
-        self.collection.insert_one({self.lu_field: self.__dt})
-
-
-class GridFSStore(Store):
-    """
-    A Store for GrdiFS backend. Provides a common access method consistent with other stores
-    """
-
-    # https://github.com/mongodb/specifications/
-    #   blob/master/source/gridfs/gridfs-spec.rst#terms
-    #   (Under "Files collection document")
-    files_collection_fields = (
-        "_id",
-        "length",
-        "chunkSize",
-        "uploadDate",
-        "md5",
-        "filename",
-        "contentType",
-        "aliases",
-        "metadata",
-    )
-
-    def __init__(
-        self,
-        database,
-        collection_name,
-        host="localhost",
-        port=27017,
-        username="",
-        password="",
-        compression=False,
-        **kwargs
-    ):
-
-        self.database = database
-        self.collection_name = collection_name
-        self.host = host
-        self.port = port
-        self.username = username
-        self.password = password
-        self._collection = None
-        self.compression = compression
-        self.kwargs = kwargs
-        self.meta_keys = set()
-
-        if "key" not in kwargs:
-            kwargs["key"] = "_id"
-
-        kwargs["lu_field"] = "uploadDate"
-
-        super(GridFSStore, self).__init__(**kwargs)
-
-    def connect(self, force_reset=False):
-        conn = MongoClient(self.host, self.port)
-        if not self._collection or force_reset:
-            db = conn[self.database]
-            if self.username is not "":
-                db.authenticate(self.username, self.password)
-
-            self._collection = gridfs.GridFS(db, self.collection_name)
-            self._files_collection = db["{}.files".format(self.collection_name)]
-            self._chunks_collection = db["{}.chunks".format(self.collection_name)]
-
-    @property
-    def collection(self):
-        # TODO: Should this return the real MongoCollection or the GridFS
-        return self._collection
-
-    @property
-    def last_updated(self):
-        doc = next(
-            self._files_collection.find(projection=[self.lu_field])
-            .sort([(self.lu_field, pymongo.DESCENDING)])
-            .limit(1),
-            None,
-        )
-        if doc and self.lu_field not in doc:
-            raise StoreError(
-                "No field '{}' in store document. Please ensure Store.lu_field "
-                "is a datetime field in your store that represents the time of "
-                "last update to each document.".format(self.lu_field)
-            )
-        # Handle when collection has docs but `NoneType` lu_field.
-        return (
-            self.lu_func[0](doc[self.lu_field])
-            if (doc and doc[self.lu_field])
-            else datetime.min
-        )
-
-    @classmethod
-    def transform_criteria(cls, criteria):
-        """
-        Allow client to not need to prepend 'metadata.' to query fields.
-        Args:
-            criteria (dict): Query criteria
-        """
-        for field in criteria:
-            if field not in cls.files_collection_fields and not field.startswith(
-                "metadata."
-            ):
-                criteria["metadata." + field] = copy.copy(criteria[field])
-                del criteria[field]
-
-    def query(self, criteria=None, properties=None, **kwargs):
-        """
-        Function that gets data from GridFS. This store ignores all
-        property projections as its designed for whole document access
-
-        Args:
-            criteria (dict): filter for query, matches documents
-                against key-value pairs
-            properties (list or dict): This will be ignored by the GridFS
-                Store
-            **kwargs (kwargs): further kwargs to Collection.find
-        """
-        if isinstance(criteria, dict):
-            self.transform_criteria(criteria)
-        for f in self.collection.find(filter=criteria, **kwargs):
-            data = f.read()
-
-            metadata = f.metadata
-            if metadata.get("compression", "") == "zlib":
-                data = zlib.decompress(data).decode("UTF-8")
-
-            try:
-                data = json.loads(data)
-            except:
-                pass
-            yield data
-
-    def query_one(self, criteria=None, properties=None, **kwargs):
-        """
-        Function that gets a single document from GridFS. This store
-        ignores all property projections as its designed for whole
-        document access
-
-        Args:
-            criteria (dict): filter for query, matches documents
-                against key-value pairs
-            properties (list or dict): This will be ignored by the GridFS
-                Store
-            **kwargs (kwargs): further kwargs to Collection.find
-        """
-        return next(self.query(criteria=criteria, **kwargs), None)
-
-    def distinct(self, key, criteria=None, all_exist=False, **kwargs):
-        """
-        Function get to get all distinct values of a certain key in
-        a mongolike store.  May take a single key or a list of keys
-
-        Args:
-            key (mongolike key or list of mongolike keys): key or keys
-                for which to find distinct values or sets of values.
-            criteria (filter criteria): criteria for filter
-            all_exist (bool): whether to ensure all keys in list exist
-                in each document, defaults to False
-            **kwargs (kwargs): kwargs corresponding to collection.distinct
-        """
-        if isinstance(key, list):
-            criteria = criteria if criteria else {}
-            # Update to ensure keys are there
-            if all_exist:
-                criteria.update(
-                    {k: {"$exists": True} for k in key if k not in criteria}
-                )
-
-            results = []
-            for d in self.groupby(key, properties=key, criteria=criteria):
-                results.append(d["_id"])
-            return results
-
-        else:
-            if criteria:
-                self.transform_criteria(criteria)
-            # Transfor to metadata subfield if not supposed to be in gridfs main fields
-            if key not in self.files_collection_fields:
-                key = "metadata.{}".format(key)
-
-            return self._files_collection.distinct(key, filter=criteria, **kwargs)
-
-    def groupby(
-        self, keys, criteria=None, properties=None, allow_disk_use=True, **kwargs
-    ):
-        """
-        Simple grouping function that will group documents
-        by keys.
-
-        Args:
-            keys (list or string): fields to group documents
-            criteria (dict): filter for documents to group
-            properties (list): properties to return in grouped documents
-            allow_disk_use (bool): whether to allow disk use in aggregation
-
-        Returns:
-            command cursor corresponding to grouped documents
-
-            elements of the command cursor have the structure:
-            {'_id': {"KEY_1": value_1, "KEY_2": value_2 ...,
-             'docs': [list_of_documents corresponding to key values]}
-
-        """
-        pipeline = []
-        if criteria is not None:
-            self.transform_criteria(criteria)
-            pipeline.append({"$match": criteria})
-
-        if properties is not None:
-            properties = [
-                p if p in self.files_collection_fields else "metadata.{}".format(p)
-                for p in properties
-            ]
-            pipeline.append({"$project": {p: 1 for p in properties}})
-
-        if isinstance(keys, str):
-            keys = [keys]
-
-        # ensure propper naming for keys in and outside of metadata
-        keys = [
-            k if k in self.files_collection_fields else "metadata.{}".format(k)
-            for k in key
-        ]
-
-        group_id = {key: "${}".format(key) for key in keys}
-        pipeline.append({"$group": {"_id": group_id, "docs": {"$push": "$$ROOT"}}})
-
-        return self.collection.aggregate(pipeline, allowDiskUse=allow_disk_use)
-
-    def ensure_index(self, key, unique=False):
-        """
-        Wrapper for pymongo.Collection.ensure_index for the files collection
-        """
-        # Transform key for gridfs first
-        if key not in self.files_collection_fields:
-            key = "metadata.{}".format(key)
-
-        if "background" not in kwargs:
-            kwargs["background"] = True
-
-        if confirm_field_index(self.collection, key):
-            return True
-        else:
-            try:
-                self.collection.create_index(key, unique=unique, **kwargs)
-                return True
-            except:
-                return False
-
-    def update(self, docs, update_lu=True, key=None):
-        """
-        Function to update associated MongoStore collection.
-
-        Args:
-            docs ([dict]): list of documents
-            update_lu (bool) : Updat the last_updated field or not
-            key (list or str): list or str of important parameters
-        """
-        if isinstance(key, str):
-            key = [key]
-        elif not key:
-            key = [self.key]
-
-        key = list(set(key) | self.meta_keys - set(self.files_collection_fields))
-
-        for d in docs:
-
-            search_doc = {k: d[k] for k in key}
-            if update_lu:
-                d[self.lu_field] = datetime.utcnow()
-
-            metadata = {self.lu_field: d[self.lu_field]}
-            metadata.update(search_doc)
-
-            data = json.dumps(jsanitize(d)).encode("UTF-8")
-            if self.compression:
-                data = zlib.compress(data)
-                metadata["compression"] = "zlib"
-
-            self.collection.put(data, metadata=metadata)
-            self.transform_criteria(search_doc)
-
-            # Cleans up old gridfs entries
-            for fdoc in (
-                self._files_collection.find(search_doc, ["_id"])
-                .sort("uploadDate", -1)
-                .skip(1)
-            ):
-                self.collection.delete(fdoc["_id"])
-
-    def close(self):
-        self.collection.database.client.close()
-
-
-class StoreError(Exception):
-    """General Store-related error."""
-
-    pass
diff --git a/maggma/validator.py b/maggma/validators.py
similarity index 75%
rename from maggma/validator.py
rename to maggma/validators.py
index b2ec59473..10ee98718 100644
--- a/maggma/validator.py
+++ b/maggma/validators.py
@@ -5,34 +5,9 @@
 that Store.
 """
 
-from abc import ABC, abstractmethod
 from jsonschema import validate, ValidationError
 from jsonschema.validators import validator_for
-import pydash
-
-
-class Validator(ABC):
-    """
-    A generic class to perform document-level validation on Stores.
-    Attach a Validator to a Store during initialization, any all documents
-    added to the Store will call .validate_doc() before being added.
-    """
-
-    @abstractmethod
-    def is_valid(self, doc):
-        """
-        Returns (bool): True if document valid, False if document
-        invalid
-        """
-        return NotImplementedError
-
-    @abstractmethod
-    def validation_errors(self, doc):
-        """
-        Returns (bool): if document is not valid, provide a list of
-        strings to display for why validation has failed
-        """
-        return NotImplementedError
+from maggma.core import Validator
 
 
 class JSONSchemaValidator(Validator):
@@ -108,14 +83,14 @@ def validation_errors(self, doc):
             return []
 
         validator = validator_for(self.schema)(self.schema)
-        errors = ["{}: {}".format(".".join(error.absolute_path),
-                                  error.message)
-                  for error in validator.iter_errors(doc)]
+        errors = [
+            "{}: {}".format(".".join(error.absolute_path), error.message)
+            for error in validator.iter_errors(doc)
+        ]
 
         return errors
 
 
-
 def msonable_schema(cls):
     """
     Convenience function to return a JSON Schema for any MSONable class.
@@ -125,6 +100,6 @@ def msonable_schema(cls):
         "required": ["@class", "@module"],
         "properties": {
             "@class": {"const": cls.__name__},
-            "@module": {"const": cls.__module__}
-        }
+            "@module": {"const": cls.__module__},
+        },
     }

From d49c627e2d6ba1807d0511bcf2b288ba604ec60c Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Tue, 5 Nov 2019 15:29:57 -0800
Subject: [PATCH 05/99] update python

---
 setup.py | 47 +++++++++++++++++++++++++----------------------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/setup.py b/setup.py
index 327379d12..765530a60 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
     setup(
         name="maggma",
         use_scm_version=True,
-        setup_requires=['setuptools_scm'],
+        setup_requires=["setuptools_scm"],
         description="MongoDB aggregation machine",
         long_description=open(os.path.join(module_dir, "README.md")).read(),
         long_description_content_type="text/markdown",
@@ -22,29 +22,32 @@
         package_data={},
         zip_safe=False,
         install_requires=[
-            "pymongo>=3.6", "mongomock>=3.10.0", "monty>=1.0.2",
-            "smoqe>=0.1.3", "PyYAML>=3.12", "pydash>=4.1.0", "tqdm>=4.19.6",
-            "mongogrant>=0.2.2", "hvac>=0.3.0", "boto3>=1.6.9",
+            "pymongo>=3.6",
+            "mongomock>=3.10.0",
+            "monty>=1.0.2",
+            "smoqe>=0.1.3",
+            "PyYAML>=3.12",
+            "pydash>=4.1.0",
+            "tqdm>=4.19.6",
+            "mongogrant>=0.2.2",
+            "hvac>=0.3.0",
+            "boto3>=1.6.9",
         ],
         extras_require={"mpi": ["mpi4py>=2.0.0"]},
-        classifiers=["Programming Language :: Python :: 3",
-                     "Programming Language :: Python :: 3.6",
-                     "Development Status :: 2 - Pre-Alpha",
-                     "Intended Audience :: Science/Research",
-                     "Intended Audience :: System Administrators",
-                     "Intended Audience :: Information Technology",
-                     "Operating System :: OS Independent",
-                     "Topic :: Other/Nonlisted Topic",
-                     "Topic :: Database :: Front-Ends",
-                     "Topic :: Scientific/Engineering"],
-
-
-        entry_points={
-            "console_scripts": [
-                "mrun = maggma.cli.mrun:main"
-                ]
-            },
+        classifiers=[
+            "Programming Language :: Python :: 3",
+            "Programming Language :: Python :: 3.6",
+            "Development Status :: 2 - Pre-Alpha",
+            "Intended Audience :: Science/Research",
+            "Intended Audience :: System Administrators",
+            "Intended Audience :: Information Technology",
+            "Operating System :: OS Independent",
+            "Topic :: Other/Nonlisted Topic",
+            "Topic :: Database :: Front-Ends",
+            "Topic :: Scientific/Engineering",
+        ],
+        entry_points={"console_scripts": ["mrun = maggma.cli.mrun:main"]},
         test_suite="nose.collector",
         tests_require=["nose"],
-        python_requires='>=3.6',
+        python_requires=">=3.8",
     )

From 7245d19c9c7e42c5f48696525668f99ec24842e8 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Tue, 5 Nov 2019 15:31:16 -0800
Subject: [PATCH 06/99] remove unused helpers

---
 maggma/helpers.py | 98 -----------------------------------------------
 1 file changed, 98 deletions(-)
 delete mode 100644 maggma/helpers.py

diff --git a/maggma/helpers.py b/maggma/helpers.py
deleted file mode 100644
index 6a0eb4491..000000000
--- a/maggma/helpers.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# coding: utf-8
-"""
-More utilities for maggma. Do we need this still?
-"""
-import json
-
-from pymongo import MongoClient
-
-
-def get_database(cred, **mongo_client_kwargs):
-    """Connect to a database given a credential dict.
-
-    Args:
-        cred (dict): {database, [host, port, username, password]}
-
-    Returns:
-        pymongo.database.Database: The database object.
-    """
-    # respect potential multiprocessing fork
-    mc_kwargs = dict(connect=False)
-    mc_kwargs.update(mongo_client_kwargs)
-    conn = MongoClient(
-        cred.get('host', 'localhost'),
-        cred.get('port', 27017),
-        **mc_kwargs)
-    db = conn[cred['database']]
-    if cred.get('username'):
-        db.authenticate(cred['username'], cred['password'])
-    return db
-
-
-def get_collection(config):
-    """
-    Returns collection from config file
-
-    Args:
-        config(str): path to the collection config file
-
-    Returns:
-        pymongo.collection
-    """
-    with open(config, "r") as f:
-        settings = json.load(f)
-    settings["aliases_config"] = {"aliases": {}, "defaults": {}}
-    db = get_database(cred=settings)
-    return db[config]
-
-
-class CredentialManager:
-
-    roles = ['read', 'write', 'admin']
-
-    def __init__(self, filepath):
-        """
-        Args:
-            filepath (str): path to the file
-        """
-        with open(filepath) as f:
-            self.creds = json.load(f)
-            self.filepath = filepath
-
-    def get_cred(self, spec):
-        """Get DB credential dict.
-
-        Args:
-            spec (str): "<role>:<host[:port]>/<database>", where <role> is
-                "read", "write", or "admin".
-
-        Returns:
-            dict: {host,port,database,username,password}
-
-        """
-        pass
-
-    def add_cred(self, cred, role):
-        """
-        Add DB credential dict to `self.filepath`.
-
-        Args:
-            cred
-            role
-        """
-        assert role in self.roles
-        pass
-
-    def ensure_cred(self, spec):
-        """
-        Attempt to ensure credentials as per spec.
-
-        Generates user/pass if no existing spec match.
-        Fails if host requires user/pass and cred file has neither
-        an admin cred for the spec database nor a cred for the
-        spec host admin db.
-
-        Args:
-            spec
-        """
-        pass

From 9ef5317a89ec8a23bc73531a787f851c596e43f7 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Tue, 5 Nov 2019 15:39:41 -0800
Subject: [PATCH 07/99] rename docs directory

---
 {source => docs}/apidoc/index.rst       | 0
 {source => docs}/conf.py                | 0
 {source => docs}/index.rst              | 0
 {source => docs}/usage/installation.rst | 0
 {source => docs}/usage/introduction.rst | 0
 {source => docs}/usage/quickstart.rst   | 0
 6 files changed, 0 insertions(+), 0 deletions(-)
 rename {source => docs}/apidoc/index.rst (100%)
 rename {source => docs}/conf.py (100%)
 rename {source => docs}/index.rst (100%)
 rename {source => docs}/usage/installation.rst (100%)
 rename {source => docs}/usage/introduction.rst (100%)
 rename {source => docs}/usage/quickstart.rst (100%)

diff --git a/source/apidoc/index.rst b/docs/apidoc/index.rst
similarity index 100%
rename from source/apidoc/index.rst
rename to docs/apidoc/index.rst
diff --git a/source/conf.py b/docs/conf.py
similarity index 100%
rename from source/conf.py
rename to docs/conf.py
diff --git a/source/index.rst b/docs/index.rst
similarity index 100%
rename from source/index.rst
rename to docs/index.rst
diff --git a/source/usage/installation.rst b/docs/usage/installation.rst
similarity index 100%
rename from source/usage/installation.rst
rename to docs/usage/installation.rst
diff --git a/source/usage/introduction.rst b/docs/usage/introduction.rst
similarity index 100%
rename from source/usage/introduction.rst
rename to docs/usage/introduction.rst
diff --git a/source/usage/quickstart.rst b/docs/usage/quickstart.rst
similarity index 100%
rename from source/usage/quickstart.rst
rename to docs/usage/quickstart.rst

From af467ac3d8959cba64bc7070ed0db7d7ce1e8977 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 6 Nov 2019 09:15:13 -0800
Subject: [PATCH 08/99] Add pytest runner

---
 setup.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 765530a60..6cc2ee523 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
     setup(
         name="maggma",
         use_scm_version=True,
-        setup_requires=["setuptools_scm"],
+        setup_requires=["setuptools_scm", "pytest-runner"],
         description="MongoDB aggregation machine",
         long_description=open(os.path.join(module_dir, "README.md")).read(),
         long_description_content_type="text/markdown",
@@ -47,7 +47,6 @@
             "Topic :: Scientific/Engineering",
         ],
         entry_points={"console_scripts": ["mrun = maggma.cli.mrun:main"]},
-        test_suite="nose.collector",
-        tests_require=["nose"],
-        python_requires=">=3.8",
+        tests_require=["pytest"],
+        python_requires=">=3.7",
     )

From 0ef5bafbc20670055e92fa8714514d318b55e923 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 6 Nov 2019 09:16:04 -0800
Subject: [PATCH 09/99] update gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 11624d729..686b20743 100644
--- a/.gitignore
+++ b/.gitignore
@@ -111,3 +111,4 @@ ENV/
 .idea
 .DS_Store
 .vscode
+.pytest_cache
\ No newline at end of file

From fad2e850f975711d30b1d581a0e34da7ec585e63 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 6 Nov 2019 09:16:32 -0800
Subject: [PATCH 10/99] Bug fixes

---
 maggma/core/__init__.py |  2 +-
 maggma/core/store.py    | 13 +++++++++----
 maggma/stores/aws.py    |  2 +-
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/maggma/core/__init__.py b/maggma/core/__init__.py
index c6d8b73f1..b20bd3cbc 100644
--- a/maggma/core/__init__.py
+++ b/maggma/core/__init__.py
@@ -1,3 +1,3 @@
+from maggma.core.validator import Validator
 from maggma.core.store import Store, Sort, DateTimeFormat, StoreError
 from maggma.core.builder import Builder
-from maggma.core.validator import Validator
\ No newline at end of file
diff --git a/maggma/core/store.py b/maggma/core/store.py
index ba356630a..bc000667c 100644
--- a/maggma/core/store.py
+++ b/maggma/core/store.py
@@ -13,7 +13,7 @@
 from enum import Enum
 from typing import Union, Optional, Dict, List, Iterator, Tuple
 
-from pydash import identity
+from pydash import identity, get
 
 from monty.dev import deprecated
 from monty.json import MSONable, MontyDecoder
@@ -128,9 +128,11 @@ def distinct(
         field: Union[List[str], str],
         criteria: Optional[Dict] = None,
         all_exist: bool = False,
-    ) -> List:
+    ) -> Union[List[Dict], List]:
         """
-        Get all distinct values for a key
+        Get all distinct values for a field(s)
+        For a single field, this returns a list of values
+        For multiple fields, this return a list of of dictionaries for each unique combination
 
         Args:
             field: the field(s) to get distinct values for
@@ -146,6 +148,9 @@ def distinct(
         results = [
             key for key, _ in self.groupby(field, properties=field, criteria=criteria)
         ]
+        # Flatten out results if searching for a single field
+        if len(field) == 1:
+            results = [get(r, field[0]) for r in results]
         return results
 
     @abstractmethod
@@ -198,7 +203,7 @@ def groupby(
             limit: limit on total number of documents returned
 
         Returns:
-            generator returning tuples of (key, list of docs)
+            generator returning tuples of (dict, list of docs)
         """
         pass
 
diff --git a/maggma/stores/aws.py b/maggma/stores/aws.py
index 432d0b792..39e47c4f8 100644
--- a/maggma/stores/aws.py
+++ b/maggma/stores/aws.py
@@ -7,7 +7,7 @@
 import zlib
 from datetime import datetime
 
-from maggma.stores import Store
+from maggma.core import Store
 from monty.json import jsanitize
 
 try:

From aaa879f32ace021edeaead95a99d10443100d5ba Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 6 Nov 2019 09:16:38 -0800
Subject: [PATCH 11/99] use _collection

---
 maggma/stores/mongolike.py | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/maggma/stores/mongolike.py b/maggma/stores/mongolike.py
index e174fe306..4e55051ea 100644
--- a/maggma/stores/mongolike.py
+++ b/maggma/stores/mongolike.py
@@ -53,7 +53,7 @@ def __init__(
             password: Password to connect with
         """
         self.database = database
-        self.collection_name = collection_name
+        self._collection_name = collection_name
         self.host = host
         self.port = port
         self.username = username
@@ -71,10 +71,10 @@ def connect(self, force_reset: bool = False):
             db = conn[self.database]
             if self.username != "":
                 db.authenticate(self.username, self.password)
-            self._collection = db[self.collection_name]
+            self._collection = db[self._collection_name]
 
     def __hash__(self):
-        return hash((self.database, self.collection_name, self.last_updated_field))
+        return hash((self.database, self._collection_name, self.last_updated_field))
 
     @classmethod
     def from_db_file(cls, filename: str):
@@ -128,7 +128,7 @@ def groupby(
             set_(group_id, key, "${}".format(key))
         pipeline.append({"$group": {"_id": group_id, "docs": {"$push": "$$ROOT"}}})
 
-        for d in self.collection.aggregate(pipeline, allowDiskUse=True):
+        for d in self._collection.aggregate(pipeline, allowDiskUse=True):
             yield (d["_id"], d["docs"])
 
     @classmethod
@@ -173,7 +173,7 @@ def query(
         """
         if isinstance(properties, list):
             properties = {p: 1 for p in properties}
-        for d in self.collection.find(
+        for d in self._collection.find(
             filter=criteria, projection=properties, skip=skip, limit=limit
         ):
             yield d
@@ -189,11 +189,11 @@ def ensure_index(self, key: str, unique: Optional[bool] = False) -> bool:
             bool indicating if the index exists/was created
         """
 
-        if confirm_field_index(self.collection, key):
+        if confirm_field_index(self._collection, key):
             return True
         else:
             try:
-                self.collection.create_index(key, unique=unique, background=True)
+                self._collection.create_index(key, unique=unique, background=True)
                 return True
             except Exception:
                 return False
@@ -238,10 +238,10 @@ def update(self, docs: Union[List[Dict], Dict], key: Union[List, str, None] = No
 
                 requests.append(ReplaceOne(search_doc, d, upsert=True))
 
-        self.collection.bulk_write(requests, ordered=False)
+        self._collection.bulk_write(requests, ordered=False)
 
     def close(self):
-        self.collection.database.client.close()
+        self._collection.database.client.close()
 
 
 class MemoryStore(MongoStore):
@@ -296,13 +296,12 @@ def groupby(
 
         if len(keys) > 1:
             grouper = itemgetter(*keys)
-            for key, grp in groupby(sorted(input_data, key=grouper), grouper):
-                temp_dict = {"_id": zip(keys, key), "docs": list(grp)}
-                yield temp_dict
+            for vals, grp in groupby(sorted(input_data, key=grouper), grouper):
+                yield {k: v for k, v in zip(keys, vals)}, list(grp)
         else:
             grouper = itemgetter(*keys)
-            for key, group in groupby(sorted(input_data, key=grouper), grouper):
-                yield (key, list(group))
+            for val, group in groupby(sorted(input_data, key=grouper), grouper):
+                yield {keys[0]: val}, list(group)
 
     def update(self, docs: Union[List[Dict], Dict], key: Union[List, str, None] = None):
         """
@@ -338,7 +337,7 @@ def update(self, docs: Union[List[Dict], Dict], key: Union[List, str, None] = No
                 else:
                     search_doc = {self.key: d[self.key]}
 
-                self.collection.update_one(d, criteria=search_doc)
+                self._collection.update_one(d, criteria=search_doc)
 
 
 class JSONStore(MemoryStore):
@@ -385,4 +384,4 @@ def __init__(self, dt, **kwargs):
 
     def connect(self, force_reset=False):
         super().connect(force_reset)
-        self.collection.insert_one({self.last_updated_field: self.__dt})
+        self._collection.insert_one({self.last_updated_field: self.__dt})

From b236d8d5dd1a6ea4c7fb330f898debfc14c5301e Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 6 Nov 2019 09:17:20 -0800
Subject: [PATCH 12/99] mongolike tests

---
 maggma/stores/tests/conftest.py       |  10 ++
 maggma/stores/tests/test_mongolike.py | 146 ++++++++++++++++++++++++++
 2 files changed, 156 insertions(+)
 create mode 100644 maggma/stores/tests/conftest.py
 create mode 100644 maggma/stores/tests/test_mongolike.py

diff --git a/maggma/stores/tests/conftest.py b/maggma/stores/tests/conftest.py
new file mode 100644
index 000000000..6a4779361
--- /dev/null
+++ b/maggma/stores/tests/conftest.py
@@ -0,0 +1,10 @@
+from pathlib import Path
+import pytest
+
+
+@pytest.fixture("session")
+def db_json():
+    module_dir = Path(__file__).resolve().parent
+    db_dir = module_dir / ".." / ".." / ".." / "test_files" / "settings_files"
+    db_json = db_dir / "db.json"
+    return db_json.resolve()
diff --git a/maggma/stores/tests/test_mongolike.py b/maggma/stores/tests/test_mongolike.py
new file mode 100644
index 000000000..0a50ad089
--- /dev/null
+++ b/maggma/stores/tests/test_mongolike.py
@@ -0,0 +1,146 @@
+import pytest
+import numpy as np
+import mongomock.collection
+import pymongo.collection
+from datetime import datetime
+import numpy.testing.utils as nptu
+from maggma.core import StoreError
+from maggma.stores import MongoStore, MemoryStore, JSONStore
+
+
+@pytest.fixture
+def mongostore():
+    store = MongoStore("maggma_test", "test")
+    store.connect()
+    yield store
+    store._collection.drop()
+
+
+def test_connect():
+    mongostore = MongoStore("maggma_test", "test")
+    assert mongostore._collection is None
+    mongostore.connect()
+    assert isinstance(mongostore._collection, pymongo.collection.Collection)
+
+
+def test_query(mongostore):
+    mongostore._collection.insert({"a": 1, "b": 2, "c": 3})
+    assert mongostore.query_one(properties=["a"])["a"] == 1
+    assert mongostore.query_one(properties=["a"])["a"] == 1
+    assert mongostore.query_one(properties=["b"])["b"] == 2
+    assert mongostore.query_one(properties=["c"])["c"] == 3
+
+
+def test_distinct(mongostore):
+    mongostore._collection.insert({"a": 1, "b": 2, "c": 3})
+    mongostore._collection.insert({"a": 4, "d": 5, "e": 6, "g": {"h": 1}})
+    assert set(mongostore.distinct("a")) == {1, 4}
+
+    # Test list distinct functionality
+    mongostore._collection.insert({"a": 4, "d": 6, "e": 7})
+    mongostore._collection.insert({"a": 4, "d": 6, "g": {"h": 2}})
+    ad_distinct = mongostore.distinct(["a", "d"])
+    assert len(ad_distinct) == 3
+    assert {"a": 4, "d": 6} in ad_distinct
+    assert {"a": 1} in ad_distinct
+    assert len(mongostore.distinct(["d", "e"], {"a": 4})) == 3
+    all_exist = mongostore.distinct(["a", "b"], all_exist=True)
+    assert len(all_exist) == 1
+    all_exist2 = mongostore.distinct(["a", "e"], all_exist=True, criteria={"d": 6})
+    assert len(all_exist2) == 1
+
+    # Test distinct subdocument functionality
+    ghs = mongostore.distinct("g.h")
+    assert set(ghs), {1 == 2}
+    ghs_ds = mongostore.distinct(["d", "g.h"], all_exist=True)
+    assert {s["g"]["h"] for s in ghs_ds}, {1 == 2}
+    assert {s["d"] for s in ghs_ds}, {5 == 6}
+
+
+def test_update(mongostore):
+    mongostore.update([{"e": 6, "d": 4}], key="e")
+    assert (
+        mongostore.query_one(criteria={"d": {"$exists": 1}}, properties=["d"])["d"] == 4
+    )
+
+    mongostore.update([{"e": 7, "d": 8, "f": 9}], key=["d", "f"])
+    assert mongostore.query_one(criteria={"d": 8, "f": 9}, properties=["e"])["e"] == 7
+
+    mongostore.update([{"e": 11, "d": 8, "f": 9}], key=["d", "f"])
+    assert mongostore.query_one(criteria={"d": 8, "f": 9}, properties=["e"])["e"] == 11
+
+
+def test_groupby(mongostore):
+    mongostore._collection.drop()
+    mongostore.update(
+        [
+            {"e": 7, "d": 9, "f": 9},
+            {"e": 7, "d": 9, "f": 10},
+            {"e": 8, "d": 9, "f": 11},
+            {"e": 9, "d": 10, "f": 12},
+        ],
+        key="f",
+    )
+    data = list(mongostore.groupby("d"))
+    assert len(data) == 2
+    grouped_by_9 = [g[1] for g in data if g[0]["d"] == 9][0]
+    assert len(grouped_by_9) == 3
+    grouped_by_10 = [g[1] for g in data if g[0]["d"] == 10][0]
+    assert len(grouped_by_10) == 1
+
+    data = list(mongostore.groupby(["e", "d"]))
+    assert len(data) == 3
+
+
+def test_from_db_file(mongostore, db_json):
+    ms = MongoStore.from_db_file(db_json)
+    assert ms._collection_name == "tmp"
+
+
+def test_from_collection(mongostore, db_json):
+    ms = MongoStore.from_db_file(db_json)
+    ms.connect()
+
+    other_ms = MongoStore.from_collection(ms._collection)
+    assert ms._collection_name == other_ms._collection_name
+    assert ms.database == other_ms.database
+
+
+def test_last_updated(mongostore):
+    assert mongostore.last_updated == datetime.min
+    start_time = datetime.now()
+    mongostore._collection.insert_one({mongostore.key: 1, "a": 1})
+    with pytest.raises(StoreError) as cm:
+        mongostore.last_updated
+    assert cm.match(mongostore.last_updated_field)
+    mongostore.update(
+        [{mongostore.key: 1, "a": 1, mongostore.last_updated_field: datetime.now()}]
+    )
+    assert mongostore.last_updated > start_time
+
+
+def test_newer_in(mongostore):
+    target = MongoStore("maggma_test", "test_target")
+    target.connect()
+
+    # make sure docs are newer in mongostore then target and check updated_keys
+
+    target.update(
+        [
+            {mongostore.key: i, mongostore.last_updated_field: datetime.now()}
+            for i in range(10)
+        ]
+    )
+
+    # Update docs in source
+    mongostore.update(
+        [
+            {mongostore.key: i, mongostore.last_updated_field: datetime.now()}
+            for i in range(10)
+        ]
+    )
+
+    assert len(target.newer_in(mongostore)) == 10
+    assert len(mongostore.newer_in(target)) == 0
+
+    target._collection.drop()

From 4c417b0fe4cfc05a5713a3fa0235229e7475a507 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 6 Nov 2019 09:17:33 -0800
Subject: [PATCH 13/99] fix utils

---
 maggma/utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/maggma/utils.py b/maggma/utils.py
index 32019587a..07910391d 100644
--- a/maggma/utils.py
+++ b/maggma/utils.py
@@ -245,16 +245,16 @@ def source_keys_updated(source, target, query=None):
 
     keys_updated = set()  # Handle non-unique keys, e.g. for GroupBuilder.
 
-    props = {target.key: 1, target.lu_field: 1, "_id": 0}
+    props = {target.key: 1, target.last_updated_field: 1, "_id": 0}
     target_dates = {
-        d[target.key]: target.lu_func[0](d[target.lu_field])
+        d[target.key]: target._lu_func[0](d[target.last_updated_field])
         for d in target.query(properties=props)
     }
 
-    props = {source.key: 1, source.lu_field: 1, "_id": 0}
+    props = {source.key: 1, source.last_updated_field: 1, "_id": 0}
     cursor_source = source.query(criteria=query, properties=props)
     for sdoc in cursor_source:
-        key, lu = sdoc[source.key], source.lu_func[0](sdoc[source.lu_field])
+        key, lu = sdoc[source.key], source._lu_func[0](sdoc[source.last_updated_field])
         if key not in target_dates:
             keys_updated.add(key)
         elif lu > target_dates[key]:

From d8f04e171bf3cedfbba68accbbc4d7863627b5b6 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 6 Nov 2019 09:38:09 -0800
Subject: [PATCH 14/99] update tests and fix buges

---
 maggma/stores/mongolike.py            |  6 ++-
 maggma/stores/tests/conftest.py       | 11 +++--
 maggma/stores/tests/test_mongolike.py | 71 ++++++++++++++++++++++-----
 3 files changed, 72 insertions(+), 16 deletions(-)

diff --git a/maggma/stores/mongolike.py b/maggma/stores/mongolike.py
index 4e55051ea..8d07e1380 100644
--- a/maggma/stores/mongolike.py
+++ b/maggma/stores/mongolike.py
@@ -254,7 +254,7 @@ def __init__(self, name: str = "memory_db", **kwargs):
         self.name = name
         self._collection = None
         self.kwargs = kwargs
-        super().__init__(**kwargs)
+        super(MongoStore, self).__init__(**kwargs)
 
     def connect(self, force_reset: bool = False):
         """
@@ -337,7 +337,9 @@ def update(self, docs: Union[List[Dict], Dict], key: Union[List, str, None] = No
                 else:
                     search_doc = {self.key: d[self.key]}
 
-                self._collection.update_one(d, criteria=search_doc)
+                self._collection.replace_one(
+                    filter=search_doc, replacement=d, upsert=True
+                )
 
 
 class JSONStore(MemoryStore):
diff --git a/maggma/stores/tests/conftest.py b/maggma/stores/tests/conftest.py
index 6a4779361..de60dcfb7 100644
--- a/maggma/stores/tests/conftest.py
+++ b/maggma/stores/tests/conftest.py
@@ -2,9 +2,14 @@
 import pytest
 
 
-@pytest.fixture("session")
-def db_json():
+@pytest.fixture
+def test_dir():
     module_dir = Path(__file__).resolve().parent
-    db_dir = module_dir / ".." / ".." / ".." / "test_files" / "settings_files"
+    test_dir = module_dir / ".." / ".." / ".." / "test_files" 
+    return test_dir.resolve()
+
+@pytest.fixture
+def db_json(test_dir):
+    db_dir = test_dir / "settings_files"
     db_json = db_dir / "db.json"
     return db_json.resolve()
diff --git a/maggma/stores/tests/test_mongolike.py b/maggma/stores/tests/test_mongolike.py
index 0a50ad089..897ad499a 100644
--- a/maggma/stores/tests/test_mongolike.py
+++ b/maggma/stores/tests/test_mongolike.py
@@ -1,9 +1,7 @@
 import pytest
-import numpy as np
 import mongomock.collection
 import pymongo.collection
 from datetime import datetime
-import numpy.testing.utils as nptu
 from maggma.core import StoreError
 from maggma.stores import MongoStore, MemoryStore, JSONStore
 
@@ -16,14 +14,21 @@ def mongostore():
     store._collection.drop()
 
 
-def test_connect():
+@pytest.fixture
+def memorystore():
+    store = MemoryStore()
+    store.connect()
+    return store
+
+
+def test_mongostore_connect():
     mongostore = MongoStore("maggma_test", "test")
     assert mongostore._collection is None
     mongostore.connect()
     assert isinstance(mongostore._collection, pymongo.collection.Collection)
 
 
-def test_query(mongostore):
+def test_mongostore_query(mongostore):
     mongostore._collection.insert({"a": 1, "b": 2, "c": 3})
     assert mongostore.query_one(properties=["a"])["a"] == 1
     assert mongostore.query_one(properties=["a"])["a"] == 1
@@ -31,7 +36,7 @@ def test_query(mongostore):
     assert mongostore.query_one(properties=["c"])["c"] == 3
 
 
-def test_distinct(mongostore):
+def test_mongostore_distinct(mongostore):
     mongostore._collection.insert({"a": 1, "b": 2, "c": 3})
     mongostore._collection.insert({"a": 4, "d": 5, "e": 6, "g": {"h": 1}})
     assert set(mongostore.distinct("a")) == {1, 4}
@@ -57,7 +62,7 @@ def test_distinct(mongostore):
     assert {s["d"] for s in ghs_ds}, {5 == 6}
 
 
-def test_update(mongostore):
+def test_mongostore_update(mongostore):
     mongostore.update([{"e": 6, "d": 4}], key="e")
     assert (
         mongostore.query_one(criteria={"d": {"$exists": 1}}, properties=["d"])["d"] == 4
@@ -70,7 +75,7 @@ def test_update(mongostore):
     assert mongostore.query_one(criteria={"d": 8, "f": 9}, properties=["e"])["e"] == 11
 
 
-def test_groupby(mongostore):
+def test_mongostore_groupby(mongostore):
     mongostore._collection.drop()
     mongostore.update(
         [
@@ -92,12 +97,12 @@ def test_groupby(mongostore):
     assert len(data) == 3
 
 
-def test_from_db_file(mongostore, db_json):
+def test_mongostore_from_db_file(mongostore, db_json):
     ms = MongoStore.from_db_file(db_json)
     assert ms._collection_name == "tmp"
 
 
-def test_from_collection(mongostore, db_json):
+def test_mongostore_from_collection(mongostore, db_json):
     ms = MongoStore.from_db_file(db_json)
     ms.connect()
 
@@ -106,7 +111,7 @@ def test_from_collection(mongostore, db_json):
     assert ms.database == other_ms.database
 
 
-def test_last_updated(mongostore):
+def test_mongostore_last_updated(mongostore):
     assert mongostore.last_updated == datetime.min
     start_time = datetime.now()
     mongostore._collection.insert_one({mongostore.key: 1, "a": 1})
@@ -119,7 +124,7 @@ def test_last_updated(mongostore):
     assert mongostore.last_updated > start_time
 
 
-def test_newer_in(mongostore):
+def test_mongostore_newer_in(mongostore):
     target = MongoStore("maggma_test", "test_target")
     target.connect()
 
@@ -144,3 +149,47 @@ def test_newer_in(mongostore):
     assert len(mongostore.newer_in(target)) == 0
 
     target._collection.drop()
+
+
+# Memory store tests
+def test_memory_store_connect():
+    memorystore = MemoryStore()
+    with pytest.raises(Exception):
+        memorystore.collection
+    memorystore.connect()
+    assert isinstance(memorystore.collection, mongomock.collection.Collection)
+
+
+def test_groupby(memorystore):
+    memorystore.update(
+        [
+            {"e": 7, "d": 9, "f": 9},
+            {"e": 7, "d": 9, "f": 10},
+            {"e": 8, "d": 9, "f": 11},
+            {"e": 9, "d": 10, "f": 12},
+        ],
+        key="f",
+    )
+    data = list(memorystore.groupby("d"))
+    assert len(data) == 2
+    grouped_by_9 = [g[1] for g in data if g[0]["d"] == 9][0]
+    assert len(grouped_by_9) == 3
+    grouped_by_10 = [g[1] for g in data if g[0]["d"] == 10][0]
+    assert len(grouped_by_10) == 1
+
+    data = list(memorystore.groupby(["e", "d"]))
+    assert len(data) == 3
+
+
+def test_json_store_load(test_dir):
+    files = []
+    for f in ["a.json", "b.json"]:
+        files.append(test_dir / "test_set" / f)
+
+    jsonstore = JSONStore(files)
+    jsonstore.connect()
+    assert len(list(jsonstore.query())) == 20
+
+    jsonstore = JSONStore(test_dir / "test_set" /"c.json.gz")
+    jsonstore.connect()
+    assert len(list(jsonstore.query())) == 20

From 07d66084b2fbb27341bd0bfd4b21f58793991064 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 6 Nov 2019 09:53:34 -0800
Subject: [PATCH 15/99] more cleanup of tests

---
 maggma/stores/tests/test_mongolike.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/maggma/stores/tests/test_mongolike.py b/maggma/stores/tests/test_mongolike.py
index 897ad499a..faaf17b57 100644
--- a/maggma/stores/tests/test_mongolike.py
+++ b/maggma/stores/tests/test_mongolike.py
@@ -29,7 +29,7 @@ def test_mongostore_connect():
 
 
 def test_mongostore_query(mongostore):
-    mongostore._collection.insert({"a": 1, "b": 2, "c": 3})
+    mongostore._collection.insert_one({"a": 1, "b": 2, "c": 3})
     assert mongostore.query_one(properties=["a"])["a"] == 1
     assert mongostore.query_one(properties=["a"])["a"] == 1
     assert mongostore.query_one(properties=["b"])["b"] == 2
@@ -37,13 +37,13 @@ def test_mongostore_query(mongostore):
 
 
 def test_mongostore_distinct(mongostore):
-    mongostore._collection.insert({"a": 1, "b": 2, "c": 3})
-    mongostore._collection.insert({"a": 4, "d": 5, "e": 6, "g": {"h": 1}})
+    mongostore._collection.insert_one({"a": 1, "b": 2, "c": 3})
+    mongostore._collection.insert_one({"a": 4, "d": 5, "e": 6, "g": {"h": 1}})
     assert set(mongostore.distinct("a")) == {1, 4}
 
     # Test list distinct functionality
-    mongostore._collection.insert({"a": 4, "d": 6, "e": 7})
-    mongostore._collection.insert({"a": 4, "d": 6, "g": {"h": 2}})
+    mongostore._collection.insert_one({"a": 4, "d": 6, "e": 7})
+    mongostore._collection.insert_one({"a": 4, "d": 6, "g": {"h": 2}})
     ad_distinct = mongostore.distinct(["a", "d"])
     assert len(ad_distinct) == 3
     assert {"a": 4, "d": 6} in ad_distinct
@@ -154,10 +154,9 @@ def test_mongostore_newer_in(mongostore):
 # Memory store tests
 def test_memory_store_connect():
     memorystore = MemoryStore()
-    with pytest.raises(Exception):
-        memorystore.collection
+    assert memorystore._collection is None
     memorystore.connect()
-    assert isinstance(memorystore.collection, mongomock.collection.Collection)
+    assert isinstance(memorystore._collection, mongomock.collection.Collection)
 
 
 def test_groupby(memorystore):
@@ -190,6 +189,6 @@ def test_json_store_load(test_dir):
     jsonstore.connect()
     assert len(list(jsonstore.query())) == 20
 
-    jsonstore = JSONStore(test_dir / "test_set" /"c.json.gz")
+    jsonstore = JSONStore(test_dir / "test_set" / "c.json.gz")
     jsonstore.connect()
     assert len(list(jsonstore.query())) == 20

From a5fee2539ec6493837c6e8008e8a7a867d2f463d Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 6 Nov 2019 09:53:44 -0800
Subject: [PATCH 16/99] add gridfs store tests

---
 maggma/stores/tests/test_gridfs.py | 70 ++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 maggma/stores/tests/test_gridfs.py

diff --git a/maggma/stores/tests/test_gridfs.py b/maggma/stores/tests/test_gridfs.py
new file mode 100644
index 000000000..8d7523f01
--- /dev/null
+++ b/maggma/stores/tests/test_gridfs.py
@@ -0,0 +1,70 @@
+import pytest
+import numpy as np
+import numpy.testing.utils as nptu
+from datetime import datetime
+from maggma.stores import GridFSStore
+
+
+@pytest.fixture
+def gridfsstore():
+    store = GridFSStore("maggma_test", "test", key="task_id")
+    store.connect()
+    yield store
+    store._files_collection.drop()
+    store._chunks_collection.drop()
+
+
+def test_update(gridfsstore):
+    data1 = np.random.rand(256)
+    data2 = np.random.rand(256)
+    # Test metadata storage
+    gridfsstore.update([{"task_id": "mp-1", "data": data1}])
+    assert (
+        gridfsstore._files_collection.find_one({"metadata.task_id": "mp-1"}) is not None
+    )
+
+    # Test storing data
+    gridfsstore.update([{"task_id": "mp-1", "data": data2}])
+    assert len(list(gridfsstore.query({"task_id": "mp-1"}))) == 1
+    assert "task_id" in gridfsstore.query_one({"task_id": "mp-1"})
+    nptu.assert_almost_equal(
+        gridfsstore.query_one({"task_id": "mp-1"})["data"], data2, 7
+    )
+
+    # Test storing compressed data
+    gridfsstore = GridFSStore("maggma_test", "test", key="task_id", compression=True)
+    gridfsstore.connect()
+    gridfsstore.update([{"task_id": "mp-1", "data": data1}])
+    assert (
+        gridfsstore._files_collection.find_one({"metadata.compression": "zlib"})
+        is not None
+    )
+
+    nptu.assert_almost_equal(
+        gridfsstore.query_one({"task_id": "mp-1"})["data"], data1, 7
+    )
+
+
+def test_query(gridfsstore):
+    data1 = np.random.rand(256)
+    data2 = np.random.rand(256)
+    tic = datetime(2018, 4, 12, 16)
+    gridfsstore.update([{"task_id": "mp-1", "data": data1}])
+    gridfsstore.update(
+        [{"task_id": "mp-2", "data": data2, gridfsstore.last_updated_field: tic}], update_lu=False
+    )
+
+    doc = gridfsstore.query_one(criteria={"task_id": "mp-1"})
+    nptu.assert_almost_equal(doc["data"], data1, 7)
+
+    doc = gridfsstore.query_one(criteria={"task_id": "mp-2"})
+    nptu.assert_almost_equal(doc["data"], data2, 7)
+    assert gridfsstore.last_updated_field in doc
+
+    assert gridfsstore.query_one(criteria={"task_id": "mp-3"}) is None
+
+
+@pytest.mark.skip("Not Done")
+def test_distinct(gridfsstore):
+    # TODO
+    pass

From 5317e6a11b88d57acc1bd7a1b867c977f544248c Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 6 Nov 2019 13:38:22 -0800
Subject: [PATCH 17/99] add more stores into main module

---
 maggma/stores/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/maggma/stores/__init__.py b/maggma/stores/__init__.py
index 19addd98e..1e3ec0a77 100644
--- a/maggma/stores/__init__.py
+++ b/maggma/stores/__init__.py
@@ -1,3 +1,5 @@
 from maggma.stores.mongolike import MongoStore, JSONStore, MemoryStore
 from maggma.stores.gridfs import GridFSStore
-from maggma.stores.aws import AmazonS3Store
\ No newline at end of file
+from maggma.stores.advanced_stores import MongograntStore, VaultStore, AliasingStore, SandboxStore
+from maggma.stores.aws import AmazonS3Store
+from maggma.stores.compound_stores import JointStore
\ No newline at end of file

From 32714aef723e2ff7b94d9df4097f6e48a3d0ece3 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 6 Nov 2019 13:38:35 -0800
Subject: [PATCH 18/99] update advanced stores

---
 maggma/stores/advanced_stores.py            | 142 ++++++++--
 maggma/stores/tests/test_advanced_stores.py | 280 ++++++++++++++++++++
 2 files changed, 393 insertions(+), 29 deletions(-)
 create mode 100644 maggma/stores/tests/test_advanced_stores.py

diff --git a/maggma/stores/advanced_stores.py b/maggma/stores/advanced_stores.py
index 961aca13d..281579e40 100644
--- a/maggma/stores/advanced_stores.py
+++ b/maggma/stores/advanced_stores.py
@@ -5,7 +5,7 @@
 import os
 import hvac
 import json
-from typing import Union, Optional, Dict, List, Iterator
+from typing import Union, Optional, Dict, List, Iterator, Tuple
 
 from maggma.core import Store, StoreError, Sort
 from maggma.stores.mongolike import MongoStore
@@ -56,7 +56,7 @@ def __init__(
                 "arguments. Use `mongogrant_spec`."
             )
         self.kwargs = kwargs
-        super().__init__(**kwargs)
+        super(MongoStore, self).__init__(**kwargs)
 
     def connect(self, force_reset: bool = False):
         """
@@ -75,7 +75,7 @@ def connect(self, force_reset: bool = False):
             self._collection = db[self.collection_name]
 
     def __hash__(self):
-        return hash((self.mongogrant_spec, self.collection_name, self.lu_field))
+        return hash((self.mongogrant_spec, self.collection_name, self.last_updated_field))
 
 
 class VaultStore(MongoStore):
@@ -150,7 +150,12 @@ def __init__(self, store: Store, aliases: Dict, **kwargs):
         self.reverse_aliases = {v: k for k, v in aliases.items()}
         self.kwargs = kwargs
 
-        kwargs.update({"lu_field": store.lu_field, "lu_type": store.lu_type})
+        kwargs.update(
+            {
+                "last_updated_field": store.last_updated_field,
+                "last_updated_type": store.last_updated_type,
+            }
+        )
         super(AliasingStore, self).__init__(**kwargs)
 
     def query(
@@ -185,7 +190,10 @@ def query(
             yield d
 
     def distinct(
-        self, field: Union[List[str], str], criteria: Optional[Dict] = None, all_exist: bool = False
+        self,
+        field: Union[List[str], str],
+        criteria: Optional[Dict] = None,
+        all_exist: bool = False,
     ) -> List:
         """
         Get all distinct values for a key
@@ -202,7 +210,30 @@ def distinct(
         field = [self.aliases[f] for f in field]
         return self.store.distinct(field, criteria=criteria)
 
-    def groupby(self, keys, criteria=None, properties=None, **kwargs):
+    def groupby(
+        self,
+        keys: Union[List[str], str],
+        criteria: Optional[Dict] = None,
+        properties: Union[Dict, List, None] = None,
+        sort: Optional[Dict[str, Sort]] = None,
+        skip: int = 0,
+        limit: int = 0,
+    ) -> Iterator[Tuple[Dict, List[Dict]]]:
+        """
+        Simple grouping function that will group documents
+        by keys.
+
+        Args:
+            keys: fields to group documents
+            criteria : PyMongo filter for documents to search in
+            properties: properties to return in grouped documents
+            sort: Dictionary of sort order for fields
+            skip: number documents to skip
+            limit: limit on total number of documents returned
+
+        Returns:
+            generator returning tuples of (dict, list of docs)
+        """
         # Convert to a list
         keys = keys if isinstance(keys, list) else [keys]
 
@@ -215,10 +246,20 @@ def groupby(self, keys, criteria=None, properties=None, **kwargs):
         lazy_substitute(criteria, self.reverse_aliases)
 
         return self.store.groupby(
-            keys=keys, properties=properties, criteria=criteria, **kwargs
+            keys=keys, properties=properties, criteria=criteria, skip=skip, limit=limit
         )
 
-    def update(self, docs, update_lu=True, key=None):
+    def update(self, docs: Union[List[Dict], Dict], key: Union[List, str, None] = None):
+        """
+        Update documents into the Store
+
+        Args:
+            docs: the document or list of documents to update
+            key: field name(s) to determine uniqueness for a
+                 document, can be a list of multiple fields,
+                 a single field, or None if the Store's key
+                 field is to be used
+        """
         key = key if key else self.key
 
         for d in docs:
@@ -227,7 +268,7 @@ def update(self, docs, update_lu=True, key=None):
         if key in self.aliases:
             key = self.aliases[key]
 
-        self.store.update(docs, update_lu=update_lu, key=key)
+        self.store.update(docs, key=key)
 
     def ensure_index(self, key, unique=False, **kwargs):
         if key in self.aliases:
@@ -250,7 +291,7 @@ class SandboxStore(Store):
     Provides a sandboxed view to another store
     """
 
-    def __init__(self, store, sandbox, exclusive=False):
+    def __init__(self, store: Store, sandbox: str, exclusive: bool = False):
         """
         store (Store): store to wrap sandboxing around
         sandbox (string): the corresponding sandbox
@@ -261,13 +302,16 @@ def __init__(self, store, sandbox, exclusive=False):
         self.exclusive = exclusive
         super().__init__(
             key=self.store.key,
-            lu_field=self.store.lu_field,
-            lu_type=self.store.lu_type,
+            last_updated_field=self.store.last_updated_field,
+            last_updated_type=self.store.last_updated_type,
             validator=self.store.validator,
         )
 
     @property
-    def sbx_criteria(self):
+    def sbx_criteria(self) -> Dict:
+        """
+        Returns the sandbox criteria dict used to filter the source store
+        """
         if self.exclusive:
             return {"sbxn": self.sandbox}
         else:
@@ -275,41 +319,81 @@ def sbx_criteria(self):
                 "$or": [{"sbxn": {"$in": [self.sandbox]}}, {"sbxn": {"$exists": False}}]
             }
 
-    def query(self, criteria=None, properties=None, **kwargs):
-        criteria = (
-            dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria
-        )
-        return self.store.query(properties=properties, criteria=criteria, **kwargs)
+    def query(
+        self,
+        criteria: Optional[Dict] = None,
+        properties: Union[Dict, List, None] = None,
+        sort: Optional[Dict[str, Sort]] = None,
+        skip: int = 0,
+        limit: int = 0,
+    ) -> Iterator[Dict]:
+        """
+        Queries the Store for a set of documents
 
-    def query_one(self, criteria=None, properties=None, **kwargs):
+        Args:
+            criteria : PyMongo filter for documents to search in
+            properties: properties to return in grouped documents
+            sort: Dictionary of sort order for fields
+            skip: number documents to skip
+            limit: limit on total number of documents returned
+        """
         criteria = (
             dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria
         )
-        return self.store.query_one(properties=properties, criteria=criteria, **kwargs)
-
-    def distinct(self, key, criteria=None, **kwargs):
-        criteria = (
-            dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria
+        return self.store.query(
+            properties=properties, criteria=criteria, sort=sort, limit=limit, skip=skip
         )
-        return self.store.distinct(key=key, criteria=criteria, **kwargs)
 
-    def groupby(self, keys, criteria=None, properties=None, **kwargs):
+    def groupby(
+        self,
+        keys: Union[List[str], str],
+        criteria: Optional[Dict] = None,
+        properties: Union[Dict, List, None] = None,
+        sort: Optional[Dict[str, Sort]] = None,
+        skip: int = 0,
+        limit: int = 0,
+    ) -> Iterator[Tuple[Dict, List[Dict]]]:
+        """
+        Simple grouping function that will group documents
+        by keys.
+
+        Args:
+            keys: fields to group documents
+            criteria : PyMongo filter for documents to search in
+            properties: properties to return in grouped documents
+            sort: Dictionary of sort order for fields
+            skip: number documents to skip
+            limit: limit on total number of documents returned
+
+        Returns:
+            generator returning tuples of (dict, list of docs)
+        """
         criteria = (
             dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria
         )
 
         return self.store.groupby(
-            keys=keys, properties=properties, criteria=criteria, **kwargs
+            keys=keys, properties=properties, criteria=criteria, skip=skip, limit=limit
         )
 
-    def update(self, docs, update_lu=True, key=None):
+    def update(self, docs: Union[List[Dict], Dict], key: Union[List, str, None] = None):
+        """
+        Update documents into the Store
+
+        Args:
+            docs: the document or list of documents to update
+            key: field name(s) to determine uniqueness for a
+                 document, can be a list of multiple fields,
+                 a single field, or None if the Store's key
+                 field is to be used
+        """
         for d in docs:
             if "sbxn" in d:
                 d["sbxn"] = list(set(d["sbxn"] + [self.sandbox]))
             else:
                 d["sbxn"] = [self.sandbox]
 
-        self.store.update(docs, update_lu=update_lu, key=key)
+        self.store.update(docs, key=key)
 
     def ensure_index(self, key, unique=False, **kwargs):
         return self.store.ensure_index(key, unique, **kwargs)
diff --git a/maggma/stores/tests/test_advanced_stores.py b/maggma/stores/tests/test_advanced_stores.py
new file mode 100644
index 000000000..2592a47fb
--- /dev/null
+++ b/maggma/stores/tests/test_advanced_stores.py
@@ -0,0 +1,280 @@
+# coding: utf-8
+"""
+Tests for advanced stores
+"""
+import time
+
+import os
+import shutil
+import signal
+import subprocess
+import tempfile
+
+from mongogrant.client import seed, check
+from mongogrant.config import Config
+from mongogrant import Client
+from pymongo import MongoClient
+from pymongo.collection import Collection
+from unittest.mock import patch
+from uuid import uuid4
+
+from maggma.stores import (
+    MongoStore,
+    MongograntStore,
+    VaultStore,
+    MemoryStore,
+    AliasingStore,
+    SandboxStore,
+)
+from maggma.stores.advanced_stores import substitute
+import pytest
+
+
+@pytest.fixture("module")
+def mgrant_server():
+    _, config_path = tempfile.mkstemp()
+    _, mdlogpath = tempfile.mkstemp()
+    mdpath = tempfile.mkdtemp()
+    mdport = 27020
+    if not (os.getenv("CONTINUOUS_INTEGRATION") and os.getenv("TRAVIS")):
+        basecmd = (
+            f"mongod --port {mdport} --dbpath {mdpath} --quiet --logpath {mdlogpath} "
+            "--bind_ip_all --auth"
+        )
+        mongod_process = subprocess.Popen(basecmd, shell=True, start_new_session=True)
+        time.sleep(5)
+        client = MongoClient(port=mdport)
+        client.admin.command(
+            "createUser", "mongoadmin", pwd="mongoadminpass", roles=["root"]
+        )
+        client.close()
+    dbname = "test_" + uuid4().hex
+    db = MongoClient(f"mongodb://mongoadmin:mongoadminpass@127.0.0.1:{mdport}/admin")[
+        dbname
+    ]
+    db.command("createUser", "reader", pwd="readerpass", roles=["read"])
+    db.command("createUser", "writer", pwd="writerpass", roles=["readWrite"])
+    db.client.close()
+
+    # Yields the fixture to use
+    yield config_path, mdport, dbname
+
+    os.remove(config_path)
+    if not (os.getenv("CONTINUOUS_INTEGRATION") and os.getenv("TRAVIS")):
+        os.killpg(os.getpgid(mongod_process.pid), signal.SIGTERM)
+        os.waitpid(mongod_process.pid, 0)
+    shutil.rmtree(mdpath)
+    os.remove(mdlogpath)
+
+
+@pytest.fixture("module")
+def mgrant_user(mgrant_server):
+    config_path, mdport, dbname = mgrant_server
+
+    config = Config(check=check, path=config_path, seed=seed())
+    client = Client(config)
+    client.set_auth(
+        host=f"localhost:{mdport}",
+        db=dbname,
+        role="read",
+        username="reader",
+        password="readerpass",
+    )
+    client.set_auth(
+        host=f"localhost:{mdport}",
+        db=dbname,
+        role="readWrite",
+        username="writer",
+        password="writerpass",
+    )
+    client.set_alias("testhost", f"localhost:{mdport}", which="host")
+    client.set_alias("testdb", dbname, which="db")
+
+    return client
+
+
+def connected_user(store):
+    return store._collection.database.command("connectionStatus")["authInfo"][
+        "authenticatedUsers"
+    ][0]["user"]
+
+
+def test_mgrant_connect(mgrant_server, mgrant_user):
+    config_path, mdport, dbname = mgrant_server
+    assert mgrant_user is not None
+    store = MongograntStore(
+        "ro:testhost/testdb", "tasks", mgclient_config_path=config_path
+    )
+    store.connect()
+    assert isinstance(store._collection, Collection)
+    assert connected_user(store) == "reader"
+    store = MongograntStore(
+        "rw:testhost/testdb", "tasks", mgclient_config_path=config_path
+    )
+    store.connect()
+    assert isinstance(store._collection, Collection)
+    assert connected_user(store) == "writer"
+
+
+def vault_store():
+    with patch("hvac.Client") as mock:
+        instance = mock.return_value
+        instance.auth_github.return_value = True
+        instance.is_authenticated.return_value = True
+        instance.read.return_value = {
+            "wrap_info": None,
+            "request_id": "2c72c063-2452-d1cd-19a2-91163c7395f7",
+            "data": {
+                "value": '{"db": "mg_core_prod", "host": "matgen2.lbl.gov", "username": "test", "password": "pass"}'
+            },
+            "auth": None,
+            "warnings": None,
+            "renewable": False,
+            "lease_duration": 2764800,
+            "lease_id": "",
+        }
+        v = VaultStore("test_coll", "secret/matgen/maggma")
+
+    return v
+
+
+def test_vault_init():
+    """
+    Test initing a vault store using a mock hvac client
+    """
+    os.environ["VAULT_ADDR"] = "https://fake:8200/"
+    os.environ["VAULT_TOKEN"] = "dummy"
+
+    # Just test that we successfully instantiated
+    v = vault_store()
+    assert isinstance(v, MongoStore)
+
+
+def test_vault_github_token():
+    """
+    Test using VaultStore with GITHUB_TOKEN and mock hvac
+    """
+    # Save token in env
+    os.environ["VAULT_ADDR"] = "https://fake:8200/"
+    os.environ["GITHUB_TOKEN"] = "dummy"
+
+    v = vault_store()
+    # Just test that we successfully instantiated
+    assert isinstance(v, MongoStore)
+
+
+def test_vault_missing_env():
+    """
+    Test VaultStore should raise an error if environment is not set
+    """
+    del os.environ["VAULT_TOKEN"]
+    del os.environ["VAULT_ADDR"]
+    del os.environ["GITHUB_TOKEN"]
+
+    # Create should raise an error
+    with pytest.raises(RuntimeError):
+        vault_store()
+
+
+@pytest.fixture
+def alias_store():
+    memorystore = MemoryStore("test")
+    memorystore.connect()
+    alias_store = AliasingStore(memorystore, {"a": "b", "c.d": "e", "f": "g.h"})
+    return alias_store
+
+
+def test_aliasing_query(alias_store):
+
+    d = [{"b": 1}, {"e": 2}, {"g": {"h": 3}}]
+    alias_store.store._collection.insert_many(d)
+
+    assert "a" in list(alias_store.query(criteria={"a": {"$exists": 1}}))[0]
+    assert "c" in list(alias_store.query(criteria={"c.d": {"$exists": 1}}))[0]
+    assert "d" in list(alias_store.query(criteria={"c.d": {"$exists": 1}}))[0].get(
+        "c", {}
+    )
+    assert "f" in list(alias_store.query(criteria={"f": {"$exists": 1}}))[0]
+
+
+def test_aliasing_update(alias_store):
+
+    alias_store.update(
+        [
+            {"task_id": "mp-3", "a": 4},
+            {"task_id": "mp-4", "c": {"d": 5}},
+            {"task_id": "mp-5", "f": 6},
+        ]
+    )
+    assert list(alias_store.query(criteria={"task_id": "mp-3"}))[0]["a"] == 4
+    assert list(alias_store.query(criteria={"task_id": "mp-4"}))[0]["c"]["d"] == 5
+    assert list(alias_store.query(criteria={"task_id": "mp-5"}))[0]["f"] == 6
+
+    assert list(alias_store.store.query(criteria={"task_id": "mp-3"}))[0]["b"] == 4
+    assert list(alias_store.store.query(criteria={"task_id": "mp-4"}))[0]["e"] == 5
+
+    assert list(alias_store.store.query(criteria={"task_id": "mp-5"}))[0]["g"]["h"] == 6
+
+
+def test_aliasing_substitute(alias_store):
+    aliases = {"a": "b", "c.d": "e", "f": "g.h"}
+
+    d = {"b": 1}
+    substitute(d, aliases)
+    assert "a" in d
+
+    d = {"e": 1}
+    substitute(d, aliases)
+    assert "c" in d
+    assert "d" in d.get("c", {})
+
+    d = {"g": {"h": 4}}
+    substitute(d, aliases)
+    assert "f" in d
+
+    d = None
+    substitute(d, aliases)
+    assert d is None
+
+
+@pytest.fixture
+def sandbox_store():
+    memstore = MemoryStore()
+    store = SandboxStore(memstore, sandbox="test")
+    store.connect()
+    return store
+
+
+def test_sandbox_query(sandbox_store):
+    sandbox_store.collection.insert_one({"a": 1, "b": 2, "c": 3})
+    assert sandbox_store.query_one(properties=["a"])["a"] == 1
+
+    sandbox_store.collection.insert_one({"a": 2, "b": 2, "sbxn": ["test"]})
+    assert sandbox_store.query_one(properties=["b"], criteria={"a": 2})["b"] == 2
+
+    sandbox_store.collection.insert_one({"a": 3, "b": 2, "sbxn": ["not_test"]})
+    assert sandbox_store.query_one(properties=["c"], criteria={"a": 3}) is None
+
+
+def test_sandbox_distinct(sandbox_store):
+    sandbox_store.connect()
+    sandbox_store.collection.insert_one({"a": 1, "b": 2, "c": 3})
+    assert sandbox_store.distinct("a") == [1]
+
+    sandbox_store.collection.insert_one({"a": 4, "d": 5, "e": 6, "sbxn": ["test"]})
+    assert sandbox_store.distinct("a")[1] == 4
+
+    sandbox_store.collection.insert_one({"a": 7, "d": 8, "e": 9, "sbxn": ["not_test"]})
+    assert sandbox_store.distinct("a")[1] == 4
+
+
+def test_sandbox_update(sandbox_store):
+    sandbox_store.connect()
+    sandbox_store.update([{"e": 6, "d": 4}], key="e")
+    assert (
+        next(sandbox_store.query(criteria={"d": {"$exists": 1}}, properties=["d"]))["d"]
+        == 4
+    )
+    assert sandbox_store.collection.find_one({"e": 6})["sbxn"] == ["test"]
+    sandbox_store.update([{"e": 7, "sbxn": ["core"]}], key="e")
+    assert set(sandbox_store.query_one(criteria={"e": 7})["sbxn"]) == {"test", "core"}

From d39dbf73a664ac23aec455cb6037b4f0e66e2b06 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 7 Nov 2019 09:55:09 -0800
Subject: [PATCH 19/99] more tests

---
 maggma/stores/tests/test_aws.py             |  48 +++++
 maggma/stores/tests/test_compound_stores.py | 192 ++++++++++++++++++++
 2 files changed, 240 insertions(+)
 create mode 100644 maggma/stores/tests/test_aws.py
 create mode 100644 maggma/stores/tests/test_compound_stores.py

diff --git a/maggma/stores/tests/test_aws.py b/maggma/stores/tests/test_aws.py
new file mode 100644
index 000000000..b1cd09876
--- /dev/null
+++ b/maggma/stores/tests/test_aws.py
@@ -0,0 +1,48 @@
+import pytest
+import json
+import boto3
+import zlib
+from moto import mock_s3
+from maggma.stores import MemoryStore, AmazonS3Store
+
+
+@pytest.fixture
+def s3store():
+    with mock_s3():
+        conn = boto3.client("s3")
+        conn.create_bucket(Bucket="bucket1")
+
+        index = MemoryStore("index'")
+        store = AmazonS3Store(index, "bucket1")
+        store.connect()
+
+        check_doc = {"task_id": "mp-1", "data": "asd"}
+        store.index.update([{"task_id": "mp-1"}])
+        store.s3_bucket.put_object(Key="mp-1", Body=json.dumps(check_doc).encode())
+
+        check_doc2 = {"task_id": "mp-3", "data": "sdf"}
+        store.index.update([{"task_id": "mp-3", "compression": "zlib"}])
+        store.s3_bucket.put_object(
+            Key="mp-3", Body=zlib.compress(json.dumps(check_doc2).encode())
+        )
+
+        yield store
+
+
+def test_qeuery(s3store):
+    assert s3store.query_one(criteria={"task_id": "mp-2"}) is None
+    assert s3store.query_one(criteria={"task_id": "mp-1"})["data"] == "asd"
+    assert s3store.query_one(criteria={"task_id": "mp-3"})["data"] == "sdf"
+
+    assert len(list(s3store.query())) == 2
+
+
+def test_update(s3store):
+    s3store.update([{"task_id": "mp-2", "data": "asd"}], compress=False)
+    assert s3store.query_one({"task_id": "mp-2"}) is not None
+
+    s3store.update([{"task_id": "mp-4", "data": "asd"}], compress=True)
+    assert s3store.index.query_one({"task_id": "mp-4"})["compression"] == "zlib"
+    assert s3store.query_one({"task_id": "mp-4"}) is not None
+    assert s3store.query_one({"task_id": "mp-4"})["data"] == "asd"
+
diff --git a/maggma/stores/tests/test_compound_stores.py b/maggma/stores/tests/test_compound_stores.py
new file mode 100644
index 000000000..b2c023189
--- /dev/null
+++ b/maggma/stores/tests/test_compound_stores.py
@@ -0,0 +1,192 @@
+import pytest
+from pydash import get
+from datetime import datetime
+from maggma.core import StoreError
+from maggma.stores import MongoStore, MemoryStore, JointStore, ConcatStore
+
+
+@pytest.fixture("module")
+def jointstore():
+    store = JointStore("maggma_test", ["test1", "test2"])
+    store.connect()
+    store.collection.drop()
+    store.collection.insert_many(
+        [
+            {
+                "task_id": k,
+                "my_prop": k + 1,
+                "last_updated": datetime.utcnow(),
+                "category": k // 5,
+            }
+            for k in range(10)
+        ]
+    )
+    store.collection.database["test2"].drop()
+    store.collection.database["test2"].insert_many(
+        [
+            {
+                "task_id": 2 * k,
+                "your_prop": k + 3,
+                "last_updated": datetime.utcnow(),
+                "category2": k // 3,
+            }
+            for k in range(5)
+        ]
+    )
+
+    return store
+
+
+@pytest.fixture("module")
+def jointstore_test1():
+    store = MongoStore("maggma_test", "test1")
+    store.connect()
+    yield store
+    store._collection.drop()
+
+
+@pytest.fixture("module")
+def jointstore_test2():
+    store = MongoStore("maggma_test", "test2")
+    store.connect()
+    yield store
+    store._collection.drop()
+
+
+def test_joint_store_query(jointstore):
+    # Test query all
+    docs = list(jointstore.query())
+    assert len(docs) == 10
+    docs_w_field = [d for d in docs if "test2" in d]
+    assert len(docs_w_field) == 5
+    docs_w_field = sorted(docs_w_field, key=lambda x: x["task_id"])
+    assert docs_w_field[0]["test2"]["your_prop"] == 3
+    assert docs_w_field[0]["task_id"] == 0
+    assert docs_w_field[0]["my_prop"] == 1
+
+
+def test_joint_store_query_one(jointstore):
+    doc = jointstore.query_one()
+    assert doc["my_prop"] == doc["task_id"] + 1
+    # Test limit properties
+    doc = jointstore.query_one(properties=["test2", "task_id"])
+    assert doc["test2"]["your_prop"] == doc["task_id"] + 3
+    assert doc.get("my_prop") is None
+    # Test criteria
+    doc = jointstore.query_one(criteria={"task_id": {"$gte": 10}})
+    assert doc is None
+    doc = jointstore.query_one(criteria={"test2.your_prop": {"$gt": 6}})
+    assert doc["task_id"] == 8
+
+    # Test merge_at_root
+    jointstore.merge_at_root = True
+
+    # Test merging is working properly
+    doc = jointstore.query_one(criteria={"task_id": 2})
+    assert doc["my_prop"] == 3
+    assert doc["your_prop"] == 4
+
+    # Test merging is allowing for subsequent match
+    doc = jointstore.query_one(criteria={"your_prop": {"$gt": 6}})
+    assert doc["task_id"] == 8
+
+
+def test_joint_store_distinct(jointstore):
+    dyour_prop = jointstore.distinct("test2.your_prop")
+    print(dyour_prop)
+    assert set(dyour_prop) == {k + 3 for k in range(5)}
+    dmy_prop = jointstore.distinct("my_prop")
+    assert set(dmy_prop) == {k + 1 for k in range(10)}
+    dmy_prop_cond = jointstore.distinct("my_prop", {"test2.your_prop": {"$gte": 5}})
+    assert set(dmy_prop_cond), {5, 7 == 9}
+
+
+def test_joint_store_last_updated(jointstore, jointstore_test1, jointstore_test2):
+    test1 = jointstore_test1
+    test2 = jointstore_test2
+    doc = jointstore.query_one({"task_id": 0})
+    test1doc = test1.query_one({"task_id": 0})
+    test2doc = test2.query_one({"task_id": 0})
+    assert test1doc["last_updated"] == doc["last_updated"]
+    assert test2doc["last_updated"] != doc["last_updated"]
+    # Swap the two
+    test2date = test2doc["last_updated"]
+    test2doc["last_updated"] = test1doc["last_updated"]
+    test1doc["last_updated"] = test2date
+    test1.update([test1doc])
+    test2.update([test2doc])
+    doc = jointstore.query_one({"task_id": 0})
+    test1doc = test1.query_one({"task_id": 0})
+    test2doc = test2.query_one({"task_id": 0})
+    assert test1doc["last_updated"] == doc["last_updated"]
+    assert test2doc["last_updated"] != doc["last_updated"]
+    # Check also that still has a field if no task2 doc
+    doc = jointstore.query_one({"task_id": 1})
+    assert doc["last_updated"] is not None
+
+
+def test_joint_store_groupby(jointstore):
+    docs = list(jointstore.groupby("category"))
+    assert len(docs[0][1]) == 5
+    assert len(docs[1][1]) == 5
+    docs = list(jointstore.groupby("test2.category2"))
+    print([d[0] for d in docs])
+
+    none_docs = next(d for d in docs if get(d[0], "test2.category2") == [])
+    one_docs = next(d for d in docs if get(d[0], "test2.category2") == [1])
+    zero_docs = next(d for d in docs if get(d[0], "test2.category2") == [0])
+    assert len(none_docs[1]) == 5
+    assert len(one_docs[1]) == 2
+    assert len(zero_docs[1]) == 3
+
+
+@pytest.fixture
+def concat_store():
+    mem_stores = [MemoryStore(str(i)) for i in range(4)]
+    store = ConcatStore(*mem_stores)
+    store.connect()
+
+    index = 0
+
+    props = {i: str(i) for i in range(10)}
+    for store in mem_stores:
+        docs = [
+            {"task_id": i, "prop": props[i - index], "index": index}
+            for i in range(index, index + 10)
+        ]
+        index = index + 10
+        store.update(docs)
+    return store
+
+
+@pytest.fixture
+def test_concat_store_distinct(concat_store):
+    docs = list(concat_store.distinct("task_id"))
+    actual_docs = list(
+        chain.from_iterable(
+            [store.distinct("task_id") for store in concat_store.stores]
+        )
+    )
+    assert len(docs) == len(actual_docs)
+    assert set(docs) == set(actual_docs)
+
+
+@pytest.fixture
+def test_concat_store_not_implemented(concat_store):
+    # Ensure collection property and update throw errors
+    with pytest.raises(NotImplementedError):
+        concat_store.collection
+        concat_store.update([])
+
+
+def test_concat_store_groupby(concat_store):
+    assert len(list(concat_store.groupby("index"))) == 4
+    assert len(list(concat_store.groupby("task_id"))) == 40
+
+
+def test_concat_store_query(concat_store):
+
+    docs = list(concat_store.query(properties=["task_id"]))
+    t_ids = [d["task_id"] for d in docs]
+    assert len(t_ids) == len(set(t_ids))
+    assert len(t_ids) == 40

From d25d8054f2d2c24c966efa960c8e54235e13df02 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 7 Nov 2019 09:55:17 -0800
Subject: [PATCH 20/99] fix aws and compound stores

---
 maggma/stores/aws.py             | 230 +++++++++++---------
 maggma/stores/compound_stores.py | 352 +++++++++++++++++++++++++++++++
 2 files changed, 477 insertions(+), 105 deletions(-)
 create mode 100644 maggma/stores/compound_stores.py

diff --git a/maggma/stores/aws.py b/maggma/stores/aws.py
index 39e47c4f8..9b1d8f1d4 100644
--- a/maggma/stores/aws.py
+++ b/maggma/stores/aws.py
@@ -1,13 +1,14 @@
 # coding: utf-8
 """
-Advanced Stores for behavior outside normal access patterns
+Advanced Stores for connecting to AWS data
 """
 
 import json
 import zlib
-from datetime import datetime
 
-from maggma.core import Store
+from typing import Union, Optional, Dict, List, Iterator, Tuple
+
+from maggma.core import Store, Sort
 from monty.json import jsanitize
 
 try:
@@ -44,38 +45,54 @@ def __init__(self, index, bucket, **kwargs):
         kwargs["key"] = index.key
         super(AmazonS3Store, self).__init__(**kwargs)
 
-    def connect(self, force_reset=False):
+    def connect(self, force_reset: bool = False):
+        """
+        Connect to the source data
+        """
         self.index.connect(force_reset=force_reset)
         if not self.s3:
             self.s3 = boto3.resource("s3")
-            # TODO: Provide configuration variable to create bucket if not present
-            if self.bucket not in self.s3.list_buckets():
+
+            if self.bucket not in [bucket.name for bucket in self.s3.buckets.all()]:
                 raise Exception("Bucket not present on AWS: {}".format(self.bucket))
+
             self.s3_bucket = self.s3.Bucket(self.bucket)
 
     def close(self):
+        """
+        Closes any connections
+        """
         self.index.close()
+        self.s3 = None
+        self.s3_bucket = None
 
     @property
     def collection(self):
         # For now returns the index collection since that is what we would "search" on
         return self.index
 
-    def query(self, criteria=None, properties=None, **kwargs):
+    def query(
+        self,
+        criteria: Optional[Dict] = None,
+        properties: Union[Dict, List, None] = None,
+        sort: Optional[Dict[str, Sort]] = None,
+        skip: int = 0,
+        limit: int = 0,
+    ) -> Iterator[Dict]:
         """
-        Function that gets data from Amazon S3. This store ignores all
-        property projections as its designed for whole document access
+        Queries the Store for a set of documents
 
         Args:
-            properties (list or dict): This will be ignored by the S3
-                Store
-            criteria (dict): filter for query, matches documents
-                against key-value pairs
-            **kwargs (kwargs): further kwargs to Collection.find
+            criteria : PyMongo filter for documents to search in
+            properties: properties to return in grouped documents
+            sort: Dictionary of sort order for fields
+            skip: number documents to skip
+            limit: limit on total number of documents returned
         """
-        for f in self.index.query(criteria=criteria, **kwargs):
+        for f in self.index.query(criteria=criteria, sort=sort, limit=limit, skip=skip):
             try:
-                data = self.s3_bucket.Object(f[self.key]).get()
+                # TODO : THis is ugly and unsafe, do some real checking before pulling data
+                data = self.s3_bucket.Object(f[self.key]).get()["Body"].read()
             except botocore.exceptions.ClientError as e:
                 # If a client error is thrown, then check that it was a 404 error.
                 # If it was a 404 error, then the object does not exist.
@@ -84,117 +101,110 @@ def query(self, criteria=None, properties=None, **kwargs):
                     self.logger.error("Could not find S3 object {}".format(f[self.key]))
                     break
 
-            if f.get("compression", "") != "zlib":
+            if f.get("compression", "") == "zlib":
                 data = zlib.decompress(data)
-
+            print(data)
             yield json.loads(data)
 
-    def query_one(self, criteria=None, properties=None, **kwargs):
-        """
-        Function that gets a single document from Amazon S3. This store
-        ignores all property projections as its designed for whole
-        document access
-
-        Args:
-            properties (list or dict): This will be ignored by the S3
-                Store
-            criteria (dict): filter for query, matches documents
-                against key-value pairs
-            **kwargs (kwargs): further kwargs to Collection.find
-        """
-        f = self.index.query_one(criteria=criteria, **kwargs)
-        if f:
-            try:
-                data = self.s3_bucket.Object(f[self.key]).get()
-            except botocore.exceptions.ClientError as e:
-                # If a client error is thrown, then check that it was a 404 error.
-                # If it was a 404 error, then the object does not exist.
-                error_code = int(e.response["Error"]["Code"])
-                if error_code == 404:
-                    self.logger.error("Could not find S3 object {}".format(f[self.key]))
-                    return None
-
-            if f.get("compression", "") != "zlib":
-                data = zlib.decompress(data)
-
-            return json.loads(data)
-        else:
-            return None
-
-    def distinct(self, key, criteria=None, all_exist=False, **kwargs):
+    def distinct(
+        self,
+        field: Union[List[str], str],
+        criteria: Optional[Dict] = None,
+        all_exist: bool = False,
+    ) -> Union[List[Dict], List]:
         """
-        Function get to get all distinct values of a certain key in the
-        AmazonS3 Store. This searches the index collection for this data
+        Get all distinct values for a field(s)
+        For a single field, this returns a list of values
+        For multiple fields, this return a list of of dictionaries for each unique combination
 
         Args:
-            key (mongolike key or list of mongolike keys): key or keys
-                for which to find distinct values or sets of values.
-            criteria (filter criteria): criteria for filter
-            all_exist (bool): whether to ensure all keys in list exist
-                in each document, defaults to False
-            **kwargs (kwargs): kwargs corresponding to collection.distinct
+            field: the field(s) to get distinct values for
+            criteria : PyMongo filter for documents to search in
+            all_exist : ensure all fields exist for the distinct set
         """
         # Index is a store so it should have its own distinct function
-        return self.index.distinct(key, filter=criteria, **kwargs)
-
-    def groupby(self, keys, criteria=None, properties=None, **kwargs):
+        return self.index.distinct(field, criteria=criteria, all_exist=all_exist)
+
+    def groupby(
+        self,
+        keys: Union[List[str], str],
+        criteria: Optional[Dict] = None,
+        properties: Union[Dict, List, None] = None,
+        sort: Optional[Dict[str, Sort]] = None,
+        skip: int = 0,
+        limit: int = 0,
+    ) -> Iterator[Tuple[Dict, List[Dict]]]:
         """
         Simple grouping function that will group documents
-        by keys. Only searches the index collection
+        by keys.
 
         Args:
-            keys (list or string): fields to group documents
-            criteria (dict): filter for documents to group
-            properties (list): properties to return in grouped documents
-            allow_disk_use (bool): whether to allow disk use in aggregation
+            keys: fields to group documents
+            criteria : PyMongo filter for documents to search in
+            properties: properties to return in grouped documents
+            sort: Dictionary of sort order for fields
+            skip: number documents to skip
+            limit: limit on total number of documents returned
 
         Returns:
-            command cursor corresponding to grouped documents
-
-            elements of the command cursor have the structure:
-            {'_id': {"KEY_1": value_1, "KEY_2": value_2 ...,
-             'docs': [list_of_documents corresponding to key values]}
-
+            generator returning tuples of (dict, list of docs)
         """
-        self.index.groupby(keys, properties, criteria, **kwargs)
-
-    def ensure_index(self, key, unique=False):
+        self.index.groupby(
+            keys=keys,
+            criteria=criteria,
+            properties=properties,
+            sort=sort,
+            skip=skip,
+            limit=limit,
+        )
+
+    def ensure_index(self, key: str, unique: Optional[bool] = False) -> bool:
         """
-        Wrapper for pymongo.Collection.ensure_index for the files collection
+        Tries to create an index and return true if it suceeded
+        Args:
+            key: single key to index
+            unique: Whether or not this index contains only unique keys
+
+        Returns:
+            bool indicating if the index exists/was created
         """
         return self.index.ensure_index(key, unique=unique, background=True)
 
-    def update(self, docs, update_lu=True, key=None, compress=False):
+    def update(
+        self,
+        docs: Union[List[Dict], Dict],
+        key: Union[List, str, None] = None,
+        compress=True,
+    ):
         """
-        Function to update associated MongoStore collection.
+        Update documents into the Store
 
         Args:
-            docs ([dict]): list of documents
-            key ([str] or str): keys to use to build search doc
-            compress (bool): compress the document or not
+            docs: the document or list of documents to update
+            key: field name(s) to determine uniqueness for a
+                 document, can be a list of multiple fields,
+                 a single field, or None if the Store's key
+                 field is to be used
+            compress: compress the documents into the S3 bucket
         """
-        now = datetime.now()
         search_docs = []
-        for d in docs:
-            if isinstance(key, list):
-                search_doc = {k: d[k] for k in key}
-            elif key:
-                search_doc = {key: d[key]}
-            else:
-                search_doc = {}
+        search_keys = []
+
+        if isinstance(key, list):
+            search_keys = key
+        elif key:
+            search_keys = [key]
+        else:
+            search_keys = [self.key]
 
-            # Always include our main key
-            search_doc[self.key] = d[self.key]
+        for d in docs:
+            search_doc = {k: d[k] for k in search_keys}
+            search_doc[self.key] = d[self.key]  # Ensure key is in metadata
 
             # Remove MongoDB _id from search
             if "_id" in search_doc:
                 del search_doc["_id"]
 
-            # Add a timestamp
-            if update_lu:
-                search_doc[self.lu_field] = now
-                d[self.lu_field] = now
-
             data = json.dumps(jsanitize(d)).encode()
 
             # Compress with zlib if chosen
@@ -212,17 +222,27 @@ def update(self, docs, update_lu=True, key=None, compress=False):
     def last_updated(self):
         return self.index.last_updated
 
-    def lu_filter(self, targets):
-        """Creates a MongoDB filter for new documents.
-
-        By "new", we mean documents in this Store that were last updated later
-        than any document in targets.
+    def newer_in(
+        self,
+        target: Store,
+        key: Union[str, None] = None,
+        criteria: Optional[Dict] = None,
+        exhaustive: bool = False,
+    ) -> List[str]:
+        """
+        Returns the keys of documents that are newer in the target
+        Store than this Store.
 
         Args:
-            targets (list): A list of Stores
-
+            key: a single key field to return, defaults to Store.key
+            criteria : PyMongo filter for documents to search in
+            exhaustive: triggers an item-by-item check vs. checking
+                        the last_updated of the target Store and using
+                        that to filter out new items in
         """
-        self.index.lu_filter(targets)
+        self.index.newer_in(
+            target=target, key=key, criteria=criteria, exhaustive=exhaustive
+        )
 
     def __hash__(self):
         return hash((self.index.__hash__, self.bucket))
diff --git a/maggma/stores/compound_stores.py b/maggma/stores/compound_stores.py
new file mode 100644
index 000000000..6e684be3b
--- /dev/null
+++ b/maggma/stores/compound_stores.py
@@ -0,0 +1,352 @@
+from typing import List, Iterator, Tuple, Optional, Union, Dict
+from pydash import get, set_
+from pymongo import MongoClient
+from maggma.core import Store, Sort
+from maggma.stores import MongoStore
+
+
+class JointStore(Store):
+    """Store corresponding to multiple collections, uses lookup to join"""
+
+    def __init__(
+        self,
+        database: str,
+        collection_names: List[str],
+        host: str = "localhost",
+        port: int = 27017,
+        username: str = "",
+        password: str = "",
+        master: Optional[str] = None,
+        merge_at_root: bool = False,
+        **kwargs
+    ):
+        self.database = database
+        self.collection_names = collection_names
+        self.host = host
+        self.port = port
+        self.username = username
+        self.password = password
+        self._collection = None
+        self.master = master or collection_names[0]
+        self.merge_at_root = merge_at_root
+        self.kwargs = kwargs
+        super(JointStore, self).__init__(**kwargs)
+
+    def connect(self, force_reset: bool = False):
+        conn = MongoClient(self.host, self.port)
+        db = conn[self.database]
+        if self.username != "":
+            db.authenticate(self.username, self.password)
+        self._collection = db[self.master]
+        self._has_merge_objects = (
+            self._collection.database.client.server_info()["version"] > "3.6"
+        )
+
+    def close(self):
+        self.collection.database.client.close()
+
+    @property
+    def collection(self):
+        return self._collection
+
+    @property
+    def nonmaster_names(self):
+        return list(set(self.collection_names) - {self.master})
+
+    @property
+    def last_updated(self):
+        lus = []
+        for cname in self.collection_names:
+            lu = MongoStore.from_collection(
+                self.collection.database[cname],
+                last_updated_field=self.last_updated_field,
+            ).last_updated
+            lus.append(lu)
+        return max(lus)
+
+    # TODO: implement update?
+    def update(self, docs, update_lu=True, key=None, **kwargs):
+        raise NotImplementedError("No update method for JointStore")
+
+    def _get_store_by_name(self, name):
+        return MongoStore.from_collection(self.collection.database[name])
+
+    def distinct(
+        self,
+        field: Union[List[str], str],
+        criteria: Optional[Dict] = None,
+        all_exist: bool = False,
+    ) -> List:
+        """
+        Get all distinct values for a key
+
+        Args:
+            field: the field(s) to get distinct values for
+            criteria : PyMongo filter for documents to search in
+            all_exist : ensure all fields exist for the distinct set
+        """
+        g_field = field if isinstance(field, list) else [field]
+        if all_exist:
+            criteria = criteria or {}
+            criteria.update(
+                {k: {"$exists": True} for k in g_field if k not in criteria}
+            )
+        cursor = self.groupby(g_field, criteria=criteria)
+        if isinstance(field, list):
+            return [d[0] for d in cursor]
+        else:
+            return [get(d[0], field) for d in cursor]
+
+    def ensure_index(self, key, unique=False, **kwargs):
+        raise NotImplementedError("No ensure_index method for JointStore")
+
+    def _get_pipeline(self, criteria=None, properties=None, skip=0, limit=0):
+        """
+        Gets the aggregation pipeline for query and query_one
+        Args:
+            properties: properties to be returned
+            criteria: criteria to filter by
+            skip: docs to skip
+            limit: limit results to N docs
+        Returns:
+            list of aggregation operators
+        """
+        pipeline = []
+        for cname in self.collection_names:
+            if cname is not self.master:
+                pipeline.append(
+                    {
+                        "$lookup": {
+                            "from": cname,
+                            "localField": self.key,
+                            "foreignField": self.key,
+                            "as": cname,
+                        }
+                    }
+                )
+
+                if self.merge_at_root:
+                    if not self._has_merge_objects:
+                        raise Exception(
+                            "MongoDB server version too low to use $mergeObjects."
+                        )
+
+                    pipeline.append(
+                        {
+                            "$replaceRoot": {
+                                "newRoot": {
+                                    "$mergeObjects": [
+                                        {"$arrayElemAt": ["${}".format(cname), 0]},
+                                        "$$ROOT",
+                                    ]
+                                }
+                            }
+                        }
+                    )
+                else:
+                    pipeline.append(
+                        {
+                            "$unwind": {
+                                "path": "${}".format(cname),
+                                "preserveNullAndEmptyArrays": True,
+                            }
+                        }
+                    )
+
+        # Do projection for max last_updated
+        lu_max_fields = ["${}".format(self.last_updated_field)]
+        lu_max_fields.extend(
+            [
+                "${}.{}".format(cname, self.last_updated_field)
+                for cname in self.collection_names
+            ]
+        )
+        lu_proj = {self.last_updated_field: {"$max": lu_max_fields}}
+        pipeline.append({"$addFields": lu_proj})
+
+        if criteria:
+            pipeline.append({"$match": criteria})
+        if isinstance(properties, list):
+            properties = {k: 1 for k in properties}
+        if properties:
+            pipeline.append({"$project": properties})
+
+        if skip > 0:
+            pipeline.append({"$skip": skip})
+
+        if limit > 0:
+            pipeline.append({"$limit": limit})
+        return pipeline
+
+    def query(
+        self,
+        criteria: Optional[Dict] = None,
+        properties: Union[Dict, List, None] = None,
+        sort: Optional[Dict[str, Sort]] = None,
+        skip: int = 0,
+        limit: int = 0,
+    ) -> Iterator[Dict]:
+        pipeline = self._get_pipeline(
+            criteria=criteria, properties=properties, skip=skip, limit=limit
+        )
+        agg = self._collection.aggregate(pipeline)
+        for d in agg:
+            yield d
+
+    def groupby(
+        self,
+        keys: Union[List[str], str],
+        criteria: Optional[Dict] = None,
+        properties: Union[Dict, List, None] = None,
+        sort: Optional[Dict[str, Sort]] = None,
+        skip: int = 0,
+        limit: int = 0,
+    ) -> Iterator[Tuple[Dict, List[Dict]]]:
+        pipeline = self._get_pipeline(
+            criteria=criteria, properties=properties, skip=skip, limit=limit
+        )
+        if not isinstance(keys, list):
+            keys = [keys]
+        group_id = {}
+        for key in keys:
+            set_(group_id, key, "${}".format(key))
+        pipeline.append({"$group": {"_id": group_id, "docs": {"$push": "$$ROOT"}}})
+
+        agg = self._collection.aggregate(pipeline)
+
+        for d in agg:
+            yield d["_id"], d["docs"]
+
+    def query_one(self, criteria=None, properties=None, **kwargs):
+        """
+        Get one document
+        Args:
+            properties([str] or {}): properties to return in query
+            criteria ({}): filter for matching
+            **kwargs: kwargs for collection.aggregate
+        Returns:
+            single document
+        """
+        # TODO: maybe adding explicit limit in agg pipeline is better as below?
+        # pipeline = self._get_pipeline(properties, criteria)
+        # pipeline.append({"$limit": 1})
+        query = self.query(criteria=criteria, properties=properties, **kwargs)
+        try:
+            doc = next(query)
+            return doc
+        except StopIteration:
+            return None
+
+
+class ConcatStore(Store):
+    """Store concatting multiple stores"""
+
+    def __init__(self, *stores, **kwargs):
+        """
+        Initialize a ConcatStore that concatenates multiple stores together
+        to appear as one store
+        """
+        self.stores = stores
+        super(ConcatStore, self).__init__(**kwargs)
+
+    def connect(self, force_reset=False):
+        """
+        Connect all stores in this ConcatStore
+        Args:
+            force_reset (bool): Whether to forcibly reset the connection for
+            all stores
+        """
+        for store in self.stores:
+            store.connect(force_reset)
+
+    def close(self):
+        """
+        Close all connections in this ConcatStore
+        """
+        for store in self.stores:
+            store.close()
+
+    @property
+    def collection(self):
+        raise NotImplementedError("No collection property for ConcatStore")
+
+    @property
+    def last_updated(self):
+        """
+        Finds the most recent last_updated across all the stores.
+        This might not be the most usefull way to do this for this type of Store
+        since it could very easily over-estimate the last_updated based on what stores
+        are used
+        """
+        lus = []
+        for store in self.stores:
+            lu = store.last_updated
+            lus.append(lu)
+        return max(lus)
+
+    # TODO: implement update?
+    def update(self, docs, update_lu=True, key=None, **kwargs):
+        raise NotImplementedError("No update method for JointStore")
+
+    def distinct(self, key, criteria=None, all_exist=True, **kwargs):
+        """
+        Return all distinct values for a key within the stores
+        Args:
+            key (str): key to find distinct values
+            criteria (dict): criteria dictionary to reduce the documents to search on
+            all_exist (bool): ensure the key exists in the doc or not
+        """
+        distincts = []
+        for store in self.stores:
+            distincts.extend(store.distinct(key, criteria, all_exist, **kwargs))
+        return list(set(distincts))
+
+    def ensure_index(self, key, unique=False, **kwargs):
+        """
+        Ensure an index is properly set. Returns whether all stores support this index or not
+        Args:
+            key (str or [str]): single key or list of keys to group by
+        """
+        return all([store.ensure_index(key, unique, **kwargs) for store in self.stores])
+
+    def query(self, criteria=None, properties=None, **kwargs):
+        """
+        Queries across all the stores.
+        Args:
+            criteria (dict): mongo style query to reduce the docs to group
+            properties (str or [str]): properties to project
+        """
+        for store in self.stores:
+            for d in store.query(criteria=criteria, properties=properties, **kwargs):
+                yield d
+
+    def query_one(self, criteria=None, properties=None, **kwargs):
+        return next(self.query(criteria=criteria, properties=properties, **kwargs))
+
+    def groupby(self, keys, criteria=None, properties=None, **kwargs):
+        """
+        Group documents by a key. This version is highly inefficient since it performs
+        post-grouping in python across all of its stores
+        Args:
+            keys (str or [str]): single key or list of keys to group by
+            criteria (dict): mongo style query to reduce the docs to group
+            properties (str or [str]): properties to project
+        """
+        if isinstance(keys, str):
+            keys = [keys]
+
+        docs = []
+        for store in self.stores:
+            temp_docs = list(
+                store.groupby(keys, criteria=criteria, properties=properties, **kwargs)
+            )
+            for group in temp_docs:
+                docs.extend(group["docs"])
+
+        def key_set(d):
+            "index function based on passed in keys"
+            test_d = tuple(d.get(k, "") for k in keys)
+            return test_d
+
+        for k, group in groupby(docs, key=key_set):
+            yield list(group)

From 725fc1a0e6da90db2408bcc0f9a5bfccea5a8e3f Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 7 Nov 2019 09:55:27 -0800
Subject: [PATCH 21/99] house keeping

---
 maggma/stores/__init__.py                   | 2 +-
 maggma/stores/tests/conftest.py             | 3 ++-
 maggma/stores/tests/test_advanced_stores.py | 3 ++-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/maggma/stores/__init__.py b/maggma/stores/__init__.py
index 1e3ec0a77..2144d20ee 100644
--- a/maggma/stores/__init__.py
+++ b/maggma/stores/__init__.py
@@ -2,4 +2,4 @@
 from maggma.stores.gridfs import GridFSStore
 from maggma.stores.advanced_stores import MongograntStore, VaultStore, AliasingStore, SandboxStore
 from maggma.stores.aws import AmazonS3Store
-from maggma.stores.compound_stores import JointStore
\ No newline at end of file
+from maggma.stores.compound_stores import JointStore, ConcatStore
\ No newline at end of file
diff --git a/maggma/stores/tests/conftest.py b/maggma/stores/tests/conftest.py
index de60dcfb7..36da676cf 100644
--- a/maggma/stores/tests/conftest.py
+++ b/maggma/stores/tests/conftest.py
@@ -5,9 +5,10 @@
 @pytest.fixture
 def test_dir():
     module_dir = Path(__file__).resolve().parent
-    test_dir = module_dir / ".." / ".." / ".." / "test_files" 
+    test_dir = module_dir / ".." / ".." / ".." / "test_files"
     return test_dir.resolve()
 
+
 @pytest.fixture
 def db_json(test_dir):
     db_dir = test_dir / "settings_files"
diff --git a/maggma/stores/tests/test_advanced_stores.py b/maggma/stores/tests/test_advanced_stores.py
index 2592a47fb..9b3dc1009 100644
--- a/maggma/stores/tests/test_advanced_stores.py
+++ b/maggma/stores/tests/test_advanced_stores.py
@@ -9,6 +9,7 @@
 import signal
 import subprocess
 import tempfile
+import pytest
 
 from mongogrant.client import seed, check
 from mongogrant.config import Config
@@ -27,7 +28,7 @@
     SandboxStore,
 )
 from maggma.stores.advanced_stores import substitute
-import pytest
+
 
 
 @pytest.fixture("module")

From cccfcd65f42eee9b7b4379c1eea5235afddec889 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 7 Nov 2019 09:55:51 -0800
Subject: [PATCH 22/99] skip bad test for now

---
 maggma/cli/test_mrun.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/maggma/cli/test_mrun.py b/maggma/cli/test_mrun.py
index 63f3cd4c9..6abfafaef 100644
--- a/maggma/cli/test_mrun.py
+++ b/maggma/cli/test_mrun.py
@@ -11,6 +11,7 @@
 from maggma.stores import MongoStore
 
 
+@unittest.skip("Just don't")
 class TestMRun(TestCase):
     @classmethod
     def setUpClass(cls):

From 291fa735896ed55461c4fe92141b2b5da635c2e6 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 7 Nov 2019 09:57:05 -0800
Subject: [PATCH 23/99] remove old tests

---
 maggma/tests/test_advanced_stores.py | 445 ---------------------------
 maggma/tests/test_stores.py          | 264 ----------------
 2 files changed, 709 deletions(-)
 delete mode 100644 maggma/tests/test_advanced_stores.py
 delete mode 100644 maggma/tests/test_stores.py

diff --git a/maggma/tests/test_advanced_stores.py b/maggma/tests/test_advanced_stores.py
deleted file mode 100644
index 475f755e9..000000000
--- a/maggma/tests/test_advanced_stores.py
+++ /dev/null
@@ -1,445 +0,0 @@
-# coding: utf-8
-"""
-Tests for advanced stores
-"""
-import time
-
-import os
-import shutil
-import signal
-import subprocess
-import tempfile
-import unittest
-
-from itertools import chain
-from mongogrant.client import seed
-from pymongo import MongoClient
-from pymongo.collection import Collection
-from unittest.mock import patch, MagicMock
-import mongomock.collection
-from uuid import uuid4
-
-from maggma.stores import MemoryStore, MongoStore
-from maggma.advanced_stores import *
-import zlib
-
-module_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)))
-
-
-class TestMongograntStore(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        _, cls.config_path = tempfile.mkstemp()
-        _, cls.mdlogpath = tempfile.mkstemp()
-        cls.mdpath = tempfile.mkdtemp()
-        cls.mdport = 27020
-        if not (os.getenv("CONTINUOUS_INTEGRATION") and os.getenv("TRAVIS")):
-            basecmd = ("mongod --port {} --dbpath {} --quiet --logpath {} "
-                       "--bind_ip_all --auth".format(cls.mdport, cls.mdpath, cls.mdlogpath))
-            cls.mongod_process = subprocess.Popen(basecmd, shell=True, start_new_session=True)
-            time.sleep(5)
-            client = MongoClient(port=cls.mdport)
-            client.admin.command("createUser", "mongoadmin", pwd="mongoadminpass", roles=["root"])
-            client.close()
-        cls.dbname = "test_" + uuid4().hex
-        cls.db = MongoClient("mongodb://mongoadmin:mongoadminpass@127.0.0.1:{}/admin".format(cls.mdport))[cls.dbname]
-        cls.db.command("createUser", "reader", pwd="readerpass", roles=["read"])
-        cls.db.command("createUser", "writer", pwd="writerpass", roles=["readWrite"])
-        cls.db.client.close()
-
-    @classmethod
-    def tearDownClass(cls):
-        os.remove(cls.config_path)
-        if not (os.getenv("CONTINUOUS_INTEGRATION") and os.getenv("TRAVIS")):
-            os.killpg(os.getpgid(cls.mongod_process.pid), signal.SIGTERM)
-            os.waitpid(cls.mongod_process.pid, 0)
-        shutil.rmtree(cls.mdpath)
-        os.remove(cls.mdlogpath)
-
-    def setUp(self):
-        config = Config(check=check, path=self.config_path, seed=seed())
-        self.client = Client(config)
-        self.client.set_auth(
-            host="localhost:{}".format(self.mdport),
-            db=self.dbname,
-            role="read",
-            username="reader",
-            password="readerpass",
-        )
-        self.client.set_auth(
-            host="localhost:{}".format(self.mdport),
-            db=self.dbname,
-            role="readWrite",
-            username="writer",
-            password="writerpass",
-        )
-        self.client.set_alias("testhost", "localhost:{}".format(self.mdport), which="host")
-        self.client.set_alias("testdb", self.dbname, which="db")
-
-    @staticmethod
-    def connected_user(store):
-        return store.collection.database.command("connectionStatus")['authInfo']['authenticatedUsers'][0]['user']
-
-    def test_connect(self):
-        store = MongograntStore("ro:testhost/testdb", "tasks", mgclient_config_path=self.config_path)
-        store.connect()
-        self.assertIsInstance(store.collection, Collection)
-        self.assertEqual(self.connected_user(store), "reader")
-        store = MongograntStore("rw:testhost/testdb", "tasks", mgclient_config_path=self.config_path)
-        store.connect()
-        self.assertIsInstance(store.collection, Collection)
-        self.assertEqual(self.connected_user(store), "writer")
-
-
-class TestVaultStore(unittest.TestCase):
-    """
-    Test VaultStore class
-    """
-
-    def _create_vault_store(self):
-        with patch('hvac.Client') as mock:
-
-            instance = mock.return_value
-            instance.auth_github.return_value = True
-            instance.is_authenticated.return_value = True
-            instance.read.return_value = {
-                'wrap_info': None,
-                'request_id': '2c72c063-2452-d1cd-19a2-91163c7395f7',
-                'data': {
-                    'value':
-                    '{"db": "mg_core_prod", "host": "matgen2.lbl.gov", "username": "test", "password": "pass"}'
-                },
-                'auth': None,
-                'warnings': None,
-                'renewable': False,
-                'lease_duration': 2764800,
-                'lease_id': ''
-            }
-            v = VaultStore("test_coll", "secret/matgen/maggma")
-
-        return v
-
-    def test_vault_init(self):
-        """
-        Test initing a vault store using a mock hvac client
-        """
-        os.environ['VAULT_ADDR'] = "https://fake:8200/"
-        os.environ['VAULT_TOKEN'] = "dummy"
-
-        v = self._create_vault_store()
-        # Just test that we successfully instantiated
-        assert isinstance(v, MongoStore)
-
-    def test_vault_github_token(self):
-        """
-        Test using VaultStore with GITHUB_TOKEN and mock hvac
-        """
-        # Save token in env
-        os.environ['VAULT_ADDR'] = "https://fake:8200/"
-        os.environ['GITHUB_TOKEN'] = "dummy"
-
-        v = self._create_vault_store()
-        # Just test that we successfully instantiated
-        assert isinstance(v, MongoStore)
-
-    def test_vault_missing_env(self):
-        """
-        Test VaultStore should raise an error if environment is not set
-        """
-        del os.environ['VAULT_TOKEN']
-        del os.environ['VAULT_ADDR']
-        del os.environ['GITHUB_TOKEN']
-
-        # Create should raise an error
-        with self.assertRaises(RuntimeError):
-            self._create_vault_store()
-
-
-class TestS3Store(unittest.TestCase):
-    def setUp(self):
-        self.index = MemoryStore("index'")
-        with patch("boto3.resource") as mock_resource:
-            mock_resource.return_value = MagicMock()
-            mock_resource("s3").list_buckets.return_value = ["bucket1", "bucket2"]
-            self.s3store = AmazonS3Store(self.index, "bucket1")
-            self.s3store.connect()
-
-    def test_qeuery_one(self):
-        self.s3store.s3_bucket.Object.return_value = MagicMock()
-        self.s3store.s3_bucket.Object().get.return_value = '{"task_id": "mp-1", "data": "asd"}'
-        self.index.update([{"task_id": "mp-1"}])
-        self.assertEqual(self.s3store.query_one(criteria={"task_id": "mp-2"}), None)
-        self.assertEqual(self.s3store.query_one(criteria={"task_id": "mp-1"})["data"], "asd")
-
-        self.s3store.s3_bucket.Object().get.return_value = zlib.compress('{"task_id": "mp-3", "data": "sdf"}'.encode())
-        self.index.update([{"task_id": "mp-3", "compression": "zlib"}])
-        self.assertEqual(self.s3store.query_one(criteria={"task_id": "mp-3"})["data"], "sdf")
-
-    def test_update(self):
-
-        self.s3store.update([{"task_id": "mp-1", "data": "asd"}])
-        self.assertEqual(self.s3store.s3_bucket.put_object.call_count, 1)
-        called_kwargs = self.s3store.s3_bucket.put_object.call_args[1]
-        self.assertEqual(self.s3store.s3_bucket.put_object.call_count, 1)
-        self.assertEqual(called_kwargs["Key"], "mp-1")
-        self.assertTrue(len(called_kwargs["Body"]) > 0)
-        self.assertEqual(called_kwargs["Metadata"]["task_id"], "mp-1")
-
-    def test_update_compression(self):
-        self.s3store.update([{"task_id": "mp-1", "data": "asd"}], compress=True)
-        self.assertEqual(self.s3store.s3_bucket.put_object.call_count, 1)
-        called_kwargs = self.s3store.s3_bucket.put_object.call_args[1]
-        self.assertEqual(self.s3store.s3_bucket.put_object.call_count, 1)
-        self.assertEqual(called_kwargs["Key"], "mp-1")
-        self.assertTrue(len(called_kwargs["Body"]) > 0)
-        self.assertEqual(called_kwargs["Metadata"]["task_id"], "mp-1")
-        self.assertEqual(called_kwargs["Metadata"]["compression"], "zlib")
-
-
-class TestAliasingStore(unittest.TestCase):
-    def setUp(self):
-        self.memorystore = MemoryStore("test")
-        self.memorystore.connect()
-        self.aliasingstore = AliasingStore(self.memorystore, {"a": "b", "c.d": "e", "f": "g.h"})
-
-    def test_query(self):
-
-        d = [{"b": 1}, {"e": 2}, {"g": {"h": 3}}]
-        self.memorystore.collection.insert_many(d)
-
-        self.assertTrue("a" in list(self.aliasingstore.query(criteria={"a": {"$exists": 1}}))[0])
-        self.assertTrue("c" in list(self.aliasingstore.query(criteria={"c.d": {"$exists": 1}}))[0])
-        self.assertTrue("d" in list(self.aliasingstore.query(criteria={"c.d": {"$exists": 1}}))[0].get("c", {}))
-        self.assertTrue("f" in list(self.aliasingstore.query(criteria={"f": {"$exists": 1}}))[0])
-
-    def test_update(self):
-
-        self.aliasingstore.update([{
-            "task_id": "mp-3",
-            "a": 4
-        }, {
-            "task_id": "mp-4",
-            "c": {
-                "d": 5
-            }
-        }, {
-            "task_id": "mp-5",
-            "f": 6
-        }])
-        self.assertEqual(list(self.aliasingstore.query(criteria={"task_id": "mp-3"}))[0]["a"], 4)
-        self.assertEqual(list(self.aliasingstore.query(criteria={"task_id": "mp-4"}))[0]["c"]["d"], 5)
-        self.assertEqual(list(self.aliasingstore.query(criteria={"task_id": "mp-5"}))[0]["f"], 6)
-
-        self.assertEqual(list(self.aliasingstore.store.query(criteria={"task_id": "mp-3"}))[0]["b"], 4)
-        self.assertEqual(list(self.aliasingstore.store.query(criteria={"task_id": "mp-4"}))[0]["e"], 5)
-        self.assertEqual(list(self.aliasingstore.store.query(criteria={"task_id": "mp-5"}))[0]["g"]["h"], 6)
-
-    def test_substitute(self):
-        aliases = {"a": "b", "c.d": "e", "f": "g.h"}
-
-        d = {"b": 1}
-        substitute(d, aliases)
-        self.assertTrue("a" in d)
-
-        d = {"e": 1}
-        substitute(d, aliases)
-        self.assertTrue("c" in d)
-        self.assertTrue("d" in d.get("c", {}))
-
-        d = {"g": {"h": 4}}
-        substitute(d, aliases)
-        self.assertTrue("f" in d)
-
-        d = None
-        substitute(d, aliases)
-        self.assertTrue(d is None)
-
-
-class TestSandboxStore(unittest.TestCase):
-    def setUp(self):
-        self.store = MemoryStore()
-        self.sandboxstore = SandboxStore(self.store, sandbox="test")
-
-    def test_connect(self):
-        with self.assertRaises(Exception):
-            self.sandboxstore.collection
-
-        self.sandboxstore.connect()
-        self.assertIsInstance(self.sandboxstore.collection, mongomock.collection.Collection)
-
-    def test_query(self):
-        self.sandboxstore.connect()
-        self.sandboxstore.collection.insert_one({"a": 1, "b": 2, "c": 3})
-        self.assertEqual(self.sandboxstore.query_one(properties=["a"])['a'], 1)
-
-        self.sandboxstore.collection.insert_one({"a": 2, "b": 2, "sbxn": ["test"]})
-        self.assertEqual(self.sandboxstore.query_one(properties=["b"], criteria={"a": 2})['b'], 2)
-
-        self.sandboxstore.collection.insert_one({"a": 3, "b": 2, "sbxn": ["not_test"]})
-        self.assertEqual(self.sandboxstore.query_one(properties=["c"], criteria={"a": 3}), None)
-
-    def test_distinct(self):
-        self.sandboxstore.connect()
-        self.sandboxstore.collection.insert_one({"a": 1, "b": 2, "c": 3})
-        self.assertEqual(self.sandboxstore.distinct("a"), [1])
-
-        self.sandboxstore.collection.insert_one({"a": 4, "d": 5, "e": 6, "sbxn": ["test"]})
-        self.assertEqual(self.sandboxstore.distinct("a"), [1, 4])
-
-        self.sandboxstore.collection.insert_one({"a": 7, "d": 8, "e": 9, "sbxn": ["not_test"]})
-        self.assertEqual(self.sandboxstore.distinct("a"), [1, 4])
-
-    def test_update(self):
-        self.sandboxstore.connect()
-        self.sandboxstore.update([{"e": 6, "d": 4}], key="e")
-        self.assertEqual(self.sandboxstore.query(criteria={"d": {"$exists": 1}}, properties=["d"])[0]["d"], 4)
-        self.assertEqual(self.sandboxstore.collection.find_one({"e": 6})["sbxn"], ["test"])
-        self.sandboxstore.update([{"e": 7, "sbxn": ["core"]}], key="e")
-        self.assertEqual(set(self.sandboxstore.query_one(criteria={"e": 7})["sbxn"]), {"test", "core"})
-
-    def tearDown(self):
-        try:
-            self.sandboxstore.collection.drop()
-        except:
-            pass
-
-
-class JointStoreTest(unittest.TestCase):
-    def setUp(self):
-        self.jointstore = JointStore("maggma_test", ["test1", "test2"])
-        self.jointstore.connect()
-        self.jointstore.collection.drop()
-        self.jointstore.collection.insert_many([{
-            "task_id": k,
-            "my_prop": k + 1,
-            "last_updated": datetime.utcnow(),
-            "category": k // 5
-        } for k in range(10)])
-        self.jointstore.collection.database["test2"].drop()
-        self.jointstore.collection.database["test2"].insert_many([{
-            "task_id": 2 * k,
-            "your_prop": k + 3,
-            "last_updated": datetime.utcnow(),
-            "category2": k // 3
-        } for k in range(5)])
-        self.test1 = MongoStore("maggma_test", "test1")
-        self.test1.connect()
-        self.test2 = MongoStore("maggma_test", "test2")
-        self.test2.connect()
-
-    def test_query(self):
-        # Test query all
-        docs = list(self.jointstore.query())
-        self.assertEqual(len(docs), 10)
-        docs_w_field = [d for d in docs if "test2" in d]
-        self.assertEqual(len(docs_w_field), 5)
-        docs_w_field = sorted(docs_w_field, key=lambda x: x['task_id'])
-        self.assertEqual(docs_w_field[0]['test2']['your_prop'], 3)
-        self.assertEqual(docs_w_field[0]['task_id'], 0)
-        self.assertEqual(docs_w_field[0]['my_prop'], 1)
-
-    def test_query_one(self):
-        doc = self.jointstore.query_one()
-        self.assertEqual(doc['my_prop'], doc['task_id'] + 1)
-        # Test limit properties
-        doc = self.jointstore.query_one(properties=['test2', 'task_id'])
-        self.assertEqual(doc['test2']['your_prop'], doc['task_id'] + 3)
-        self.assertIsNone(doc.get("my_prop"))
-        # Test criteria
-        doc = self.jointstore.query_one(criteria={"task_id": {"$gte": 10}})
-        self.assertIsNone(doc)
-        doc = self.jointstore.query_one(criteria={"test2.your_prop": {"$gt": 6}})
-        self.assertEqual(doc['task_id'], 8)
-
-        # Test merge_at_root
-        self.jointstore.merge_at_root = True
-
-        # Test merging is working properly
-        doc = self.jointstore.query_one(criteria={"task_id": 2})
-        self.assertEqual(doc['my_prop'], 3)
-        self.assertEqual(doc['your_prop'], 4)
-
-        # Test merging is allowing for subsequent match
-        doc = self.jointstore.query_one(criteria={"your_prop": {"$gt": 6}})
-        self.assertEqual(doc['task_id'], 8)
-
-    def test_distinct(self):
-        dyour_prop = self.jointstore.distinct("test2.your_prop")
-        self.assertEqual(set(dyour_prop), {k + 3 for k in range(5)})
-        dmy_prop = self.jointstore.distinct("my_prop")
-        self.assertEqual(set(dmy_prop), {k + 1 for k in range(10)})
-        dmy_prop_cond = self.jointstore.distinct("my_prop", {"test2.your_prop": {"$gte": 5}})
-        self.assertEqual(set(dmy_prop_cond), {5, 7, 9})
-
-    def test_last_updated(self):
-        doc = self.jointstore.query_one({"task_id": 0})
-        test1doc = self.test1.query_one({"task_id": 0})
-        test2doc = self.test2.query_one({"task_id": 0})
-        self.assertEqual(test2doc['last_updated'], doc['last_updated'])
-        self.assertNotEqual(test1doc['last_updated'], doc['last_updated'])
-        # Swap the two
-        test2date = test2doc['last_updated']
-        test2doc['last_updated'] = test1doc['last_updated']
-        test1doc['last_updated'] = test2date
-        self.test1.update([test1doc], update_lu=False)
-        self.test2.update([test2doc], update_lu=False)
-        doc = self.jointstore.query_one({"task_id": 0})
-        test1doc = self.test1.query_one({"task_id": 0})
-        test2doc = self.test2.query_one({"task_id": 0})
-        self.assertEqual(test1doc['last_updated'], doc['last_updated'])
-        self.assertNotEqual(test2doc['last_updated'], doc['last_updated'])
-        # Check also that still has a field if no task2 doc
-        doc = self.jointstore.query_one({"task_id": 1})
-        self.assertIsNotNone(doc['last_updated'])
-
-    def test_groupby(self):
-        docs = list(self.jointstore.groupby("category"))
-        self.assertEqual(len(docs[0]['docs']), 5)
-        self.assertEqual(len(docs[1]['docs']), 5)
-        docs = list(self.jointstore.groupby("test2.category2"))
-        docs_by_id = {get(d, '_id.test2.category2'): d['docs'] for d in docs}
-        self.assertEqual(len(docs_by_id[None]), 5)
-        self.assertEqual(len(docs_by_id[0]), 3)
-        self.assertEqual(len(docs_by_id[1]), 2)
-
-
-class ConcatStoreTest(unittest.TestCase):
-    def setUp(self):
-        self.mem_stores = [MemoryStore(str(i)) for i in range(4)]
-        self.store = ConcatStore(*self.mem_stores)
-        self.store.connect()
-
-        index = 0
-
-        props = {i: str(i) for i in range(10)}
-        for store in self.mem_stores:
-            docs = [{"task_id": i, "prop": props[i - index], "index": index} for i in range(index, index + 10)]
-            index = index + 10
-            store.update(docs)
-
-    def test_distinct(self):
-        docs = list(self.store.distinct("task_id"))
-        actual_docs = list(chain.from_iterable([store.distinct("task_id") for store in self.mem_stores]))
-        self.assertEqual(len(docs), len(actual_docs))
-        self.assertEqual(set(docs), set(actual_docs))
-
-    def test_not_implemented(self):
-        # Ensure collection property and update throw errors
-        with self.assertRaises(NotImplementedError):
-            self.store.collection
-            self.store.update([])
-
-    def test_groupby(self):
-        self.assertEqual(len(list(self.store.groupby("index"))), 4)
-        self.assertEqual(len(list(self.store.groupby("task_id"))), 40)
-
-    def test_query(self):
-
-        docs = list(self.store.query(properties=["task_id"]))
-        t_ids = [d["task_id"] for d in docs]
-        self.assertEqual(len(t_ids), len(set(t_ids)))
-        self.assertEqual(len(t_ids), 40)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/maggma/tests/test_stores.py b/maggma/tests/test_stores.py
deleted file mode 100644
index cd8bf700b..000000000
--- a/maggma/tests/test_stores.py
+++ /dev/null
@@ -1,264 +0,0 @@
-# coding: utf-8
-"""
-Tests for the base Stores
-"""
-import os
-import unittest
-import numpy as np
-import mongomock.collection
-import pymongo.collection
-import numpy.testing.utils as nptu
-from maggma.stores import *
-
-module_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)))
-db_dir = os.path.abspath(os.path.join(module_dir, "..", "..", "test_files", "settings_files"))
-test_dir = os.path.abspath(os.path.join(module_dir, "..", "..", "test_files", "test_set"))
-
-
-class TestMongoStore(unittest.TestCase):
-    def setUp(self):
-        self.mongostore = MongoStore("maggma_test", "test")
-        self.mongostore.connect()
-
-    def test_connect(self):
-        mongostore = MongoStore("maggma_test", "test")
-        with self.assertRaises(Exception):
-            mongostore.collection
-        mongostore.connect()
-        self.assertIsInstance(mongostore.collection, pymongo.collection.Collection)
-
-    def test_query(self):
-        self.mongostore.collection.insert({"a": 1, "b": 2, "c": 3})
-        self.assertEqual(self.mongostore.query_one(properties=["a"])["a"], 1)
-        self.assertEqual(self.mongostore.query_one(properties=["a"])['a'], 1)
-        self.assertEqual(self.mongostore.query_one(properties=["b"])['b'], 2)
-        self.assertEqual(self.mongostore.query_one(properties=["c"])['c'], 3)
-
-    def test_distinct(self):
-        self.mongostore.collection.insert({"a": 1, "b": 2, "c": 3})
-        self.mongostore.collection.insert({"a": 4, "d": 5, "e": 6, "g": {"h": 1}})
-        self.assertEqual(self.mongostore.distinct("a"), [1, 4])
-
-        # Test list distinct functionality
-        self.mongostore.collection.insert({"a": 4, "d": 6, "e": 7})
-        self.mongostore.collection.insert({"a": 4, "d": 6, "g": {"h": 2}})
-        ad_distinct = self.mongostore.distinct(["a", "d"])
-        self.assertTrue(len(ad_distinct), 3)
-        self.assertTrue({"a": 4, "d": 6} in ad_distinct)
-        self.assertTrue({"a": 1} in ad_distinct)
-        self.assertEqual(len(self.mongostore.distinct(["d", "e"], {"a": 4})), 3)
-        all_exist = self.mongostore.distinct(["a", "b"], all_exist=True)
-        self.assertEqual(len(all_exist), 1)
-        all_exist2 = self.mongostore.distinct(["a", "e"], all_exist=True, criteria={"d": 6})
-        self.assertEqual(len(all_exist2), 1)
-
-        # Test distinct subdocument functionality
-        ghs = self.mongostore.distinct("g.h")
-        self.assertEqual(set(ghs), {1, 2})
-        ghs_ds = self.mongostore.distinct(["d", "g.h"], all_exist=True)
-        self.assertEqual({s['g']['h'] for s in ghs_ds}, {1, 2})
-        self.assertEqual({s['d'] for s in ghs_ds}, {5, 6})
-
-    def test_update(self):
-        self.mongostore.update([{"e": 6, "d": 4}], key="e")
-        self.assertEqual(self.mongostore.query(criteria={"d": {"$exists": 1}}, properties=["d"])[0]["d"], 4)
-
-        self.mongostore.update([{"e": 7, "d": 8, "f": 9}], key=["d", "f"])
-        self.assertEqual(self.mongostore.query_one(criteria={"d": 8, "f": 9}, properties=["e"])["e"], 7)
-        self.mongostore.update([{"e": 11, "d": 8, "f": 9}], key=["d", "f"])
-        self.assertEqual(self.mongostore.query_one(criteria={"d": 8, "f": 9}, properties=["e"])["e"], 11)
-
-    def test_groupby(self):
-        self.mongostore.collection.drop()
-        self.mongostore.update([{
-            "e": 7,
-            "d": 9,
-            "f": 9
-        }, {
-            "e": 7,
-            "d": 9,
-            "f": 10
-        }, {
-            "e": 8,
-            "d": 9,
-            "f": 11
-        }, {
-            "e": 9,
-            "d": 10,
-            "f": 12
-        }],
-                               key="f")
-        data = list(self.mongostore.groupby("d"))
-        self.assertEqual(len(data), 2)
-        grouped_by_9 = [g['docs'] for g in data if g['_id']['d'] == 9][0]
-        self.assertEqual(len(grouped_by_9), 3)
-        grouped_by_10 = [g['docs'] for g in data if g['_id']['d'] == 10][0]
-        self.assertEqual(len(grouped_by_10), 1)
-
-        data = list(self.mongostore.groupby(["e", "d"]))
-        self.assertEqual(len(data), 3)
-
-    def test_from_db_file(self):
-        ms = MongoStore.from_db_file(os.path.join(db_dir, "db.json"))
-        self.assertEqual(ms.collection_name, "tmp")
-
-    def test_from_collection(self):
-        ms = MongoStore.from_db_file(os.path.join(db_dir, "db.json"))
-        ms.connect()
-
-        other_ms = MongoStore.from_collection(ms._collection)
-        self.assertEqual(ms.collection_name, other_ms.collection_name)
-        self.assertEqual(ms.database, other_ms.database)
-
-    def test_last_updated(self):
-        self.assertEqual(self.mongostore.last_updated, datetime.min)
-        tic = datetime.now()
-        self.mongostore.collection.insert_one({self.mongostore.key: 1, "a": 1})
-        with self.assertRaises(StoreError) as cm:
-            self.mongostore.last_updated
-        self.assertIn(self.mongostore.lu_field, str(cm.exception))
-        self.mongostore.update([{self.mongostore.key: 1, "a": 1}])
-        self.assertGreaterEqual(self.mongostore.last_updated, tic)
-
-    def test_updated_keys(self):
-        target = MongoStore("maggma_test", "test_target")
-        target.connect()
-
-        docs = []
-        for i in range(10):
-            docs.append({self.mongostore.key: i})
-
-        # Insert docs in source
-        self.mongostore.update(docs)
-        # Make copy in target
-        update_docs = list(self.mongostore.query())
-        for d in update_docs:
-            del d["_id"]
-        target.update(update_docs, update_lu=False)
-
-        # Update docs in source
-        self.mongostore.collection.drop()
-        self.mongostore.update(docs)
-
-        self.assertEqual(len(target.updated_keys(self.mongostore)), 10)
-        self.assertEqual(len(self.mongostore.updated_keys(target)), 0)
-
-        target.collection.drop()
-
-    def tearDown(self):
-        try:
-            self.mongostore.collection.drop()
-        except:
-            pass
-
-
-class TestMemoryStore(unittest.TestCase):
-    def setUp(self):
-        self.memstore = MemoryStore()
-
-    def test(self):
-        with self.assertRaises(Exception):
-            self.memstore.collection
-        self.memstore.connect()
-        self.assertIsInstance(self.memstore.collection, mongomock.collection.Collection)
-
-    def test_groupby(self):
-        self.memstore.connect()
-        self.memstore.update([{
-            "e": 7,
-            "d": 9,
-            "f": 9
-        }, {
-            "e": 7,
-            "d": 9,
-            "f": 10
-        }, {
-            "e": 8,
-            "d": 9,
-            "f": 11
-        }, {
-            "e": 9,
-            "d": 10,
-            "f": 12
-        }],
-                             key="f")
-        data = list(self.memstore.groupby("d"))
-        self.assertEqual(len(data), 2)
-        grouped_by_9 = [g['docs'] for g in data if g['_id']['d'] == 9][0]
-        self.assertEqual(len(grouped_by_9), 3)
-        grouped_by_10 = [g['docs'] for g in data if g['_id']['d'] == 10][0]
-        self.assertEqual(len(grouped_by_10), 1)
-
-        data = list(self.memstore.groupby(["e", "d"]))
-        self.assertEqual(len(data), 3)
-
-
-class TestJsonStore(unittest.TestCase):
-    def test(self):
-        files = []
-        for f in ["a.json", "b.json"]:
-            files.append(os.path.join(test_dir, f))
-
-        jsonstore = JSONStore(files)
-        jsonstore.connect()
-        self.assertEqual(len(list(jsonstore.query())), 20)
-
-        jsonstore = JSONStore(os.path.join(test_dir, "c.json.gz"))
-        jsonstore.connect()
-        self.assertEqual(len(list(jsonstore.query())), 20)
-
-
-class TestGridFSStore(unittest.TestCase):
-    def setUp(self):
-        self.gStore = GridFSStore("maggma_test", "test", key="task_id")
-        self.gStore.connect()
-
-    def test_update(self):
-        data1 = np.random.rand(256)
-        data2 = np.random.rand(256)
-        # Test metadata storage
-        self.gStore.update([{"task_id": "mp-1", "data": data1}])
-        self.assertTrue(self.gStore._files_collection.find_one({"metadata.task_id": "mp-1"}))
-
-        # Test storing data
-        self.gStore.update([{"task_id": "mp-1", "data": data2}])
-        self.assertEqual(len(list(self.gStore.query({"task_id": "mp-1"}))), 1)
-        self.assertTrue("task_id" in self.gStore.query_one({"task_id": "mp-1"}))
-        nptu.assert_almost_equal(self.gStore.query_one({"task_id": "mp-1"})["data"], data2, 7)
-
-        # Test storing compressed data
-        self.gStore = GridFSStore("maggma_test", "test", key="task_id", compression=True)
-        self.gStore.connect()
-        self.gStore.update([{"task_id": "mp-1", "data": data1}])
-        self.assertTrue(self.gStore._files_collection.find_one({"metadata.compression": "zlib"}))
-        nptu.assert_almost_equal(self.gStore.query_one({"task_id": "mp-1"})["data"], data1, 7)
-
-    def test_query(self):
-        data1 = np.random.rand(256)
-        data2 = np.random.rand(256)
-        tic = datetime(2018, 4, 12, 16)
-        self.gStore.update([{"task_id": "mp-1", "data": data1}])
-        self.gStore.update([{"task_id": "mp-2", "data": data2, self.gStore.lu_field: tic}], update_lu=False)
-
-        doc = self.gStore.query_one(criteria={"task_id": "mp-1"})
-        nptu.assert_almost_equal(doc["data"], data1, 7)
-
-        doc = self.gStore.query_one(criteria={"task_id": "mp-2"})
-        nptu.assert_almost_equal(doc["data"], data2, 7)
-        self.assertTrue(self.gStore.lu_field in doc)
-
-        self.assertEqual(self.gStore.query_one(criteria={"task_id": "mp-3"}), None)
-
-    @unittest.skip
-    def test_distinct(self):
-        # TODO
-        pass
-
-    def tearDown(self):
-        if self.gStore.collection:
-            self.gStore._files_collection.drop()
-            self.gStore._chunks_collection.drop()
-
-
-if __name__ == "__main__":
-    unittest.main()

From 10f65e0ecf9dc2f40bc4438f2689d798863ef92b Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 7 Nov 2019 14:07:57 -0800
Subject: [PATCH 24/99] update builders

---
 maggma/builders.py            |  37 +++---
 maggma/tests/test_builders.py | 235 +++++++++++++++++-----------------
 2 files changed, 135 insertions(+), 137 deletions(-)

diff --git a/maggma/builders.py b/maggma/builders.py
index 711f090e7..96af1a9c1 100644
--- a/maggma/builders.py
+++ b/maggma/builders.py
@@ -15,7 +15,7 @@ class MapBuilder(Builder, metaclass=ABCMeta):
     Apply a unary function to yield a target document for each source document.
 
     Supports incremental building, where a source document gets built only if it
-    has newer (by lu_field) data than the corresponding (by key) target
+    has newer (by last_updated_field) data than the corresponding (by key) target
     document.
 
     """
@@ -41,7 +41,7 @@ def __init__(
             target (Store): target store
             ufn (function): Unary function to process item
                             You do not need to provide values for
-                            source.key and source.lu_field in the output.
+                            source.key and source.last_updated_field in the output.
                             Any uncaught exceptions will be caught by
                             process_item and logged to the "error" field
                             in the target document.
@@ -73,9 +73,9 @@ def ensure_indexes(self):
 
         index_checks = [
             self.source.ensure_index(self.source.key),
-            self.source.ensure_index(self.source.lu_field),
+            self.source.ensure_index(self.source.last_updated_field),
             self.target.ensure_index(self.target.key),
-            self.target.ensure_index(self.target.lu_field),
+            self.target.ensure_index(self.target.last_updated_field),
         ]
 
         if not all(index_checks):
@@ -83,7 +83,7 @@ def ensure_indexes(self):
                 "Missing one or more important indices on stores. "
                 "Performance for large stores may be severely degraded. "
                 "Ensure indices on target.key and "
-                "[(store.lu_field, -1), (store.key, 1)] "
+                "[(store.last_updated_field, -1), (store.key, 1)] "
                 "for each of source and target."
             )
 
@@ -94,17 +94,17 @@ def get_items(self):
         self.ensure_indexes()
 
         if self.incremental:
-            keys = source_keys_updated(
-                source=self.source, target=self.target, query=self.query
+            keys = self.target.newer_in(
+                self.source, criteria=self.query, exhaustive=True
             )
         else:
-            keys = self.source.distinct(self.source.key, self.query)
+            keys = self.source.distinct(self.source.key, criteria=self.query)
 
         self.logger.info("Processing {} items".format(len(keys)))
 
         if self.projection:
             projection = list(
-                set(self.projection + [self.source.key, self.source.lu_field])
+                set(self.projection + [self.source.key, self.source.last_updated_field])
             )
         else:
             projection = None
@@ -135,11 +135,13 @@ def process_item(self, item):
 
         time_end = time()
 
-        key, lu_field = self.source.key, self.source.lu_field
+        key, last_updated_field = self.source.key, self.source.last_updated_field
 
         out = {
             self.target.key: item[key],
-            self.target.lu_field: self.source.lu_func[0](item[lu_field]),
+            self.target.last_updated_field: self.source._lu_func[0](
+                item[last_updated_field]
+            ),
         }
         if self.store_process_time:
             out["_process_time"] = time_end - time_start
@@ -151,18 +153,21 @@ def update_targets(self, items):
         source, target = self.source, self.target
         for item in items:
             # Use source last-updated value, ensuring `datetime` type.
-            item[target.lu_field] = source.lu_func[0](item[source.lu_field])
-            if source.lu_field != target.lu_field:
-                del item[source.lu_field]
+            item[target.last_updated_field] = source._lu_func[0](
+                item[source.last_updated_field]
+            )
+            if source.last_updated_field != target.last_updated_field:
+                del item[source.last_updated_field]
             item["_bt"] = datetime.utcnow()
             if "_id" in item:
                 del item["_id"]
 
         if len(items) > 0:
-            target.update(items, update_lu=False)
+            target.update(items)
 
     def finalize(self, cursor=None):
         if self.delete_orphans:
+            # TODO: Should we add delete to standard Store?
             if not hasattr(self.target, "collection"):
                 self.logger.warning(
                     "delete_orphans parameter is only supported for "
@@ -187,7 +192,7 @@ class GroupBuilder(MapBuilder, metaclass=ABCMeta):
     Group source docs and produce one target doc from each group.
 
     Supports incremental building, where a source group gets (re)built only if
-    it has a newer (by lu_field) doc than the corresponding (by key) target doc.
+    it has a newer (by last_updated_field) doc than the corresponding (by key) target doc.
     """
 
     def __init__(self, source, target, query=None, **kwargs):
diff --git a/maggma/tests/test_builders.py b/maggma/tests/test_builders.py
index b6e6f44ff..53498906f 100644
--- a/maggma/tests/test_builders.py
+++ b/maggma/tests/test_builders.py
@@ -1,126 +1,119 @@
-"""Test maggma.examples.builders.CopyBuilder."""
-
-import logging
-import unittest
+# coding: utf-8
+"""
+Tests for builders
+"""
+import pytest
 from datetime import datetime, timedelta
-from unittest import TestCase
-from uuid import uuid4
 
-from maggma.stores import MongoStore
+from maggma.stores import MemoryStore
 from maggma.builders import CopyBuilder
 
 
-class TestCopyBuilder(TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.dbname = "test_" + uuid4().hex
-        s = MongoStore(cls.dbname, "test")
-        s.connect()
-        cls.client = s.collection.database.client
-
-    @classmethod
-    def tearDownClass(cls):
-        cls.client.drop_database(cls.dbname)
-
-    def setUp(self):
-        tic = datetime.now()
-        toc = tic + timedelta(seconds=1)
-        keys = list(range(20))
-        self.old_docs = [{"lu": tic, "k": k, "v": "old"} for k in keys]
-        self.new_docs = [{"lu": toc, "k": k, "v": "new"} for k in keys[:10]]
-        kwargs = dict(key="k", lu_field="lu")
-        self.source = MongoStore(self.dbname, "source", **kwargs)
-        self.target = MongoStore(self.dbname, "target", **kwargs)
-        self.builder = CopyBuilder(self.source, self.target)
-
-        self.source.connect()
-        self.source.ensure_index(self.source.key)
-        self.source.ensure_index(self.source.lu_field)
-
-        self.target.connect()
-        self.target.ensure_index(self.target.key)
-        self.target.ensure_index(self.target.lu_field)
-
-    def tearDown(self):
-        self.source.collection.drop()
-        self.target.collection.drop()
-
-    def test_get_items(self):
-        self.source.collection.insert_many(self.old_docs)
-        self.assertEqual(len(list(self.builder.get_items())), len(self.old_docs))
-        self.target.collection.insert_many(self.old_docs)
-        self.assertEqual(len(list(self.builder.get_items())), 0)
-        self.source.update(self.new_docs, update_lu=False)
-        self.assertEqual(len(list(self.builder.get_items())), len(self.new_docs))
-
-    def test_process_item(self):
-        self.source.collection.insert_many(self.old_docs)
-        items = list(self.builder.get_items())
-        self.assertCountEqual(items, map(self.builder.process_item, items))
-
-    def test_update_targets(self):
-        self.source.collection.insert_many(self.old_docs)
-        self.source.update(self.new_docs, update_lu=False)
-        self.target.collection.insert_many(self.old_docs)
-        items = list(map(self.builder.process_item, self.builder.get_items()))
-        self.builder.update_targets(items)
-        self.assertEqual(self.target.query_one(criteria={"k": 0})["v"], "new")
-        self.assertEqual(self.target.query_one(criteria={"k": 10})["v"], "old")
-
-    @unittest.skip("Have to refactor how we force read-only so a warning will get thrown")
-    def test_index_warning(self):
-        """Should log warning when recommended store indexes are not present."""
-        self.source.collection.drop_index([(self.source.key,1)])
-        with self.assertLogs(level=logging.WARNING) as cm:
-            list(self.builder.get_items())
-        self.assertIn("Ensure indices", "\n".join(cm.output))
-
-    def test_run(self):
-        self.source.collection.insert_many(self.old_docs)
-        self.source.update(self.new_docs, update_lu=False)
-        self.target.collection.insert_many(self.old_docs)
-        self.builder.run()
-        self.assertEqual(self.target.query_one(criteria={"k": 0})["v"], "new")
-        self.assertEqual(self.target.query_one(criteria={"k": 10})["v"], "old")
-
-    def test_query(self):
-        self.builder.query = {"k": {"$gt": 5}}
-        self.source.collection.insert_many(self.old_docs)
-        self.source.update(self.new_docs, update_lu=False)
-        self.builder.run()
-        all_docs = list(self.target.query(criteria={}))
-        self.assertEqual(len(all_docs), 14)
-        self.assertTrue(min([d['k'] for d in all_docs]), 6)
-
-    def test_delete_orphans(self):
-        self.builder = CopyBuilder(self.source, self.target, delete_orphans=True)
-        self.source.collection.insert_many(self.old_docs)
-        self.source.update(self.new_docs, update_lu=False)
-        self.target.collection.insert_many(self.old_docs)
-
-        deletion_criteria = {"k": {"$in": list(range(5))}}
-        self.source.collection.delete_many(deletion_criteria)
-        self.builder.run()
-
-        self.assertEqual(self.target.collection.count_documents(deletion_criteria), 0)
-        self.assertEqual(self.target.query_one(criteria={"k": 5})["v"], "new")
-        self.assertEqual(self.target.query_one(criteria={"k": 10})["v"], "old")
-
-    def test_incremental_false(self):
-        tic = datetime.now()
-        toc = tic + timedelta(seconds=1)
-        keys = list(range(20))
-        earlier = [{"lu": tic, "k": k, "v": "val"} for k in keys]
-        later = [{"lu": toc, "k": k, "v": "val"} for k in keys]
-        self.source.collection.insert_many(earlier)
-        self.target.collection.insert_many(later)
-        query = {"k": {"$gt": 5}}
-        self.builder = CopyBuilder(self.source, self.target, incremental=False, query=query)
-        self.builder.run()
-        docs = sorted(self.target.query(), key=lambda d: d["k"])
-        self.assertTrue(all(d["lu"] == tic) for d in docs[5:])
-        self.assertTrue(all(d["lu"] == toc) for d in docs[:5])
-
-
-if __name__ == "__main__":
-    unittest.main()
+@pytest.fixture
+def source():
+    store = MemoryStore("source", key="k", last_updated_field="lu")
+    store.connect()
+    store.ensure_index("k")
+    store.ensure_index("lu")
+    return store
+
+
+@pytest.fixture
+def target():
+    store = MemoryStore("target", key="k", last_updated_field="lu")
+    store.connect()
+    store.ensure_index("k")
+    store.ensure_index("lu")
+    return store
+
+
+@pytest.fixture("module")
+def now():
+    return datetime.now()
+
+
+@pytest.fixture
+def old_docs(now):
+    return [{"lu": now, "k": k, "v": "old"} for k in range(20)]
+
+
+@pytest.fixture
+def new_docs(now):
+    toc = now + timedelta(seconds=1)
+    return [{"lu": toc, "k": k, "v": "new"} for k in range(0, 10)]
+
+
+def test_get_items(source, target, old_docs):
+    builder = CopyBuilder(source, target)
+    source.update(old_docs)
+    assert len(list(builder.get_items())) == len(old_docs)
+    target.update(old_docs)
+    assert len(list(builder.get_items())) == 0
+
+
+def test_process_item(source, target, old_docs):
+    builder = CopyBuilder(source, target)
+    source.update(old_docs)
+    items = list(builder.get_items())
+    assert len(items) == len(list(map(builder.process_item, items)))
+
+
+def test_update_targets(source, target, old_docs, new_docs):
+    builder = CopyBuilder(source, target)
+    builder.update_targets(old_docs)
+    builder.update_targets(new_docs)
+    assert target.query_one(criteria={"k": 0})["v"] == "new"
+    assert target.query_one(criteria={"k": 10})["v"] == "old"
+
+
+def test_run(source, target, old_docs, new_docs):
+    source.update(old_docs)
+    source.update(new_docs)
+    target.update(old_docs)
+
+    builder = CopyBuilder(source, target)
+    builder.run()
+    assert target.query_one(criteria={"k": 0})["v"] == "new"
+    assert target.query_one(criteria={"k": 10})["v"] == "old"
+
+
+def test_query(source, target, old_docs, new_docs):
+    builder = CopyBuilder(source, target)
+    builder.query = {"k": {"$gt": 5}}
+    source.update(old_docs)
+    source.update(new_docs)
+    builder.run()
+    all_docs = list(target.query(criteria={}))
+    assert len(all_docs) == 14
+    assert min([d["k"] for d in all_docs]) == 6
+
+
+def test_delete_orphans(source, target, old_docs, new_docs):
+    builder = CopyBuilder(source, target, delete_orphans=True)
+    source.update(old_docs)
+    source.update(new_docs)
+    target.update(old_docs)
+
+    deletion_criteria = {"k": {"$in": list(range(5))}}
+    source.collection.delete_many(deletion_criteria)
+    builder.run()
+
+    assert target.collection.count_documents(deletion_criteria) == 0
+    assert target.query_one(criteria={"k": 5})["v"] == "new"
+    assert target.query_one(criteria={"k": 10})["v"] == "old"
+
+
+def test_incremental_false(source, target, old_docs, new_docs):
+    tic = datetime.now()
+    toc = tic + timedelta(seconds=1)
+    keys = list(range(20))
+    earlier = [{"lu": tic, "k": k, "v": "val"} for k in keys]
+    later = [{"lu": toc, "k": k, "v": "val"} for k in keys]
+    source.update(earlier)
+    target.update(later)
+    query = {"k": {"$gt": 5}}
+    builder = CopyBuilder(source, target, incremental=False, query=query)
+    builder.run()
+    docs = sorted(target.query(), key=lambda d: d["k"])
+    assert (all(d["lu"] == tic) for d in docs[5:])
+    assert (all(d["lu"] == toc) for d in docs[:5])

From 1909a4dbcb07424da2944747c8bd8aead47978a4 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 7 Nov 2019 14:12:54 -0800
Subject: [PATCH 25/99] update validator tests

---
 maggma/tests/test_validator.py | 130 ++++++++++++++++-----------------
 1 file changed, 63 insertions(+), 67 deletions(-)

diff --git a/maggma/tests/test_validator.py b/maggma/tests/test_validator.py
index b9c291fdf..414eb51c9 100644
--- a/maggma/tests/test_validator.py
+++ b/maggma/tests/test_validator.py
@@ -2,75 +2,71 @@
 """
 Tests the validators
 """
-import unittest
-from maggma.validator import JSONSchemaValidator, msonable_schema
+import pytest
+from maggma.validators import JSONSchemaValidator, msonable_schema
 from monty.json import MSONable
 
-class ValidatorTests(unittest.TestCase):
+
+class LatticeMock(MSONable):
     """
-    Tests for Validators.
+    A sample MSONable object, just for testing.
     """
 
-    def test_jsonschemevalidator(self):
-        """
-        Test the JSONSchemaValidator class.
-        """
-
-        class LatticeMock(MSONable):
-            """
-            A sample MSONable object, just for testing.
-            """
-            def __init__(self, a):
-                self.a = a
-
-        test_schema = {
-            "type": "object",
-            "properties":
-                {
-                    "task_id": {"type": "string"},
-                    "successful": {"type": "boolean"},
-                    "lattice": msonable_schema(LatticeMock)
-                },
-            "required": ["task_id", "successful"]
-        }
-
-        validator = JSONSchemaValidator(schema=test_schema)
-
-        lattice = LatticeMock(5)
-
-        valid_doc = {
-            'task_id': 'mp-test',
-            'successful': True,
-            'lattice': lattice.as_dict()
-        }
-
-        invalid_doc_msonable = {
-            'task_id': 'mp-test',
-            'successful': True,
-            'lattice': ['I am not a lattice!']
-        }
-
-        invalid_doc_missing_key = {
-            'task_id': 'mp-test',
-            'lattice': lattice.as_dict()
-        }
-
-        invalid_doc_wrong_type = {
-            'task_id': 'mp-test',
-            'successful': 'true',
-            'lattice': lattice.as_dict()
-        }
-
-        self.assertTrue(validator.is_valid(valid_doc))
-        self.assertFalse(validator.is_valid(invalid_doc_msonable))
-        self.assertFalse(validator.is_valid(invalid_doc_missing_key))
-        self.assertFalse(validator.is_valid(invalid_doc_wrong_type))
-
-        self.assertListEqual(validator.validation_errors(invalid_doc_msonable),
-                             ["lattice: ['I am not a lattice!'] is not of type 'object'"])
-
-        self.assertListEqual(validator.validation_errors(invalid_doc_missing_key),
-                             [": 'successful' is a required property"])
-
-        self.assertListEqual(validator.validation_errors(invalid_doc_wrong_type),
-                             ["successful: 'true' is not of type 'boolean'"])
+    def __init__(self, a):
+        self.a = a
+
+
+@pytest.fixture
+def test_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "task_id": {"type": "string"},
+            "successful": {"type": "boolean"},
+            "lattice": msonable_schema(LatticeMock),
+        },
+        "required": ["task_id", "successful"],
+    }
+
+
+def test_jsonschemevalidator(test_schema):
+    """
+    Test the JSONSchemaValidator class.
+    """
+
+    validator = JSONSchemaValidator(schema=test_schema)
+
+    lattice = LatticeMock(5)
+
+    valid_doc = {"task_id": "mp-test", "successful": True, "lattice": lattice.as_dict()}
+
+    invalid_doc_msonable = {
+        "task_id": "mp-test",
+        "successful": True,
+        "lattice": ["I am not a lattice!"],
+    }
+
+    invalid_doc_missing_key = {"task_id": "mp-test", "lattice": lattice.as_dict()}
+
+    invalid_doc_wrong_type = {
+        "task_id": "mp-test",
+        "successful": "true",
+        "lattice": lattice.as_dict(),
+    }
+
+    assert validator.is_valid(valid_doc)
+    assert not validator.is_valid(invalid_doc_msonable)
+    assert not validator.is_valid(invalid_doc_missing_key)
+    assert not validator.is_valid(invalid_doc_wrong_type)
+
+    assert validator.validation_errors(invalid_doc_msonable) == [
+        "lattice: ['I am not a lattice!'] is not of type 'object'"
+    ]
+
+    assert validator.validation_errors(invalid_doc_missing_key) == [
+        ": 'successful' is a required property"
+    ]
+
+    assert validator.validation_errors(invalid_doc_wrong_type) == [
+        "successful: 'true' is not of type 'boolean'"
+    ]

From 0ba2fe3070325747ea50725b170f7a2ff64c4152 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 7 Nov 2019 16:21:39 -0800
Subject: [PATCH 26/99] update travis

---
 .travis.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 3ca3e2874..4e741b5a9 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,7 +1,7 @@
 language: python
 cache: pip
 python:
-  - "3.6"
+  - "3.7"
 install:
   - ./install-mpi.sh openmpi
   - pip install -r requirements.txt
@@ -29,15 +29,13 @@ before_script:
   - cd -
 script:
   - mpiexec -n 2  python $PWD/maggma/tests/mpi_test.py
-  - nosetests --nocapture --with-coverage --cover-package=maggma
+  - pytest --cov=maggma/
 after_success:
   - coveralls
 notifications:
   email:
     recipients:
-      - montoyjh@lbl.gov
       - shyamd@lbl.gov
-      - dwinston@lbl.gov
     on_success: change
     on_failure: always
 deploy:

From 63e71a5fce6065c1c2081ee48e91009b19b60e75 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 14 Nov 2019 16:53:59 -0800
Subject: [PATCH 27/99] move around functions

---
 maggma/core/store.py | 89 +++++++++++++++++++++-----------------------
 1 file changed, 43 insertions(+), 46 deletions(-)

diff --git a/maggma/core/store.py b/maggma/core/store.py
index bc000667c..802b6b89e 100644
--- a/maggma/core/store.py
+++ b/maggma/core/store.py
@@ -6,7 +6,6 @@
 
 import logging
 
-
 from abc import ABCMeta, abstractmethod, abstractproperty
 
 from datetime import datetime
@@ -108,51 +107,6 @@ def query(
         """
         pass
 
-    def query_one(self, criteria=None, properties=None, **kwargs):
-        """
-        Function that gets a single document from GridFS. This store
-        ignores all property projections as its designed for whole
-        document access
-
-        Args:
-            criteria (dict): filter for query, matches documents
-                against key-value pairs
-            properties (list or dict): This will be ignored by the GridFS
-                Store
-            **kwargs (kwargs): further kwargs to Collection.find
-        """
-        return next(self.query(criteria=criteria, **kwargs), None)
-
-    def distinct(
-        self,
-        field: Union[List[str], str],
-        criteria: Optional[Dict] = None,
-        all_exist: bool = False,
-    ) -> Union[List[Dict], List]:
-        """
-        Get all distinct values for a field(s)
-        For a single field, this returns a list of values
-        For multiple fields, this return a list of of dictionaries for each unique combination
-
-        Args:
-            field: the field(s) to get distinct values for
-            criteria : PyMongo filter for documents to search in
-            all_exist : ensure all fields exist for the distinct set
-        """
-        field = field if isinstance(field, list) else [field]
-
-        criteria = criteria or {}
-
-        if all_exist:
-            criteria.update({f: {"$exists": 1} for f in field if f not in criteria})
-        results = [
-            key for key, _ in self.groupby(field, properties=field, criteria=criteria)
-        ]
-        # Flatten out results if searching for a single field
-        if len(field) == 1:
-            results = [get(r, field[0]) for r in results]
-        return results
-
     @abstractmethod
     def update(self, docs: Union[List[Dict], Dict], key: Union[List, str, None] = None):
         """
@@ -207,6 +161,49 @@ def groupby(
         """
         pass
 
+    def query_one(self, criteria=None, properties=None, sort=sort):
+        """
+        Queries the Store for a single document
+
+        Args:
+            criteria : PyMongo filter for documents to search
+            properties: properties to return in the document
+            sort: Dictionary of sort order for fields
+        """
+        return next(
+            self.query(criteria=criteria, properties=properties, sort=sort), None
+        )
+
+    def distinct(
+        self,
+        field: Union[List[str], str],
+        criteria: Optional[Dict] = None,
+        all_exist: bool = False,
+    ) -> Union[List[Dict], List]:
+        """
+        Get all distinct values for a field(s)
+        For a single field, this returns a list of values
+        For multiple fields, this return a list of of dictionaries for each unique combination
+
+        Args:
+            field: the field(s) to get distinct values for
+            criteria : PyMongo filter for documents to search in
+            all_exist : ensure all fields exist for the distinct set
+        """
+        field = field if isinstance(field, list) else [field]
+
+        criteria = criteria or {}
+
+        if all_exist:
+            criteria.update({f: {"$exists": 1} for f in field if f not in criteria})
+        results = [
+            key for key, _ in self.groupby(field, properties=field, criteria=criteria)
+        ]
+        # Flatten out results if searching for a single field
+        if len(field) == 1:
+            results = [get(r, field[0]) for r in results]
+        return results
+
     @property
     def last_updated(self):
         """

From 804d3abf169a24aaf8be37a3193fa9d2227eb293 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 14 Nov 2019 16:54:48 -0800
Subject: [PATCH 28/99] add remove docs

---
 maggma/core/store.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/maggma/core/store.py b/maggma/core/store.py
index 802b6b89e..f23c756a0 100644
--- a/maggma/core/store.py
+++ b/maggma/core/store.py
@@ -161,6 +161,16 @@ def groupby(
         """
         pass
 
+    @abstractmethod
+    def remove_docs(self, query : Dict):
+        """
+        Remove docs matching the query dictionary
+
+        Args:
+            query: query dictionary to match 
+        """
+        pass
+
     def query_one(self, criteria=None, properties=None, sort=sort):
         """
         Queries the Store for a single document

From 0bbcc3b54dc85e2980bcae3f983f660816c94d17 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 14 Nov 2019 17:03:49 -0800
Subject: [PATCH 29/99] rename argument

---
 maggma/core/store.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/maggma/core/store.py b/maggma/core/store.py
index f23c756a0..8a2d27949 100644
--- a/maggma/core/store.py
+++ b/maggma/core/store.py
@@ -162,12 +162,12 @@ def groupby(
         pass
 
     @abstractmethod
-    def remove_docs(self, query : Dict):
+    def remove_docs(self, criteria : Dict):
         """
         Remove docs matching the query dictionary
 
         Args:
-            query: query dictionary to match 
+            criteria: query dictionary to match
         """
         pass
 

From f9099f9efa3b32030c3f56ca5244e06f705c2b80 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 14 Nov 2019 17:37:11 -0800
Subject: [PATCH 30/99] add type hints

---
 maggma/core/store.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/maggma/core/store.py b/maggma/core/store.py
index 8a2d27949..fda28fe7d 100644
--- a/maggma/core/store.py
+++ b/maggma/core/store.py
@@ -162,7 +162,7 @@ def groupby(
         pass
 
     @abstractmethod
-    def remove_docs(self, criteria : Dict):
+    def remove_docs(self, criteria: Dict):
         """
         Remove docs matching the query dictionary
 
@@ -171,7 +171,12 @@ def remove_docs(self, criteria : Dict):
         """
         pass
 
-    def query_one(self, criteria=None, properties=None, sort=sort):
+    def query_one(
+        self,
+        criteria: Optional[Dict] = None,
+        properties: Union[Dict, List, None] = None,
+        sort: Optional[Dict[str, Sort]] = None,
+    ):
         """
         Queries the Store for a single document
 

From 5c75e9ab365f5aa1b443cfa3ca0f700c8a56a0e1 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 14 Nov 2019 17:40:07 -0800
Subject: [PATCH 31/99] add type hints

---
 maggma/builders.py | 89 ++++++++++++++++++++++++----------------------
 1 file changed, 46 insertions(+), 43 deletions(-)

diff --git a/maggma/builders.py b/maggma/builders.py
index 96af1a9c1..23afb42b5 100644
--- a/maggma/builders.py
+++ b/maggma/builders.py
@@ -7,7 +7,8 @@
 from datetime import datetime
 from maggma.utils import source_keys_updated, grouper, Timeout
 from time import time
-from maggma.core import Builder
+from maggma.core import Builder, Store
+from typing import Optional, Dict, List, Callable
 
 
 class MapBuilder(Builder, metaclass=ABCMeta):
@@ -22,38 +23,38 @@ class MapBuilder(Builder, metaclass=ABCMeta):
 
     def __init__(
         self,
-        source,
-        target,
-        ufn,
-        query=None,
-        incremental=True,
-        projection=None,
-        delete_orphans=False,
-        timeout=None,
-        store_process_time=True,
+        source: Store,
+        target: Store,
+        ufn: Callable,
+        query: Optional[Dict] = None,
+        incremental: bool = True,
+        projection: Optional[List] = None,
+        delete_orphans: bool = False,
+        timeout: int = 0,
+        store_process_time: bool = True,
         **kwargs
     ):
         """
         Apply a unary function to each source document.
 
         Args:
-            source (Store): source store
-            target (Store): target store
-            ufn (function): Unary function to process item
-                            You do not need to provide values for
-                            source.key and source.last_updated_field in the output.
-                            Any uncaught exceptions will be caught by
-                            process_item and logged to the "error" field
-                            in the target document.
-            query (dict): optional query to filter source store
-            incremental (bool): Whether to limit query to filter for only updated source documents.
-            projection (list): list of keys to project from the source for
+            source: source store
+            target: target store
+            ufn: Unary function to process item
+                You do not need to provide values for
+                source.key and source.last_updated_field in the output.
+                Any uncaught exceptions will be caught by
+                process_item and logged to the "error" field
+                in the target document.
+            query: optional query to filter source store
+            incremental: Whether to limit query to filter for only updated source documents.
+            projection: list of keys to project from the source for
                 processing. Limits data transfer to improve efficiency.
-            delete_orphans (bool): Whether to delete documents on target store
+            delete_orphans: Whether to delete documents on target store
                 with key values not present in source store. Deletion happens
                 after all updates, during Builder.finalize.
-            timeout (int): maximum running time per item in seconds
-            store_process_time (bool): If True, add "_process_time" key to
+            timeout: maximum running time per item in seconds
+            store_process_time: If True, add "_process_time" key to
             document for profiling purposes
         """
         self.source = source
@@ -61,7 +62,7 @@ def __init__(
         self.query = query
         self.incremental = incremental
         self.ufn = ufn
-        self.projection = projection if projection else []
+        self.projection = projection
         self.delete_orphans = delete_orphans
         self.kwargs = kwargs
         self.total = None
@@ -88,6 +89,10 @@ def ensure_indexes(self):
             )
 
     def get_items(self):
+        """
+        Generic get items for Map Builder designed to perform
+        incremental building
+        """
 
         self.logger.info("Starting {} Builder".format(self.__class__.__name__))
 
@@ -120,7 +125,11 @@ def get_items(self):
             ):
                 yield doc
 
-    def process_item(self, item):
+    def process_item(self, item: Dict):
+        """
+        Generic process items to process a dictionary using
+        a map function
+        """
 
         self.logger.debug("Processing: {}".format(item[self.source.key]))
 
@@ -149,7 +158,10 @@ def process_item(self, item):
         out.update(processed)
         return out
 
-    def update_targets(self, items):
+    def update_targets(self, items: List[Dict]):
+        """
+        Generic update targets for Map Builder
+        """
         source, target = self.source, self.target
         for item in items:
             # Use source last-updated value, ensuring `datetime` type.
@@ -167,23 +179,14 @@ def update_targets(self, items):
 
     def finalize(self, cursor=None):
         if self.delete_orphans:
-            # TODO: Should we add delete to standard Store?
-            if not hasattr(self.target, "collection"):
-                self.logger.warning(
-                    "delete_orphans parameter is only supported for "
-                    "Mongolike target stores at this time."
-                )
-            else:
-                source_keyvals = set(self.source.distinct(self.source.key))
-                target_keyvals = set(self.target.distinct(self.target.key))
-                to_delete = list(target_keyvals - source_keyvals)
-                if len(to_delete):
-                    self.logger.info(
-                        "Finalize: Deleting {} orphans.".format(len(to_delete))
-                    )
-                self.target.collection.delete_many(
-                    {self.target.key: {"$in": to_delete}}
+            source_keyvals = set(self.source.distinct(self.source.key))
+            target_keyvals = set(self.target.distinct(self.target.key))
+            to_delete = list(target_keyvals - source_keyvals)
+            if len(to_delete):
+                self.logger.info(
+                    "Finalize: Deleting {} orphans.".format(len(to_delete))
                 )
+            self.target.remove_docs({self.target.key: {"$in": to_delete}})
         super().finalize(cursor)
 
 

From 2993038526f8b6349876f1f885f8d376a61bb0fe Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 14 Nov 2019 20:34:52 -0800
Subject: [PATCH 32/99] use store close

---
 maggma/core/builder.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/maggma/core/builder.py b/maggma/core/builder.py
index 678ac8a8d..3a8fd8dd3 100644
--- a/maggma/core/builder.py
+++ b/maggma/core/builder.py
@@ -92,23 +92,16 @@ def update_targets(self, items: List):
         """
         pass
 
-    def finalize(self, cursor=None):
+    def finalize(self):
         """
         Perform any final clean up.
         """
         # Close any Mongo connections.
         for store in self.sources + self.targets:
             try:
-                store.collection.database.client.close()
+                store.close()
             except AttributeError:
                 continue
-        # Runner will pass iterable yielded by `self.get_items` as `cursor`. If
-        # this is a Mongo cursor with `no_cursor_timeout=True` (not the
-        # default), we must be explicitly kill it.
-        try:
-            cursor and cursor.close()
-        except AttributeError:
-            pass
 
     def run(self):
         """

From b8e922bbc62bfaa3f9544657050c26da269cbac3 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 14 Nov 2019 20:35:07 -0800
Subject: [PATCH 33/99] code style

---
 maggma/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/maggma/utils.py b/maggma/utils.py
index 07910391d..1a36835d8 100644
--- a/maggma/utils.py
+++ b/maggma/utils.py
@@ -53,7 +53,7 @@ def emit(self, record):
             self.flush()
         except (KeyboardInterrupt, SystemExit):
             raise
-        except:
+        except Exception:
             self.handleError(record)
 
 
@@ -127,7 +127,7 @@ def grouper(iterable, n, fillvalue=None):
     args = [iter(iterable)] * n
     iterator = itertools.zip_longest(*args, fillvalue=fillvalue)
 
-    if fillvalue == None:
+    if fillvalue is None:
         iterator = filter(None.__ne__, iterator)
 
     return iterator
@@ -143,7 +143,7 @@ def get_mpi():
         comm = MPI.COMM_WORLD
         rank = comm.Get_rank()
         size = comm.Get_size()
-    except:
+    except Exception:
         comm = None
         rank = -1
         size = 0

From 6979c8490fb8ce62a7fc979ce440ef9edd320200 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Fri, 15 Nov 2019 12:40:29 -0800
Subject: [PATCH 34/99] more type hints

---
 maggma/builders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maggma/builders.py b/maggma/builders.py
index 23afb42b5..f6d904428 100644
--- a/maggma/builders.py
+++ b/maggma/builders.py
@@ -284,7 +284,7 @@ def group_to_items(self, group):
 class CopyBuilder(MapBuilder):
     """Sync a source store with a target store."""
 
-    def __init__(self, source, target, **kwargs):
+    def __init__(self, source: Store, target: Store, **kwargs):
         super().__init__(
             source=source,
             target=target,

From 5fc4c4b62ca7a569ed320904fed61149f4ec1a15 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Fri, 15 Nov 2019 12:40:44 -0800
Subject: [PATCH 35/99] don't override default init with less documented
 version

---
 maggma/builders.py | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/maggma/builders.py b/maggma/builders.py
index f6d904428..204f7dc13 100644
--- a/maggma/builders.py
+++ b/maggma/builders.py
@@ -198,22 +198,6 @@ class GroupBuilder(MapBuilder, metaclass=ABCMeta):
     it has a newer (by last_updated_field) doc than the corresponding (by key) target doc.
     """
 
-    def __init__(self, source, target, query=None, **kwargs):
-        """
-
-        Given criteria, get docs with needed grouping properties. With these
-        minimal docs, yield groups. For each group, fetch all needed data for
-        item processing, and yield one or more items (i.e. subgroups as
-        appropriate).
-
-        Args:
-            source (Store): source store
-            target (Store): target store
-            query (dict): optional query to filter source store
-        """
-        super().__init__(source, target, query=query, **kwargs)
-        self.total = None
-
     def get_items(self):
         criteria = source_keys_updated(self.source, self.target, query=self.query)
         if all(isinstance(entry, str) for entry in self.grouping_properties()):

From 5685020e69ac30314d647a48f096b9de837b01bd Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Fri, 15 Nov 2019 12:48:23 -0800
Subject: [PATCH 36/99] make unary function part of class defintion

---
 maggma/builders.py | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/maggma/builders.py b/maggma/builders.py
index 204f7dc13..8820d2277 100644
--- a/maggma/builders.py
+++ b/maggma/builders.py
@@ -8,7 +8,7 @@
 from maggma.utils import source_keys_updated, grouper, Timeout
 from time import time
 from maggma.core import Builder, Store
-from typing import Optional, Dict, List, Callable
+from typing import Optional, Dict, List
 
 
 class MapBuilder(Builder, metaclass=ABCMeta):
@@ -25,7 +25,6 @@ def __init__(
         self,
         source: Store,
         target: Store,
-        ufn: Callable,
         query: Optional[Dict] = None,
         incremental: bool = True,
         projection: Optional[List] = None,
@@ -40,12 +39,6 @@ def __init__(
         Args:
             source: source store
             target: target store
-            ufn: Unary function to process item
-                You do not need to provide values for
-                source.key and source.last_updated_field in the output.
-                Any uncaught exceptions will be caught by
-                process_item and logged to the "error" field
-                in the target document.
             query: optional query to filter source store
             incremental: Whether to limit query to filter for only updated source documents.
             projection: list of keys to project from the source for
@@ -61,7 +54,6 @@ def __init__(
         self.target = target
         self.query = query
         self.incremental = incremental
-        self.ufn = ufn
         self.projection = projection
         self.delete_orphans = delete_orphans
         self.kwargs = kwargs
@@ -137,7 +129,7 @@ def process_item(self, item: Dict):
 
         try:
             with Timeout(seconds=self.timeout):
-                processed = self.ufn.__call__(item)
+                processed = self.unary_function(item)
         except Exception as e:
             self.logger.error(traceback.format_exc())
             processed = {"error": str(e)}
@@ -189,6 +181,18 @@ def finalize(self, cursor=None):
             self.target.remove_docs({self.target.key: {"$in": to_delete}})
         super().finalize(cursor)
 
+    @abstractmethod
+    def unary_function(self, item):
+        """
+        ufn: Unary function to process item
+                You do not need to provide values for
+                source.key and source.last_updated_field in the output.
+                Any uncaught exceptions will be caught by
+                process_item and logged to the "error" field
+                in the target document.
+        """
+        pass
+
 
 class GroupBuilder(MapBuilder, metaclass=ABCMeta):
     """
@@ -268,11 +272,5 @@ def group_to_items(self, group):
 class CopyBuilder(MapBuilder):
     """Sync a source store with a target store."""
 
-    def __init__(self, source: Store, target: Store, **kwargs):
-        super().__init__(
-            source=source,
-            target=target,
-            ufn=lambda x: x,
-            store_process_time=False,
-            **kwargs
-        )
+    def unary_function(item):
+        return item

From 4dee68f1930aee28d778df410607abcac5afcbd1 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Fri, 15 Nov 2019 16:52:19 -0800
Subject: [PATCH 37/99] Map builder should always be incremental

---
 maggma/builders.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/maggma/builders.py b/maggma/builders.py
index 8820d2277..2ffe2fc03 100644
--- a/maggma/builders.py
+++ b/maggma/builders.py
@@ -40,7 +40,6 @@ def __init__(
             source: source store
             target: target store
             query: optional query to filter source store
-            incremental: Whether to limit query to filter for only updated source documents.
             projection: list of keys to project from the source for
                 processing. Limits data transfer to improve efficiency.
             delete_orphans: Whether to delete documents on target store
@@ -53,7 +52,6 @@ def __init__(
         self.source = source
         self.target = target
         self.query = query
-        self.incremental = incremental
         self.projection = projection
         self.delete_orphans = delete_orphans
         self.kwargs = kwargs
@@ -90,12 +88,10 @@ def get_items(self):
 
         self.ensure_indexes()
 
-        if self.incremental:
-            keys = self.target.newer_in(
-                self.source, criteria=self.query, exhaustive=True
-            )
-        else:
-            keys = self.source.distinct(self.source.key, criteria=self.query)
+        
+        keys = self.target.newer_in(
+            self.source, criteria=self.query, exhaustive=True
+        )
 
         self.logger.info("Processing {} items".format(len(keys)))
 

From 23e3fc04e86b5a4cf7c966bde0a109b9d2291380 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Fri, 15 Nov 2019 16:56:43 -0800
Subject: [PATCH 38/99] add prechunk algorithm

---
 maggma/builders.py     | 18 +++++++++++++-----
 maggma/core/builder.py | 15 +++++++++++++--
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/maggma/builders.py b/maggma/builders.py
index 2ffe2fc03..2a38dc1dc 100644
--- a/maggma/builders.py
+++ b/maggma/builders.py
@@ -8,7 +8,7 @@
 from maggma.utils import source_keys_updated, grouper, Timeout
 from time import time
 from maggma.core import Builder, Store
-from typing import Optional, Dict, List
+from typing import Optional, Dict, List, Iterator
 
 
 class MapBuilder(Builder, metaclass=ABCMeta):
@@ -78,6 +78,17 @@ def ensure_indexes(self):
                 "for each of source and target."
             )
 
+    def prechunk(self, number_splits: int) -> Iterator[Dict]:
+        """
+        Generic prechunk for map builder to perform domain-decompostion
+        by the key field
+        """
+        self.ensure_indexes()
+        keys = self.target.newer_in(self.source, criteria=self.query, exhaustive=True)
+
+        for split in grouper(keys, number_splits):
+            yield {self.source.key: {"$in": list(filter(None.__ne__, split))}}
+
     def get_items(self):
         """
         Generic get items for Map Builder designed to perform
@@ -88,10 +99,7 @@ def get_items(self):
 
         self.ensure_indexes()
 
-        
-        keys = self.target.newer_in(
-            self.source, criteria=self.query, exhaustive=True
-        )
+        keys = self.target.newer_in(self.source, criteria=self.query, exhaustive=True)
 
         self.logger.info("Processing {} items".format(len(keys)))
 
diff --git a/maggma/core/builder.py b/maggma/core/builder.py
index 3a8fd8dd3..db3c48316 100644
--- a/maggma/core/builder.py
+++ b/maggma/core/builder.py
@@ -52,10 +52,21 @@ def connect(self):
         """
         Connect to the builder sources and targets.
         """
-        stores = self.sources + self.targets
-        for s in stores:
+        for s in self.sources + self.targets:
             s.connect()
 
+    def prechunk(self, number_splits: int) -> Iterator[Dict]:
+        """
+        Part of a domain-decomposition paradigm to allow the builder to operate on
+        multiple nodes by divinding up the IO as well as the compute
+        This function should return an iterator of dictionaries that can be distributed
+        to multiple instances of the builder to get/process/udpate on
+
+        Args:
+            number_splits: The number of groups to split the documents to work on
+        """
+        yield self.query
+
     @abstractmethod
     def get_items(self) -> Iterator:
         """

From fe2958bede439531d044b91962bc132e35c28cca Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Fri, 15 Nov 2019 17:14:36 -0800
Subject: [PATCH 39/99] get rid of source_keys_updated

---
 maggma/core/store.py | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/maggma/core/store.py b/maggma/core/store.py
index fda28fe7d..df56209f4 100644
--- a/maggma/core/store.py
+++ b/maggma/core/store.py
@@ -16,7 +16,7 @@
 
 from monty.dev import deprecated
 from monty.json import MSONable, MontyDecoder
-from maggma.utils import source_keys_updated, LU_KEY_ISOFORMAT
+from maggma.utils import LU_KEY_ISOFORMAT
 from maggma.core import Validator
 
 
@@ -267,7 +267,28 @@ def newer_in(
         self.ensure_index(self.key)
         self.ensure_index(self.last_updated_field)
         if exhaustive:
-            return source_keys_updated(target, self, query=criteria)
+
+            # Get our current last_updated dates for each key value
+            props = {self.key: 1, self.last_updated_field: 1, "_id": 0}
+            dates = {
+                d[self.key]: self._lu_func[0](d[self.last_updated_field])
+                for d in self.query(properties=props)
+            }
+
+            # Get the
+            props = {target.key: 1, target.last_updated_field: 1, "_id": 0}
+            target_dates = {
+                d[target.key]: target._lu_func[0](d[target.last_updated_field])
+                for d in target.query(criteria=criteria, properties=props)
+            }
+
+            new_keys = set(target_dates.keys()) - set(dates.keys())
+            updated_keys = {
+                key for key, date in dates.items() if target_dates[key] > date
+            }
+
+            return list(new_keys | updated_keys)
+
         else:
             key = key if key is not None else self.key  # Default value
             criteria = {
@@ -308,7 +329,7 @@ def updated_keys(self, target, criteria=None):
         self.ensure_index(self.key)
         self.ensure_index(self.last_updated_field)
 
-        return source_keys_updated(target, self, query=criteria)
+        return self.newer_in(target, criteria=criteria)
 
     def __eq__(self, other):
         return hash(self) == hash(other)

From 72981673e13e2346b58eb966749f2539cc1b8832 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Fri, 15 Nov 2019 17:14:55 -0800
Subject: [PATCH 40/99] ensure exhaustive mode works

---
 maggma/stores/tests/test_mongolike.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/maggma/stores/tests/test_mongolike.py b/maggma/stores/tests/test_mongolike.py
index faaf17b57..1ef7a4823 100644
--- a/maggma/stores/tests/test_mongolike.py
+++ b/maggma/stores/tests/test_mongolike.py
@@ -146,6 +146,7 @@ def test_mongostore_newer_in(mongostore):
     )
 
     assert len(target.newer_in(mongostore)) == 10
+    assert len(target.newer_in(mongostore, exhaustive=True)) == 10
     assert len(mongostore.newer_in(target)) == 0
 
     target._collection.drop()

From a2f228bdd82ac0d707419dd253a764cbe7936922 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Fri, 15 Nov 2019 17:23:16 -0800
Subject: [PATCH 41/99] add remove_docs for mongostores

---
 maggma/stores/mongolike.py            | 12 +++++++++++-
 maggma/stores/tests/test_mongolike.py |  8 +++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/maggma/stores/mongolike.py b/maggma/stores/mongolike.py
index 8d07e1380..a32a59ea3 100644
--- a/maggma/stores/mongolike.py
+++ b/maggma/stores/mongolike.py
@@ -238,7 +238,17 @@ def update(self, docs: Union[List[Dict], Dict], key: Union[List, str, None] = No
 
                 requests.append(ReplaceOne(search_doc, d, upsert=True))
 
-        self._collection.bulk_write(requests, ordered=False)
+        if len(requests) > 0:
+            self._collection.bulk_write(requests, ordered=False)
+
+    def remove_docs(self, criteria: Dict):
+        """
+        Remove docs matching the query dictionary
+
+        Args:
+            criteria: query dictionary to match
+        """
+        self._collection.delete_many(filter=criteria)
 
     def close(self):
         self._collection.database.client.close()
diff --git a/maggma/stores/tests/test_mongolike.py b/maggma/stores/tests/test_mongolike.py
index 1ef7a4823..38a2d770e 100644
--- a/maggma/stores/tests/test_mongolike.py
+++ b/maggma/stores/tests/test_mongolike.py
@@ -76,7 +76,6 @@ def test_mongostore_update(mongostore):
 
 
 def test_mongostore_groupby(mongostore):
-    mongostore._collection.drop()
     mongostore.update(
         [
             {"e": 7, "d": 9, "f": 9},
@@ -97,6 +96,13 @@ def test_mongostore_groupby(mongostore):
     assert len(data) == 3
 
 
+def test_mongostore_remove_docs(mongostore):
+    mongostore._collection.insert_one({"a": 1, "b": 2, "c": 3})
+    mongostore._collection.insert_one({"a": 4, "d": 5, "e": 6, "g": {"h": 1}})
+    mongostore.remove_docs({"a": 1})
+    assert len(list(mongostore.query({"a": 4}))) == 1
+    assert len(list(mongostore.query({"a": 1}))) == 0
+
 def test_mongostore_from_db_file(mongostore, db_json):
     ms = MongoStore.from_db_file(db_json)
     assert ms._collection_name == "tmp"

From 6da4da88bfb59fd8b838537d1bc2ff6a37f22973 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Sat, 16 Nov 2019 11:56:29 -0800
Subject: [PATCH 42/99] add remove to gridfs

---
 maggma/stores/gridfs.py            | 181 +++++++++++++++++------------
 maggma/stores/tests/test_gridfs.py |  33 +++++-
 2 files changed, 137 insertions(+), 77 deletions(-)

diff --git a/maggma/stores/gridfs.py b/maggma/stores/gridfs.py
index 02a19259f..c3ee4c72e 100644
--- a/maggma/stores/gridfs.py
+++ b/maggma/stores/gridfs.py
@@ -5,6 +5,9 @@
 various utillities
 """
 from __future__ import annotations
+
+from typing import Union, Optional, Dict, List, Iterator, Tuple
+
 import copy
 from datetime import datetime
 import json
@@ -14,10 +17,12 @@
 
 from pymongo import MongoClient
 from monty.json import jsanitize
+from monty.dev import deprecated
 from maggma.utils import confirm_field_index
-from maggma.core import Store
+from maggma.core import Store, Sort
 
 
+# TODO: Make arguments more specific for this
 class GridFSStore(Store):
     """
     A Store for GrdiFS backend. Provides a common access method consistent with other stores
@@ -74,12 +79,12 @@ def __init__(
 
         if "key" not in kwargs:
             kwargs["key"] = "_id"
-
-        kwargs["last_updated_field"] = "uploadDate"
-
         super().__init__(**kwargs)
 
-    def connect(self, force_reset=False):
+    def connect(self, force_reset: bool = False):
+        """
+        Connect to the source data
+        """
         conn = MongoClient(self.host, self.port)
         if not self._collection or force_reset:
             db = conn[self.database]
@@ -91,12 +96,16 @@ def connect(self, force_reset=False):
             self._chunks_collection = db["{}.chunks".format(self.collection_name)]
 
     @property
+    @deprecated(message="This will be removed in the future")
     def collection(self):
-        # TODO: Should this return the real MongoCollection or the GridFS
         return self._collection
 
     @property
-    def last_updated(self):
+    def last_updated(self) -> datetime:
+        """
+        Provides the most recent last_updated date time stamp from
+        the documents in this Store
+        """
         doc = next(
             self._files_collection.find(projection=[self.last_updated_field])
             .sort([(self.last_updated_field, pymongo.DESCENDING)])
@@ -117,34 +126,49 @@ def last_updated(self):
         )
 
     @classmethod
-    def transform_criteria(cls, criteria):
+    def transform_criteria(cls, criteria: Dict) -> Dict:
         """
         Allow client to not need to prepend 'metadata.' to query fields.
         Args:
             criteria (dict): Query criteria
         """
-        for field in criteria:
+        new_criteria = dict(**criteria)
+        for field in new_criteria:
             if field not in cls.files_collection_fields and not field.startswith(
                 "metadata."
             ):
-                criteria["metadata." + field] = copy.copy(criteria[field])
-                del criteria[field]
+                new_criteria["metadata." + field] = copy.copy(new_criteria[field])
+                del new_criteria[field]
+
+        return new_criteria
 
-    def query(self, criteria=None, properties=None, **kwargs):
+    def query(
+        self,
+        criteria: Optional[Dict] = None,
+        properties: Union[Dict, List, None] = None,
+        sort: Optional[Dict[str, Sort]] = None,
+        skip: int = 0,
+        limit: int = 0,
+    ) -> Iterator[Dict]:
         """
-        Function that gets data from GridFS. This store ignores all
-        property projections as its designed for whole document access
+        Queries the GridFS Store for a set of documents
+        Currently ignores properties
+
+        TODO: If properties wholy in metadata, just query that
 
         Args:
-            criteria (dict): filter for query, matches documents
-                against key-value pairs
-            properties (list or dict): This will be ignored by the GridFS
-                Store
-            **kwargs (kwargs): further kwargs to Collection.find
+            criteria : PyMongo filter for documents to search in
+            properties: properties to return in grouped documents
+            sort: Dictionary of sort order for fields
+            skip: number documents to skip
+            limit: limit on total number of documents returned
         """
         if isinstance(criteria, dict):
-            self.transform_criteria(criteria)
-        for f in self.collection.find(filter=criteria, **kwargs):
+            criteria = self.transform_criteria(criteria)
+
+        for f in self._collection.find(
+            filter=criteria, skip=skip, limit=limit, sort=sort
+        ):
             data = f.read()
 
             metadata = f.metadata
@@ -160,7 +184,10 @@ def query(self, criteria=None, properties=None, **kwargs):
     def distinct(self, key, criteria=None, all_exist=False, **kwargs):
         """
         Function get to get all distinct values of a certain key in
-        a mongolike store.  May take a single key or a list of keys
+        a GridFs store.
+
+        Currently not implemented
+        TODO: If key in metadata or transform to metadata field
 
         Args:
             key (mongolike key or list of mongolike keys): key or keys
@@ -170,52 +197,35 @@ def distinct(self, key, criteria=None, all_exist=False, **kwargs):
                 in each document, defaults to False
             **kwargs (kwargs): kwargs corresponding to collection.distinct
         """
-        if isinstance(key, list):
-            criteria = criteria if criteria else {}
-            # Update to ensure keys are there
-            if all_exist:
-                criteria.update(
-                    {k: {"$exists": True} for k in key if k not in criteria}
-                )
-
-            results = []
-            for d in self.groupby(key, properties=key, criteria=criteria):
-                results.append(d["_id"])
-            return results
-
-        else:
-            if criteria:
-                self.transform_criteria(criteria)
-            # Transfor to metadata subfield if not supposed to be in gridfs main fields
-            if key not in self.files_collection_fields:
-                key = "metadata.{}".format(key)
-
-            return self._files_collection.distinct(key, filter=criteria, **kwargs)
+        raise Exception("Can't get distinct values of GridFS Store")
 
     def groupby(
-        self, keys, criteria=None, properties=None, allow_disk_use=True, **kwargs
-    ):
+        self,
+        keys: Union[List[str], str],
+        criteria: Optional[Dict] = None,
+        properties: Union[Dict, List, None] = None,
+        sort: Optional[Dict[str, Sort]] = None,
+        skip: int = 0,
+        limit: int = 0,
+    ) -> Iterator[Tuple[Dict, List[Dict]]]:
         """
         Simple grouping function that will group documents
         by keys.
 
         Args:
-            keys (list or string): fields to group documents
-            criteria (dict): filter for documents to group
-            properties (list): properties to return in grouped documents
-            allow_disk_use (bool): whether to allow disk use in aggregation
+            keys: fields to group documents
+            criteria : PyMongo filter for documents to search in
+            properties: properties to return in grouped documents
+            sort: Dictionary of sort order for fields
+            skip: number documents to skip
+            limit: limit on total number of documents returned
 
         Returns:
-            command cursor corresponding to grouped documents
-
-            elements of the command cursor have the structure:
-            {'_id': {"KEY_1": value_1, "KEY_2": value_2 ...,
-             'docs': [list_of_documents corresponding to key values]}
-
+            generator returning tuples of (dict, list of docs)
         """
         pipeline = []
         if criteria is not None:
-            self.transform_criteria(criteria)
+            criteria = self.transform_criteria(criteria)
             pipeline.append({"$match": criteria})
 
         if properties is not None:
@@ -237,11 +247,19 @@ def groupby(
         group_id = {key: "${}".format(key) for key in keys}
         pipeline.append({"$group": {"_id": group_id, "docs": {"$push": "$$ROOT"}}})
 
-        return self.collection.aggregate(pipeline, allowDiskUse=allow_disk_use)
+        for doc in self._collection.aggregate(pipeline, allowDiskUse=True):
+            yield (doc["_id"], doc["docs"])
 
-    def ensure_index(self, key, unique=False):
+    def ensure_index(self, key: str, unique: Optional[bool] = False) -> bool:
         """
-        Wrapper for pymongo.Collection.ensure_index for the files collection
+        Tries to create an index and return true if it suceeded
+        Currently operators on the GridFS files collection
+        Args:
+            key: single key to index
+            unique: Whether or not this index contains only unique keys
+
+        Returns:
+            bool indicating if the index exists/was created
         """
         # Transform key for gridfs first
         if key not in self.files_collection_fields:
@@ -251,20 +269,26 @@ def ensure_index(self, key, unique=False):
             return True
         else:
             try:
-                self.collection.create_index(key, unique=unique, background=True)
+                self._collection.create_index(key, unique=unique, background=True)
                 return True
             except Exception:
                 return False
 
-    def update(self, docs, update_lu=True, key=None):
+    def update(self, docs: Union[List[Dict], Dict], key: Union[List, str, None] = None):
         """
-        Function to update associated MongoStore collection.
+        Update documents into the Store
 
         Args:
-            docs ([dict]): list of documents
-            update_lu (bool) : Updat the last_updated field or not
-            key (list or str): list or str of important parameters
+            docs: the document or list of documents to update
+            key: field name(s) to determine uniqueness for a
+                 document, can be a list of multiple fields,
+                 a single field, or None if the Store's key
+                 field is to be used
         """
+
+        if not isinstance(docs, list):
+            docs = [docs]
+
         if isinstance(key, str):
             key = [key]
         elif not key:
@@ -273,12 +297,9 @@ def update(self, docs, update_lu=True, key=None):
         key = list(set(key) | self.meta_keys - set(self.files_collection_fields))
 
         for d in docs:
-
             search_doc = {k: d[k] for k in key}
-            if update_lu:
-                d[self.last_updated_field] = datetime.utcnow()
 
-            metadata = {self.last_updated_field: d[self.last_updated_field]}
+            metadata = {k: d[k] for k in [self.last_updated_field] if k in d}
             metadata.update(search_doc)
 
             data = json.dumps(jsanitize(d)).encode("UTF-8")
@@ -286,8 +307,8 @@ def update(self, docs, update_lu=True, key=None):
                 data = zlib.compress(data)
                 metadata["compression"] = "zlib"
 
-            self.collection.put(data, metadata=metadata)
-            self.transform_criteria(search_doc)
+            self._collection.put(data, metadata=metadata)
+            search_doc = self.transform_criteria(search_doc)
 
             # Cleans up old gridfs entries
             for fdoc in (
@@ -295,10 +316,24 @@ def update(self, docs, update_lu=True, key=None):
                 .sort("uploadDate", -1)
                 .skip(1)
             ):
-                self.collection.delete(fdoc["_id"])
+                self._collection.delete(fdoc["_id"])
+
+    def remove_docs(self, criteria: Dict):
+        """
+        Remove docs matching the query dictionary
+
+        Args:
+            criteria: query dictionary to match
+        """
+        if isinstance(criteria, dict):
+            criteria = self.transform_criteria(criteria)
+        ids = [cursor._id for cursor in self._collection.find(criteria)]
+
+        for id in ids:
+            self._collection.delete(id)
 
     def close(self):
-        self.collection.database.client.close()
+        self._collection.database.client.close()
 
 
 class StoreError(Exception):
diff --git a/maggma/stores/tests/test_gridfs.py b/maggma/stores/tests/test_gridfs.py
index 8d7523f01..13167a38f 100644
--- a/maggma/stores/tests/test_gridfs.py
+++ b/maggma/stores/tests/test_gridfs.py
@@ -17,14 +17,19 @@ def gridfsstore():
 def test_update(gridfsstore):
     data1 = np.random.rand(256)
     data2 = np.random.rand(256)
+    tic = datetime(2018, 4, 12, 16)
     # Test metadata storage
-    gridfsstore.update([{"task_id": "mp-1", "data": data1}])
+    gridfsstore.update(
+        [{"task_id": "mp-1", "data": data1, gridfsstore.last_updated_field: tic}]
+    )
     assert (
         gridfsstore._files_collection.find_one({"metadata.task_id": "mp-1"}) is not None
     )
 
     # Test storing data
-    gridfsstore.update([{"task_id": "mp-1", "data": data2}])
+    gridfsstore.update(
+        [{"task_id": "mp-1", "data": data2, gridfsstore.last_updated_field: tic}]
+    )
     assert len(list(gridfsstore.query({"task_id": "mp-1"}))) == 1
     assert "task_id" in gridfsstore.query_one({"task_id": "mp-1"})
     nptu.assert_almost_equal(
@@ -45,13 +50,33 @@ def test_update(gridfsstore):
     )
 
 
+def test_remove(gridfsstore):
+    data1 = np.random.rand(256)
+    data2 = np.random.rand(256)
+    tic = datetime(2018, 4, 12, 16)
+    gridfsstore.update(
+        [{"task_id": "mp-1", "data": data1, gridfsstore.last_updated_field: tic}]
+    )
+    gridfsstore.update(
+        [{"task_id": "mp-2", "data": data2, gridfsstore.last_updated_field: tic}]
+    )
+
+    assert gridfsstore.query_one(criteria={"task_id": "mp-1"})
+    assert gridfsstore.query_one(criteria={"task_id": "mp-2"})
+    gridfsstore.remove_docs({"task_id": "mp-1"})
+    assert gridfsstore.query_one(criteria={"task_id": "mp-1"}) is None
+    assert gridfsstore.query_one(criteria={"task_id": "mp-2"})
+
+
 def test_query(gridfsstore):
     data1 = np.random.rand(256)
     data2 = np.random.rand(256)
     tic = datetime(2018, 4, 12, 16)
-    gridfsstore.update([{"task_id": "mp-1", "data": data1}])
     gridfsstore.update(
-        [{"task_id": "mp-2", "data": data2, gridfsstore.last_updated_field: tic}], update_lu=False
+        [{"task_id": "mp-1", "data": data1, gridfsstore.last_updated_field: tic}]
+    )
+    gridfsstore.update(
+        [{"task_id": "mp-2", "data": data2, gridfsstore.last_updated_field: tic}]
     )
 
     doc = gridfsstore.query_one(criteria={"task_id": "mp-1"})

From 34737136fc08acdaaa59406b1a57553037fdb0ee Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Sat, 16 Nov 2019 11:56:36 -0800
Subject: [PATCH 43/99] add remove to aws

---
 maggma/stores/aws.py            | 56 ++++++++++++++++++++++++---------
 maggma/stores/tests/test_aws.py | 17 ++++++++--
 2 files changed, 56 insertions(+), 17 deletions(-)

diff --git a/maggma/stores/aws.py b/maggma/stores/aws.py
index 9b1d8f1d4..c5fbadc4d 100644
--- a/maggma/stores/aws.py
+++ b/maggma/stores/aws.py
@@ -8,8 +8,11 @@
 
 from typing import Union, Optional, Dict, List, Iterator, Tuple
 
-from maggma.core import Store, Sort
 from monty.json import jsanitize
+from monty.dev import deprecated
+
+from maggma.core import Store, Sort
+from maggma.utils import grouper
 
 try:
     import boto3
@@ -26,12 +29,13 @@ class AmazonS3Store(Store):
     Assumes Amazon AWS key and secret key are set in environment or default config file
     """
 
-    def __init__(self, index, bucket, **kwargs):
+    def __init__(self, index: Store, bucket: str, compress: bool = False, **kwargs):
         """
         Initializes an S3 Store
         Args:
             index (Store): a store to use to index the S3 Bucket
             bucket (str) : name of the bucket
+            compress (bool): compress files inserted into the store
         """
         if not boto_import:
             raise ValueError(
@@ -39,6 +43,7 @@ def __init__(self, index, bucket, **kwargs):
             )
         self.index = index
         self.bucket = bucket
+        self.compress = compress
         self.s3 = None
         self.s3_bucket = None
         # Force the key to be the same as the index
@@ -67,7 +72,12 @@ def close(self):
         self.s3_bucket = None
 
     @property
+    @deprecated(message="This will be removed in the future")
     def collection(self):
+        """
+        Returns a handle to the pymongo collection object
+        Not guaranteed to exist in the future
+        """
         # For now returns the index collection since that is what we would "search" on
         return self.index
 
@@ -89,21 +99,24 @@ def query(
             skip: number documents to skip
             limit: limit on total number of documents returned
         """
-        for f in self.index.query(criteria=criteria, sort=sort, limit=limit, skip=skip):
+        for doc in self.index.query(
+            criteria=criteria, sort=sort, limit=limit, skip=skip
+        ):
             try:
                 # TODO : THis is ugly and unsafe, do some real checking before pulling data
-                data = self.s3_bucket.Object(f[self.key]).get()["Body"].read()
+                data = self.s3_bucket.Object(doc[self.key]).get()["Body"].read()
             except botocore.exceptions.ClientError as e:
                 # If a client error is thrown, then check that it was a 404 error.
                 # If it was a 404 error, then the object does not exist.
                 error_code = int(e.response["Error"]["Code"])
                 if error_code == 404:
-                    self.logger.error("Could not find S3 object {}".format(f[self.key]))
+                    self.logger.error(
+                        "Could not find S3 object {}".format(doc[self.key])
+                    )
                     break
 
-            if f.get("compression", "") == "zlib":
+            if doc.get("compression", "") == "zlib":
                 data = zlib.decompress(data)
-            print(data)
             yield json.loads(data)
 
     def distinct(
@@ -170,12 +183,7 @@ def ensure_index(self, key: str, unique: Optional[bool] = False) -> bool:
         """
         return self.index.ensure_index(key, unique=unique, background=True)
 
-    def update(
-        self,
-        docs: Union[List[Dict], Dict],
-        key: Union[List, str, None] = None,
-        compress=True,
-    ):
+    def update(self, docs: Union[List[Dict], Dict], key: Union[List, str, None] = None):
         """
         Update documents into the Store
 
@@ -185,7 +193,6 @@ def update(
                  document, can be a list of multiple fields,
                  a single field, or None if the Store's key
                  field is to be used
-            compress: compress the documents into the S3 bucket
         """
         search_docs = []
         search_keys = []
@@ -208,7 +215,7 @@ def update(
             data = json.dumps(jsanitize(d)).encode()
 
             # Compress with zlib if chosen
-            if compress:
+            if self.compress:
                 search_doc["compression"] = "zlib"
                 data = zlib.compress(data)
 
@@ -218,6 +225,25 @@ def update(
         # Use store's update to remove key clashes
         self.index.update(search_docs)
 
+    def remove_docs(self, criteria: Dict, remove_s3_object: bool = False):
+        """
+        Remove docs matching the query dictionary
+
+        Args:
+            criteria: query dictionary to match
+            remove_s3_object: whether to remove the actual S3 Object or not
+        """
+        if not remove_s3_object:
+            self.index.remove_docs(criteria=criteria)
+        else:
+            to_remove = self.index.distinct(self.key, criteria=criteria)
+            self.index.remove_docs(criteria=criteria)
+
+            # Can remove up to 1000 items at a time via boto
+            to_remove_chunks = list(grouper(to_remove, N=1000))
+            for chunk_to_remove in to_remove_chunks:
+                self.s3_bucket.delete_objects()
+
     @property
     def last_updated(self):
         return self.index.last_updated
diff --git a/maggma/stores/tests/test_aws.py b/maggma/stores/tests/test_aws.py
index b1cd09876..00b57398c 100644
--- a/maggma/stores/tests/test_aws.py
+++ b/maggma/stores/tests/test_aws.py
@@ -38,11 +38,24 @@ def test_qeuery(s3store):
 
 
 def test_update(s3store):
-    s3store.update([{"task_id": "mp-2", "data": "asd"}], compress=False)
+    s3store.update([{"task_id": "mp-2", "data": "asd"}])
     assert s3store.query_one({"task_id": "mp-2"}) is not None
 
-    s3store.update([{"task_id": "mp-4", "data": "asd"}], compress=True)
+    s3store.compress = True
+    s3store.update([{"task_id": "mp-4", "data": "asd"}])
     assert s3store.index.query_one({"task_id": "mp-4"})["compression"] == "zlib"
     assert s3store.query_one({"task_id": "mp-4"}) is not None
     assert s3store.query_one({"task_id": "mp-4"})["data"] == "asd"
 
+
+def test_remove(s3store):
+    s3store.update([{"task_id": "mp-2", "data": "asd"}])
+    s3store.update([{"task_id": "mp-4", "data": "asd"}])
+
+    assert s3store.query_one({"task_id": "mp-2"}) is not None
+    assert s3store.query_one({"task_id": "mp-4"}) is not None
+
+    s3store.remove_docs({"task_id": "mp-2"})
+
+    assert s3store.query_one({"task_id": "mp-2"}) is None
+    assert s3store.query_one({"task_id": "mp-4"}) is not None

From d32a1a5219cbaf72db746f52520081893fef8baf Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Sat, 16 Nov 2019 12:50:57 -0800
Subject: [PATCH 44/99] more type hints

---
 maggma/builders.py   | 12 ++++++------
 maggma/core/store.py |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/maggma/builders.py b/maggma/builders.py
index 2a38dc1dc..291a8a8d0 100644
--- a/maggma/builders.py
+++ b/maggma/builders.py
@@ -8,7 +8,7 @@
 from maggma.utils import source_keys_updated, grouper, Timeout
 from time import time
 from maggma.core import Builder, Store
-from typing import Optional, Dict, List, Iterator
+from typing import Optional, Dict, List, Iterator, Union
 
 
 class MapBuilder(Builder, metaclass=ABCMeta):
@@ -206,7 +206,7 @@ class GroupBuilder(MapBuilder, metaclass=ABCMeta):
     it has a newer (by last_updated_field) doc than the corresponding (by key) target doc.
     """
 
-    def get_items(self):
+    def get_items(self) -> Iterator[Dict]:
         criteria = source_keys_updated(self.source, self.target, query=self.query)
         if all(isinstance(entry, str) for entry in self.grouping_properties()):
             properties = {entry: 1 for entry in self.grouping_properties()}
@@ -230,7 +230,7 @@ def get_items(self):
 
     @staticmethod
     @abstractmethod
-    def grouping_properties():
+    def grouping_properties() -> Union[List, Dict]:
         """
         Needed projection for docs_to_groups (passed to source.query).
 
@@ -244,14 +244,14 @@ def grouping_properties():
 
     @staticmethod
     @abstractmethod
-    def docs_to_groups(docs):
+    def docs_to_groups(docs: List[Dict]) -> Iterator:
         """
         Yield groups from (minimally-projected) documents.
 
         This could be as simple as returning a set of unique document keys.
 
         Args:
-            docs (pymongo.cursor.Cursor): documents with minimal projections
+            docs: documents with minimal projections
                 needed to determine groups.
 
         Returns:
@@ -259,7 +259,7 @@ def docs_to_groups(docs):
         """
 
     @abstractmethod
-    def group_to_items(self, group):
+    def group_to_items(self, group: Dict) -> Iterator:
         """
         Given a group, yield items for this builder's process_item method.
 
diff --git a/maggma/core/store.py b/maggma/core/store.py
index df56209f4..90cd91fa4 100644
--- a/maggma/core/store.py
+++ b/maggma/core/store.py
@@ -220,7 +220,7 @@ def distinct(
         return results
 
     @property
-    def last_updated(self):
+    def last_updated(self) -> datetime:
         """
         Provides the most recent last_updated date time stamp from
         the documents in this Store

From e89707411a16711b649c3b34f68d898ca0f1e61c Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Sat, 16 Nov 2019 15:39:22 -0800
Subject: [PATCH 45/99] add remove_docs to advanced stores

---
 maggma/stores/advanced_stores.py            | 28 ++++++++++++++++-
 maggma/stores/tests/test_advanced_stores.py | 35 ++++++++++++++++++++-
 2 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/maggma/stores/advanced_stores.py b/maggma/stores/advanced_stores.py
index 281579e40..b30651dea 100644
--- a/maggma/stores/advanced_stores.py
+++ b/maggma/stores/advanced_stores.py
@@ -75,7 +75,9 @@ def connect(self, force_reset: bool = False):
             self._collection = db[self.collection_name]
 
     def __hash__(self):
-        return hash((self.mongogrant_spec, self.collection_name, self.last_updated_field))
+        return hash(
+            (self.mongogrant_spec, self.collection_name, self.last_updated_field)
+        )
 
 
 class VaultStore(MongoStore):
@@ -270,6 +272,17 @@ def update(self, docs: Union[List[Dict], Dict], key: Union[List, str, None] = No
 
         self.store.update(docs, key=key)
 
+    def remove_docs(self, criteria: Dict):
+        """
+        Remove docs matching the query dictionary
+
+        Args:
+            criteria: query dictionary to match
+        """
+        # Update criteria and properties based on aliases
+        lazy_substitute(criteria, self.reverse_aliases)
+        self.store.remove_docs(criteria)
+
     def ensure_index(self, key, unique=False, **kwargs):
         if key in self.aliases:
             key = self.aliases
@@ -395,6 +408,19 @@ def update(self, docs: Union[List[Dict], Dict], key: Union[List, str, None] = No
 
         self.store.update(docs, key=key)
 
+    def remove_docs(self, criteria: Dict):
+        """
+        Remove docs matching the query dictionary
+
+        Args:
+            criteria: query dictionary to match
+        """
+        # Update criteria and properties based on aliases
+        criteria = (
+            dict(**criteria, **self.sbx_criteria) if criteria else self.sbx_criteria
+        )
+        self.store.remove_docs(criteria)
+
     def ensure_index(self, key, unique=False, **kwargs):
         return self.store.ensure_index(key, unique, **kwargs)
 
diff --git a/maggma/stores/tests/test_advanced_stores.py b/maggma/stores/tests/test_advanced_stores.py
index 9b3dc1009..6bc15f0e8 100644
--- a/maggma/stores/tests/test_advanced_stores.py
+++ b/maggma/stores/tests/test_advanced_stores.py
@@ -30,9 +30,9 @@
 from maggma.stores.advanced_stores import substitute
 
 
-
 @pytest.fixture("module")
 def mgrant_server():
+    # TODO: This is whacked code that starts a mongo server. How do we fix this?
     _, config_path = tempfile.mkstemp()
     _, mdlogpath = tempfile.mkstemp()
     mdpath = tempfile.mkdtemp()
@@ -217,6 +217,23 @@ def test_aliasing_update(alias_store):
     assert list(alias_store.store.query(criteria={"task_id": "mp-5"}))[0]["g"]["h"] == 6
 
 
+def test_aliasing_remove_docs(alias_store):
+
+    alias_store.update(
+        [
+            {"task_id": "mp-3", "a": 4},
+            {"task_id": "mp-4", "c": {"d": 5}},
+            {"task_id": "mp-5", "f": 6},
+        ]
+    )
+    assert alias_store.query_one(criteria={"task_id": "mp-3"})
+    assert alias_store.query_one(criteria={"task_id": "mp-4"})
+    assert alias_store.query_one(criteria={"task_id": "mp-5"})
+
+    alias_store.remove_docs({"a": 4})
+    assert alias_store.query_one(criteria={"task_id": "mp-3"}) is None
+
+
 def test_aliasing_substitute(alias_store):
     aliases = {"a": "b", "c.d": "e", "f": "g.h"}
 
@@ -279,3 +296,19 @@ def test_sandbox_update(sandbox_store):
     assert sandbox_store.collection.find_one({"e": 6})["sbxn"] == ["test"]
     sandbox_store.update([{"e": 7, "sbxn": ["core"]}], key="e")
     assert set(sandbox_store.query_one(criteria={"e": 7})["sbxn"]) == {"test", "core"}
+
+
+def test_sandbox_remove_docs(sandbox_store):
+    sandbox_store.connect()
+    sandbox_store.update([{"e": 6, "d": 4}], key="e")
+    sandbox_store.update([{"e": 7, "sbxn": ["core"]}], key="e")
+
+    assert sandbox_store.query_one(criteria={"d": {"$exists": 1}}, properties=["d"])
+    assert sandbox_store.query_one(criteria={"e": 7})
+    sandbox_store.remove_docs(criteria={"d": 4})
+
+    assert (
+        sandbox_store.query_one(criteria={"d": {"$exists": 1}}, properties=["d"])
+        is None
+    )
+    assert sandbox_store.query_one(criteria={"e": 7})

From b89cce0f8bb888624de73e1ad8f0eefc627add24 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Sat, 16 Nov 2019 17:56:44 -0800
Subject: [PATCH 46/99] Update compound stores

---
 maggma/stores/compound_stores.py            | 185 +++++++++++++-------
 maggma/stores/tests/test_compound_stores.py |  47 ++---
 2 files changed, 145 insertions(+), 87 deletions(-)

diff --git a/maggma/stores/compound_stores.py b/maggma/stores/compound_stores.py
index 6e684be3b..d32fb68dd 100644
--- a/maggma/stores/compound_stores.py
+++ b/maggma/stores/compound_stores.py
@@ -1,8 +1,12 @@
 from typing import List, Iterator, Tuple, Optional, Union, Dict
+from datetime import datetime
+from itertools import groupby
 from pydash import get, set_
 from pymongo import MongoClient
+from monty.dev import deprecated
 from maggma.core import Store, Sort
 from maggma.stores import MongoStore
+from operator import itemgetter
 
 
 class JointStore(Store):
@@ -43,9 +47,10 @@ def connect(self, force_reset: bool = False):
         )
 
     def close(self):
-        self.collection.database.client.close()
+        self._collection.database.client.close()
 
     @property
+    @deprecated("This will be removed in the future")
     def collection(self):
         return self._collection
 
@@ -58,7 +63,7 @@ def last_updated(self):
         lus = []
         for cname in self.collection_names:
             lu = MongoStore.from_collection(
-                self.collection.database[cname],
+                self._collection.database[cname],
                 last_updated_field=self.last_updated_field,
             ).last_updated
             lus.append(lu)
@@ -69,33 +74,7 @@ def update(self, docs, update_lu=True, key=None, **kwargs):
         raise NotImplementedError("No update method for JointStore")
 
     def _get_store_by_name(self, name):
-        return MongoStore.from_collection(self.collection.database[name])
-
-    def distinct(
-        self,
-        field: Union[List[str], str],
-        criteria: Optional[Dict] = None,
-        all_exist: bool = False,
-    ) -> List:
-        """
-        Get all distinct values for a key
-
-        Args:
-            field: the field(s) to get distinct values for
-            criteria : PyMongo filter for documents to search in
-            all_exist : ensure all fields exist for the distinct set
-        """
-        g_field = field if isinstance(field, list) else [field]
-        if all_exist:
-            criteria = criteria or {}
-            criteria.update(
-                {k: {"$exists": True} for k in g_field if k not in criteria}
-            )
-        cursor = self.groupby(g_field, criteria=criteria)
-        if isinstance(field, list):
-            return [d[0] for d in cursor]
-        else:
-            return [get(d[0], field) for d in cursor]
+        return MongoStore.from_collection(self._collection.database[name])
 
     def ensure_index(self, key, unique=False, **kwargs):
         raise NotImplementedError("No ensure_index method for JointStore")
@@ -237,11 +216,20 @@ def query_one(self, criteria=None, properties=None, **kwargs):
         except StopIteration:
             return None
 
+    def remove_docs(self, criteria: Dict):
+        """
+        Remove docs matching the query dictionary
+
+        Args:
+            criteria: query dictionary to match
+        """
+        raise NotImplementedError("No remove_docs method for JointStore")
+
 
 class ConcatStore(Store):
     """Store concatting multiple stores"""
 
-    def __init__(self, *stores, **kwargs):
+    def __init__(self, *stores: Store, **kwargs):
         """
         Initialize a ConcatStore that concatenates multiple stores together
         to appear as one store
@@ -249,7 +237,7 @@ def __init__(self, *stores, **kwargs):
         self.stores = stores
         super(ConcatStore, self).__init__(**kwargs)
 
-    def connect(self, force_reset=False):
+    def connect(self, force_reset: bool = False):
         """
         Connect all stores in this ConcatStore
         Args:
@@ -267,11 +255,12 @@ def close(self):
             store.close()
 
     @property
+    @deprecated
     def collection(self):
         raise NotImplementedError("No collection property for ConcatStore")
 
     @property
-    def last_updated(self):
+    def last_updated(self) -> datetime:
         """
         Finds the most recent last_updated across all the stores.
         This might not be the most usefull way to do this for this type of Store
@@ -284,53 +273,105 @@ def last_updated(self):
             lus.append(lu)
         return max(lus)
 
-    # TODO: implement update?
-    def update(self, docs, update_lu=True, key=None, **kwargs):
-        raise NotImplementedError("No update method for JointStore")
+    def update(self, docs: Union[List[Dict], Dict], key: Union[List, str, None] = None):
+        """
+        Update documents into the Store
+        Not implemented in ConcatStore
 
-    def distinct(self, key, criteria=None, all_exist=True, **kwargs):
+        Args:
+            docs: the document or list of documents to update
+            key: field name(s) to determine uniqueness for a
+                 document, can be a list of multiple fields,
+                 a single field, or None if the Store's key
+                 field is to be used
         """
-        Return all distinct values for a key within the stores
+        raise NotImplementedError("No update method for ConcatStore")
+
+    def distinct(
+        self,
+        field: Union[List[str], str],
+        criteria: Optional[Dict] = None,
+        all_exist: bool = False,
+    ) -> Union[List[Dict], List]:
+        """
+        Get all distinct values for a field(s)
+        For a single field, this returns a list of values
+        For multiple fields, this return a list of of dictionaries for each unique combination
+
         Args:
-            key (str): key to find distinct values
-            criteria (dict): criteria dictionary to reduce the documents to search on
-            all_exist (bool): ensure the key exists in the doc or not
+            field: the field(s) to get distinct values for
+            criteria : PyMongo filter for documents to search in
+            all_exist : ensure all fields exist for the distinct set
         """
         distincts = []
         for store in self.stores:
-            distincts.extend(store.distinct(key, criteria, all_exist, **kwargs))
-        return list(set(distincts))
+            distincts.extend(
+                store.distinct(field=field, criteria=criteria, all_exist=all_exist)
+            )
 
-    def ensure_index(self, key, unique=False, **kwargs):
+        if isinstance(field, str):
+            return list(set(distincts))
+        else:
+            return [dict(s) for s in set(frozenset(d.items()) for d in distincts)]
+
+    def ensure_index(self, key: str, unique: Optional[bool] = False) -> bool:
         """
         Ensure an index is properly set. Returns whether all stores support this index or not
         Args:
-            key (str or [str]): single key or list of keys to group by
+            key: single key to index
+            unique: Whether or not this index contains only unique keys
+
+        Returns:
+            bool indicating if the index exists/was created on all stores
         """
-        return all([store.ensure_index(key, unique, **kwargs) for store in self.stores])
+        return all([store.ensure_index(key, unique) for store in self.stores])
 
-    def query(self, criteria=None, properties=None, **kwargs):
+    def query(
+        self,
+        criteria: Optional[Dict] = None,
+        properties: Union[Dict, List, None] = None,
+        sort: Optional[Dict[str, Sort]] = None,
+        skip: int = 0,
+        limit: int = 0,
+    ) -> Iterator[Dict]:
         """
-        Queries across all the stores.
+        Queries across all Store for a set of documents
+
         Args:
-            criteria (dict): mongo style query to reduce the docs to group
-            properties (str or [str]): properties to project
+            criteria : PyMongo filter for documents to search in
+            properties: properties to return in grouped documents
+            sort: Dictionary of sort order for fields
+            skip: number documents to skip
+            limit: limit on total number of documents returned
         """
+        # TODO: skip, sort and limit are broken. implement properly
         for store in self.stores:
-            for d in store.query(criteria=criteria, properties=properties, **kwargs):
+            for d in store.query(criteria=criteria, properties=properties):
                 yield d
 
-    def query_one(self, criteria=None, properties=None, **kwargs):
-        return next(self.query(criteria=criteria, properties=properties, **kwargs))
-
-    def groupby(self, keys, criteria=None, properties=None, **kwargs):
+    def groupby(
+        self,
+        keys: Union[List[str], str],
+        criteria: Optional[Dict] = None,
+        properties: Union[Dict, List, None] = None,
+        sort: Optional[Dict[str, Sort]] = None,
+        skip: int = 0,
+        limit: int = 0,
+    ) -> Iterator[Tuple[Dict, List[Dict]]]:
         """
-        Group documents by a key. This version is highly inefficient since it performs
-        post-grouping in python across all of its stores
+        Simple grouping function that will group documents
+        by keys.
+
         Args:
-            keys (str or [str]): single key or list of keys to group by
-            criteria (dict): mongo style query to reduce the docs to group
-            properties (str or [str]): properties to project
+            keys: fields to group documents
+            criteria : PyMongo filter for documents to search in
+            properties: properties to return in grouped documents
+            sort: Dictionary of sort order for fields
+            skip: number documents to skip
+            limit: limit on total number of documents returned
+
+        Returns:
+            generator returning tuples of (dict, list of docs)
         """
         if isinstance(keys, str):
             keys = [keys]
@@ -338,15 +379,31 @@ def groupby(self, keys, criteria=None, properties=None, **kwargs):
         docs = []
         for store in self.stores:
             temp_docs = list(
-                store.groupby(keys, criteria=criteria, properties=properties, **kwargs)
+                store.groupby(
+                    keys,
+                    criteria=criteria,
+                    properties=properties,
+                    sort=sort,
+                    skip=skip,
+                    limit=limit,
+                )
             )
             for group in temp_docs:
-                docs.extend(group["docs"])
+                docs.extend(group[1])
 
         def key_set(d):
             "index function based on passed in keys"
-            test_d = tuple(d.get(k, "") for k in keys)
+            test_d = tuple(d.get(k, None) for k in keys)
             return test_d
 
-        for k, group in groupby(docs, key=key_set):
-            yield list(group)
+        for k, group in groupby(sorted(docs, key=key_set), key=key_set):
+            yield k, list(group)
+
+    def remove_docs(self, criteria: Dict):
+        """
+        Remove docs matching the query dictionary
+
+        Args:
+            criteria: query dictionary to match
+        """
+        raise NotImplementedError("No remove_docs method for JointStore")
diff --git a/maggma/stores/tests/test_compound_stores.py b/maggma/stores/tests/test_compound_stores.py
index b2c023189..60e054b24 100644
--- a/maggma/stores/tests/test_compound_stores.py
+++ b/maggma/stores/tests/test_compound_stores.py
@@ -1,7 +1,7 @@
 import pytest
 from pydash import get
 from datetime import datetime
-from maggma.core import StoreError
+from itertools import chain
 from maggma.stores import MongoStore, MemoryStore, JointStore, ConcatStore
 
 
@@ -9,8 +9,8 @@
 def jointstore():
     store = JointStore("maggma_test", ["test1", "test2"])
     store.connect()
-    store.collection.drop()
-    store.collection.insert_many(
+    store._collection.drop()
+    store._collection.insert_many(
         [
             {
                 "task_id": k,
@@ -21,8 +21,8 @@ def jointstore():
             for k in range(10)
         ]
     )
-    store.collection.database["test2"].drop()
-    store.collection.database["test2"].insert_many(
+    store._collection.database["test2"].drop()
+    store._collection.database["test2"].insert_many(
         [
             {
                 "task_id": 2 * k,
@@ -92,13 +92,12 @@ def test_joint_store_query_one(jointstore):
 
 
 def test_joint_store_distinct(jointstore):
-    dyour_prop = jointstore.distinct("test2.your_prop")
-    print(dyour_prop)
-    assert set(dyour_prop) == {k + 3 for k in range(5)}
-    dmy_prop = jointstore.distinct("my_prop")
-    assert set(dmy_prop) == {k + 1 for k in range(10)}
-    dmy_prop_cond = jointstore.distinct("my_prop", {"test2.your_prop": {"$gte": 5}})
-    assert set(dmy_prop_cond), {5, 7 == 9}
+    your_prop = jointstore.distinct("test2.your_prop")
+    assert set(your_prop) == {k + 3 for k in range(5)}
+    my_prop = jointstore.distinct("my_prop")
+    assert set(my_prop) == {k + 1 for k in range(10)}
+    my_prop_cond = jointstore.distinct("my_prop", {"test2.your_prop": {"$gte": 5}})
+    assert set(my_prop_cond), {5, 7 == 9}
 
 
 def test_joint_store_last_updated(jointstore, jointstore_test1, jointstore_test2):
@@ -140,6 +139,16 @@ def test_joint_store_groupby(jointstore):
     assert len(zero_docs[1]) == 3
 
 
+def test_joint_update(jointstore):
+    with pytest.raises(NotImplementedError):
+        jointstore.update({})
+
+
+def test_joint_remove_docs(jointstore):
+    with pytest.raises(NotImplementedError):
+        jointstore.remove_docs({})
+
+
 @pytest.fixture
 def concat_store():
     mem_stores = [MemoryStore(str(i)) for i in range(4)]
@@ -149,18 +158,18 @@ def concat_store():
     index = 0
 
     props = {i: str(i) for i in range(10)}
-    for store in mem_stores:
+    for mem_store in mem_stores:
         docs = [
             {"task_id": i, "prop": props[i - index], "index": index}
             for i in range(index, index + 10)
         ]
         index = index + 10
-        store.update(docs)
+        mem_store.update(docs)
     return store
 
 
-@pytest.fixture
 def test_concat_store_distinct(concat_store):
+    print(type(concat_store))
     docs = list(concat_store.distinct("task_id"))
     actual_docs = list(
         chain.from_iterable(
@@ -171,14 +180,6 @@ def test_concat_store_distinct(concat_store):
     assert set(docs) == set(actual_docs)
 
 
-@pytest.fixture
-def test_concat_store_not_implemented(concat_store):
-    # Ensure collection property and update throw errors
-    with pytest.raises(NotImplementedError):
-        concat_store.collection
-        concat_store.update([])
-
-
 def test_concat_store_groupby(concat_store):
     assert len(list(concat_store.groupby("index"))) == 4
     assert len(list(concat_store.groupby("task_id"))) == 40

From 90a745177e93c04a0d96cdbabfcbdb4b241f3715 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Sun, 17 Nov 2019 08:46:36 -0800
Subject: [PATCH 47/99] move test

---
 maggma/cli/{ => tests}/test_mrun.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename maggma/cli/{ => tests}/test_mrun.py (100%)

diff --git a/maggma/cli/test_mrun.py b/maggma/cli/tests/test_mrun.py
similarity index 100%
rename from maggma/cli/test_mrun.py
rename to maggma/cli/tests/test_mrun.py

From ecc1fbf69488c10531261b6a501aa8a9e44590db Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 20 Nov 2019 09:11:58 -0800
Subject: [PATCH 48/99] xfail bad test

---
 maggma/stores/tests/test_compound_stores.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/maggma/stores/tests/test_compound_stores.py b/maggma/stores/tests/test_compound_stores.py
index 60e054b24..f2d38762b 100644
--- a/maggma/stores/tests/test_compound_stores.py
+++ b/maggma/stores/tests/test_compound_stores.py
@@ -91,6 +91,7 @@ def test_joint_store_query_one(jointstore):
     assert doc["task_id"] == 8
 
 
+@pytest.mark.xfail(reason="key grouping appears to make lists")
 def test_joint_store_distinct(jointstore):
     your_prop = jointstore.distinct("test2.your_prop")
     assert set(your_prop) == {k + 3 for k in range(5)}

From c0699140d742828907d9a5888016ebb69f04cb8f Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 20 Nov 2019 09:12:48 -0800
Subject: [PATCH 49/99] add name property

---
 maggma/core/store.py             |  7 +++++++
 maggma/stores/advanced_stores.py | 12 ++++++++++++
 maggma/stores/aws.py             |  6 ++++++
 maggma/stores/compound_stores.py | 12 ++++++++++++
 maggma/stores/gridfs.py          |  6 ++++++
 maggma/stores/mongolike.py       | 10 ++++++++--
 6 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/maggma/core/store.py b/maggma/core/store.py
index 90cd91fa4..ca8464b83 100644
--- a/maggma/core/store.py
+++ b/maggma/core/store.py
@@ -72,6 +72,13 @@ def collection(self):
         """
         pass
 
+    @abstractproperty
+    def name(self) -> str:
+        """
+        Return a string representing this data source
+        """
+        pass
+
     @abstractmethod
     def connect(self, force_reset: bool = False):
         """
diff --git a/maggma/stores/advanced_stores.py b/maggma/stores/advanced_stores.py
index b30651dea..6efdc6e93 100644
--- a/maggma/stores/advanced_stores.py
+++ b/maggma/stores/advanced_stores.py
@@ -160,6 +160,12 @@ def __init__(self, store: Store, aliases: Dict, **kwargs):
         )
         super(AliasingStore, self).__init__(**kwargs)
 
+    def name(self) -> str:
+        """
+        Return a string representing this data source
+        """
+        return self.store.name
+
     def query(
         self,
         criteria: Optional[Dict] = None,
@@ -320,6 +326,12 @@ def __init__(self, store: Store, sandbox: str, exclusive: bool = False):
             validator=self.store.validator,
         )
 
+    def name(self) -> str:
+        """
+        Return a string representing this data source
+        """
+        return self.store.name
+
     @property
     def sbx_criteria(self) -> Dict:
         """
diff --git a/maggma/stores/aws.py b/maggma/stores/aws.py
index c5fbadc4d..9023fda4e 100644
--- a/maggma/stores/aws.py
+++ b/maggma/stores/aws.py
@@ -50,6 +50,12 @@ def __init__(self, index: Store, bucket: str, compress: bool = False, **kwargs):
         kwargs["key"] = index.key
         super(AmazonS3Store, self).__init__(**kwargs)
 
+    def name(self) -> str:
+        """
+        Return a string representing this data source
+        """
+        return self.bucket
+
     def connect(self, force_reset: bool = False):
         """
         Connect to the source data
diff --git a/maggma/stores/compound_stores.py b/maggma/stores/compound_stores.py
index d32fb68dd..0e4e30546 100644
--- a/maggma/stores/compound_stores.py
+++ b/maggma/stores/compound_stores.py
@@ -36,6 +36,12 @@ def __init__(
         self.kwargs = kwargs
         super(JointStore, self).__init__(**kwargs)
 
+    def name(self) -> str:
+        """
+        Return a string representing this data source
+        """
+        return self.master
+
     def connect(self, force_reset: bool = False):
         conn = MongoClient(self.host, self.port)
         db = conn[self.database]
@@ -237,6 +243,12 @@ def __init__(self, *stores: Store, **kwargs):
         self.stores = stores
         super(ConcatStore, self).__init__(**kwargs)
 
+    def name(self) -> str:
+        """
+        Return a string representing this data source
+        """
+        return self.stores[0].name
+
     def connect(self, force_reset: bool = False):
         """
         Connect all stores in this ConcatStore
diff --git a/maggma/stores/gridfs.py b/maggma/stores/gridfs.py
index c3ee4c72e..2879d7122 100644
--- a/maggma/stores/gridfs.py
+++ b/maggma/stores/gridfs.py
@@ -81,6 +81,12 @@ def __init__(
             kwargs["key"] = "_id"
         super().__init__(**kwargs)
 
+    def name(self) -> str:
+        """
+        Return a string representing this data source
+        """
+        return self.collection_name
+
     def connect(self, force_reset: bool = False):
         """
         Connect to the source data
diff --git a/maggma/stores/mongolike.py b/maggma/stores/mongolike.py
index a32a59ea3..c9d1ca3c6 100644
--- a/maggma/stores/mongolike.py
+++ b/maggma/stores/mongolike.py
@@ -62,6 +62,12 @@ def __init__(
         self.kwargs = kwargs
         super().__init__(**kwargs)
 
+    def name(self) -> str:
+        """
+        Return a string representing this data source
+        """
+        return self.collection_name
+
     def connect(self, force_reset: bool = False):
         """
         Connect to the source data
@@ -71,10 +77,10 @@ def connect(self, force_reset: bool = False):
             db = conn[self.database]
             if self.username != "":
                 db.authenticate(self.username, self.password)
-            self._collection = db[self._collection_name]
+            self._collection = db[self.collection_name]
 
     def __hash__(self):
-        return hash((self.database, self._collection_name, self.last_updated_field))
+        return hash((self.database, self.collection_name, self.last_updated_field))
 
     @classmethod
     def from_db_file(cls, filename: str):

From c7297c7f1761ab6f791e42fbb57ba1f5db849de5 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 20 Nov 2019 09:13:07 -0800
Subject: [PATCH 50/99] misc bugs

---
 maggma/stores/compound_stores.py            |  3 +-
 maggma/stores/mongolike.py                  | 39 +--------------------
 maggma/stores/tests/test_advanced_stores.py |  2 +-
 maggma/stores/tests/test_mongolike.py       |  6 ++--
 4 files changed, 7 insertions(+), 43 deletions(-)

diff --git a/maggma/stores/compound_stores.py b/maggma/stores/compound_stores.py
index 0e4e30546..91df35f6d 100644
--- a/maggma/stores/compound_stores.py
+++ b/maggma/stores/compound_stores.py
@@ -1,12 +1,11 @@
 from typing import List, Iterator, Tuple, Optional, Union, Dict
 from datetime import datetime
 from itertools import groupby
-from pydash import get, set_
+from pydash import set_
 from pymongo import MongoClient
 from monty.dev import deprecated
 from maggma.core import Store, Sort
 from maggma.stores import MongoStore
-from operator import itemgetter
 
 
 class JointStore(Store):
diff --git a/maggma/stores/mongolike.py b/maggma/stores/mongolike.py
index c9d1ca3c6..6a8f0be54 100644
--- a/maggma/stores/mongolike.py
+++ b/maggma/stores/mongolike.py
@@ -53,7 +53,7 @@ def __init__(
             password: Password to connect with
         """
         self.database = database
-        self._collection_name = collection_name
+        self.collection_name = collection_name
         self.host = host
         self.port = port
         self.username = username
@@ -319,43 +319,6 @@ def groupby(
             for val, group in groupby(sorted(input_data, key=grouper), grouper):
                 yield {keys[0]: val}, list(group)
 
-    def update(self, docs: Union[List[Dict], Dict], key: Union[List, str, None] = None):
-        """
-        Update documents into the Store
-
-        Args:
-            docs: the document or list of documents to update
-            key: field name(s) to determine uniqueness for a
-                 document, can be a list of multiple fields,
-                 a single field, or None if the Store's key
-                 field is to be used
-        """
-
-        for d in docs:
-
-            d = jsanitize(d, allow_bson=True)
-
-            # document-level validation is optional
-            validates = True
-            if self.validator:
-                validates = self.validator.is_valid(d)
-                if not validates:
-                    if self.validator.strict:
-                        raise ValueError(self.validator.validation_errors(d))
-                    else:
-                        self.logger.error(self.validator.validation_errors(d))
-
-            if validates:
-                if isinstance(key, list):
-                    search_doc = {k: d[k] for k in key}
-                elif key:
-                    search_doc = {key: d[key]}
-                else:
-                    search_doc = {self.key: d[self.key]}
-
-                self._collection.replace_one(
-                    filter=search_doc, replacement=d, upsert=True
-                )
 
 
 class JSONStore(MemoryStore):
diff --git a/maggma/stores/tests/test_advanced_stores.py b/maggma/stores/tests/test_advanced_stores.py
index 6bc15f0e8..ecec08375 100644
--- a/maggma/stores/tests/test_advanced_stores.py
+++ b/maggma/stores/tests/test_advanced_stores.py
@@ -60,10 +60,10 @@ def mgrant_server():
     # Yields the fixture to use
     yield config_path, mdport, dbname
 
-    os.remove(config_path)
     if not (os.getenv("CONTINUOUS_INTEGRATION") and os.getenv("TRAVIS")):
         os.killpg(os.getpgid(mongod_process.pid), signal.SIGTERM)
         os.waitpid(mongod_process.pid, 0)
+    os.remove(config_path)
     shutil.rmtree(mdpath)
     os.remove(mdlogpath)
 
diff --git a/maggma/stores/tests/test_mongolike.py b/maggma/stores/tests/test_mongolike.py
index 38a2d770e..291312e57 100644
--- a/maggma/stores/tests/test_mongolike.py
+++ b/maggma/stores/tests/test_mongolike.py
@@ -103,9 +103,11 @@ def test_mongostore_remove_docs(mongostore):
     assert len(list(mongostore.query({"a": 4}))) == 1
     assert len(list(mongostore.query({"a": 1}))) == 0
 
+
 def test_mongostore_from_db_file(mongostore, db_json):
     ms = MongoStore.from_db_file(db_json)
-    assert ms._collection_name == "tmp"
+    ms.connect()
+    assert ms._collection.full_name == "maggma_tests.tmp"
 
 
 def test_mongostore_from_collection(mongostore, db_json):
@@ -113,7 +115,7 @@ def test_mongostore_from_collection(mongostore, db_json):
     ms.connect()
 
     other_ms = MongoStore.from_collection(ms._collection)
-    assert ms._collection_name == other_ms._collection_name
+    assert ms._collection.full_name == other_ms._collection.full_name
     assert ms.database == other_ms.database
 
 

From 1046be4cab0848aa469116abced24013398921dc Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 20 Nov 2019 09:13:15 -0800
Subject: [PATCH 51/99] upgrade to pytest

---
 maggma/tests/test_utils.py | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/maggma/tests/test_utils.py b/maggma/tests/test_utils.py
index 0f4c52647..34b332113 100644
--- a/maggma/tests/test_utils.py
+++ b/maggma/tests/test_utils.py
@@ -1,33 +1,33 @@
 # coding: utf-8
 """
-Tests utilities
+Tests for builders
 """
-import unittest
+import pytest
 
 from maggma.utils import recursive_update, Timeout
 from time import sleep
 
-class UtilsTests(unittest.TestCase):
 
-    def test_recursiveupdate(self):
-        d = {"a": {"b": 3}, "c": [4]}
+def test_recursiveupdate():
+    d = {"a": {"b": 3}, "c": [4]}
 
-        recursive_update(d, {"c": [5]})
-        self.assertEqual(d["c"], [5])
+    recursive_update(d, {"c": [5]})
+    assert d["c"] ==  [5]
 
-        recursive_update(d, {"a": {"b": 5}})
-        self.assertEqual(d["a"]["b"], 5)
+    recursive_update(d, {"a": {"b": 5}})
+    assert d["a"]["b"] ==  5
 
-        recursive_update(d, {"a": {"b": [6]}})
-        self.assertEqual(d["a"]["b"], [6])
+    recursive_update(d, {"a": {"b": [6]}})
+    assert d["a"]["b"] ==  [6]
 
-        recursive_update(d, {"a": {"b": [7]}})
-        self.assertEqual(d["a"]["b"], [7])
+    recursive_update(d, {"a": {"b": [7]}})
+    assert d["a"]["b"] ==  [7]
 
-    def test_timeout(self):
+def test_timeout():
 
-        def takes_too_long():
-            with Timeout(seconds=1):
-                sleep(2)
+    def takes_too_long():
+        with Timeout(seconds=1):
+            sleep(2)
+    with pytest.raises(TimeoutError):
+        takes_too_long()
 
-        self.assertRaises(TimeoutError, takes_too_long)

From 80effc7025484d00b0421c4710674e23a660f3f2 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 20 Nov 2019 09:13:42 -0800
Subject: [PATCH 52/99] add retry failed

---
 maggma/builders.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/maggma/builders.py b/maggma/builders.py
index 291a8a8d0..5c4321937 100644
--- a/maggma/builders.py
+++ b/maggma/builders.py
@@ -31,6 +31,7 @@ def __init__(
         delete_orphans: bool = False,
         timeout: int = 0,
         store_process_time: bool = True,
+        retry_failed: bool = False,
         **kwargs
     ):
         """
@@ -47,7 +48,9 @@ def __init__(
                 after all updates, during Builder.finalize.
             timeout: maximum running time per item in seconds
             store_process_time: If True, add "_process_time" key to
-            document for profiling purposes
+                document for profiling purposes
+            retry_failed: If True, will retry building documents that
+                previously failed
         """
         self.source = source
         self.target = target
@@ -58,6 +61,7 @@ def __init__(
         self.total = None
         self.timeout = timeout
         self.store_process_time = store_process_time
+        self.retry_failed = retry_failed
         super().__init__(sources=[source], targets=[target], **kwargs)
 
     def ensure_indexes(self):
@@ -67,6 +71,7 @@ def ensure_indexes(self):
             self.source.ensure_index(self.source.last_updated_field),
             self.target.ensure_index(self.target.key),
             self.target.ensure_index(self.target.last_updated_field),
+            self.target.ensure_index("state"),
         ]
 
         if not all(index_checks):
@@ -99,6 +104,12 @@ def get_items(self):
 
         self.ensure_indexes()
 
+        temp_query = dict(**self.query) if self.query else {}
+        if self.retry_failed:
+            temp_query.pop("state", None)
+        else:
+            temp_query["state"] = {"$ne": "failed"}
+
         keys = self.target.newer_in(self.source, criteria=self.query, exhaustive=True)
 
         self.logger.info("Processing {} items".format(len(keys)))
@@ -134,9 +145,10 @@ def process_item(self, item: Dict):
         try:
             with Timeout(seconds=self.timeout):
                 processed = self.unary_function(item)
+                processed.update({"state": "successful"})
         except Exception as e:
             self.logger.error(traceback.format_exc())
-            processed = {"error": str(e)}
+            processed = {"error": str(e), "state": "failed"}
 
         time_end = time()
 
@@ -173,7 +185,7 @@ def update_targets(self, items: List[Dict]):
         if len(items) > 0:
             target.update(items)
 
-    def finalize(self, cursor=None):
+    def finalize(self):
         if self.delete_orphans:
             source_keyvals = set(self.source.distinct(self.source.key))
             target_keyvals = set(self.target.distinct(self.target.key))
@@ -183,7 +195,7 @@ def finalize(self, cursor=None):
                     "Finalize: Deleting {} orphans.".format(len(to_delete))
                 )
             self.target.remove_docs({self.target.key: {"$in": to_delete}})
-        super().finalize(cursor)
+        super().finalize()
 
     @abstractmethod
     def unary_function(self, item):
@@ -276,5 +288,5 @@ def group_to_items(self, group: Dict) -> Iterator:
 class CopyBuilder(MapBuilder):
     """Sync a source store with a target store."""
 
-    def unary_function(item):
+    def unary_function(self,item):
         return item

From 5aac8aa9324888ae5ea2956f831a184c39acc6cc Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 20 Nov 2019 09:34:38 -0800
Subject: [PATCH 53/99] update travis

---
 .travis.yml               | 4 ++--
 requirements-optional.txt | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 4e741b5a9..75aad5402 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -28,8 +28,8 @@ before_script:
     --noauth --bind_ip_all --fork
   - cd -
 script:
-  - mpiexec -n 2  python $PWD/maggma/tests/mpi_test.py
-  - pytest --cov=maggma/
+#  - mpiexec -n 2  python $PWD/maggma/tests/mpi_test.py
+  - pytest --cov=maggma
 after_success:
   - coveralls
 notifications:
diff --git a/requirements-optional.txt b/requirements-optional.txt
index 680734d79..e19cf29db 100644
--- a/requirements-optional.txt
+++ b/requirements-optional.txt
@@ -1,5 +1,6 @@
 invoke==1.0.0
-nose==1.3.4
+pytest==5.2.2
+pytest-cov==2.8.1
 mpi4py==3.0.0
 numpy==1.15.3
 python-coveralls==2.9.1

From 693dcedd179f470ac1c6b92126cf34d0530c8aef Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 20 Nov 2019 10:25:03 -0800
Subject: [PATCH 54/99] coverage settings

---
 .coveragerc               | 2 ++
 requirements-optional.txt | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)
 create mode 100644 .coveragerc

diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 000000000..b4fd218f9
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,2 @@
+[run]
+omit = *test*
\ No newline at end of file
diff --git a/requirements-optional.txt b/requirements-optional.txt
index e19cf29db..6702ca1fd 100644
--- a/requirements-optional.txt
+++ b/requirements-optional.txt
@@ -7,4 +7,5 @@ python-coveralls==2.9.1
 sphinx==1.7.5
 sphinx_rtd_theme==0.4.0
 twine==1.9.1
-wheel==0.31.1
\ No newline at end of file
+wheel==0.31.1
+moto==1.3.13
\ No newline at end of file

From 15c4f10d9a78d9ba9a796c841e0a0ece666d76ba Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Mon, 9 Dec 2019 09:42:16 -0800
Subject: [PATCH 55/99] switch to utcnow

---
 maggma/tests/test_builders.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/maggma/tests/test_builders.py b/maggma/tests/test_builders.py
index 53498906f..6c597c184 100644
--- a/maggma/tests/test_builders.py
+++ b/maggma/tests/test_builders.py
@@ -29,7 +29,7 @@ def target():
 
 @pytest.fixture("module")
 def now():
-    return datetime.now()
+    return datetime.utcnow()
 
 
 @pytest.fixture
@@ -104,7 +104,7 @@ def test_delete_orphans(source, target, old_docs, new_docs):
 
 
 def test_incremental_false(source, target, old_docs, new_docs):
-    tic = datetime.now()
+    tic = datetime.utcnow()
     toc = tic + timedelta(seconds=1)
     keys = list(range(20))
     earlier = [{"lu": tic, "k": k, "v": "val"} for k in keys]

From a0eabc619f2b4904bc35fa15697280783caabe35 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 11 Dec 2019 14:22:38 -0800
Subject: [PATCH 56/99] bug fixes

---
 maggma/core/builder.py | 2 +-
 maggma/core/store.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/maggma/core/builder.py b/maggma/core/builder.py
index db3c48316..b4a46d777 100644
--- a/maggma/core/builder.py
+++ b/maggma/core/builder.py
@@ -132,7 +132,7 @@ def run(self):
             ]
             self.update_targets(processed_items)
 
-        self.finalize(cursor)
+        self.finalize()
 
     def __getstate__(self):
         return self.as_dict()
diff --git a/maggma/core/store.py b/maggma/core/store.py
index ca8464b83..1e4fc3ed1 100644
--- a/maggma/core/store.py
+++ b/maggma/core/store.py
@@ -291,7 +291,7 @@ def newer_in(
 
             new_keys = set(target_dates.keys()) - set(dates.keys())
             updated_keys = {
-                key for key, date in dates.items() if target_dates[key] > date
+                key for key, date in dates.items() if target_dates.get(key,datetime.min) > date
             }
 
             return list(new_keys | updated_keys)

From 793cdb5ccf1d41beee9f69daafe25461b5429bee Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 11 Dec 2019 14:22:58 -0800
Subject: [PATCH 57/99] filter corrupts the iterator depending on data type

---
 maggma/utils.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/maggma/utils.py b/maggma/utils.py
index 1a36835d8..4d7db351e 100644
--- a/maggma/utils.py
+++ b/maggma/utils.py
@@ -126,10 +126,6 @@ def grouper(iterable, n, fillvalue=None):
     # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx
     args = [iter(iterable)] * n
     iterator = itertools.zip_longest(*args, fillvalue=fillvalue)
-
-    if fillvalue is None:
-        iterator = filter(None.__ne__, iterator)
-
     return iterator
 
 

From efaa982dba340e55c6301eb66038fff320283f34 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 11 Dec 2019 14:23:08 -0800
Subject: [PATCH 58/99] remove smoque

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 045412ef8..a8fae8da1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,6 @@
 pymongo==3.7.2
 mongomock==3.13.0
 monty==1.0.3
-smoqe==0.1.3
 pyyaml>=4.2b1
 pydash==4.7.3
 jsonschema==2.6.0

From 5dea233965065b4bb16dbec768d6188a7a7b98b9 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 11 Dec 2019 14:23:25 -0800
Subject: [PATCH 59/99] use utcnow instead of now

---
 maggma/stores/tests/test_mongolike.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/maggma/stores/tests/test_mongolike.py b/maggma/stores/tests/test_mongolike.py
index 291312e57..b1af29592 100644
--- a/maggma/stores/tests/test_mongolike.py
+++ b/maggma/stores/tests/test_mongolike.py
@@ -121,13 +121,13 @@ def test_mongostore_from_collection(mongostore, db_json):
 
 def test_mongostore_last_updated(mongostore):
     assert mongostore.last_updated == datetime.min
-    start_time = datetime.now()
+    start_time = datetime.utcnow()
     mongostore._collection.insert_one({mongostore.key: 1, "a": 1})
     with pytest.raises(StoreError) as cm:
         mongostore.last_updated
     assert cm.match(mongostore.last_updated_field)
     mongostore.update(
-        [{mongostore.key: 1, "a": 1, mongostore.last_updated_field: datetime.now()}]
+        [{mongostore.key: 1, "a": 1, mongostore.last_updated_field: datetime.utcnow()}]
     )
     assert mongostore.last_updated > start_time
 
@@ -140,7 +140,7 @@ def test_mongostore_newer_in(mongostore):
 
     target.update(
         [
-            {mongostore.key: i, mongostore.last_updated_field: datetime.now()}
+            {mongostore.key: i, mongostore.last_updated_field: datetime.utcnow()}
             for i in range(10)
         ]
     )
@@ -148,7 +148,7 @@ def test_mongostore_newer_in(mongostore):
     # Update docs in source
     mongostore.update(
         [
-            {mongostore.key: i, mongostore.last_updated_field: datetime.now()}
+            {mongostore.key: i, mongostore.last_updated_field: datetime.utcnow()}
             for i in range(10)
         ]
     )

From 05ffd974ad2cc0f5c91ef520d73bd25ffad96c22 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 12 Dec 2019 09:57:53 -0800
Subject: [PATCH 60/99] new basic CLI

---
 maggma/cli/__init__.py        | 80 ++++++++++++++++++++++++++++++++++-
 maggma/cli/multiprocessing.py | 66 +++++++++++++++++++++++++++++
 maggma/cli/serial.py          | 30 +++++++++++++
 maggma/cli/utils.py           | 13 ++++++
 requirements.txt              |  6 +--
 5 files changed, 191 insertions(+), 4 deletions(-)
 create mode 100644 maggma/cli/multiprocessing.py
 create mode 100644 maggma/cli/serial.py
 create mode 100644 maggma/cli/utils.py

diff --git a/maggma/cli/__init__.py b/maggma/cli/__init__.py
index 576f56f87..dad9e84bc 100644
--- a/maggma/cli/__init__.py
+++ b/maggma/cli/__init__.py
@@ -1 +1,79 @@
-# coding: utf-8
\ No newline at end of file
+#!/usr/bin/env python
+# coding utf-8
+
+
+import logging
+import click
+import asyncio
+from itertools import chain
+from monty.serialization import loadfn
+from maggma.utils import TqdmLoggingHandler
+from maggma.cli.serial import serial
+from maggma.cli.multiprocessing import multi
+
+
+""""
+mrun script1
+mrun script1 script2 script3
+mrun -n  32 script1 script2
+
+
+
+
+
+mrun master -N 4 sciprt1 script2 <-- have to deploy workers
+mrun worker -n 32 127.0.0.1:70001
+mrun worker -n 32 127.0.0.1:70001
+mrun worker -n 32 127.0.0.1:70001
+
+
+mrun master -N 4 script1 script 2
+mpirun -N 4 mrun worker -n 32 script1 script 2
+
+
+
+"""
+
+
+@click.command()
+@click.argument("builders", nargs=-1, type=click.Path(exists=True))
+@click.option(
+    "-v",
+    "--verbose",
+    "verbosity",
+    count=True,
+    help="Controls logging level per number of v's",
+    default=0,
+)
+@click.option(
+    "-n",
+    "--num-workers",
+    "num_workers",
+    help="Number of worker processes. Defaults to single processing",
+    default=1,
+    type=click.IntRange(1),
+)
+def run(builders, verbosity, num_workers):
+
+    # Set Logging
+    levels = [logging.WARNING, logging.INFO, logging.DEBUG]
+    level = levels[min(len(levels) - 1, verbosity)]  # capped to number of levels
+    root = logging.getLogger()
+    root.setLevel(level)
+    ch = TqdmLoggingHandler()
+    formatter = logging.Formatter(
+        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    )
+    ch.setFormatter(formatter)
+    root.addHandler(ch)
+
+    builders = [loadfn(b) for b in builders]
+    builders = [b if isinstance(b, list) else [b] for b in builders]
+    builders = list(chain.from_iterable(builders))
+
+    if num_workers == 1:
+        for builder in builders:
+            serial(builder)
+    else:
+        for builder in builders:
+            asyncio.run(multi(builder, num_workers))
diff --git a/maggma/cli/multiprocessing.py b/maggma/cli/multiprocessing.py
new file mode 100644
index 000000000..063528cf4
--- /dev/null
+++ b/maggma/cli/multiprocessing.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+# coding utf-8
+
+import asyncio
+from asyncio import BoundedSemaphore
+from aioitertools import zip_longest
+from concurrent.futures import ProcessPoolExecutor
+from maggma.utils import tqdm
+
+
+class AsyncBackPressuredMap:
+    """
+    Wrapper for an iterator to provide
+    async access with backpressure
+    """
+
+    def __init__(self, iterator, builder, executor):
+        self.iterator = iter(iterator)
+        self.process = builder.process_item
+        self.executor = executor
+        self.back_pressure = BoundedSemaphore(builder.chunk_size)
+
+    async def __aiter__(self):
+        return self
+
+    async def __anext__(self):
+        await self.back_pressure.acquire()
+        loop = asyncio.get_running_loop()
+
+        try:
+            item = next(self.iterator)
+        except StopIteration:
+            raise StopAsyncIteration
+
+        async def process_and_release():
+            future = loop.run_in_executor(self.executor, self.process, item)
+            await future
+            self.back_pressure.release()
+            return future
+
+        return process_and_release()
+
+
+async def grouper(iterable, n, fillvalue=None):
+    """
+    Collect data into fixed-length chunks or blocks.
+    """
+    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx
+    args = [iterable] * n
+    iterator = zip_longest(*args, fillvalue=fillvalue)
+
+    async for group in iterator:
+        group = [g for g in group if g is not None]
+        yield group
+
+
+async def multi(builder, num_workers):
+    builder.connect()
+    cursor = builder.get_items()
+    executor = ProcessPoolExecutor(num_workers)
+    mapper = AsyncBackPressuredMap(tqdm(cursor, desc="Get"), builder, executor)
+
+    async for chunk in grouper(mapper, builder.chunk_size, fillvalue=None):
+        chunk = await asyncio.gather(*chunk)
+        processed_items = [c.result() for c in chunk if chunk is not None]
+        builder.update_targets(processed_items)
diff --git a/maggma/cli/serial.py b/maggma/cli/serial.py
new file mode 100644
index 000000000..ca696f85d
--- /dev/null
+++ b/maggma/cli/serial.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+# coding utf-8
+
+import logging
+from tqdm import tqdm
+
+from maggma.utils import grouper
+from maggma.core import Builder
+
+
+def serial(builder: Builder):
+    """
+    Runs the builders using a single process
+    """
+
+    logger = logging.getLogger("SerialProcessor")
+
+    builder.connect()
+
+    cursor = builder.get_items()
+
+    for chunk in grouper(tqdm(cursor), builder.chunk_size):
+        logger.info("Processing batch of {} items".format(builder.chunk_size))
+        processed_items = [
+            builder.process_item(item) for item in chunk if item is not None
+        ]
+        builder.update_targets(processed_items)
+
+    builder.finalize()
+
diff --git a/maggma/cli/utils.py b/maggma/cli/utils.py
new file mode 100644
index 000000000..8d4fcab2c
--- /dev/null
+++ b/maggma/cli/utils.py
@@ -0,0 +1,13 @@
+from typing import List
+from maggma.core import Builder
+
+
+def get_build_order(builders: List[Builder]) -> List[Builder]:
+    """
+    Returns a list of builders in the order they should run to satisfy
+    dependencies
+
+    TODO: For now just do dumb in order since builders should be 
+    written to just run over and over again
+    """
+    return builders
diff --git a/requirements.txt b/requirements.txt
index a8fae8da1..6464c5dd2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
-pymongo==3.7.2
-mongomock==3.13.0
-monty==1.0.3
+pymongo==3.9.0
+mongomock==3.18.0
+monty==3.0.2
 pyyaml>=4.2b1
 pydash==4.7.3
 jsonschema==2.6.0

From c6862c579e15c41579d81ccc4b023ab500e937f4 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 12 Dec 2019 09:58:00 -0800
Subject: [PATCH 61/99] remove old mrun

---
 maggma/cli/mrun.py | 61 ----------------------------------------------
 1 file changed, 61 deletions(-)
 delete mode 100644 maggma/cli/mrun.py

diff --git a/maggma/cli/mrun.py b/maggma/cli/mrun.py
deleted file mode 100644
index e923c1203..000000000
--- a/maggma/cli/mrun.py
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/usr/bin/env python
-# coding utf-8
-
-from maggma.runner import Runner
-from maggma.builders import Builder
-from monty.serialization import loadfn
-import argparse
-import logging
-import sys
-from maggma.utils import TqdmLoggingHandler
-
-
-def main():
-    parser = argparse.ArgumentParser(description="mrun is a script to run builders written using the Maggma framework.")
-    parser.add_argument(
-        "builder",
-        help="Builder file in either json or yaml format. Can contain a list of builders or a predefined Runner")
-    parser.add_argument(
-        "-n",
-        "--num_workers",
-        type=int,
-        default=0,
-        help="Number of worker processes. Defaults to use as many as available.")
-    parser.add_argument('-v', '--verbose', action='count', default=0, help="Controls logging level per number of v's")
-    parser.add_argument(
-        "--dry-run",
-        action="store_true",
-        default=False,
-        help="Dry run loading the builder file. Does not run the builders")
-    parser.add_argument("--mpi", action="store_true", default=False, help="Running under MPI")
-    args = parser.parse_args()
-
-    # Set Logging
-    levels = [logging.WARNING, logging.INFO, logging.DEBUG]
-    level = levels[min(len(levels) - 1, args.verbose)]  # capped to number of levels
-    root = logging.getLogger()
-    root.setLevel(level)
-    ch = TqdmLoggingHandler()
-    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
-    ch.setFormatter(formatter)
-    root.addHandler(ch)
-
-    objects = loadfn(args.builder)
-
-    if isinstance(objects, list):
-        # If this is a list of builders
-        runner = Runner(objects, max_workers=args.num_workers, mpi=args.mpi)
-    elif isinstance(objects, Runner):
-        # This is a runner:
-        root.info("Changing number of workers from default in input file")
-        runner = Runner(objects.builders, args.num_workers, mpi=args.mpi)
-    elif isinstance(objects, Builder):
-        runner = Runner([objects], args.num_workers, mpi=args.mpi)
-        root.error("Couldn't properly read the builder file.")
-
-    if not args.dry_run:
-        runner.run()
-
-
-if __name__ == "__main__":
-    main()

From bb01e8ef87ae9ce5ebd9300311a5e98ed21beac2 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 12 Dec 2019 09:58:15 -0800
Subject: [PATCH 62/99] remove old tests

---
 maggma/tests/mpi_test.py    |  48 -------
 maggma/tests/test_runner.py | 241 ------------------------------------
 2 files changed, 289 deletions(-)
 delete mode 100644 maggma/tests/mpi_test.py
 delete mode 100644 maggma/tests/test_runner.py

diff --git a/maggma/tests/mpi_test.py b/maggma/tests/mpi_test.py
deleted file mode 100644
index 9af017f4e..000000000
--- a/maggma/tests/mpi_test.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# coding: utf-8
-"""
-MPI Tests for MPI Processor
-"""
-import sys
-import logging
-import numpy as np
-from maggma.builders import Builder
-from maggma.stores import MemoryStore
-from maggma.runner import MPIProcessor
-
-
-class DummyBuilder(Builder):
-
-    def __init__(self, temp_storage_store):
-        self.temp_storage_store = temp_storage_store
-        super(DummyBuilder, self).__init__(sources=[], targets=[temp_storage_store],chunk_size=100)
-
-    def get_items(self):
-        self.logger.info("Getting Items")
-        for i in range(1000):
-            yield {"val": i, "task_id": i}
-
-    def process_item(self, item):
-        if item["val"] % 10 == 0:
-            self.logger.debug("Processing: {}".format(item["val"]))
-        proc_val = np.sqrt(np.square(float(item["val"])))
-        item["proc_val"] = proc_val
-        return item
-
-    def update_targets(self, items):
-        self.logger.info("Updating {} items".format(len(items)))
-        self.temp_storage_store.update(items)
-
-if __name__ == "__main__":
-
-    root = logging.getLogger()
-    root.setLevel(logging.DEBUG)
-    ch = logging.StreamHandler(sys.stdout)
-    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
-    ch.setFormatter(formatter)
-    root.addHandler(ch)
-
-    mem = MemoryStore("processed")
-    bldr = DummyBuilder(mem)
-
-    mpi_proc = MPIProcessor([bldr])
-    mpi_proc.process(0)
diff --git a/maggma/tests/test_runner.py b/maggma/tests/test_runner.py
deleted file mode 100644
index 8a1729086..000000000
--- a/maggma/tests/test_runner.py
+++ /dev/null
@@ -1,241 +0,0 @@
-# coding: utf-8
-"""
-Tests for the Runner class
-"""
-import unittest
-from unittest.mock import patch, MagicMock
-
-from maggma.runner import Runner, SerialProcessor, MultiprocProcessor, MPIProcessor
-
-__author__ = 'Shyam Dwaraknath'
-__email__ = 'shyamd@lbl.gov'
-
-
-class TestRunner(unittest.TestCase):
-    def test_1(self):
-        builder1 = MagicMock()
-        builder2 = MagicMock()
-
-        builder1.configure_mock(sources=[1, 2, 3], targets=[4])
-        builder2.configure_mock(sources=[3, 4, 5], targets=[6])
-        self.builders = [builder1, builder2]
-
-        rnr = Runner(self.builders)
-        self.assertEqual(rnr.dependency_graph, {1: [0]})
-
-
-class TestSerialProcessor(unittest.TestCase):
-    def test_process(self):
-
-        builder = MagicMock()
-        builder.configure_mock(chunk_size=10)
-        builder.get_items.return_value = range(10)
-        builder.process_item.side_effect = range(10, 20)
-
-        proc = SerialProcessor([builder])
-
-        proc.process(0)
-
-        builder.get_items.assert_called()
-        self.assertEqual(builder.process_item.call_count, 10)
-        builder.update_targets.assert_called()
-
-
-class TestMultiprocProcessor(unittest.TestCase):
-    def setUp(self):
-        builder = MagicMock()
-        builder.configure_mock(chunk_size=10)
-        builder.get_items.return_value = iter(range(10))
-        builder.process_item.side_effect = range(10, 20)
-        builder.from_dict.return_value = {}
-
-        self.builder = builder
-
-    def test_init(self):
-        proc = MultiprocProcessor([], 3)
-        self.assertEqual(proc.max_workers, 3)
-
-    def test_setup_multithreading(self):
-        
-        with patch("maggma.runner.Thread") as mock_thread:
-            proc = MultiprocProcessor([self.builder], max_workers=3)
-            proc.builder = proc.builders[0]
-            proc.setup_multithreading()
-            mock_thread.assert_called()
-
-    def test_update_targets(self):
-
-        proc = MultiprocProcessor([self.builder], max_workers=3)
-
-        proc.builder = self.builder
-        proc.update_data_condition = MagicMock()
-        proc.data = MagicMock()
-        proc.update_pbar = MagicMock()
-        proc.run_update_targets = MagicMock()
-        proc.run_update_targets.__bool__.side_effect = [True,True,True,False]
-
-        proc.update_targets()
-        proc.run_update_targets.__bool__.assert_called()
-        proc.data.clear.assert_called()
-        proc.update_data_condition.wait_for.assert_called()
-        proc.builder.update_targets.assert_called()
-
-    def test_update_data_callback(self):
-
-        proc = MultiprocProcessor([self.builder], max_workers=3)
-
-        future = MagicMock()
-        proc.data = MagicMock()
-        proc.task_count = MagicMock()
-        proc.update_data_condition = MagicMock()
-        proc.process_pbar = MagicMock()
-        proc.update_data_callback(future)
-
-        future.result.assert_called()
-        proc.update_data_condition.notify_all.assert_called()
-        proc.task_count.release.assert_called()
-
-    def test_clean_up_data(self):
-
-        proc = MultiprocProcessor([self.builder], max_workers=3)
-
-        proc.data = MagicMock()
-        proc.update_data_condition = MagicMock()
-        proc.builder = MagicMock()
-        proc.update_targets_thread = MagicMock()
-
-        proc.clean_up_data()
-
-        proc.update_data_condition.notify_all.assert_called()
-        proc.update_targets_thread.join.assert_called()
-
-    def test_put_tasks(self):
-
-        with patch("maggma.runner.ProcessPoolExecutor") as mock_executor:
-
-            mock_exec_obj = mock_executor()
-            proc = MultiprocProcessor([self.builder], max_workers=3)
-            proc.builder = MagicMock()
-            proc.task_count = MagicMock()
-            cursor = [True,True,True,False]
-            proc.get_pbar = cursor
-            proc.put_tasks()
-            proc.task_count.acquire.assert_called()
-
-
-class TestMPIProcessor(unittest.TestCase):
-    def setUp(self):
-        builder = MagicMock()
-        builder.configure_mock(chunk_size=10)
-        builder.get_items.return_value = iter(range(10))
-        builder.process_item.side_effect = range(10, 20)
-        builder.from_dict.return_value = {}
-        builder.setup_pbars([])
-
-        self.builder = builder
-        self.get_mpi_patcher = patch("maggma.runner.get_mpi")
-        self.get_mpi = self.get_mpi_patcher.start()
-        self.comm = MagicMock()
-        self.get_mpi.return_value = self.comm, 0, 2  # comm, rank , size
-
-    def tearDown(self):
-        self.get_mpi.stop()
-
-    def test_init(self):
-        proc = MPIProcessor([self.builder])
-        self.comm.barrier.assert_called()
-
-    def test_setup_multithreading(self):
-
-        with patch("maggma.runner.Thread") as mock_thread:
-            proc = MPIProcessor([self.builder])
-            proc.builder = proc.builders[0]
-            proc.setup_multithreading()
-            mock_thread.assert_called()
-
-    def test_update_targets(self):
-
-        proc = MPIProcessor([self.builder])
-
-        proc.builder = self.builder
-        proc.update_data_condition = MagicMock()
-        proc.data = MagicMock()
-        proc.run_update_targets = MagicMock()
-        proc.run_update_targets.__bool__.side_effect = [True,True,True,False]
-        proc.setup_pbars([])
-
-        proc.update_targets()
-        proc.run_update_targets.__bool__.assert_called()
-        proc.data.clear.assert_called()
-        proc.update_data_condition.wait_for.assert_called()
-        proc.builder.update_targets.assert_called()
-
-    def test_clean_up_data(self):
-
-        proc = MPIProcessor([self.builder])
-
-        proc.data = MagicMock()
-        proc.update_data_condition = MagicMock()
-        proc.builder = MagicMock()
-        proc.update_targets_thread = MagicMock()
-
-        proc.clean_up_data()
-
-        proc.update_data_condition.notify_all.assert_called()
-        proc.update_targets_thread.join.assert_called()
-
-    def test_clean_up_workers(self):
-        proc = MPIProcessor([self.builder])
-
-        proc.clean_up_workers()
-        self.comm.send.assert_called()
-        self.assertEqual(self.comm.send.call_count, 1)
-
-    def test_submit_item(self):
-
-        proc = MPIProcessor([self.builder])
-        proc.ranks = MagicMock()
-        proc.update_data_condition = MagicMock()
-        proc.data = MagicMock()
-        proc.task_count = MagicMock()
-        proc.setup_pbars([])
-        self.comm.recv.return_value = {"type": "return", "return": "data"}
-
-        proc.submit_item(0, {})
-
-        self.comm.recv.assert_called()
-        proc.update_data_condition.__enter__.assert_called()
-        proc.data.append.assert_called()
-        proc.update_data_condition.notify_all.assert_called()
-
-        proc.ranks.append.assert_called()
-
-    def test_put_tasks(self):
-        with patch("maggma.runner.ThreadPoolExecutor") as mock_executor:
-            proc = MPIProcessor([self.builder])
-            proc.builder = MagicMock()
-            proc.task_count = MagicMock()
-            cursor = [True,True,True,False]
-
-
-            proc.setup_pbars(cursor)
-            proc.put_tasks(0)
-            proc.task_count.acquire.assert_called()
-            mock_executor.return_value.__enter__.assert_called()
-            proc.task_count.acquire.assert_called()
-            mock_executor.return_value.__enter__.return_value.submit.assert_called()
-
-    def test_proccess_worker(self):
-        proc = MPIProcessor([self.builder])
-
-        self.comm.recv.side_effect = [{"type": "process", "builder_id": 0, "data": ""}, {"type": "shutdown"}]
-
-        proc.process_worker()
-
-        self.comm.recv.assert_called()
-        self.comm.send.assert_called()
-        self.builder.process_item.assert_called()
-
-
-if __name__ == "__main__":
-    unittest.main()

From 11d41c08c69b795f9b1e2c4adad12a6cbcc1c1a7 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 12 Dec 2019 09:58:34 -0800
Subject: [PATCH 63/99] test prechunk

---
 maggma/tests/test_builders.py | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/maggma/tests/test_builders.py b/maggma/tests/test_builders.py
index 6c597c184..ee99ddcf9 100644
--- a/maggma/tests/test_builders.py
+++ b/maggma/tests/test_builders.py
@@ -103,17 +103,11 @@ def test_delete_orphans(source, target, old_docs, new_docs):
     assert target.query_one(criteria={"k": 10})["v"] == "old"
 
 
-def test_incremental_false(source, target, old_docs, new_docs):
-    tic = datetime.utcnow()
-    toc = tic + timedelta(seconds=1)
-    keys = list(range(20))
-    earlier = [{"lu": tic, "k": k, "v": "val"} for k in keys]
-    later = [{"lu": toc, "k": k, "v": "val"} for k in keys]
-    source.update(earlier)
-    target.update(later)
-    query = {"k": {"$gt": 5}}
-    builder = CopyBuilder(source, target, incremental=False, query=query)
-    builder.run()
-    docs = sorted(target.query(), key=lambda d: d["k"])
-    assert (all(d["lu"] == tic) for d in docs[5:])
-    assert (all(d["lu"] == toc) for d in docs[:5])
+def test_prechunk(source, target, old_docs,new_docs):
+    builder = CopyBuilder(source, target, delete_orphans=True)
+    source.update(old_docs)
+    source.update(new_docs)
+
+    chunk_queries = list(builder.prechunk(2))
+    assert len(chunk_queries) == 2
+    assert chunk_queries[0] ==  {'k': {'$in': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}}

From 083e5b1af52506b255ee0c276699f70d0ee4dcf6 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 12 Dec 2019 09:58:42 -0800
Subject: [PATCH 64/99] fix prechunk

---
 maggma/builders.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/maggma/builders.py b/maggma/builders.py
index 5c4321937..91bd68596 100644
--- a/maggma/builders.py
+++ b/maggma/builders.py
@@ -2,11 +2,12 @@
 """
 Base Builder class to define how builders need to be defined
 """
-from abc import ABCMeta, abstractmethod
 import traceback
+from abc import ABCMeta, abstractmethod
+from time import time
+from math import ceil
 from datetime import datetime
 from maggma.utils import source_keys_updated, grouper, Timeout
-from time import time
 from maggma.core import Builder, Store
 from typing import Optional, Dict, List, Iterator, Union
 
@@ -26,7 +27,6 @@ def __init__(
         source: Store,
         target: Store,
         query: Optional[Dict] = None,
-        incremental: bool = True,
         projection: Optional[List] = None,
         delete_orphans: bool = False,
         timeout: int = 0,
@@ -91,7 +91,8 @@ def prechunk(self, number_splits: int) -> Iterator[Dict]:
         self.ensure_indexes()
         keys = self.target.newer_in(self.source, criteria=self.query, exhaustive=True)
 
-        for split in grouper(keys, number_splits):
+        N = ceil(len(keys) / number_splits)
+        for split in grouper(keys, N):
             yield {self.source.key: {"$in": list(filter(None.__ne__, split))}}
 
     def get_items(self):

From ddc91f682725e5d91c9530ddc534f97ddc53f9ca Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 12 Dec 2019 09:58:56 -0800
Subject: [PATCH 65/99] setup click CLI

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6cc2ee523..5be7d8549 100644
--- a/setup.py
+++ b/setup.py
@@ -46,7 +46,7 @@
             "Topic :: Database :: Front-Ends",
             "Topic :: Scientific/Engineering",
         ],
-        entry_points={"console_scripts": ["mrun = maggma.cli.mrun:main"]},
+        entry_points={"console_scripts": ["mrun = maggma.cli:run"]},
         tests_require=["pytest"],
         python_requires=">=3.7",
     )

From 6ca35d552aa2387ac9965f55f8cdf80305f4a992 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 12 Dec 2019 09:59:44 -0800
Subject: [PATCH 66/99] remove runner

---
 maggma/runner.py | 516 -----------------------------------------------
 1 file changed, 516 deletions(-)
 delete mode 100644 maggma/runner.py

diff --git a/maggma/runner.py b/maggma/runner.py
deleted file mode 100644
index 17499ba87..000000000
--- a/maggma/runner.py
+++ /dev/null
@@ -1,516 +0,0 @@
-# coding: utf-8
-"""
-Module defining objects to run builders in various modes
-including serial processing, multiprocessing on a single computer,
-and processing via MPI
-"""
-
-import abc
-import logging
-import multiprocessing
-import types
-from collections import defaultdict, deque
-from threading import Thread, Condition, BoundedSemaphore
-from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
-from monty.json import MSONable
-from maggma.utils import get_mpi, grouper, primed
-
-# import tqdm Jupyter widget if running inside Jupyter
-try:
-    # noinspection PyUnresolvedReferences
-    if get_ipython().__class__.__name__ == 'ZMQInteractiveShell':
-        from tqdm import tqdm_notebook as tqdm
-    else: # likely 'TerminalInteractiveShell'
-        from tqdm import tqdm
-except NameError:
-    from tqdm import tqdm
-
-class BaseProcessor(MSONable, metaclass=abc.ABCMeta):
-    """
-    Base processor class for multiprocessing paradigms
-    """
-
-    def __init__(self, builders):
-        """
-        Initialize with a list of builders
-
-        Args:
-            builders(list): list of builders
-        """
-        self.builders = builders
-
-        self.logger = logging.getLogger(type(self).__name__)
-        self.logger.addHandler(logging.NullHandler())
-
-    @abc.abstractmethod
-    def process(self, builder_id):
-        """
-        Does the processing. e.g. send work to workers(in MPI) or start the processes in
-        multiprocessing.
-
-        Args:
-            builder_id (int): process the builder_id th builder i.e
-                process_item --> update_targets --> finalize
-        """
-        pass
-
-
-class SerialProcessor(BaseProcessor):
-    """
-    Simple serial processor. Usefull for debugging or example code
-    """
-
-    def process(self, builder_id):
-        """
-        Run the builder serially
-
-        Args:
-            builder_id (int): the index of the builder in the builders list
-        """
-        builder = self.builders[builder_id]
-        chunk_size = builder.chunk_size
-
-        # establish connection to the sources and targets
-        builder.connect()
-
-        cursor = builder.get_items()
-
-        for chunk in grouper(cursor, chunk_size):
-            self.logger.info("Processing batch of {} items".format(chunk_size))
-            processed_items = [builder.process_item(item) for item in chunk if item is not None]
-            builder.update_targets(processed_items)
-
-        builder.finalize(cursor)
-
-
-class MPIProcessor(BaseProcessor):
-    """
-    Processor to distribute work using MPI
-    """
-
-    def __init__(self, builders):
-        (self.comm, self.rank, self.size) = get_mpi()
-        if not self.comm:
-            raise Exception(
-                "MPI not working properly, check your mpi4py installation and ensure this is running under mpi")
-        self.comm.barrier()
-        super(MPIProcessor, self).__init__(builders)
-
-    def process(self, builder_id):
-        """
-        Run the builder using MPI protocol.
-
-        Args:
-            builder_id (int): the index of the builder in the builders list
-        """
-        self.comm.barrier()
-        if self.rank == 0:
-            self.process_master(builder_id)
-        else:
-            self.process_worker()
-
-    def setup_multithreading(self):
-        """
-        Setup structures for managing data to/from MPI Workers
-        """
-        self.data = deque()
-        self.ranks = deque([i + 1 for i in range(self.size - 1)])
-        self.task_count = BoundedSemaphore(self.builder.chunk_size)
-        self.update_data_condition = Condition()
-
-        self.run_update_targets = True
-        self.update_targets_thread = Thread(target=self.update_targets)
-        self.update_targets_thread.start()
-
-    def process_master(self, builder_id):
-        """
-        Master process for MPI processing
-        Handles Data IO to Stores and to MPI Workers
-        """
-        self.builder = self.builders[builder_id]
-        self.builder.connect()
-
-        cursor = self.builder.get_items()
-
-        self.setup_pbars(cursor)
-        self.setup_multithreading()
-        self.put_tasks(builder_id)
-        self.clean_up_workers()
-        self.clean_up_data()
-        self.builder.finalize(cursor)
-        self.cleanup_pbars()
-
-    def process_worker(self):
-        """
-        MPI Worker process
-        """
-        is_valid = True
-
-        while is_valid:
-            packet = self.comm.recv(source=0)
-            if packet["type"] == "process":
-                builder_id = packet["builder_id"]
-                data = packet["data"]
-                try:
-                    result = self.builders[builder_id].process_item(data)
-                    self.comm.send({"type": "return", "return": result}, dest=0)
-                except e:
-                    self.comm.send({"type": "error", "error": e})
-            elif packet["type"] == "shutdown":
-                is_valid = False
-
-    def setup_pbars(self, cursor):
-        """
-        Sets up progress bars
-        """
-        total = None
-        if isinstance(cursor, types.GeneratorType):
-            cursor = primed(cursor)
-            if hasattr(self.builder, "total"):
-                total = self.builder.total
-        elif hasattr(cursor, "__len__"):
-            total = len(cursor)
-        elif hasattr(cursor, "count"):
-            total = cursor.count()
-
-        self.get_pbar = tqdm(cursor, desc="Get Items", total=total)
-        self.process_pbar = tqdm(desc="Processing Item", total=total)
-        self.update_pbar = tqdm(desc="Updating Targets", total=total)
-
-    def cleanup_pbars(self):
-        """
-        Cleans up the TQDM bars
-        """
-        self.get_pbar.close()
-        self.process_pbar.close()
-        self.update_pbar.close()
-
-    def put_tasks(self, builder_id):
-        """
-        Submit tasks from cursor to MPI workers
-        """
-        # 1.) Setup thread pool
-        with ThreadPoolExecutor(max_workers=self.size - 1) as executor:
-            # 2.) Loop over every item wrapped in a tqdm bar
-            for item in self.get_pbar:
-                # 3.) Limit total number of queued tasks using a semaphore
-                self.task_count.acquire()
-                # 4.) Submit the item to a worker
-                f = executor.submit(self.submit_item, builder_id, item)
-
-    def submit_item(self, builder_id, data):
-        """
-        Thread to submit an item to MPI Workers and get data back
-
-        """
-
-        # 1.) Find free rank and take it
-        mpi_rank = self.ranks.pop()
-        # 2.) Submit the job to that rank
-        self.comm.send({"type": "process", "builder_id": builder_id, "data": data}, dest=mpi_rank)
-        # 3.) Periodically poll for data back
-        result = None
-        while not result:
-            packet = self.comm.recv(source=mpi_rank)
-            if packet["type"] == "return":
-                result = packet["return"]
-                self.task_count.release()
-            elif packet["type"] == "error":
-                self.logger.error("MPI Rank {} Errored on Builder ID {}:\n{}".format(
-                    mpi_rank, builder_id, packet["error"]))
-                self.task_count.release()
-                return
-            else:
-                self.task_count.release()
-                return  # don't know what happened here, just quit
-
-        # 6.) Update process progress bar
-        self.process_pbar.update(1)
-
-        # 7.) Save data
-        with self.update_data_condition:
-            self.data.append(result)
-            self.update_data_condition.notify_all()
-        # 8.) Return rank
-        self.ranks.append(mpi_rank)
-
-    def clean_up_workers(self):
-        """
-        Sends shutdown signal to all MPI workers
-        """
-        for i in range(self.size - 1):
-            self.comm.send({"type": "shutdown"}, dest=i + 1)
-
-    def clean_up_data(self):
-        """
-        Call back to add data into a list in thread safe manner and signal other threads to add more tasks or update_targets
-        """
-        self.logger.debug("Cleaning up data queue")
-        try:
-            with self.update_data_condition:
-                self.run_update_targets = False
-                self.update_data_condition.notify_all()
-        except Exception as e:
-            self.logger.debug("Problem in updating targets at end of builder run: {}".format(e))
-
-        self.update_targets_thread.join()
-
-    def update_targets(self):
-        """
-        Thread to update targets periodically
-        """
-        while self.run_update_targets:
-            with self.update_data_condition:
-                self.update_data_condition.wait_for(
-                    lambda: not self.run_update_targets or len(self.data) > self.builder.chunk_size)
-                try:
-                    self.builder.update_targets(self.data)
-                    self.update_pbar.update(len(self.data))
-                    self.data.clear()
-                except Exception as e:
-                    self.logger.exception("Problem in updating targets in builder run: {}".format(e))
-
-
-class MultiprocProcessor(BaseProcessor):
-    """
-    Processor to run builders using python multiprocessing
-    """
-
-    def __init__(self, builders, max_workers=None):
-        self.max_workers = max_workers
-        super(MultiprocProcessor, self).__init__(builders)
-        self.logger.info(
-            "Building with multiprocessing, {} workers in the pool".format(
-                "{} max".format(multiprocessing.cpu_count())
-                if self.max_workers is None else self.max_workers))
-
-    def process(self, builder_id):
-        """
-        Run the builder using the builtin multiprocessing.
-
-        Args:
-            builder_id (int): the index of the builder in the builders list
-        """
-        self.builder = self.builders[builder_id]
-        self.builder.connect()
-
-        cursor = self.builder.get_items()
-
-        self.setup_pbars(cursor)
-
-        self.setup_multithreading()
-        self.put_tasks()
-        self.clean_up_data()
-        self.builder.finalize(cursor)
-        self.cleanup_pbars()
-
-    def setup_pbars(self, cursor):
-        """
-        Sets up progress bars
-        """
-        total = None
-
-        if isinstance(cursor, types.GeneratorType):
-            try:
-                cursor = primed(cursor)
-                if hasattr(self.builder, "total"):
-                    total = self.builder.total
-            except StopIteration:
-                self.logger.debug("Get items returned empty iterator")
-
-        elif hasattr(cursor, "__len__"):
-            total = len(cursor)
-        elif hasattr(cursor, "count"):
-            total = cursor.count()
-
-        self.get_pbar = tqdm(cursor, desc="Get Items", total=total)
-        self.process_pbar = tqdm(desc="Processing Item", total=total)
-        self.update_pbar = tqdm(desc="Updating Targets", total=total)
-
-    def cleanup_pbars(self):
-        """
-        Cleans up the TQDM bars
-        """
-        self.get_pbar.close()
-        self.process_pbar.close()
-        self.update_pbar.close()
-
-    def setup_multithreading(self):
-        """
-        Sets up objects necessary to store and synchronize data in multiprocessing
-        """
-        self.data = deque()
-        self.task_count = BoundedSemaphore(self.builder.chunk_size)
-        self.update_data_condition = Condition()
-
-        self.run_update_targets = True
-        self.update_targets_thread = Thread(target=self.update_targets)
-        self.update_targets_thread.start()
-
-    def put_tasks(self):
-        """
-        Processes all items from builder using a pool of processes
-        """
-        # 1.) setup a process pool
-        with ProcessPoolExecutor(self.max_workers) as executor:
-            # 2.) Loop over every item wrapped in a tqdm bar
-            for item in self.get_pbar:
-                # 3.) Limit total number of queues tasks using a semaphore
-                self.task_count.acquire()
-                # 4.) Submit a task to processing pool
-                f = executor.submit(self.builder.process_item, item)
-                # 5.) Add call back to update our data list
-                f.add_done_callback(self.update_data_callback)
-
-    def clean_up_data(self):
-        """
-        Updates targets with remaining data and then cleans up the data collection
-        """
-        try:
-            with self.update_data_condition:
-                self.run_update_targets = False
-                self.update_data_condition.notify_all()
-        except Exception as e:
-            self.logger.debug("Problem in updating targets at end of builder run: {}".format(e))
-
-        self.update_targets_thread.join()
-
-    def update_data_callback(self, future):
-        """
-        Call back to add data into a list in thread safe manner and signal other threads to add more tasks or update_targets
-        """
-        with self.update_data_condition:
-            self.process_pbar.update(1)
-            self.data.append(future.result())
-            self.update_data_condition.notify_all()
-
-        self.task_count.release()
-
-    def update_targets(self):
-        """
-        Thread to update targets periodically
-        """
-
-        while self.run_update_targets:
-            with self.update_data_condition:
-                self.update_data_condition.wait_for(
-                    lambda: not self.run_update_targets or len(self.data) > self.builder.chunk_size)
-                try:
-                    if self.data is not None:
-                        self.update_pbar.unpause()
-                        self.builder.update_targets(self.data)
-                        self.update_pbar.update(len(self.data))
-                        self.data.clear()
-                except Exception as e:
-                    self.logger.exception("Problem in updating targets in builder run: {}".format(e))
-
-
-class Runner(MSONable):
-    def __init__(self, builders, max_workers=1, mpi=False):
-        """
-        Initialize with a list of builders
-
-        Args:
-            builders(list): list of builders
-            max_workers (int): number of processes. Ignored if mpi is True.
-                Uses multiprocessing if not set to 1. Set to 0 for no maximum.
-            mpi (bool): Run with MPI
-        """
-        self.builders = builders
-        self.max_workers = max_workers
-        self.mpi = mpi
-        self.logger = logging.getLogger(type(self).__name__)
-        self.logger.addHandler(logging.NullHandler())
-
-        self.dependency_graph = self._get_builder_dependency_graph()
-        self.has_run = []  # for bookkeeping builder runs
-        if self.mpi:
-            self.processor = MPIProcessor(self.builders)
-        elif self.max_workers == 1:
-            self.processor = SerialProcessor(self.builders)
-        else:
-            max_workers = None if self.max_workers == 0 else self.max_workers
-            self.processor = MultiprocProcessor(self.builders, max_workers)
-
-
-    # TODO: make it efficient, O(N^2) complexity at the moment,
-    # might be ok(not many builders)? - KM
-    def _get_builder_dependency_graph(self):
-        """
-        Does the following:
-        1.) use targets and sources of builders to determine interdependencies
-        2.) order builders according to interdependencies
-
-        Returns:
-            dict
-        """
-        # key = index of the builder in the self.builders list
-        # value = list of indices of builders that the key depends on i.e these must run before
-        # the builder corresponding to the key.
-        links_dict = defaultdict(list)
-        for i, bi in enumerate(self.builders):
-            for j, bj in enumerate(self.builders):
-                if i != j:
-                    for s in bi.sources:
-                        if s in bj.targets:
-                            links_dict[i].append(j)
-        return links_dict
-
-    def run(self):
-        """
-        Does the following:
-            - traverse through the builder dependency graph and does the following to
-              each builder
-                - connect to sources
-                - get items and feed it to the processing pipeline
-                - process each item
-                    - supported options: serial, MPI or the builtin multiprocessing
-                - collect all processed items
-                - connect to the targets
-                - update targets
-                - finalize aka cleanup(close all connections etc)
-        """
-        if isinstance(self.processor, MPIProcessor):
-            self.logger.info(
-                "Running with MPI Rank {} (Size: {})".format(
-                    self.processor.rank, self.processor.size))
-        elif isinstance(self.processor, MultiprocProcessor):
-            self.logger.info(
-                "Running with Multiprocessing (up to {} workers)".format(
-                    multiprocessing.cpu_count()
-                    if self.max_workers == 0 else self.max_workers))
-        else:
-            self.logger.info("Running with {}".format(
-                str(self.processor.__class__.__name__)))
-
-        for i in range(len(self.builders)):
-            self._build_dependencies(i)
-
-    def _build_dependencies(self, builder_id):
-        """
-        Run the builders by recursively traversing through the dependency graph.
-
-        Args:
-            builder_id (int): builder index
-        """
-        if builder_id in self.has_run:
-            return
-        else:
-            if self.dependency_graph[builder_id]:
-                for j in self.dependency_graph[builder_id]:
-                    self._build_dependencies(j)
-            self._run_builder(builder_id)
-            self.has_run.append(builder_id)
-
-    def _run_builder(self, builder_id):
-        """
-        Run builder: self.builders[builder_id]
-
-        Args:
-            builder_id (int): builder index
-
-        Returns:
-
-        """
-        self.logger.debug("Building: {}".format(builder_id))
-        self.processor.process(builder_id)

From 8441186bfdd3aa2ff0c197651c6dd5ad0fa57af9 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 12 Dec 2019 15:40:53 -0800
Subject: [PATCH 67/99] remove more defunct code

---
 maggma/cli/tests/test_mrun.py     | 48 ----------------
 maggma/examples/__init__.py       |  0
 maggma/examples/builders.py       | 10 ----
 maggma/examples/runner_sample.py  | 93 -------------------------------
 maggma/examples/tests/__init__.py |  0
 5 files changed, 151 deletions(-)
 delete mode 100644 maggma/cli/tests/test_mrun.py
 delete mode 100644 maggma/examples/__init__.py
 delete mode 100644 maggma/examples/builders.py
 delete mode 100755 maggma/examples/runner_sample.py
 delete mode 100644 maggma/examples/tests/__init__.py

diff --git a/maggma/cli/tests/test_mrun.py b/maggma/cli/tests/test_mrun.py
deleted file mode 100644
index 6abfafaef..000000000
--- a/maggma/cli/tests/test_mrun.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import os
-
-import subprocess
-from maggma.runner import Runner
-from monty.serialization import dumpfn
-import unittest
-from unittest import TestCase
-from uuid import uuid4
-
-from maggma.builders import CopyBuilder
-from maggma.stores import MongoStore
-
-
-@unittest.skip("Just don't")
-class TestMRun(TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.dbname = "test_" + uuid4().hex
-        cls.source = MongoStore(cls.dbname, "source")
-        cls.target = MongoStore(cls.dbname, "target")
-        cls.stores = [cls.source, cls.target]
-        for store in cls.stores:
-            store.connect()
-            store.ensure_index(store.key)
-            store.ensure_index([(store.lu_field, -1), (store.key, 1)])
-        cls.client = cls.stores[0].collection.database.client
-
-    @classmethod
-    def tearDownClass(cls):
-        cls.client.drop_database(cls.dbname)
-
-    def setUp(self):
-        self.runner_filename = "runner_" + uuid4().hex + ".json"
-
-    def tearDown(self):
-        os.remove(self.runner_filename)
-
-    def test_simple_runner(self):
-        builder = CopyBuilder(self.source, self.target)
-        runner = Runner([builder])
-        dumpfn(runner, self.runner_filename)
-        p = subprocess.run("python -m maggma.cli.mrun {}".format(
-            self.runner_filename).split(), timeout=15)
-        self.assertEqual(p.returncode, 0)
-
-
-if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
diff --git a/maggma/examples/__init__.py b/maggma/examples/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/maggma/examples/builders.py b/maggma/examples/builders.py
deleted file mode 100644
index e50c93dda..000000000
--- a/maggma/examples/builders.py
+++ /dev/null
@@ -1,10 +0,0 @@
-"""
-Example builders for testing and general use.
-"""
-from maggma.builders import MapBuilder, CopyBuilder
-
-import warnings
-
-warnings.warn("maggma.examples.builder is now deprecated. "
-              "MapBuilder and CopyBuilder have been moved to the "
-              "main builders module")
diff --git a/maggma/examples/runner_sample.py b/maggma/examples/runner_sample.py
deleted file mode 100755
index b8bf85f47..000000000
--- a/maggma/examples/runner_sample.py
+++ /dev/null
@@ -1,93 +0,0 @@
-"""
- Example Usage:
- with serial processing:
-    python runner_sample.py
- with multiprocessing (use max cores available):
-    python runner_sample.py -n 0
- with multiprocessing (use up to 3 cores):
-    python runner_sample.py -n 3
- with mpi(need mpi4py package) size 3:
-    mpiexec -n 3 python runner_sample.py --mpi
-"""
-
-import argparse
-import logging
-
-from maggma.stores import MemoryStore
-from maggma.builders import Builder
-from maggma.runner import Runner
-
-__author__ = "Kiran Mathew, Donny Winston"
-
-
-class MyDumbBuilder(Builder):
-    """This builder builds."""
-    def __init__(self, N, sources, targets, chunk_size=1):
-        super().__init__(sources, targets, chunk_size)
-        self.N = N
-
-    def get_items(self):
-        for i in range(self.N):
-            yield i
-
-    def process_item(self, item):
-        self.logger.info("processing item: {}".format(item))
-        return {item: "processed"}
-
-    def update_targets(self, items):
-        self.logger.info("Updating targets ...")
-        self.logger.info("Received {} processed items".format(len(items)))
-        self.logger.info("Updated items: {}".format(list(items)))
-
-    def finalize(self, cursor=None):
-        self.logger.info("Finalizing ...")
-        self.logger.info("DONE!")
-
-
-def logstreamhandle(runner, level=logging.INFO, stream=None):
-    """
-    Log output of runner and its processors and builders to stream at level.
-
-    Defaults: output to sys.stderr at INFO level.
-
-    Args:
-        runner (Runner): the runner.
-        level (int): logging level. DEBUG, INFO, WARNING, ERROR, or CRITICAL.
-        stream: any stream (sys.stdout, sys.stderr, etc.) or file-like object.
-    """
-    loggers = [runner.logger, runner.processor.logger]
-    loggers.extend(b.logger for b in runner.builders)
-    for l in loggers:
-        l.setLevel(level)
-        ch = logging.StreamHandler(stream=stream)
-        ch.setLevel(level)
-        formatter = logging.Formatter(
-            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-        ch.setFormatter(formatter)
-        l.addHandler(ch)
-
-
-if __name__ == '__main__':
-    N = 10
-    chunk_size = 3
-    stores = [MemoryStore(str(i)) for i in range(7)]
-
-    sources = [stores[0], stores[1], stores[3]]
-    targets = [stores[3], stores[6]]
-
-    mdb = MyDumbBuilder(N, sources, targets, chunk_size=chunk_size)
-
-    builders = [mdb]
-
-    parser = argparse.ArgumentParser(description='Run a sample runner.')
-    parser.add_argument('--nworkers', '-n', type=int, default=1,
-                        help='number of workers (0 for max available)')
-    parser.add_argument('--mpi', dest='mpi', action='store_true')
-    parser.add_argument('--no-mpi', dest='mpi', action='store_false')
-    parser.set_defaults(mpi=False)
-
-    args = parser.parse_args()
-    runner = Runner(builders, max_workers=args.nworkers, mpi=args.mpi)
-
-    logstreamhandle(runner)
-    runner.run()
diff --git a/maggma/examples/tests/__init__.py b/maggma/examples/tests/__init__.py
deleted file mode 100644
index e69de29bb..000000000

From 3120d2f1656cec1b01fe8f616a5f665259c6e812 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 12 Dec 2019 18:04:32 -0800
Subject: [PATCH 68/99] more tests

---
 maggma/stores/tests/test_advanced_stores.py | 19 ++++++++
 maggma/stores/tests/test_mongolike.py       |  6 +--
 maggma/tests/test_builders.py               |  4 +-
 maggma/tests/test_utils.py                  | 52 ++++++++++++++++++---
 4 files changed, 70 insertions(+), 11 deletions(-)

diff --git a/maggma/stores/tests/test_advanced_stores.py b/maggma/stores/tests/test_advanced_stores.py
index ecec08375..0de1e94ce 100644
--- a/maggma/stores/tests/test_advanced_stores.py
+++ b/maggma/stores/tests/test_advanced_stores.py
@@ -19,6 +19,7 @@
 from unittest.mock import patch
 from uuid import uuid4
 
+from maggma.core import StoreError
 from maggma.stores import (
     MongoStore,
     MongograntStore,
@@ -100,6 +101,15 @@ def connected_user(store):
     ][0]["user"]
 
 
+def test_mgrant_init():
+    with pytest.raises(StoreError):
+        store = MongograntStore("", "", username="")
+
+    with pytest.raises(ValueError):
+        store = MongograntStore("", "")
+        store.connect()
+
+
 def test_mgrant_connect(mgrant_server, mgrant_user):
     config_path, mdport, dbname = mgrant_server
     assert mgrant_user is not None
@@ -255,6 +265,15 @@ def test_aliasing_substitute(alias_store):
     assert d is None
 
 
+def test_aliasing_distinct(alias_store):
+    d = [{"b": 1}, {"e": 2}, {"g": {"h": 3}}]
+    alias_store.store._collection.insert_many(d)
+
+    assert alias_store.distinct("a") == [1]
+    assert alias_store.distinct("c.d") == [2]
+    assert alias_store.distinct("f") == [3]
+
+
 @pytest.fixture
 def sandbox_store():
     memstore = MemoryStore()
diff --git a/maggma/stores/tests/test_mongolike.py b/maggma/stores/tests/test_mongolike.py
index b1af29592..bb4f13eaa 100644
--- a/maggma/stores/tests/test_mongolike.py
+++ b/maggma/stores/tests/test_mongolike.py
@@ -56,10 +56,10 @@ def test_mongostore_distinct(mongostore):
 
     # Test distinct subdocument functionality
     ghs = mongostore.distinct("g.h")
-    assert set(ghs), {1 == 2}
+    assert set(ghs) == {1, 2, None}
     ghs_ds = mongostore.distinct(["d", "g.h"], all_exist=True)
-    assert {s["g"]["h"] for s in ghs_ds}, {1 == 2}
-    assert {s["d"] for s in ghs_ds}, {5 == 6}
+    assert {s["g"]["h"] for s in ghs_ds} == {1, 2}
+    assert {s["d"] for s in ghs_ds}, {5, 6}
 
 
 def test_mongostore_update(mongostore):
diff --git a/maggma/tests/test_builders.py b/maggma/tests/test_builders.py
index ee99ddcf9..ac4838382 100644
--- a/maggma/tests/test_builders.py
+++ b/maggma/tests/test_builders.py
@@ -103,11 +103,11 @@ def test_delete_orphans(source, target, old_docs, new_docs):
     assert target.query_one(criteria={"k": 10})["v"] == "old"
 
 
-def test_prechunk(source, target, old_docs,new_docs):
+def test_prechunk(source, target, old_docs, new_docs):
     builder = CopyBuilder(source, target, delete_orphans=True)
     source.update(old_docs)
     source.update(new_docs)
 
     chunk_queries = list(builder.prechunk(2))
     assert len(chunk_queries) == 2
-    assert chunk_queries[0] ==  {'k': {'$in': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}}
+    assert chunk_queries[0] == {"k": {"$in": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}}
diff --git a/maggma/tests/test_utils.py b/maggma/tests/test_utils.py
index 34b332113..cbae39dcd 100644
--- a/maggma/tests/test_utils.py
+++ b/maggma/tests/test_utils.py
@@ -4,30 +4,70 @@
 """
 import pytest
 
-from maggma.utils import recursive_update, Timeout
+from maggma.utils import (
+    recursive_update,
+    Timeout,
+    primed,
+    dt_to_isoformat_ceil_ms,
+    isostr_to_dt,
+)
 from time import sleep
+from datetime import datetime
 
 
 def test_recursiveupdate():
     d = {"a": {"b": 3}, "c": [4]}
 
     recursive_update(d, {"c": [5]})
-    assert d["c"] ==  [5]
+    assert d["c"] == [5]
 
     recursive_update(d, {"a": {"b": 5}})
-    assert d["a"]["b"] ==  5
+    assert d["a"]["b"] == 5
 
     recursive_update(d, {"a": {"b": [6]}})
-    assert d["a"]["b"] ==  [6]
+    assert d["a"]["b"] == [6]
 
     recursive_update(d, {"a": {"b": [7]}})
-    assert d["a"]["b"] ==  [7]
+    assert d["a"]["b"] == [7]
 
-def test_timeout():
 
+def test_timeout():
     def takes_too_long():
         with Timeout(seconds=1):
             sleep(2)
+
     with pytest.raises(TimeoutError):
         takes_too_long()
 
+
+def test_primed():
+
+    global is_primed
+    is_primed = False
+
+    def unprimed_iter():
+        global is_primed
+        is_primed = True
+        for i in range(10):
+            yield i
+
+    iterator = unprimed_iter()
+
+    # iterator is still unprimed
+    assert is_primed is False
+
+    iterator = primed(iterator)
+    assert is_primed is True
+    assert list(iterator) == list(range(10))
+
+
+def test_datetime_utils():
+
+    assert (
+        dt_to_isoformat_ceil_ms(datetime(2019, 12, 13, 0, 23, 11, 9515))
+        == "2019-12-13T00:23:11.010"
+    )
+
+    assert isostr_to_dt("2019-12-13T00:23:11.010") == datetime(
+        2019, 12, 13, 0, 23, 11, 10000
+    )

From bacc9fe7e723d468475ee16799154f07dfe6d05c Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 12 Dec 2019 18:04:45 -0800
Subject: [PATCH 69/99] remove more mpi

---
 maggma/utils.py | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/maggma/utils.py b/maggma/utils.py
index 4d7db351e..4e7c02a5c 100644
--- a/maggma/utils.py
+++ b/maggma/utils.py
@@ -129,24 +129,6 @@ def grouper(iterable, n, fillvalue=None):
     return iterator
 
 
-def get_mpi():
-    """
-    Helper that returns the mpi communicator, rank and size.
-    """
-    try:
-        from mpi4py import MPI
-
-        comm = MPI.COMM_WORLD
-        rank = comm.Get_rank()
-        size = comm.Get_size()
-    except Exception:
-        comm = None
-        rank = -1
-        size = 0
-
-    return comm, rank, size
-
-
 def lazy_substitute(d, aliases):
     """
     Simple top level substitute that doesn't dive into mongo like strings

From eade629657476c0d6c7d35ba0645737cbf1fd8bf Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 12 Dec 2019 18:04:55 -0800
Subject: [PATCH 70/99] fix memory store

---
 maggma/stores/mongolike.py | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/maggma/stores/mongolike.py b/maggma/stores/mongolike.py
index 6a8f0be54..b727aa0fd 100644
--- a/maggma/stores/mongolike.py
+++ b/maggma/stores/mongolike.py
@@ -13,9 +13,8 @@
 import mongomock
 
 from itertools import groupby
-from operator import itemgetter
 from pymongo import MongoClient
-from pydash import set_
+from pydash import set_, get, has
 
 from pymongo import ReplaceOne
 
@@ -307,18 +306,20 @@ def groupby(
             generator returning tuples of (key, list of elemnts)
         """
         keys = keys if isinstance(keys, list) else [keys]
-
-        input_data = list(self.query(properties=keys, criteria=criteria))
-
-        if len(keys) > 1:
-            grouper = itemgetter(*keys)
-            for vals, grp in groupby(sorted(input_data, key=grouper), grouper):
-                yield {k: v for k, v in zip(keys, vals)}, list(grp)
-        else:
-            grouper = itemgetter(*keys)
-            for val, group in groupby(sorted(input_data, key=grouper), grouper):
-                yield {keys[0]: val}, list(group)
-
+        data = [
+            doc
+            for doc in self.query(properties=keys, criteria=criteria)
+            if all(has(doc, k) for k in keys)
+        ]
+
+        def grouper(doc):
+            return tuple(get(doc, k) for k in keys)
+
+        for vals, group in groupby(sorted(data, key=grouper), grouper):
+            doc = {}
+            for k, v in zip(keys, vals):
+                set_(doc, k, v)
+            yield doc, list(group)
 
 
 class JSONStore(MemoryStore):

From 04a9d3e7d03d5522d6a3ca029fc6b7c56b7a2a12 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Fri, 13 Dec 2019 09:35:22 -0800
Subject: [PATCH 71/99] fix last_updated

---
 maggma/core/store.py       | 21 +++++++++++----------
 maggma/stores/mongolike.py |  4 +++-
 2 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/maggma/core/store.py b/maggma/core/store.py
index 1e4fc3ed1..bb31cead5 100644
--- a/maggma/core/store.py
+++ b/maggma/core/store.py
@@ -12,7 +12,7 @@
 from enum import Enum
 from typing import Union, Optional, Dict, List, Iterator, Tuple
 
-from pydash import identity, get
+from pydash import identity, get, has
 
 from monty.dev import deprecated
 from monty.json import MSONable, MontyDecoder
@@ -22,7 +22,7 @@
 
 class Sort(Enum):
     Ascending = 1
-    Descending = 2
+    Descending = -1
 
 
 class DateTimeFormat(Enum):
@@ -240,18 +240,17 @@ def last_updated(self) -> datetime:
             ),
             None,
         )
-        if doc and self.last_updated_field not in doc:
+        if doc and not has(doc, self.last_updated_field):
             raise StoreError(
                 f"No field '{self.last_updated_field}' in store document. Please ensure Store.last_updated_field "
                 "is a datetime field in your store that represents the time of "
                 "last update to each document."
             )
-        # Handle when collection has docs but `NoneType` last_updated_field.
-        return (
-            self._lu_func[0](doc[self.last_updated_field])
-            if (doc and doc[self.last_updated_field])
-            else datetime.min
-        )
+        elif not doc or get(doc, self.last_updated_field) is None:
+            # Handle when collection has docs but `NoneType` last_updated_field.
+            return datetime.min
+        else:
+            return self._lu_func[0](get(doc, self.last_updated_field))
 
     def newer_in(
         self,
@@ -291,7 +290,9 @@ def newer_in(
 
             new_keys = set(target_dates.keys()) - set(dates.keys())
             updated_keys = {
-                key for key, date in dates.items() if target_dates.get(key,datetime.min) > date
+                key
+                for key, date in dates.items()
+                if target_dates.get(key, datetime.min) > date
             }
 
             return list(new_keys | updated_keys)
diff --git a/maggma/stores/mongolike.py b/maggma/stores/mongolike.py
index b727aa0fd..5c220df85 100644
--- a/maggma/stores/mongolike.py
+++ b/maggma/stores/mongolike.py
@@ -178,8 +178,10 @@ def query(
         """
         if isinstance(properties, list):
             properties = {p: 1 for p in properties}
+
+        sort = [(k, v.value) for k, v in sort.items()] if sort else None
         for d in self._collection.find(
-            filter=criteria, projection=properties, skip=skip, limit=limit
+            filter=criteria, projection=properties, skip=skip, limit=limit, sort=sort
         ):
             yield d
 

From 7ec411aa5db834ccae92424e2d855657ade35df0 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Fri, 13 Dec 2019 11:02:13 -0800
Subject: [PATCH 72/99] new tests and cleanup

---
 maggma/stores/tests/test_compound_stores.py |  2 -
 maggma/stores/tests/test_gridfs.py          | 75 ++++++++++++++++++++-
 2 files changed, 72 insertions(+), 5 deletions(-)

diff --git a/maggma/stores/tests/test_compound_stores.py b/maggma/stores/tests/test_compound_stores.py
index f2d38762b..73fe1af4d 100644
--- a/maggma/stores/tests/test_compound_stores.py
+++ b/maggma/stores/tests/test_compound_stores.py
@@ -130,7 +130,6 @@ def test_joint_store_groupby(jointstore):
     assert len(docs[0][1]) == 5
     assert len(docs[1][1]) == 5
     docs = list(jointstore.groupby("test2.category2"))
-    print([d[0] for d in docs])
 
     none_docs = next(d for d in docs if get(d[0], "test2.category2") == [])
     one_docs = next(d for d in docs if get(d[0], "test2.category2") == [1])
@@ -170,7 +169,6 @@ def concat_store():
 
 
 def test_concat_store_distinct(concat_store):
-    print(type(concat_store))
     docs = list(concat_store.distinct("task_id"))
     actual_docs = list(
         chain.from_iterable(
diff --git a/maggma/stores/tests/test_gridfs.py b/maggma/stores/tests/test_gridfs.py
index 13167a38f..7d6b15173 100644
--- a/maggma/stores/tests/test_gridfs.py
+++ b/maggma/stores/tests/test_gridfs.py
@@ -3,6 +3,7 @@
 import numpy.testing.utils as nptu
 from datetime import datetime
 from maggma.stores import GridFSStore
+from maggma.core import Sort
 
 
 @pytest.fixture
@@ -89,7 +90,75 @@ def test_query(gridfsstore):
     assert gridfsstore.query_one(criteria={"task_id": "mp-3"}) is None
 
 
-@pytest.mark.skip("Not Done")
+def test_last_updated(gridfsstore):
+    data1 = np.random.rand(256)
+    data2 = np.random.rand(256)
+    tic = datetime(2018, 4, 12, 16)
+
+    gridfsstore.update(
+        [{"task_id": "mp-1", "data": data1, gridfsstore.last_updated_field: tic}]
+    )
+    gridfsstore.update(
+        [{"task_id": "mp-2", "data": data2, gridfsstore.last_updated_field: tic}]
+    )
+
+    assert gridfsstore.last_updated == tic
+
+    toc = datetime(2019, 6, 12, 16)
+    gridfsstore.update(
+        [{"task_id": "mp-3", "data": data2, gridfsstore.last_updated_field: toc}]
+    )
+
+    assert gridfsstore.last_updated == toc
+
+    tic = datetime(2017, 6, 12, 16)
+    gridfsstore.update(
+        [{"task_id": "mp-4", "data": data2, gridfsstore.last_updated_field: tic}]
+    )
+
+    assert gridfsstore.last_updated == toc
+
+
+def test_groupby(gridfsstore):
+    tic = datetime(2018, 4, 12, 16)
+
+    for i in range(3):
+        gridfsstore.update(
+            [{"task_id": f"mp-{i}", "a": 1, gridfsstore.last_updated_field: tic}],
+            key=["task_id", "a"],
+        )
+
+    for i in range(3, 7):
+        gridfsstore.update(
+            [{"task_id": f"mp-{i}", "a": 2, gridfsstore.last_updated_field: tic}],
+            key=["task_id", "a"],
+        )
+
+    groups = list(gridfsstore.groupby("a"))
+    assert len(groups) == 2
+    assert {g[0]["a"] for g in groups} == {1, 2}
+
+    by_group = {}
+    for group, docs in groups:
+        by_group[group["a"]] = {d["task_id"] for d in docs}
+    assert by_group[1] == {"mp-0", "mp-1", "mp-2"}
+    assert by_group[2] == {"mp-3", "mp-4", "mp-5", "mp-6"}
+
+
 def test_distinct(gridfsstore):
-    # TODO
-    pass
+    tic = datetime(2018, 4, 12, 16)
+
+    for i in range(3):
+        gridfsstore.update(
+            [{"task_id": f"mp-{i}", "a": 1, gridfsstore.last_updated_field: tic}],
+            key=["task_id", "a"],
+        )
+
+    for i in range(3, 7):
+        gridfsstore.update(
+            [{"task_id": f"mp-{i}", "a": 2, gridfsstore.last_updated_field: tic}],
+            key=["task_id", "a"],
+        )
+
+    assert set(gridfsstore.distinct("a")) == {1, 2}
+

From ba3f5eb928199e337341e84afded71dd586952a4 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Fri, 13 Dec 2019 11:02:43 -0800
Subject: [PATCH 73/99] fix for grouping by sub-fields

---
 maggma/stores/mongolike.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/maggma/stores/mongolike.py b/maggma/stores/mongolike.py
index 5c220df85..14f00497a 100644
--- a/maggma/stores/mongolike.py
+++ b/maggma/stores/mongolike.py
@@ -119,22 +119,24 @@ def groupby(
             generator returning tuples of (key, list of docs)
         """
         pipeline = []
+        if isinstance(keys, str):
+            keys = [keys]
+
         if criteria is not None:
             pipeline.append({"$match": criteria})
 
         if properties is not None:
-            pipeline.append({"$project": {p: 1 for p in properties}})
+            pipeline.append({"$project": {p: 1 for p in properties + keys}})
 
-        if isinstance(keys, str):
-            keys = [keys]
-
-        group_id = {}
-        for key in keys:
-            set_(group_id, key, "${}".format(key))
+        alpha = "abcdefghijklmnopqrstuvwxyz"
+        group_id = {letter: f"${key}" for letter, key in zip(alpha, keys)}
         pipeline.append({"$group": {"_id": group_id, "docs": {"$push": "$$ROOT"}}})
-
         for d in self._collection.aggregate(pipeline, allowDiskUse=True):
-            yield (d["_id"], d["docs"])
+            id_doc = {}
+            for letter, key in group_id.items():
+                if has(d["_id"], letter):
+                    set_(id_doc, key[1:], d["_id"][letter])
+            yield (id_doc, d["docs"])
 
     @classmethod
     def from_collection(cls, collection):

From 8bd90f7d22f5cfe9f0b7825d54cc9f39326b768d Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Fri, 13 Dec 2019 11:03:32 -0800
Subject: [PATCH 74/99] fix distinct and groupby

---
 maggma/stores/gridfs.py | 96 +++++++++++++++++++++--------------------
 1 file changed, 49 insertions(+), 47 deletions(-)

diff --git a/maggma/stores/gridfs.py b/maggma/stores/gridfs.py
index 2879d7122..acf49daed 100644
--- a/maggma/stores/gridfs.py
+++ b/maggma/stores/gridfs.py
@@ -12,17 +12,17 @@
 from datetime import datetime
 import json
 import zlib
-import pymongo
 import gridfs
+from pydash import get, has
 
 from pymongo import MongoClient
 from monty.json import jsanitize
 from monty.dev import deprecated
 from maggma.utils import confirm_field_index
 from maggma.core import Store, Sort
+from maggma.stores import MongoStore
 
 
-# TODO: Make arguments more specific for this
 class GridFSStore(Store):
     """
     A Store for GrdiFS backend. Provides a common access method consistent with other stores
@@ -99,6 +99,8 @@ def connect(self, force_reset: bool = False):
 
             self._collection = gridfs.GridFS(db, self.collection_name)
             self._files_collection = db["{}.files".format(self.collection_name)]
+            self._files_store = MongoStore.from_collection(self._files_collection)
+            self._files_store.last_updated_field = f"metadata.{self.last_updated_field}"
             self._chunks_collection = db["{}.chunks".format(self.collection_name)]
 
     @property
@@ -112,24 +114,7 @@ def last_updated(self) -> datetime:
         Provides the most recent last_updated date time stamp from
         the documents in this Store
         """
-        doc = next(
-            self._files_collection.find(projection=[self.last_updated_field])
-            .sort([(self.last_updated_field, pymongo.DESCENDING)])
-            .limit(1),
-            None,
-        )
-        if doc and self.last_updated_field not in doc:
-            raise StoreError(
-                "No field '{}' in store document. Please ensure Store.last_updated_field "
-                "is a datetime field in your store that represents the time of "
-                "last update to each document.".format(self.last_updated_field)
-            )
-        # Handle when collection has docs but `NoneType` last_updated_field.
-        return (
-            self._lu_func[0](doc[self.last_updated_field])
-            if (doc and doc[self.last_updated_field])
-            else datetime.min
-        )
+        return self._files_store.last_updated
 
     @classmethod
     def transform_criteria(cls, criteria: Dict) -> Dict:
@@ -187,13 +172,16 @@ def query(
                 pass
             yield data
 
-    def distinct(self, key, criteria=None, all_exist=False, **kwargs):
+    def distinct(
+        self,
+        field: Union[List[str], str],
+        criteria: Optional[Dict] = None,
+        all_exist: bool = False,
+    ) -> Union[List[Dict], List]:
         """
         Function get to get all distinct values of a certain key in
         a GridFs store.
 
-        Currently not implemented
-        TODO: If key in metadata or transform to metadata field
 
         Args:
             key (mongolike key or list of mongolike keys): key or keys
@@ -203,7 +191,21 @@ def distinct(self, key, criteria=None, all_exist=False, **kwargs):
                 in each document, defaults to False
             **kwargs (kwargs): kwargs corresponding to collection.distinct
         """
-        raise Exception("Can't get distinct values of GridFS Store")
+        criteria = (
+            self.transform_criteria(criteria)
+            if isinstance(criteria, dict)
+            else criteria
+        )
+        field = [field] if not isinstance(field, list) else field
+        field = [
+            f"metadata.{k}"
+            if k not in self.files_collection_fields and not k.startswith("metadata.")
+            else k
+            for k in field
+        ]
+        return self._files_store.distinct(
+            field=field, criteria=criteria, all_exist=all_exist
+        )
 
     def groupby(
         self,
@@ -216,7 +218,8 @@ def groupby(
     ) -> Iterator[Tuple[Dict, List[Dict]]]:
         """
         Simple grouping function that will group documents
-        by keys.
+        by keys. Will only work if the keys are included in the files
+        collection for GridFS
 
         Args:
             keys: fields to group documents
@@ -229,32 +232,33 @@ def groupby(
         Returns:
             generator returning tuples of (dict, list of docs)
         """
-        pipeline = []
-        if criteria is not None:
-            criteria = self.transform_criteria(criteria)
-            pipeline.append({"$match": criteria})
 
-        if properties is not None:
-            properties = [
-                p if p in self.files_collection_fields else "metadata.{}".format(p)
-                for p in properties
-            ]
-            pipeline.append({"$project": {p: 1 for p in properties}})
-
-        if isinstance(keys, str):
-            keys = [keys]
-
-        # ensure propper naming for keys in and outside of metadata
+        criteria = (
+            self.transform_criteria(criteria)
+            if isinstance(criteria, dict)
+            else criteria
+        )
+        keys = [keys] if not isinstance(keys, list) else keys
         keys = [
-            k if k in self.files_collection_fields else "metadata.{}".format(k)
+            f"metadata.{k}"
+            if k not in self.files_collection_fields and not k.startswith("metadata.")
+            else k
             for k in keys
         ]
+        for group, ids in self._files_store.groupby(
+            keys, criteria=criteria, properties=[f"metadata.{self.key}"]
+        ):
+            ids = [
+                get(doc, f"metadata.{self.key}")
+                for doc in ids
+                if has(doc, f"metadata.{self.key}")
+            ]
 
-        group_id = {key: "${}".format(key) for key in keys}
-        pipeline.append({"$group": {"_id": group_id, "docs": {"$push": "$$ROOT"}}})
+            group = {
+                k.replace("metadata.", ""): get(group, k) for k in keys if has(group, k)
+            }
 
-        for doc in self._collection.aggregate(pipeline, allowDiskUse=True):
-            yield (doc["_id"], doc["docs"])
+            yield group, list(self.query(criteria={self.key: {"$in": ids}}))
 
     def ensure_index(self, key: str, unique: Optional[bool] = False) -> bool:
         """
@@ -301,13 +305,11 @@ def update(self, docs: Union[List[Dict], Dict], key: Union[List, str, None] = No
             key = [self.key]
 
         key = list(set(key) | self.meta_keys - set(self.files_collection_fields))
-
         for d in docs:
             search_doc = {k: d[k] for k in key}
 
             metadata = {k: d[k] for k in [self.last_updated_field] if k in d}
             metadata.update(search_doc)
-
             data = json.dumps(jsanitize(d)).encode("UTF-8")
             if self.compression:
                 data = zlib.compress(data)

From 98cd5e09399111b19dcdbc5c2e77b08275dba058 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Fri, 13 Dec 2019 11:03:49 -0800
Subject: [PATCH 75/99] remove unused utils

---
 maggma/utils.py | 80 -------------------------------------------------
 1 file changed, 80 deletions(-)

diff --git a/maggma/utils.py b/maggma/utils.py
index 4e7c02a5c..71828835a 100644
--- a/maggma/utils.py
+++ b/maggma/utils.py
@@ -161,86 +161,6 @@ def unset(d, key):
             unset(d, path[:i])
 
 
-def total_size(o, handlers=None, verbose=False):
-    """
-    Returns the approximate memory footprint (in bytes) of an object.
-
-    Automatically finds the contents of the following builtin containers and
-    their subclasses:  tuple, list, deque, dict, set and frozenset.
-
-    To search other containers, add handlers to iterate over their contents:
-
-        handlers = {SomeContainerClass: iter,
-                    OtherContainerClass: OtherContainerClass.get_elements}
-
-    Example usage:
-    >>> d = dict(a=1, b=2, c=3, d=[4,5,6,7], e='a string of chars')
-    >>> print(total_size(d, verbose=True))
-
-    Based on: https://github.com/ActiveState/code/blob
-    /73b09edc1b9850c557a79296655f140ce5e853db
-    /recipes/Python/577504_Compute_Memory_footprint_object_its/recipe-577504.py
-    """
-    all_handlers = {
-        tuple: iter,
-        list: iter,
-        deque: iter,
-        dict: (lambda d: itertools.chain.from_iterable(d.items())),
-        set: iter,
-        frozenset: iter,
-    }
-    if handlers:
-        all_handlers.update(handlers)  # user handlers take precedence
-    seen = set()  # track which object id's have already been seen
-    default_size = getsizeof(0)  # estimate sizeof object without __sizeof__
-
-    def sizeof(o):
-        """Recursively determine size (in bytes) of object."""
-        if id(o) in seen:  # do not double count the same object
-            return 0
-        seen.add(id(o))
-        s = getsizeof(o, default_size)
-
-        if verbose:
-            print(s, type(o), repr(o), file=stderr)
-
-        for typ, handler in all_handlers.items():
-            if isinstance(o, typ):
-                s += sum(map(sizeof, handler(o)))
-                break
-        return s
-
-    return sizeof(o)
-
-
-def source_keys_updated(source, target, query=None):
-    """
-    Utility for incremental building. Gets a list of source.key values.
-
-    Get key values for source documents that have been updated with respect to
-    corresponding target documents.
-    """
-
-    keys_updated = set()  # Handle non-unique keys, e.g. for GroupBuilder.
-
-    props = {target.key: 1, target.last_updated_field: 1, "_id": 0}
-    target_dates = {
-        d[target.key]: target._lu_func[0](d[target.last_updated_field])
-        for d in target.query(properties=props)
-    }
-
-    props = {source.key: 1, source.last_updated_field: 1, "_id": 0}
-    cursor_source = source.query(criteria=query, properties=props)
-    for sdoc in cursor_source:
-        key, lu = sdoc[source.key], source._lu_func[0](sdoc[source.last_updated_field])
-        if key not in target_dates:
-            keys_updated.add(key)
-        elif lu > target_dates[key]:
-            keys_updated.add(key)
-
-    return list(keys_updated)
-
-
 class Timeout:
     # implementation courtesy of https://stackoverflow.com/a/22348885/637562
 

From de911f8a592a828a62814845122ecf854d2ee3d8 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Fri, 13 Dec 2019 11:04:40 -0800
Subject: [PATCH 76/99] update for newer maggma store capabillities

---
 maggma/builders.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/maggma/builders.py b/maggma/builders.py
index 91bd68596..b480f1e91 100644
--- a/maggma/builders.py
+++ b/maggma/builders.py
@@ -7,7 +7,7 @@
 from time import time
 from math import ceil
 from datetime import datetime
-from maggma.utils import source_keys_updated, grouper, Timeout
+from maggma.utils import  grouper, Timeout
 from maggma.core import Builder, Store
 from typing import Optional, Dict, List, Iterator, Union
 
@@ -220,15 +220,13 @@ class GroupBuilder(MapBuilder, metaclass=ABCMeta):
     """
 
     def get_items(self) -> Iterator[Dict]:
-        criteria = source_keys_updated(self.source, self.target, query=self.query)
-        if all(isinstance(entry, str) for entry in self.grouping_properties()):
-            properties = {entry: 1 for entry in self.grouping_properties()}
-            if "_id" not in properties:
-                properties.update({"_id": 0})
-        else:
-            properties = {
-                entry: include for entry, include in self.grouping_properties()
+        criteria = {
+            self.source.key: {
+                "$in": self.target.newer_in(self.source, criteria=self.query)
             }
+        }
+
+        properties = self.grouping_properties()
         groups = self.docs_to_groups(
             self.source.query(criteria=criteria, properties=properties)
         )
@@ -289,5 +287,5 @@ def group_to_items(self, group: Dict) -> Iterator:
 class CopyBuilder(MapBuilder):
     """Sync a source store with a target store."""
 
-    def unary_function(self,item):
+    def unary_function(self, item):
         return item

From d3c44e6bc4cfde58485394c3c31204ee64d1b3d5 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Sun, 15 Dec 2019 09:27:44 -0800
Subject: [PATCH 77/99] test for sub-fields

---
 maggma/stores/tests/test_mongolike.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/maggma/stores/tests/test_mongolike.py b/maggma/stores/tests/test_mongolike.py
index bb4f13eaa..dde555ccc 100644
--- a/maggma/stores/tests/test_mongolike.py
+++ b/maggma/stores/tests/test_mongolike.py
@@ -188,6 +188,19 @@ def test_groupby(memorystore):
     data = list(memorystore.groupby(["e", "d"]))
     assert len(data) == 3
 
+    memorystore.update(
+        [
+            {"e": { "d": 9}, "f": 9},
+            {"e": { "d": 9}, "f": 10},
+            {"e": { "d": 9}, "f": 11},
+            {"e": { "d": 10}, "f": 12},
+        ],
+        key="f",
+    )
+    data = list(memorystore.groupby("e.d"))
+    assert len(data) == 2
+    
+
 
 def test_json_store_load(test_dir):
     files = []

From 04a413a9e135a0886b37399af8bcf7e441ec66c7 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Mon, 23 Dec 2019 16:47:03 -0800
Subject: [PATCH 78/99] tests for simple processing

---
 maggma/cli/tests/test_multiprocessing.py | 56 ++++++++++++++++++++++++
 maggma/cli/tests/test_serial.py          | 34 ++++++++++++++
 2 files changed, 90 insertions(+)
 create mode 100644 maggma/cli/tests/test_multiprocessing.py
 create mode 100644 maggma/cli/tests/test_serial.py

diff --git a/maggma/cli/tests/test_multiprocessing.py b/maggma/cli/tests/test_multiprocessing.py
new file mode 100644
index 000000000..4b1c6367b
--- /dev/null
+++ b/maggma/cli/tests/test_multiprocessing.py
@@ -0,0 +1,56 @@
+import pytest
+import time
+import asyncio
+from maggma.core import Builder
+from maggma.cli.multiprocessing import AsyncBackPressuredMap, grouper, multi
+from concurrent.futures import ThreadPoolExecutor
+
+
+@pytest.mark.asyncio
+async def test_grouper():
+    async def arange(count):
+        for i in range(count):
+            yield (i)
+
+    async for group in grouper(arange(100), n=10):
+        assert len(group) == 10
+
+    async for group in grouper(arange(9), n=10, fillvalue="s"):
+        assert len(group) == 10
+
+    async for group in grouper(arange(9), n=10):
+        assert len(group) == 9
+
+
+def wait_and_return(x):
+    time.sleep(1)
+    return x * x
+
+
+@pytest.mark.asyncio
+async def test_backpressure_map():
+
+    executor = ThreadPoolExecutor(1)
+    mapper = AsyncBackPressuredMap(
+        iterator=range(3), func=wait_and_return, max_run=2, executor=executor
+    )
+
+    true_values = [x * x for x in range(3)]
+    async for finished_val in mapper:
+        finished_val = await finished_val
+        assert finished_val.result() == true_values.pop(0)
+
+    mapper = AsyncBackPressuredMap(
+        iterator=range(3), func=wait_and_return, max_run=2, executor=executor
+    )
+
+    # Put two items into the process queue
+    futures = [await mapper.__anext__(), await mapper.__anext__()]
+    # Ensure back_pressure enabled
+    assert mapper.back_pressure.locked()
+    await asyncio.sleep(2)
+    # Ensure back_pressure enabled till data is dequeued from process_pipeline
+    assert mapper.back_pressure.locked()
+    # Dequeue futures and ensure back_pressure is gone
+    await asyncio.gather(*futures)
+    assert not mapper.back_pressure.locked()
diff --git a/maggma/cli/tests/test_serial.py b/maggma/cli/tests/test_serial.py
new file mode 100644
index 000000000..3b87c26cf
--- /dev/null
+++ b/maggma/cli/tests/test_serial.py
@@ -0,0 +1,34 @@
+import pytest
+from maggma.core import Builder
+from maggma.cli.serial import serial
+
+
+class TestBuilder(Builder):
+    def __init__(self, total=10):
+        self.get_called = 0
+        self.process_called = 0
+        self.update_called = 0
+        self.total = total
+        super().__init__(sources=[], targets=[])
+
+    def get_items(self):
+        for i in range(self.total):
+            self.get_called += 1
+            yield self.get_called
+
+    def process_item(self, item):
+        self.process_called += 1
+        return item
+
+    def update_targets(self, items):
+        self.update_called += 1
+
+
+def test_serial():
+
+    builder = TestBuilder()
+
+    serial(builder)
+    assert builder.get_called == 10
+    assert builder.process_called == 10
+    assert builder.update_called == 1

From cd6dfb1fdd938de7556e718f1c6d68e3f689ba89 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Mon, 23 Dec 2019 16:49:28 -0800
Subject: [PATCH 79/99] add asyncio testing

---
 requirements-optional.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements-optional.txt b/requirements-optional.txt
index 6702ca1fd..661749e60 100644
--- a/requirements-optional.txt
+++ b/requirements-optional.txt
@@ -1,6 +1,7 @@
 invoke==1.0.0
 pytest==5.2.2
 pytest-cov==2.8.1
+pytest-asyncio==0.10.0
 mpi4py==3.0.0
 numpy==1.15.3
 python-coveralls==2.9.1

From 730554329488682bb6d14b889de46e026618f0a7 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Mon, 23 Dec 2019 16:49:44 -0800
Subject: [PATCH 80/99] update memorystore name

---
 maggma/stores/mongolike.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/maggma/stores/mongolike.py b/maggma/stores/mongolike.py
index 14f00497a..0331f6049 100644
--- a/maggma/stores/mongolike.py
+++ b/maggma/stores/mongolike.py
@@ -61,6 +61,7 @@ def __init__(
         self.kwargs = kwargs
         super().__init__(**kwargs)
 
+    @property
     def name(self) -> str:
         """
         Return a string representing this data source
@@ -269,8 +270,8 @@ class MemoryStore(MongoStore):
     to a MongoStore
     """
 
-    def __init__(self, name: str = "memory_db", **kwargs):
-        self.name = name
+    def __init__(self, collection_name: str = "memory_db", **kwargs):
+        self.collection_name = collection_name
         self._collection = None
         self.kwargs = kwargs
         super(MongoStore, self).__init__(**kwargs)
@@ -282,6 +283,10 @@ def connect(self, force_reset: bool = False):
         if not self._collection or force_reset:
             self._collection = mongomock.MongoClient().db[self.name]
 
+    @property
+    def name(self):
+        return self.collection_name
+
     def __hash__(self):
         return hash((self.name, self.last_updated_field))
 

From b2feee9e21ee070a676a338989b8062bd4979955 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Mon, 23 Dec 2019 16:49:58 -0800
Subject: [PATCH 81/99] clean up multiprocessing

---
 maggma/cli/multiprocessing.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/maggma/cli/multiprocessing.py b/maggma/cli/multiprocessing.py
index 063528cf4..a9982fad2 100644
--- a/maggma/cli/multiprocessing.py
+++ b/maggma/cli/multiprocessing.py
@@ -14,13 +14,13 @@ class AsyncBackPressuredMap:
     async access with backpressure
     """
 
-    def __init__(self, iterator, builder, executor):
+    def __init__(self, iterator, func, max_run, executor):
         self.iterator = iter(iterator)
-        self.process = builder.process_item
+        self.func = func
         self.executor = executor
-        self.back_pressure = BoundedSemaphore(builder.chunk_size)
+        self.back_pressure = BoundedSemaphore(max_run)
 
-    async def __aiter__(self):
+    def __aiter__(self):
         return self
 
     async def __anext__(self):
@@ -33,7 +33,7 @@ async def __anext__(self):
             raise StopAsyncIteration
 
         async def process_and_release():
-            future = loop.run_in_executor(self.executor, self.process, item)
+            future = loop.run_in_executor(self.executor, self.func, item)
             await future
             self.back_pressure.release()
             return future
@@ -58,7 +58,12 @@ async def multi(builder, num_workers):
     builder.connect()
     cursor = builder.get_items()
     executor = ProcessPoolExecutor(num_workers)
-    mapper = AsyncBackPressuredMap(tqdm(cursor, desc="Get"), builder, executor)
+    mapper = AsyncBackPressuredMap(
+        iterator=tqdm(cursor, desc="Get"),
+        func=builder.process_items,
+        max_run=builder.chunk_size,
+        executor=executor,
+    )
 
     async for chunk in grouper(mapper, builder.chunk_size, fillvalue=None):
         chunk = await asyncio.gather(*chunk)

From 413817db3105079171834b3762349a0c9d745c71 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Mon, 23 Dec 2019 17:02:09 -0800
Subject: [PATCH 82/99] update requirements

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 6464c5dd2..37ad6899c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,3 +8,4 @@ hvac==0.3.0
 tqdm==4.28.1
 mongogrant==0.2.2
 boto3==1.6.9
+aioitertools==0.5.1

From 9a53b0e705bdb8a73be803ca69ce6b3be8c032d6 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Tue, 24 Dec 2019 21:05:34 -0800
Subject: [PATCH 83/99] more aws tests

---
 maggma/stores/aws.py            |  2 ++
 maggma/stores/tests/test_aws.py | 40 +++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/maggma/stores/aws.py b/maggma/stores/aws.py
index 9023fda4e..755a9191b 100644
--- a/maggma/stores/aws.py
+++ b/maggma/stores/aws.py
@@ -120,6 +120,8 @@ def query(
                         "Could not find S3 object {}".format(doc[self.key])
                     )
                     break
+                else:
+                    raise e
 
             if doc.get("compression", "") == "zlib":
                 data = zlib.decompress(data)
diff --git a/maggma/stores/tests/test_aws.py b/maggma/stores/tests/test_aws.py
index 00b57398c..f24fa029c 100644
--- a/maggma/stores/tests/test_aws.py
+++ b/maggma/stores/tests/test_aws.py
@@ -4,6 +4,8 @@
 import zlib
 from moto import mock_s3
 from maggma.stores import MemoryStore, AmazonS3Store
+import maggma.stores.aws
+from botocore.exceptions import ClientError
 
 
 @pytest.fixture
@@ -59,3 +61,41 @@ def test_remove(s3store):
 
     assert s3store.query_one({"task_id": "mp-2"}) is None
     assert s3store.query_one({"task_id": "mp-4"}) is not None
+
+
+def test_close(s3store):
+    list(s3store.query())
+    s3store.close()
+    with pytest.raises(AttributeError):
+        list(s3store.query())
+
+
+@pytest.fixture
+def bad_import():
+    maggma.stores.aws.boto_import = False
+    yield
+    maggma.stores.aws.boto_import = True
+
+
+def test_bad_impot(bad_import):
+    with pytest.raises(ValueError):
+        index = MemoryStore("index'")
+        AmazonS3Store(index, "bucket1")
+
+
+def test_aws_error(s3store):
+    def raise_exception_404(data):
+        error_response = {"Error": {"Code": 404}}
+        raise ClientError(error_response, "raise_exception")
+
+    def raise_exception_other(data):
+        error_response = {"Error": {"Code": 405}}
+        raise ClientError(error_response, "raise_exception")
+
+    s3store.s3_bucket.Object = raise_exception_other
+    with pytest.raises(ClientError):
+        s3store.query_one()
+
+    # Should just pass
+    s3store.s3_bucket.Object = raise_exception_404
+    s3store.query_one()

From 94ee33f25000a3159e8a4c96eddfacc751c50876 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Tue, 24 Dec 2019 21:05:52 -0800
Subject: [PATCH 84/99] remove DateTime Store

---
 maggma/stores/mongolike.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/maggma/stores/mongolike.py b/maggma/stores/mongolike.py
index 0331f6049..3bd925286 100644
--- a/maggma/stores/mongolike.py
+++ b/maggma/stores/mongolike.py
@@ -359,20 +359,3 @@ def connect(self, force_reset=False):
 
     def __hash__(self):
         return hash((*self.paths, self.last_updated_field))
-
-
-class DatetimeStore(MemoryStore):
-    """Utility store intended for use with `Store.lu_filter`."""
-
-    def __init__(self, dt, **kwargs):
-        """
-        Args:
-            dt (Datetime): Datetime to set
-        """
-        self.__dt = dt
-        self.kwargs = kwargs
-        super().__init__("date", **kwargs)
-
-    def connect(self, force_reset=False):
-        super().connect(force_reset)
-        self._collection.insert_one({self.last_updated_field: self.__dt})

From f035c209bc76fa296d98cd60e10edf81165493c1 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Tue, 24 Dec 2019 21:06:12 -0800
Subject: [PATCH 85/99] test projection in map builder

---
 maggma/tests/test_builders.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/maggma/tests/test_builders.py b/maggma/tests/test_builders.py
index ac4838382..4d15e6a20 100644
--- a/maggma/tests/test_builders.py
+++ b/maggma/tests/test_builders.py
@@ -47,9 +47,15 @@ def test_get_items(source, target, old_docs):
     builder = CopyBuilder(source, target)
     source.update(old_docs)
     assert len(list(builder.get_items())) == len(old_docs)
+
     target.update(old_docs)
     assert len(list(builder.get_items())) == 0
 
+    builder = CopyBuilder(source, target, projection=["k"])
+    target.remove_docs({})
+    assert len(list(builder.get_items())) == len(old_docs)
+    assert all("v" not in d for d in builder.get_items())
+
 
 def test_process_item(source, target, old_docs):
     builder = CopyBuilder(source, target)

From e7f8d486a7f2b5b287a736879b98628543781c3b Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Tue, 24 Dec 2019 21:06:24 -0800
Subject: [PATCH 86/99] more mongo tests

---
 maggma/stores/tests/test_mongolike.py | 32 ++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/maggma/stores/tests/test_mongolike.py b/maggma/stores/tests/test_mongolike.py
index dde555ccc..7af80cb7f 100644
--- a/maggma/stores/tests/test_mongolike.py
+++ b/maggma/stores/tests/test_mongolike.py
@@ -4,6 +4,7 @@
 from datetime import datetime
 from maggma.core import StoreError
 from maggma.stores import MongoStore, MemoryStore, JSONStore
+from maggma.validators import JSONSchemaValidator
 
 
 @pytest.fixture
@@ -63,7 +64,7 @@ def test_mongostore_distinct(mongostore):
 
 
 def test_mongostore_update(mongostore):
-    mongostore.update([{"e": 6, "d": 4}], key="e")
+    mongostore.update({"e": 6, "d": 4}, key="e")
     assert (
         mongostore.query_one(criteria={"d": {"$exists": 1}}, properties=["d"])["d"] == 4
     )
@@ -74,6 +75,18 @@ def test_mongostore_update(mongostore):
     mongostore.update([{"e": 11, "d": 8, "f": 9}], key=["d", "f"])
     assert mongostore.query_one(criteria={"d": 8, "f": 9}, properties=["e"])["e"] == 11
 
+    test_schema = {
+        "type": "object",
+        "properties": {"e": {"type": "integer"}},
+        "required": ["e"],
+    }
+    mongostore.validator = JSONSchemaValidator(schema=test_schema)
+    mongostore.update({"e": 100, "d": 3}, key="e")
+
+    # Non strict update
+    mongostore.update({"e": "abc", "d": 3}, key="e")
+
+
 
 def test_mongostore_groupby(mongostore):
     mongostore.update(
@@ -119,6 +132,14 @@ def test_mongostore_from_collection(mongostore, db_json):
     assert ms.database == other_ms.database
 
 
+def test_mongostore_name(mongostore):
+    assert mongostore.name == "test"
+
+def test_ensure_index(mongostore):
+    assert mongostore.ensure_index("test_key")
+    # TODO: How to check for exception?
+
+
 def test_mongostore_last_updated(mongostore):
     assert mongostore.last_updated == datetime.min
     start_time = datetime.utcnow()
@@ -190,16 +211,15 @@ def test_groupby(memorystore):
 
     memorystore.update(
         [
-            {"e": { "d": 9}, "f": 9},
-            {"e": { "d": 9}, "f": 10},
-            {"e": { "d": 9}, "f": 11},
-            {"e": { "d": 10}, "f": 12},
+            {"e": {"d": 9}, "f": 9},
+            {"e": {"d": 9}, "f": 10},
+            {"e": {"d": 9}, "f": 11},
+            {"e": {"d": 10}, "f": 12},
         ],
         key="f",
     )
     data = list(memorystore.groupby("e.d"))
     assert len(data) == 2
-    
 
 
 def test_json_store_load(test_dir):

From 9942af7831096e66934f356167e6dad64b5cb227 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Tue, 24 Dec 2019 21:06:57 -0800
Subject: [PATCH 87/99] set total in main builder

---
 maggma/builders.py              | 1 -
 maggma/cli/tests/test_serial.py | 2 +-
 maggma/core/builder.py          | 1 +
 3 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/maggma/builders.py b/maggma/builders.py
index b480f1e91..4db4ff036 100644
--- a/maggma/builders.py
+++ b/maggma/builders.py
@@ -58,7 +58,6 @@ def __init__(
         self.projection = projection
         self.delete_orphans = delete_orphans
         self.kwargs = kwargs
-        self.total = None
         self.timeout = timeout
         self.store_process_time = store_process_time
         self.retry_failed = retry_failed
diff --git a/maggma/cli/tests/test_serial.py b/maggma/cli/tests/test_serial.py
index 3b87c26cf..6fd1f1d7f 100644
--- a/maggma/cli/tests/test_serial.py
+++ b/maggma/cli/tests/test_serial.py
@@ -8,8 +8,8 @@ def __init__(self, total=10):
         self.get_called = 0
         self.process_called = 0
         self.update_called = 0
-        self.total = total
         super().__init__(sources=[], targets=[])
+        self.total = total
 
     def get_items(self):
         for i in range(self.total):
diff --git a/maggma/core/builder.py b/maggma/core/builder.py
index b4a46d777..3bf472e04 100644
--- a/maggma/core/builder.py
+++ b/maggma/core/builder.py
@@ -45,6 +45,7 @@ def __init__(
         self.targets = targets if isinstance(targets, list) else [targets]
         self.chunk_size = chunk_size
         self.query = query
+        self.total = None
         self.logger = logging.getLogger(type(self).__name__)
         self.logger.addHandler(logging.NullHandler())
 

From 26c61286b571b6be7020d3ae4e42876856fbb9d2 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 25 Dec 2019 11:28:33 -0800
Subject: [PATCH 88/99] remove non-default key for newer_in

---
 maggma/core/store.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/maggma/core/store.py b/maggma/core/store.py
index bb31cead5..72a842bd9 100644
--- a/maggma/core/store.py
+++ b/maggma/core/store.py
@@ -253,11 +253,7 @@ def last_updated(self) -> datetime:
             return self._lu_func[0](get(doc, self.last_updated_field))
 
     def newer_in(
-        self,
-        target: Store,
-        key: Union[str, None] = None,
-        criteria: Optional[Dict] = None,
-        exhaustive: bool = False,
+        self, target: Store, criteria: Optional[Dict] = None, exhaustive: bool = False
     ) -> List[str]:
         """
         Returns the keys of documents that are newer in the target
@@ -272,8 +268,8 @@ def newer_in(
         """
         self.ensure_index(self.key)
         self.ensure_index(self.last_updated_field)
-        if exhaustive:
 
+        if exhaustive:
             # Get our current last_updated dates for each key value
             props = {self.key: 1, self.last_updated_field: 1, "_id": 0}
             dates = {
@@ -281,7 +277,7 @@ def newer_in(
                 for d in self.query(properties=props)
             }
 
-            # Get the
+            # Get the last_updated for the store we're comparing with
             props = {target.key: 1, target.last_updated_field: 1, "_id": 0}
             target_dates = {
                 d[target.key]: target._lu_func[0](d[target.last_updated_field])
@@ -298,11 +294,10 @@ def newer_in(
             return list(new_keys | updated_keys)
 
         else:
-            key = key if key is not None else self.key  # Default value
             criteria = {
                 self.last_updated_field: {"$gt": self._lu_func[1](self.last_updated)}
             }
-            return target.distinct(field=key, criteria=criteria)
+            return target.distinct(field=self.key, criteria=criteria)
 
     @deprecated(message="Please use Store.newer_in")
     def lu_filter(self, targets):

From d827328f3430116ac65c23af2304363fe79c8fb2 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 25 Dec 2019 11:28:53 -0800
Subject: [PATCH 89/99] no need for error on abstractmethod

---
 maggma/core/validator.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/maggma/core/validator.py b/maggma/core/validator.py
index f5d763882..d3ea65cdf 100644
--- a/maggma/core/validator.py
+++ b/maggma/core/validator.py
@@ -23,7 +23,6 @@ def is_valid(self, doc: Dict) -> bool:
         Returns (bool): True if document valid, False if document
         invalid
         """
-        return NotImplementedError
 
     @abstractmethod
     def validation_errors(self, doc: Dict) -> bool:
@@ -31,4 +30,3 @@ def validation_errors(self, doc: Dict) -> bool:
         Returns (bool): if document is not valid, provide a list of
         strings to display for why validation has failed
         """
-        return NotImplementedError

From ea596f4943031b584941e3365e97f306dfeeafbd Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 25 Dec 2019 15:48:08 -0800
Subject: [PATCH 90/99] mypy updates

---
 maggma/builders.py               |  4 ++--
 maggma/core/builder.py           | 11 +++++++----
 maggma/core/store.py             | 12 ++++++------
 maggma/stores/aws.py             | 26 +++++++++++---------------
 maggma/stores/compound_stores.py | 26 ++++++++++++++------------
 maggma/stores/gridfs.py          |  8 ++++----
 maggma/stores/mongolike.py       | 27 ++++++++++++++++++---------
 maggma/utils.py                  |  2 +-
 8 files changed, 63 insertions(+), 53 deletions(-)

diff --git a/maggma/builders.py b/maggma/builders.py
index 4db4ff036..76c0c3ff1 100644
--- a/maggma/builders.py
+++ b/maggma/builders.py
@@ -7,7 +7,7 @@
 from time import time
 from math import ceil
 from datetime import datetime
-from maggma.utils import  grouper, Timeout
+from maggma.utils import grouper, Timeout
 from maggma.core import Builder, Store
 from typing import Optional, Dict, List, Iterator, Union
 
@@ -254,7 +254,7 @@ def grouping_properties() -> Union[List, Dict]:
 
     @staticmethod
     @abstractmethod
-    def docs_to_groups(docs: List[Dict]) -> Iterator:
+    def docs_to_groups(docs: Iterator[Dict]) -> List:
         """
         Yield groups from (minimally-projected) documents.
 
diff --git a/maggma/core/builder.py b/maggma/core/builder.py
index 3bf472e04..a0b937d2c 100644
--- a/maggma/core/builder.py
+++ b/maggma/core/builder.py
@@ -6,7 +6,7 @@
 
 import logging
 from abc import ABCMeta, abstractmethod
-from typing import Union, Optional, Dict, List, Iterator, Any
+from typing import Union, Optional, Dict, List, Iterator, Iterable, Any
 
 from monty.json import MSONable, MontyDecoder
 from maggma.utils import grouper
@@ -45,7 +45,7 @@ def __init__(
         self.targets = targets if isinstance(targets, list) else [targets]
         self.chunk_size = chunk_size
         self.query = query
-        self.total = None
+        self.total = None  # type: Optional[int]
         self.logger = logging.getLogger(type(self).__name__)
         self.logger.addHandler(logging.NullHandler())
 
@@ -56,7 +56,7 @@ def connect(self):
         for s in self.sources + self.targets:
             s.connect()
 
-    def prechunk(self, number_splits: int) -> Iterator[Dict]:
+    def prechunk(self, number_splits: int) -> Iterable[Dict]:
         """
         Part of a domain-decomposition paradigm to allow the builder to operate on
         multiple nodes by divinding up the IO as well as the compute
@@ -66,7 +66,10 @@ def prechunk(self, number_splits: int) -> Iterator[Dict]:
         Args:
             number_splits: The number of groups to split the documents to work on
         """
-        yield self.query
+        if self.query:
+            return [self.query]
+        else:
+            return []
 
     @abstractmethod
     def get_items(self) -> Iterator:
diff --git a/maggma/core/store.py b/maggma/core/store.py
index 72a842bd9..36b42b66d 100644
--- a/maggma/core/store.py
+++ b/maggma/core/store.py
@@ -10,7 +10,7 @@
 
 from datetime import datetime
 from enum import Enum
-from typing import Union, Optional, Dict, List, Iterator, Tuple
+from typing import Union, Optional, Dict, List, Iterator, Tuple, Callable
 
 from pydash import identity, get, has
 
@@ -40,7 +40,7 @@ def __init__(
         self,
         key: str = "task_id",
         last_updated_field: str = "last_updated",
-        last_updated_type: DateTimeFormat = "datetime",
+        last_updated_type: DateTimeFormat = DateTimeFormat("datetime"),
         validator: Optional[Validator] = None,
     ):
         """
@@ -58,12 +58,12 @@ def __init__(
             LU_KEY_ISOFORMAT
             if last_updated_type == DateTimeFormat.IsoFormat
             else (identity, identity)
-        )
+        )  # type: Tuple[Callable, Callable]
         self.validator = validator
         self.logger = logging.getLogger(type(self).__name__)
         self.logger.addHandler(logging.NullHandler())
 
-    @abstractproperty
+    @abstractproperty  # type: ignore
     @deprecated(message="This will be removed in the future")
     def collection(self):
         """
@@ -129,7 +129,7 @@ def update(self, docs: Union[List[Dict], Dict], key: Union[List, str, None] = No
         pass
 
     @abstractmethod
-    def ensure_index(self, key: str, unique: Optional[bool] = False) -> bool:
+    def ensure_index(self, key: str, unique: bool = False) -> bool:
         """
         Tries to create an index and return true if it suceeded
         Args:
@@ -201,7 +201,7 @@ def distinct(
         field: Union[List[str], str],
         criteria: Optional[Dict] = None,
         all_exist: bool = False,
-    ) -> Union[List[Dict], List]:
+    ) -> List:
         """
         Get all distinct values for a field(s)
         For a single field, this returns a list of values
diff --git a/maggma/stores/aws.py b/maggma/stores/aws.py
index 755a9191b..5934c16c5 100644
--- a/maggma/stores/aws.py
+++ b/maggma/stores/aws.py
@@ -6,7 +6,7 @@
 import json
 import zlib
 
-from typing import Union, Optional, Dict, List, Iterator, Tuple
+from typing import Union, Optional, Dict, List, Iterator, Tuple, Any
 
 from monty.json import jsanitize
 from monty.dev import deprecated
@@ -44,8 +44,8 @@ def __init__(self, index: Store, bucket: str, compress: bool = False, **kwargs):
         self.index = index
         self.bucket = bucket
         self.compress = compress
-        self.s3 = None
-        self.s3_bucket = None
+        self.s3 = None  # type: Any
+        self.s3_bucket = None  # type: Any
         # Force the key to be the same as the index
         kwargs["key"] = index.key
         super(AmazonS3Store, self).__init__(**kwargs)
@@ -77,7 +77,7 @@ def close(self):
         self.s3 = None
         self.s3_bucket = None
 
-    @property
+    @property  # type: ignore
     @deprecated(message="This will be removed in the future")
     def collection(self):
         """
@@ -170,7 +170,7 @@ def groupby(
         Returns:
             generator returning tuples of (dict, list of docs)
         """
-        self.index.groupby(
+        return self.index.groupby(
             keys=keys,
             criteria=criteria,
             properties=properties,
@@ -179,7 +179,7 @@ def groupby(
             limit=limit,
         )
 
-    def ensure_index(self, key: str, unique: Optional[bool] = False) -> bool:
+    def ensure_index(self, key: str, unique: bool = False) -> bool:
         """
         Tries to create an index and return true if it suceeded
         Args:
@@ -189,7 +189,7 @@ def ensure_index(self, key: str, unique: Optional[bool] = False) -> bool:
         Returns:
             bool indicating if the index exists/was created
         """
-        return self.index.ensure_index(key, unique=unique, background=True)
+        return self.index.ensure_index(key, unique=unique)
 
     def update(self, docs: Union[List[Dict], Dict], key: Union[List, str, None] = None):
         """
@@ -248,7 +248,7 @@ def remove_docs(self, criteria: Dict, remove_s3_object: bool = False):
             self.index.remove_docs(criteria=criteria)
 
             # Can remove up to 1000 items at a time via boto
-            to_remove_chunks = list(grouper(to_remove, N=1000))
+            to_remove_chunks = list(grouper(to_remove, n=1000))
             for chunk_to_remove in to_remove_chunks:
                 self.s3_bucket.delete_objects()
 
@@ -257,11 +257,7 @@ def last_updated(self):
         return self.index.last_updated
 
     def newer_in(
-        self,
-        target: Store,
-        key: Union[str, None] = None,
-        criteria: Optional[Dict] = None,
-        exhaustive: bool = False,
+        self, target: Store, criteria: Optional[Dict] = None, exhaustive: bool = False
     ) -> List[str]:
         """
         Returns the keys of documents that are newer in the target
@@ -274,8 +270,8 @@ def newer_in(
                         the last_updated of the target Store and using
                         that to filter out new items in
         """
-        self.index.newer_in(
-            target=target, key=key, criteria=criteria, exhaustive=exhaustive
+        return self.index.newer_in(
+            target=target, criteria=criteria, exhaustive=exhaustive
         )
 
     def __hash__(self):
diff --git a/maggma/stores/compound_stores.py b/maggma/stores/compound_stores.py
index 91df35f6d..e995018e6 100644
--- a/maggma/stores/compound_stores.py
+++ b/maggma/stores/compound_stores.py
@@ -1,4 +1,4 @@
-from typing import List, Iterator, Tuple, Optional, Union, Dict
+from typing import List, Iterator, Tuple, Optional, Union, Dict, Any
 from datetime import datetime
 from itertools import groupby
 from pydash import set_
@@ -29,7 +29,7 @@ def __init__(
         self.port = port
         self.username = username
         self.password = password
-        self._collection = None
+        self._collection = None  # type: Any
         self.master = master or collection_names[0]
         self.merge_at_root = merge_at_root
         self.kwargs = kwargs
@@ -54,7 +54,7 @@ def connect(self, force_reset: bool = False):
     def close(self):
         self._collection.database.client.close()
 
-    @property
+    @property  # type: ignore
     @deprecated("This will be removed in the future")
     def collection(self):
         return self._collection
@@ -191,7 +191,7 @@ def groupby(
         )
         if not isinstance(keys, list):
             keys = [keys]
-        group_id = {}
+        group_id = {}  # type: Dict[str,Any]
         for key in keys:
             set_(group_id, key, "${}".format(key))
         pipeline.append({"$group": {"_id": group_id, "docs": {"$push": "$$ROOT"}}})
@@ -265,7 +265,7 @@ def close(self):
         for store in self.stores:
             store.close()
 
-    @property
+    @property  # type: ignore
     @deprecated
     def collection(self):
         raise NotImplementedError("No collection property for ConcatStore")
@@ -325,7 +325,7 @@ def distinct(
         else:
             return [dict(s) for s in set(frozenset(d.items()) for d in distincts)]
 
-    def ensure_index(self, key: str, unique: Optional[bool] = False) -> bool:
+    def ensure_index(self, key: str, unique: bool = False) -> bool:
         """
         Ensure an index is properly set. Returns whether all stores support this index or not
         Args:
@@ -391,7 +391,7 @@ def groupby(
         for store in self.stores:
             temp_docs = list(
                 store.groupby(
-                    keys,
+                    keys=keys,
                     criteria=criteria,
                     properties=properties,
                     sort=sort,
@@ -399,16 +399,18 @@ def groupby(
                     limit=limit,
                 )
             )
-            for group in temp_docs:
-                docs.extend(group[1])
+            for key, group in temp_docs:
+                docs.extend(group)
 
-        def key_set(d):
+        def key_set(d: Dict) -> Tuple:
             "index function based on passed in keys"
             test_d = tuple(d.get(k, None) for k in keys)
             return test_d
 
-        for k, group in groupby(sorted(docs, key=key_set), key=key_set):
-            yield k, list(group)
+        sorted_docs = sorted(docs, key=key_set)
+        for vals, group_iter in groupby(sorted_docs, key=key_set):
+            id_dict = {key: val for key, val in zip(keys, vals)}
+            yield id_dict, list(group_iter)
 
     def remove_docs(self, criteria: Dict):
         """
diff --git a/maggma/stores/gridfs.py b/maggma/stores/gridfs.py
index acf49daed..c1b1ffe45 100644
--- a/maggma/stores/gridfs.py
+++ b/maggma/stores/gridfs.py
@@ -6,7 +6,7 @@
 """
 from __future__ import annotations
 
-from typing import Union, Optional, Dict, List, Iterator, Tuple
+from typing import Union, Optional, Dict, List, Iterator, Tuple, Set, Any
 
 import copy
 from datetime import datetime
@@ -72,10 +72,10 @@ def __init__(
         self.port = port
         self.username = username
         self.password = password
-        self._collection = None
+        self._collection = None  # type: Any
         self.compression = compression
         self.kwargs = kwargs
-        self.meta_keys = set()
+        self.meta_keys = set()  # type: Set[str]
 
         if "key" not in kwargs:
             kwargs["key"] = "_id"
@@ -103,7 +103,7 @@ def connect(self, force_reset: bool = False):
             self._files_store.last_updated_field = f"metadata.{self.last_updated_field}"
             self._chunks_collection = db["{}.chunks".format(self.collection_name)]
 
-    @property
+    @property  # type: ignore
     @deprecated(message="This will be removed in the future")
     def collection(self):
         return self._collection
diff --git a/maggma/stores/mongolike.py b/maggma/stores/mongolike.py
index 3bd925286..2770fbb4b 100644
--- a/maggma/stores/mongolike.py
+++ b/maggma/stores/mongolike.py
@@ -8,7 +8,7 @@
 
 import json
 
-from typing import Union, Optional, Dict, List, Iterator, Tuple
+from typing import Union, Optional, Dict, List, Iterator, Tuple, Any
 
 import mongomock
 
@@ -57,7 +57,7 @@ def __init__(
         self.port = port
         self.username = username
         self.password = password
-        self._collection = None
+        self._collection = None  # type: Any
         self.kwargs = kwargs
         super().__init__(**kwargs)
 
@@ -79,7 +79,7 @@ def connect(self, force_reset: bool = False):
                 db.authenticate(self.username, self.password)
             self._collection = db[self.collection_name]
 
-    def __hash__(self):
+    def __hash__(self) -> int:
         return hash((self.database, self.collection_name, self.last_updated_field))
 
     @classmethod
@@ -123,17 +123,22 @@ def groupby(
         if isinstance(keys, str):
             keys = [keys]
 
+        if properties is None:
+            properties = []
+        if isinstance(properties, dict):
+            properties = list(properties.keys())
+
         if criteria is not None:
             pipeline.append({"$match": criteria})
 
-        if properties is not None:
+        if len(properties) > 0:
             pipeline.append({"$project": {p: 1 for p in properties + keys}})
 
         alpha = "abcdefghijklmnopqrstuvwxyz"
         group_id = {letter: f"${key}" for letter, key in zip(alpha, keys)}
         pipeline.append({"$group": {"_id": group_id, "docs": {"$push": "$$ROOT"}}})
         for d in self._collection.aggregate(pipeline, allowDiskUse=True):
-            id_doc = {}
+            id_doc = {}  # type: Dict[str,Any]
             for letter, key in group_id.items():
                 if has(d["_id"], letter):
                     set_(id_doc, key[1:], d["_id"][letter])
@@ -154,7 +159,7 @@ def from_collection(cls, collection):
         store._collection = collection
         return store
 
-    @property
+    @property  # type: ignore
     @deprecated(message="This will be removed in the future")
     def collection(self):
         if self._collection is None:
@@ -182,9 +187,13 @@ def query(
         if isinstance(properties, list):
             properties = {p: 1 for p in properties}
 
-        sort = [(k, v.value) for k, v in sort.items()] if sort else None
+        sort_list = [(k, v.value) for k, v in sort.items()] if sort else None
         for d in self._collection.find(
-            filter=criteria, projection=properties, skip=skip, limit=limit, sort=sort
+            filter=criteria,
+            projection=properties,
+            skip=skip,
+            limit=limit,
+            sort=sort_list,
         ):
             yield d
 
@@ -325,7 +334,7 @@ def grouper(doc):
             return tuple(get(doc, k) for k in keys)
 
         for vals, group in groupby(sorted(data, key=grouper), grouper):
-            doc = {}
+            doc = {}  # type: Dict[Any,Any]
             for k, v in zip(keys, vals):
                 set_(doc, k, v)
             yield doc, list(group)
diff --git a/maggma/utils.py b/maggma/utils.py
index 71828835a..ad32fc306 100644
--- a/maggma/utils.py
+++ b/maggma/utils.py
@@ -18,7 +18,7 @@
 # import tqdm Jupyter widget if running inside Jupyter
 try:
     # noinspection PyUnresolvedReferences
-    if get_ipython().__class__.__name__ == "ZMQInteractiveShell":
+    if get_ipython().__class__.__name__ == "ZMQInteractiveShell": # type: ignore
         from tqdm import tqdm_notebook as tqdm
     else:  # likely 'TerminalInteractiveShell'
         from tqdm import tqdm

From 310e7ec3c6607445dcd7af77db2c647a0ef0567a Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 25 Dec 2019 21:11:13 -0800
Subject: [PATCH 91/99] remove redundant pass

---
 maggma/core/store.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/maggma/core/store.py b/maggma/core/store.py
index 36b42b66d..8b0ce17be 100644
--- a/maggma/core/store.py
+++ b/maggma/core/store.py
@@ -70,28 +70,24 @@ def collection(self):
         Returns a handle to the pymongo collection object
         Not guaranteed to exist in the future
         """
-        pass
 
     @abstractproperty
     def name(self) -> str:
         """
         Return a string representing this data source
         """
-        pass
 
     @abstractmethod
     def connect(self, force_reset: bool = False):
         """
         Connect to the source data
         """
-        pass
 
     @abstractmethod
     def close(self):
         """
         Closes any connections
         """
-        pass
 
     @abstractmethod
     def query(
@@ -112,7 +108,6 @@ def query(
             skip: number documents to skip
             limit: limit on total number of documents returned
         """
-        pass
 
     @abstractmethod
     def update(self, docs: Union[List[Dict], Dict], key: Union[List, str, None] = None):
@@ -126,7 +121,6 @@ def update(self, docs: Union[List[Dict], Dict], key: Union[List, str, None] = No
                  a single field, or None if the Store's key
                  field is to be used
         """
-        pass
 
     @abstractmethod
     def ensure_index(self, key: str, unique: bool = False) -> bool:
@@ -139,7 +133,6 @@ def ensure_index(self, key: str, unique: bool = False) -> bool:
         Returns:
             bool indicating if the index exists/was created
         """
-        pass
 
     @abstractmethod
     def groupby(
@@ -166,7 +159,6 @@ def groupby(
         Returns:
             generator returning tuples of (dict, list of docs)
         """
-        pass
 
     @abstractmethod
     def remove_docs(self, criteria: Dict):
@@ -176,7 +168,6 @@ def remove_docs(self, criteria: Dict):
         Args:
             criteria: query dictionary to match
         """
-        pass
 
     def query_one(
         self,
@@ -354,5 +345,3 @@ def __setstate__(self, d):
 
 class StoreError(Exception):
     """General Store-related error."""
-
-    pass

From 9f142809257e244be62a5d410e087713b7744105 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 25 Dec 2019 21:11:22 -0800
Subject: [PATCH 92/99] more validator tests

---
 maggma/tests/test_validator.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/maggma/tests/test_validator.py b/maggma/tests/test_validator.py
index 414eb51c9..7c79f2122 100644
--- a/maggma/tests/test_validator.py
+++ b/maggma/tests/test_validator.py
@@ -3,7 +3,7 @@
 Tests the validators
 """
 import pytest
-from maggma.validators import JSONSchemaValidator, msonable_schema
+from maggma.validators import JSONSchemaValidator, msonable_schema, ValidationError
 from monty.json import MSONable
 
 
@@ -35,6 +35,7 @@ def test_jsonschemevalidator(test_schema):
     """
 
     validator = JSONSchemaValidator(schema=test_schema)
+    strict_validator = JSONSchemaValidator(schema=test_schema,strict=True)
 
     lattice = LatticeMock(5)
 
@@ -59,6 +60,10 @@ def test_jsonschemevalidator(test_schema):
     assert not validator.is_valid(invalid_doc_missing_key)
     assert not validator.is_valid(invalid_doc_wrong_type)
 
+    with pytest.raises(ValidationError):
+        strict_validator.is_valid(invalid_doc_msonable)
+
+    assert validator.validation_errors(valid_doc) == []
     assert validator.validation_errors(invalid_doc_msonable) == [
         "lattice: ['I am not a lattice!'] is not of type 'object'"
     ]

From d973820e7a4ff03c38370e132df2682160cdf539 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 25 Dec 2019 21:11:31 -0800
Subject: [PATCH 93/99] main cli tests

---
 maggma/cli/tests/test_init.py | 52 +++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 maggma/cli/tests/test_init.py

diff --git a/maggma/cli/tests/test_init.py b/maggma/cli/tests/test_init.py
new file mode 100644
index 000000000..a44e30bda
--- /dev/null
+++ b/maggma/cli/tests/test_init.py
@@ -0,0 +1,52 @@
+import pytest
+from click.testing import CliRunner
+from maggma.cli import run
+from maggma.stores import MongoStore, MemoryStore
+from maggma.builders import CopyBuilder
+from monty.serialization import dumpfn
+from datetime import datetime
+
+
+@pytest.fixture
+def mongostore():
+    store = MongoStore("maggma_test", "test")
+    store.connect()
+    yield store
+    store._collection.drop()
+
+
+def test_basic_run():
+
+    runner = CliRunner()
+    result = runner.invoke(run, ["--help"])
+    assert result.exit_code == 0
+
+    result = runner.invoke(run)
+    assert result.exit_code == 0
+
+
+def test_run_builder(mongostore):
+
+    memorystore = MemoryStore("temp")
+    builder = CopyBuilder(mongostore, memorystore)
+
+    mongostore.update(
+        [
+            {mongostore.key: i, mongostore.last_updated_field: datetime.utcnow()}
+            for i in range(10)
+        ]
+    )
+
+    runner = CliRunner()
+    with runner.isolated_filesystem():
+        dumpfn(builder, "test_builder.json")
+        result = runner.invoke(run, ["-v", "test_builder.json"])
+        assert result.exit_code == 0
+        assert "CopyBuilder" in result.output
+        assert "SerialProcessor" in result.output
+
+        result = runner.invoke(run, ["-v", "-n", "2", "test_builder.json"])
+        print(result)
+        assert result.exit_code == 0
+        assert "CopyBuilder" in result.output
+        assert "MultiProcessor" in result.output

From c00b9675020e58db2aa0fbd7016f1e533eab06f6 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 25 Dec 2019 21:11:45 -0800
Subject: [PATCH 94/99] misc bugs in cli

---
 maggma/cli/__init__.py        | 23 -----------------------
 maggma/cli/multiprocessing.py |  6 +++++-
 maggma/cli/utils.py           | 13 -------------
 3 files changed, 5 insertions(+), 37 deletions(-)
 delete mode 100644 maggma/cli/utils.py

diff --git a/maggma/cli/__init__.py b/maggma/cli/__init__.py
index dad9e84bc..3ecd269be 100644
--- a/maggma/cli/__init__.py
+++ b/maggma/cli/__init__.py
@@ -12,29 +12,6 @@
 from maggma.cli.multiprocessing import multi
 
 
-""""
-mrun script1
-mrun script1 script2 script3
-mrun -n  32 script1 script2
-
-
-
-
-
-mrun master -N 4 sciprt1 script2 <-- have to deploy workers
-mrun worker -n 32 127.0.0.1:70001
-mrun worker -n 32 127.0.0.1:70001
-mrun worker -n 32 127.0.0.1:70001
-
-
-mrun master -N 4 script1 script 2
-mpirun -N 4 mrun worker -n 32 script1 script 2
-
-
-
-"""
-
-
 @click.command()
 @click.argument("builders", nargs=-1, type=click.Path(exists=True))
 @click.option(
diff --git a/maggma/cli/multiprocessing.py b/maggma/cli/multiprocessing.py
index a9982fad2..b37f319ef 100644
--- a/maggma/cli/multiprocessing.py
+++ b/maggma/cli/multiprocessing.py
@@ -2,6 +2,7 @@
 # coding utf-8
 
 import asyncio
+import logging
 from asyncio import BoundedSemaphore
 from aioitertools import zip_longest
 from concurrent.futures import ProcessPoolExecutor
@@ -55,17 +56,20 @@ async def grouper(iterable, n, fillvalue=None):
 
 
 async def multi(builder, num_workers):
+    logger = logging.getLogger("MultiProcessor")
+
     builder.connect()
     cursor = builder.get_items()
     executor = ProcessPoolExecutor(num_workers)
     mapper = AsyncBackPressuredMap(
         iterator=tqdm(cursor, desc="Get"),
-        func=builder.process_items,
+        func=builder.process_item,
         max_run=builder.chunk_size,
         executor=executor,
     )
 
     async for chunk in grouper(mapper, builder.chunk_size, fillvalue=None):
+        logger.info("Processing batch of {} items".format(builder.chunk_size))
         chunk = await asyncio.gather(*chunk)
         processed_items = [c.result() for c in chunk if chunk is not None]
         builder.update_targets(processed_items)
diff --git a/maggma/cli/utils.py b/maggma/cli/utils.py
deleted file mode 100644
index 8d4fcab2c..000000000
--- a/maggma/cli/utils.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from typing import List
-from maggma.core import Builder
-
-
-def get_build_order(builders: List[Builder]) -> List[Builder]:
-    """
-    Returns a list of builders in the order they should run to satisfy
-    dependencies
-
-    TODO: For now just do dumb in order since builders should be 
-    written to just run over and over again
-    """
-    return builders

From 82692e9f8f8a2ce2edb40e3bfca78446f6ec9bd1 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 25 Dec 2019 21:11:54 -0800
Subject: [PATCH 95/99] update docstring

---
 maggma/core/builder.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/maggma/core/builder.py b/maggma/core/builder.py
index a0b937d2c..78b415bb1 100644
--- a/maggma/core/builder.py
+++ b/maggma/core/builder.py
@@ -121,6 +121,7 @@ def finalize(self):
     def run(self):
         """
         Run the builder serially
+        This is only intended for diagnostic purposes
 
         Args:
             builder_id (int): the index of the builder in the builders list

From 449bf4038d0ee5b89718ab3759a080e44ba2802e Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Wed, 25 Dec 2019 21:16:31 -0800
Subject: [PATCH 96/99] update optional requirements

---
 requirements-optional.txt | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/requirements-optional.txt b/requirements-optional.txt
index 661749e60..35e564b3b 100644
--- a/requirements-optional.txt
+++ b/requirements-optional.txt
@@ -1,8 +1,7 @@
 invoke==1.0.0
-pytest==5.2.2
-pytest-cov==2.8.1
+pytest==5.3.1
 pytest-asyncio==0.10.0
-mpi4py==3.0.0
+pytest-cov==2.8.1
 numpy==1.15.3
 python-coveralls==2.9.1
 sphinx==1.7.5

From baaf407d11713982a88a5d0532b87985b0fd53aa Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 26 Dec 2019 12:33:55 -0800
Subject: [PATCH 97/99] flake8 fixes

---
 maggma/cli/serial.py                     | 1 -
 maggma/cli/tests/test_multiprocessing.py | 3 +--
 maggma/cli/tests/test_serial.py          | 1 -
 maggma/stores/__init__.py                | 9 +++++++--
 maggma/stores/tests/test_gridfs.py       | 2 --
 maggma/stores/tests/test_mongolike.py    | 2 +-
 maggma/tests/test_validator.py           | 2 +-
 maggma/utils.py                          | 5 +----
 8 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/maggma/cli/serial.py b/maggma/cli/serial.py
index ca696f85d..90ef9e43c 100644
--- a/maggma/cli/serial.py
+++ b/maggma/cli/serial.py
@@ -27,4 +27,3 @@ def serial(builder: Builder):
         builder.update_targets(processed_items)
 
     builder.finalize()
-
diff --git a/maggma/cli/tests/test_multiprocessing.py b/maggma/cli/tests/test_multiprocessing.py
index 4b1c6367b..ee93b2df5 100644
--- a/maggma/cli/tests/test_multiprocessing.py
+++ b/maggma/cli/tests/test_multiprocessing.py
@@ -1,8 +1,7 @@
 import pytest
 import time
 import asyncio
-from maggma.core import Builder
-from maggma.cli.multiprocessing import AsyncBackPressuredMap, grouper, multi
+from maggma.cli.multiprocessing import AsyncBackPressuredMap, grouper
 from concurrent.futures import ThreadPoolExecutor
 
 
diff --git a/maggma/cli/tests/test_serial.py b/maggma/cli/tests/test_serial.py
index 6fd1f1d7f..e71394c80 100644
--- a/maggma/cli/tests/test_serial.py
+++ b/maggma/cli/tests/test_serial.py
@@ -1,4 +1,3 @@
-import pytest
 from maggma.core import Builder
 from maggma.cli.serial import serial
 
diff --git a/maggma/stores/__init__.py b/maggma/stores/__init__.py
index 2144d20ee..56ffb6aa8 100644
--- a/maggma/stores/__init__.py
+++ b/maggma/stores/__init__.py
@@ -1,5 +1,10 @@
 from maggma.stores.mongolike import MongoStore, JSONStore, MemoryStore
 from maggma.stores.gridfs import GridFSStore
-from maggma.stores.advanced_stores import MongograntStore, VaultStore, AliasingStore, SandboxStore
+from maggma.stores.advanced_stores import (
+    MongograntStore,
+    VaultStore,
+    AliasingStore,
+    SandboxStore,
+)
 from maggma.stores.aws import AmazonS3Store
-from maggma.stores.compound_stores import JointStore, ConcatStore
\ No newline at end of file
+from maggma.stores.compound_stores import JointStore, ConcatStore
diff --git a/maggma/stores/tests/test_gridfs.py b/maggma/stores/tests/test_gridfs.py
index 7d6b15173..177a4b6da 100644
--- a/maggma/stores/tests/test_gridfs.py
+++ b/maggma/stores/tests/test_gridfs.py
@@ -3,7 +3,6 @@
 import numpy.testing.utils as nptu
 from datetime import datetime
 from maggma.stores import GridFSStore
-from maggma.core import Sort
 
 
 @pytest.fixture
@@ -161,4 +160,3 @@ def test_distinct(gridfsstore):
         )
 
     assert set(gridfsstore.distinct("a")) == {1, 2}
-
diff --git a/maggma/stores/tests/test_mongolike.py b/maggma/stores/tests/test_mongolike.py
index 7af80cb7f..d3bb6dddc 100644
--- a/maggma/stores/tests/test_mongolike.py
+++ b/maggma/stores/tests/test_mongolike.py
@@ -87,7 +87,6 @@ def test_mongostore_update(mongostore):
     mongostore.update({"e": "abc", "d": 3}, key="e")
 
 
-
 def test_mongostore_groupby(mongostore):
     mongostore.update(
         [
@@ -135,6 +134,7 @@ def test_mongostore_from_collection(mongostore, db_json):
 def test_mongostore_name(mongostore):
     assert mongostore.name == "test"
 
+
 def test_ensure_index(mongostore):
     assert mongostore.ensure_index("test_key")
     # TODO: How to check for exception?
diff --git a/maggma/tests/test_validator.py b/maggma/tests/test_validator.py
index 7c79f2122..16b0abe5f 100644
--- a/maggma/tests/test_validator.py
+++ b/maggma/tests/test_validator.py
@@ -35,7 +35,7 @@ def test_jsonschemevalidator(test_schema):
     """
 
     validator = JSONSchemaValidator(schema=test_schema)
-    strict_validator = JSONSchemaValidator(schema=test_schema,strict=True)
+    strict_validator = JSONSchemaValidator(schema=test_schema, strict=True)
 
     lattice = LatticeMock(5)
 
diff --git a/maggma/utils.py b/maggma/utils.py
index ad32fc306..3a1f125a4 100644
--- a/maggma/utils.py
+++ b/maggma/utils.py
@@ -6,10 +6,7 @@
 import signal
 import logging
 
-
-from collections import deque
 from datetime import datetime, timedelta
-from sys import getsizeof, stderr
 
 from pydash.utilities import to_path
 from pydash.objects import set_, get, has
@@ -18,7 +15,7 @@
 # import tqdm Jupyter widget if running inside Jupyter
 try:
     # noinspection PyUnresolvedReferences
-    if get_ipython().__class__.__name__ == "ZMQInteractiveShell": # type: ignore
+    if get_ipython().__class__.__name__ == "ZMQInteractiveShell":  # type: ignore
         from tqdm import tqdm_notebook as tqdm
     else:  # likely 'TerminalInteractiveShell'
         from tqdm import tqdm

From c36bcd484ceceabaf33f67f42d50d2de2a34c01c Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 26 Dec 2019 12:34:10 -0800
Subject: [PATCH 98/99] add progress bars for process and update again

---
 maggma/cli/multiprocessing.py | 51 ++++++++++++++++++++++++++++++++---
 1 file changed, 47 insertions(+), 4 deletions(-)

diff --git a/maggma/cli/multiprocessing.py b/maggma/cli/multiprocessing.py
index b37f319ef..db6b22f35 100644
--- a/maggma/cli/multiprocessing.py
+++ b/maggma/cli/multiprocessing.py
@@ -2,11 +2,27 @@
 # coding utf-8
 
 import asyncio
+import types
 import logging
 from asyncio import BoundedSemaphore
 from aioitertools import zip_longest
 from concurrent.futures import ProcessPoolExecutor
-from maggma.utils import tqdm
+from maggma.utils import tqdm, primed
+
+
+class ProcessItemsSemaphore(BoundedSemaphore):
+    """
+    Modified BoundedSemaphore to update a TQDM bar
+    for process_items
+    """
+
+    def __init__(self, total=None, *args, **kwargs):
+        self.tqdm = tqdm(total=total, desc="Process Items")
+        super().__init__(*args, **kwargs)
+
+    def release(self):
+        self.tqdm.update(1)
+        super().release()
 
 
 class AsyncBackPressuredMap:
@@ -15,11 +31,11 @@ class AsyncBackPressuredMap:
     async access with backpressure
     """
 
-    def __init__(self, iterator, func, max_run, executor):
+    def __init__(self, iterator, func, max_run, executor, total=None):
         self.iterator = iter(iterator)
         self.func = func
         self.executor = executor
-        self.back_pressure = BoundedSemaphore(max_run)
+        self.back_pressure = ProcessItemsSemaphore(value=max_run, total=total)
 
     def __aiter__(self):
         return self
@@ -42,6 +58,28 @@ async def process_and_release():
         return process_and_release()
 
 
+def get_total(cursor, builder):
+    """
+    Gets the total item count from the builder
+    """
+    total = None
+
+    if isinstance(cursor, types.GeneratorType):
+        try:
+            cursor = primed(cursor)
+            if hasattr(builder, "total"):
+                total = builder.total
+        except StopIteration:
+            pass
+
+    elif hasattr(cursor, "__len__"):
+        total = len(cursor)
+    elif hasattr(cursor, "count"):
+        total = cursor.count()
+
+    return total
+
+
 async def grouper(iterable, n, fillvalue=None):
     """
     Collect data into fixed-length chunks or blocks.
@@ -61,15 +99,20 @@ async def multi(builder, num_workers):
     builder.connect()
     cursor = builder.get_items()
     executor = ProcessPoolExecutor(num_workers)
+    total = get_total(cursor, builder)
+
     mapper = AsyncBackPressuredMap(
-        iterator=tqdm(cursor, desc="Get"),
+        iterator=tqdm(cursor, desc="Get", total=total),
         func=builder.process_item,
         max_run=builder.chunk_size,
         executor=executor,
+        total=total,
     )
+    update_items = tqdm(total=total, desc="Update Targets")
 
     async for chunk in grouper(mapper, builder.chunk_size, fillvalue=None):
         logger.info("Processing batch of {} items".format(builder.chunk_size))
         chunk = await asyncio.gather(*chunk)
         processed_items = [c.result() for c in chunk if chunk is not None]
         builder.update_targets(processed_items)
+        update_items.update(len(processed_items))

From cb3e18142ec06664d1ffb98b4b9b17dfc38fa2b8 Mon Sep 17 00:00:00 2001
From: Shyam Dwaraknath <shyamd@lbl.gov>
Date: Thu, 26 Dec 2019 12:34:48 -0800
Subject: [PATCH 99/99] useless __init__ file

---
 maggma/tests/__init__.py | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 maggma/tests/__init__.py

diff --git a/maggma/tests/__init__.py b/maggma/tests/__init__.py
deleted file mode 100644
index 576f56f87..000000000
--- a/maggma/tests/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# coding: utf-8
\ No newline at end of file