diff --git a/fiftyone/core/collections.py b/fiftyone/core/collections.py index d75f47e45e7..093695a9e03 100644 --- a/fiftyone/core/collections.py +++ b/fiftyone/core/collections.py @@ -1403,6 +1403,8 @@ def get_field_schema( ftype=None, embedded_doc_type=None, read_only=None, + info_keys=None, + created_after=None, include_private=False, flat=False, mode=None, @@ -1418,10 +1420,16 @@ def get_field_schema( iterable of types to which to restrict the returned schema. Must be subclass(es) of :class:`fiftyone.core.odm.BaseEmbeddedDocument` - include_private (False): whether to include fields that start with - ``_`` in the returned schema read_only (None): whether to restrict to (True) or exclude (False) read-only fields. By default, all fields are included + info_keys (None): an optional key or list of keys that must be in + a field's ``info`` dict in order for it to be included in the + returned schema. If ``None``, no filtering is performed. + created_after (None): an optional ``datetime`` to filter the + returned schema by, such that the field was `created_at` after + this time. If ``None``, no filtering is performed. + include_private (False): whether to include fields that start with + ``_`` in the returned schema flat (False): whether to return a flattened schema where all embedded document fields are included as top-level keys mode (None): whether to apply the above constraints before and/or @@ -1440,6 +1448,8 @@ def get_frame_field_schema( ftype=None, embedded_doc_type=None, read_only=None, + info_keys=None, + created_after=None, include_private=False, flat=False, mode=None, @@ -1458,6 +1468,12 @@ def get_frame_field_schema( :class:`fiftyone.core.odm.BaseEmbeddedDocument` read_only (None): whether to restrict to (True) or exclude (False) read-only fields. By default, all fields are included + info_keys (None): an optional key or list of keys that must be in + a field's ``info`` dict in order for it to be included in the + returned schema. If ``None``, no filtering is performed. + created_after (None): an optional ``datetime`` to filter the + returned schema by, such that the field was `created_at` after + this time. If ``None``, no filtering is performed. include_private (False): whether to include fields that start with ``_`` in the returned schema flat (False): whether to return a flattened schema where all diff --git a/fiftyone/core/dataset.py b/fiftyone/core/dataset.py index 67c9a6671eb..0f12384edb1 100644 --- a/fiftyone/core/dataset.py +++ b/fiftyone/core/dataset.py @@ -1330,6 +1330,8 @@ def get_field_schema( ftype=None, embedded_doc_type=None, read_only=None, + info_keys=None, + created_after=None, include_private=False, flat=False, mode=None, @@ -1347,6 +1349,12 @@ def get_field_schema( :class:`fiftyone.core.odm.BaseEmbeddedDocument` read_only (None): whether to restrict to (True) or exclude (False) read-only fields. By default, all fields are included + info_keys (None): an optional key or list of keys that must be in + a field's ``info`` dict in order for it to be included in the + returned schema. If ``None``, no filtering is performed. + created_after (None): an optional ``datetime`` to filter the + returned schema by, such that the field was `created_at` after + this time. If ``None``, no filtering is performed. include_private (False): whether to include fields that start with ``_`` in the returned schema flat (False): whether to return a flattened schema where all @@ -1364,6 +1372,8 @@ def get_field_schema( ftype=ftype, embedded_doc_type=embedded_doc_type, read_only=read_only, + info_keys=info_keys, + created_after=created_after, include_private=include_private, flat=flat, mode=mode, @@ -1374,6 +1384,8 @@ def get_frame_field_schema( ftype=None, embedded_doc_type=None, read_only=None, + info_keys=None, + created_after=None, include_private=False, flat=False, mode=None, @@ -1393,6 +1405,12 @@ def get_frame_field_schema( :class:`fiftyone.core.odm.BaseEmbeddedDocument` read_only (None): whether to restrict to (True) or exclude (False) read-only fields. By default, all fields are included + info_keys (None): an optional key or list of keys that must be in + a field's ``info`` dict in order for it to be included in the + returned schema. If ``None``, no filtering is performed. + created_after (None): an optional ``datetime`` to filter the + returned schema by, such that the field was `created_at` after + this time. If ``None``, no filtering is performed. include_private (False): whether to include fields that start with ``_`` in the returned schema flat (False): whether to return a flattened schema where all @@ -1413,6 +1431,8 @@ def get_frame_field_schema( ftype=ftype, embedded_doc_type=embedded_doc_type, read_only=read_only, + info_keys=info_keys, + created_after=created_after, include_private=include_private, flat=flat, mode=mode, @@ -1628,6 +1648,267 @@ def add_frame_field( if expanded: self._reload() + def add_frame_rollup_field( + self, + frame_path, + rollup_path=None, + include_counts=False, + overwrite=True, + create_index=True, + ): + """Populates a sample-level field on each sample in the given video + dataset that records the unique values that appear in the frame-level + field across all frames of that sample. + + This method is useful for generating a sample-level rollup that can be + efficiently queried to retrieve samples that contain specific values of + interest in at least one frame. + + Examples:: + + import fiftyone as fo + import fiftyone.zoo as foz + + dataset = foz.load_zoo_dataset("quickstart-video") + + # Populate string lists of observed values + dataset.add_frame_rollup_field( + "frames.detections.detections.label", + "rollup_labels" + ) + dataset.add_frame_rollup_field("frames.detections.detections.confidence") + + # Populate `Classifications` that record the observed values and their counts + dataset.add_frame_rollup_field( + "frames.detections.detections.label", + "rollup_label_counts", + mode=1 + ) + dataset.add_frame_rollup_field( + "frames.detections.detections.confidence", + mode=1 + ) + + print(dataset.list_frame_rollup_fields()) + + + Args: + frame_path: the frame field to rollup + rollup_path (None): the sample-level field in which to store the + rollup results. If ``None``, a default field name will be + chosen in this form: rollup_{type}_{embedded_frame_field_name} + include_counts (False): determines the format to populate + ``rollup_path`` with: + - ``False``: store a list of unique observed values on each + sample + - ``True``: store the results in a + :class:`fiftyone.core.labels.Classifications` field whose + ``label`` and ``count`` attributes encode the observed + values and their counts, respectively. Only applicable when + ``frame_path`` contains string values + overwrite (True): whether to overwrite any existing rollup field + with the same name + create_index (True): whether to create a database index for the + rollup field so it can be quickly filtered on + + Raises: + ValueError: If ``frame_path`` is not a valid frame field + """ + frame_rollup_key = "_frame_rollup" + has_frames = self._has_frame_fields() + if not has_frames: + raise ValueError("Dataset does not contain videos") + + inpath, is_frame_field, list_fields, _, _ = self._parse_field_name( + frame_path + ) + + if not is_frame_field or inpath not in self.get_frame_field_schema( + flat=True + ): + raise ValueError("frame_path must be a valid frame field") + + # Format: rollup_[list|counts]_{embedded_frame_field} + if rollup_path is None: + rollup_path = "rollup_" + rollup_path += "counts_" if include_counts else "list_" + rollup_path += inpath.replace(".", "_") + + # Delete old field if overwriting + _field = self.get_field(rollup_path) + if _field is not None: + if overwrite and "_frame_rollup" in _field.info: + self.delete_frame_rollup_field(rollup_path) + else: + raise ValueError(f"Field '{rollup_path}' already exists") + + # Create sample field up front + info = {frame_rollup_key: inpath} + if include_counts: + self.add_sample_field( + rollup_path, + fo.EmbeddedDocumentField, + embedded_doc_type=fo.Classifications, + info=info, + ) + else: + self.add_sample_field(rollup_path, fo.ListField, info=info) + + # Optionally create mongodb index on rollup field + if create_index: + self.create_index(rollup_path) + + # Create pipeline depending on mode we've chosen + pipeline = [ + {"$unwind": "$frames"}, + {"$replaceRoot": {"newRoot": "$frames"}}, + ] + + if list_fields: + pipeline.append({"$unwind": "$" + list_fields[0]}) + + if include_counts: + # Values plus counts: + # 1. Group by (_sample_id, field value) and aggregate the count + # 2. Remove nulls + # 3. Group by _sample_id and push all counts to a list + pipeline.extend( + [ + { + "$group": { + "_id": { + "sample": "$_sample_id", + "value": "$" + inpath, + }, + "count": {"$sum": 1}, + }, + }, + {"$match": {"$expr": {"$gt": ["$_id.value", None]}}}, + { + "$group": { + "_id": "$_id.sample", + "result": { + "$push": {"k": "$_id.value", "v": "$count"} + }, + }, + }, + {"$set": {"result": {"$arrayToObject": "$result"}}}, + ] + ) + else: + # Values only: + # 1. Group by _sample_id, add all values seen to a set + # 2. Remove nulls + pipeline.extend( + [ + { + "$group": { + "_id": "$_sample_id", + "values": {"$addToSet": "$" + inpath}, + }, + }, + { + "$project": { + "values": { + "$filter": { + "input": "$values", + "cond": {"$gt": ["$$this", None]}, + } + } + } + }, + ] + ) + + results = self._aggregate(pipeline=pipeline, attach_frames=True) + + # Parse and set values to field + if include_counts: + labels = { + str(r["_id"]): fo.Classifications( + classifications=[ + fo.Classification(label=k, count=v) + for k, v in r["result"].items() + ] + ) + for r in results + } + else: + labels = {str(r["_id"]): list(r["values"]) for r in results} + + self.set_values(rollup_path, labels, key_field="id") + + # Now lock this field as readonly. + rollup_field = self.get_field(rollup_path) + _set_field_read_only(rollup_field, True) + rollup_field.save() + + def delete_frame_rollup_field(self, rollup_path): + """Deletes frame rollup field + + Examples:: + + import fiftyone as fo + import fiftyone.zoo as foz + + dataset = foz.load_zoo_dataset("quickstart-video") + + dataset.add_frame_rollup_field( + "frames.detections.detections.label", + "rollup_labels" + ) + + dataset.delete_frame_rollup_field("rollup_labels") + + Args: + rollup_path: path to rollup + + Raises: + ValueError: If rollup_path is not a frame rollup field + """ + if rollup_path not in self.list_frame_rollup_fields(): + raise ValueError("Path is not a valid frame rollup field") + + field = self.get_field(rollup_path) + _set_field_read_only(field, False) + field.save() + self.delete_sample_field(rollup_path) + + def check_frame_rollup_fields(self): + """Returns a list of frame rollup fields that could be out of sync + with their target frame field. + + Inclusion in this list is a heuristic, not a guarantee. However, there + are no false negatives. + + Returns: + list of frame rollup fields + """ + frames_schema = self.get_frame_field_schema(flat=True) + frame_rollup_schema = self.get_field_schema( + info_keys=["_frame_rollup"] + ) + + _, last_frame_mod = self.bounds("frames.last_modified_at") + + return [ + rollup_field.name + for rollup_field in frame_rollup_schema.values() + if ( + rollup_field.info["_frame_rollup"] not in frames_schema + or rollup_field.created_at > last_frame_mod + ) + ] + + def list_frame_rollup_fields(self): + """Lists frame rollup fields created via + :meth:`Dataset.add_frame_rollup_field` + + Use :meth:`Dataset.delete_frame_rollup_field` to delete these fields, + or :meth:`Dataset.get_field` to get information about these fields. + """ + return sorted(list(self.get_field_schema(info_keys="_frame_rollup"))) + def _add_implied_frame_field( self, field_name, value, dynamic=False, validate=True ): @@ -9614,8 +9895,9 @@ def _handle_nested_fields(schema): def _set_field_read_only(field_doc, read_only): field_doc.read_only = read_only - for _field_doc in field_doc.fields: - _set_field_read_only(_field_doc, read_only) + if hasattr(field_doc, "fields"): + for _field_doc in field_doc.fields: + _set_field_read_only(_field_doc, read_only) def _extract_archive_if_necessary(archive_path, cleanup): diff --git a/fiftyone/core/fields.py b/fiftyone/core/fields.py index 2aa181c74e2..a0c2f047cb5 100644 --- a/fiftyone/core/fields.py +++ b/fiftyone/core/fields.py @@ -23,7 +23,13 @@ import fiftyone.core.utils as fou -def validate_constraints(ftype=None, embedded_doc_type=None, read_only=None): +def validate_constraints( + ftype=None, + embedded_doc_type=None, + read_only=None, + info_keys=None, + created_after=None, +): """Validates the given field constraints. Args: @@ -34,6 +40,12 @@ def validate_constraints(ftype=None, embedded_doc_type=None, read_only=None): :class:`fiftyone.core.odm.BaseEmbeddedDocument` read_only (None): whether to optionally enforce that the field is read-only (True) or not read-only (False) + info_keys (None): an optional key or list of keys that must be in + a field's ``info`` dict in order for it to be included in the + returned schema. If ``None``, no filtering is performed. + created_after (None): an optional ``datetime`` to filter the + returned schema by, such that the field was `created_at` after + this time. If ``None``, no filtering is performed. Returns: True/False whether any constraints were provided @@ -86,11 +98,26 @@ def validate_constraints(ftype=None, embedded_doc_type=None, read_only=None): if not isinstance(read_only, bool): raise ValueError("read_only must be a boolean") + if info_keys is not None: + has_contraints = True + if not isinstance(info_keys, (list, str)): + raise ValueError("info_keys must be a single or a list of str's") + + if created_after is not None: + has_contraints = True + if not isinstance(created_after, datetime): + raise ValueError("created_after must be a datetime") + return has_contraints def matches_constraints( - field, ftype=None, embedded_doc_type=None, read_only=None + field, + ftype=None, + embedded_doc_type=None, + read_only=None, + info_keys=None, + created_after=None, ): """Determines whether the field matches the given constraints. @@ -103,6 +130,12 @@ def matches_constraints( :class:`fiftyone.core.odm.BaseEmbeddedDocument` read_only (None): whether to optionally enforce that the field is read-only (True) or not read-only (False) + info_keys (None): an optional key or list of keys that must be in + a field's ``info`` dict in order for it to be included in the + returned schema. If ``None``, no filtering is performed. + created_after (None): an optional ``datetime`` to filter the + returned schema by, such that the field was `created_at` after + this time. If ``None``, no filtering is performed. Returns: True/False @@ -123,8 +156,20 @@ def matches_constraints( ): return False - if read_only is not None: - return read_only == field.read_only + if read_only is not None and read_only != field.read_only: + return False + + if info_keys is not None: + key_set = {info_keys} if isinstance(info_keys, str) else set(info_keys) + if field.info is None or not set(field.info.keys()).issubset(key_set): + return False + + if ( + created_after is not None + and field.created_at is not None + and field.created_at > created_after + ): + return False return True @@ -226,6 +271,8 @@ def filter_schema( ftype=None, embedded_doc_type=None, read_only=None, + info_keys=None, + created_after=None, include_private=False, flat=False, mode=None, @@ -243,6 +290,12 @@ def filter_schema( :class:`fiftyone.core.odm.BaseEmbeddedDocument` read_only (None): whether to restrict to (True) or exclude (False) read-only fields. By default, all fields are included + info_keys (None): an optional key or list of keys that must be in + a field's ``info`` dict in order for it to be included in the + returned schema. If ``None``, no filtering is performed. + created_after (None): an optional ``datetime`` to filter the + returned schema by, such that the field was `created_at` after + this time. If ``None``, no filtering is performed. include_private (False): whether to include fields that start with ``_`` in the returned schema flat (False): whether to return a flattened schema where all @@ -259,6 +312,8 @@ def filter_schema( ftype=ftype, embedded_doc_type=embedded_doc_type, read_only=read_only, + info_keys=info_keys, + created_after=created_after, ) if has_contraints: @@ -266,6 +321,8 @@ def filter_schema( ftype=ftype, embedded_doc_type=embedded_doc_type, read_only=read_only, + info_keys=info_keys, + created_after=created_after, ) else: kwargs = {} @@ -314,6 +371,8 @@ def flatten_schema( ftype=None, embedded_doc_type=None, read_only=None, + info_keys=None, + created_after=None, include_private=False, ): """Returns a flat version of the given schema where all embedded document @@ -329,6 +388,12 @@ def flatten_schema( subclass(es) of :class:`fiftyone.core.odm.BaseEmbeddedDocument` read_only (None): whether to restrict to (True) or exclude (False) read-only fields. By default, all fields are included + info_keys (None): an optional key or list of keys that must be in + a field's ``info`` dict in order for it to be included in the + returned schema. If ``None``, no filtering is performed. + created_after (None): an optional ``datetime`` to filter the + returned schema by, such that the field was `created_at` after + this time. If ``None``, no filtering is performed. include_private (False): whether to include fields that start with ``_`` in the returned schema @@ -339,6 +404,8 @@ def flatten_schema( ftype=ftype, embedded_doc_type=embedded_doc_type, read_only=read_only, + info_keys=info_keys, + created_after=created_after, ) _schema = {} @@ -351,6 +418,8 @@ def flatten_schema( ftype, embedded_doc_type, read_only, + info_keys, + created_after, include_private, ) @@ -365,6 +434,8 @@ def _flatten( ftype, embedded_doc_type, read_only, + info_keys, + created_after, include_private, ): if not include_private and name.startswith("_"): @@ -380,6 +451,8 @@ def _flatten( ftype=ftype, embedded_doc_type=embedded_doc_type, read_only=read_only, + info_keys=info_keys, + created_after=created_after, ): schema[prefix] = field @@ -396,6 +469,8 @@ def _flatten( ftype, embedded_doc_type, read_only, + info_keys, + created_after, include_private, ) diff --git a/fiftyone/core/odm/mixins.py b/fiftyone/core/odm/mixins.py index 2723b7ca339..d9623409b96 100644 --- a/fiftyone/core/odm/mixins.py +++ b/fiftyone/core/odm/mixins.py @@ -179,6 +179,8 @@ def get_field_schema( ftype=None, embedded_doc_type=None, read_only=None, + info_keys=None, + created_after=None, include_private=False, flat=False, mode=None, @@ -198,6 +200,12 @@ def get_field_schema( :class:`fiftyone.core.odm.BaseEmbeddedDocument` read_only (None): whether to restrict to (True) or exclude (False) read-only fields. By default, all fields are included + info_keys (None): an optional key or list of keys that must be in + a field's ``info`` dict in order for it to be included in the + returned schema. If ``None``, no filtering is performed. + created_after (None): an optional ``datetime`` to filter the + returned schema by, such that the field was `created_at` after + this time. If ``None``, no filtering is performed. include_private (False): whether to include fields that start with ``_`` in the returned schema flat (False): whether to return a flattened schema where all @@ -221,6 +229,8 @@ def get_field_schema( ftype=ftype, embedded_doc_type=embedded_doc_type, read_only=read_only, + info_keys=info_keys, + created_after=created_after, include_private=include_private, flat=flat, mode=mode, diff --git a/fiftyone/core/view.py b/fiftyone/core/view.py index a3bba994131..bc3788cd81b 100644 --- a/fiftyone/core/view.py +++ b/fiftyone/core/view.py @@ -915,6 +915,8 @@ def get_field_schema( ftype=None, embedded_doc_type=None, read_only=None, + info_keys=None, + created_after=None, include_private=False, flat=False, mode=None, @@ -932,6 +934,12 @@ def get_field_schema( :class:`fiftyone.core.odm.BaseEmbeddedDocument` read_only (None): whether to restrict to (True) or exclude (False) read-only fields. By default, all fields are included + info_keys (None): an optional key or list of keys that must be in + a field's ``info`` dict in order for it to be included in the + returned schema. If ``None``, no filtering is performed. + created_after (None): an optional ``datetime`` to filter the + returned schema by, such that the field was `created_at` after + this time. If ``None``, no filtering is performed. include_private (False): whether to include fields that start with ``_`` in the returned schema flat (False): whether to return a flattened schema where all @@ -956,6 +964,8 @@ def get_field_schema( ftype=ftype, embedded_doc_type=embedded_doc_type, read_only=read_only, + info_keys=info_keys, + created_after=created_after, include_private=include_private, flat=flat, mode=mode, @@ -966,6 +976,8 @@ def get_frame_field_schema( ftype=None, embedded_doc_type=None, read_only=None, + info_keys=None, + created_after=None, include_private=False, flat=False, mode=None, @@ -985,6 +997,12 @@ def get_frame_field_schema( :class:`fiftyone.core.odm.BaseEmbeddedDocument` read_only (None): whether to restrict to (True) or exclude (False) read-only fields. By default, all fields are included + info_keys (None): an optional key or list of keys that must be in + a field's ``info`` dict in order for it to be included in the + returned schema. If ``None``, no filtering is performed. + created_after (None): an optional ``datetime`` to filter the + returned schema by, such that the field was `created_at` after + this time. If ``None``, no filtering is performed. include_private (False): whether to include fields that start with ``_`` in the returned schema flat (False): whether to return a flattened schema where all @@ -1012,6 +1030,8 @@ def get_frame_field_schema( ftype=ftype, embedded_doc_type=embedded_doc_type, read_only=read_only, + info_keys=info_keys, + created_after=created_after, include_private=include_private, flat=flat, mode=mode, diff --git a/tests/unittests/video_tests.py b/tests/unittests/video_tests.py index 2680a8f75ee..b6f0d222475 100644 --- a/tests/unittests/video_tests.py +++ b/tests/unittests/video_tests.py @@ -3938,6 +3938,9 @@ def test_clips_save_context(self): self.assertEqual(dataset.count("events.detections.foo"), 3) self.assertEqual(dataset.count("frames.foo"), 5) + def test_frame_field_rollup(self): + ... + if __name__ == "__main__": fo.config.show_progress_bars = False