NVIDIA-Merlin · karlhigley · Feb 15, 2023 · Feb 9, 2023 · Feb 9, 2023 · Feb 9, 2023
diff --git a/nvtabular/loader/backend.py b/nvtabular/loader/backend.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 #
 
+from merlin.dtypes.shape import Shape
 from merlin.schema import ColumnSchema, Tags
 
 
@@ -22,9 +23,10 @@ def _augment_schema(
     cats=None,
     conts=None,
     labels=None,
-    sparse_names=None,
-    sparse_max=None,
-    sparse_as_dense=False,
+    padded_cols=None,
+    padded_lengths=None,
+    pad=False,
+    batch_size=0
 ):
     labels = [labels] if isinstance(labels, str) else labels
     for label in labels or []:
@@ -34,21 +36,24 @@ def _augment_schema(
     for label in conts or []:
         schema[label] = schema[label].with_tags(Tags.CONTINUOUS)
 
-    # Set the appropriate properties for the sparse_names/sparse_max/sparse_as_dense
-    for col in sparse_names or []:
+    for col in padded_cols or []:
         cs = schema[col]
-        properties = cs.properties
-        if sparse_max and col in sparse_max:
-            properties["value_count"] = {"max": sparse_max[col]}
-        if sparse_as_dense:
-            properties["value_count"]["min"] = properties["value_count"]["max"]
+        dims = Shape(((1,batch_size), None))
+
+        if not cs.shape.dims[1].is_unknown:
+            dims = dims.with_dim(1, cs.shape.dims[1])
+
+        if pad:
+            dims = dims.with_dim_min(1, padded_lengths[col])
+        if padded_lengths and col in padded_lengths:
+            dims = dims.with_dim_max(1, padded_lengths[col])
+
         schema[col] = ColumnSchema(
             name=cs.name,
             tags=cs.tags,
             dtype=cs.dtype,
-            is_list=True,
-            is_ragged=not sparse_as_dense,
-            properties=properties,
+            properties=cs.properties,
+            dims=dims
         )
 
     return schema
diff --git a/nvtabular/loader/tensorflow.py b/nvtabular/loader/tensorflow.py
@@ -251,6 +251,7 @@ def __init__(
             sparse_names,
             sparse_max,
             sparse_as_dense,
+            batch_size
         )
 
         super().__init__(

diff --git a/nvtabular/ops/groupby.py b/nvtabular/ops/groupby.py
@@ -18,6 +18,7 @@
 from dask.dataframe.utils import meta_nonempty
 
 from merlin.core.dispatch import DataFrameType, annotate
+from merlin.dtypes.shape import DefaultShapes
 from merlin.schema import Schema
 from nvtabular.ops.operator import ColumnSelector, Operator
 
@@ -186,10 +187,7 @@ def dependencies(self):
     def _compute_dtype(self, col_schema, input_schema):
         col_schema = super()._compute_dtype(col_schema, input_schema)
 
-        dtype = col_schema.dtype
-        is_list = col_schema.is_list
-
-        dtypes = {
+        agg_dtypes = {
             "count": numpy.int32,
             "nunique": numpy.int32,
             "mean": numpy.float32,
@@ -199,18 +197,26 @@ def _compute_dtype(self, col_schema, input_schema):
             "sum": numpy.float32,
         }
 
-        is_lists = {"list": True}
+        agg = self._find_agg(col_schema, input_schema)
+        dtype = agg_dtypes.get(agg, col_schema.dtype)
+
+        return col_schema.with_dtype(dtype)
+
+    def _compute_shape(self, col_schema, input_schema):
+        agg_is_lists = {"list": True}
+
+        agg = self._find_agg(col_schema, input_schema)
+        is_list = agg_is_lists.get(agg, col_schema.is_list)
 
-        for col_name in input_schema.column_names:
-            combined_aggs = _aggs_for_column(col_name, self.conv_aggs)
-            combined_aggs += _aggs_for_column(col_name, self.list_aggs)
-            for agg in combined_aggs:
-                if col_schema.name.endswith(f"{self.name_sep}{agg}"):
-                    dtype = dtypes.get(agg, dtype)
-                    is_list = is_lists.get(agg, is_list)
-                    break
+        shape = DefaultShapes.LIST if is_list else DefaultShapes.SCALAR
+        return col_schema.with_shape(shape)
 
-        return col_schema.with_dtype(dtype, is_list=is_list, is_ragged=is_list)
+    def _find_agg(self, col_schema, input_schema):
+        input_selector = ColumnSelector(input_schema.column_names)
+        column_mapping = self.column_mapping(input_selector)
+        input_column_name = column_mapping[col_schema.name][0]
+        agg = col_schema.name.replace(input_column_name, "").lstrip(self.name_sep)
+        return agg
 
 
 def _aggs_for_column(col_name, agg_dict):

diff --git a/nvtabular/ops/join_groupby.py b/nvtabular/ops/join_groupby.py
@@ -21,6 +21,7 @@
 
 import nvtabular as nvt
 from merlin.core.dispatch import DataFrameType, arange, concat_columns, read_parquet_dispatch
+from merlin.dtypes.shape import DefaultShapes
 from merlin.schema import Schema
 from nvtabular.ops import categorify as nvt_cat
 from nvtabular.ops.operator import ColumnSelector, Operator
@@ -241,19 +242,18 @@ def column_mapping(self, col_selector):
 
         return column_mapping
 
-    def _compute_dtype(self, col_schema, input_schema):
-        new_schema = super()._compute_dtype(col_schema, input_schema)
-
+    def _compute_shape(self, col_schema, input_schema):
+        new_schema = super()._compute_shape(col_schema, input_schema)
+        shape = new_schema.shape
         dtype = new_schema.dtype
-        is_list = new_schema.is_list
-
+
         for agg in list(AGG_DTYPES.keys()):
             if col_schema.name.endswith(f"{self.name_sep}{agg}"):
                 dtype = AGG_DTYPES.get(agg, dtype)
-                is_list = False
+                shape = DefaultShapes.SCALAR
                 break
 
-        return col_schema.with_dtype(dtype, is_list=is_list, is_ragged=is_list)
+        return col_schema.with_shape(shape)
 
     def set_storage_path(self, new_path, copy=False):
         self.categories = nvt_cat._copy_storage(self.categories, self.out_path, new_path, copy)

diff --git a/nvtabular/ops/list_slice.py b/nvtabular/ops/list_slice.py
@@ -129,7 +129,7 @@ def transform(self, col_selector: ColumnSelector, df: DataFrameType) -> DataFram
 
     def _compute_dtype(self, col_schema, input_schema):
         col_schema = super()._compute_dtype(col_schema, input_schema)
-        return col_schema.with_dtype(col_schema.dtype, is_list=True, is_ragged=not self.pad)
+        return col_schema.with_dtype(col_schema.dtype)
 
     def _compute_properties(self, col_schema, input_schema):
         col_schema = super()._compute_properties(col_schema, input_schema)
@@ -140,6 +140,17 @@ def _compute_properties(self, col_schema, input_schema):
                 properties["value_count"]["min"] = self.max_elements
         return col_schema.with_properties(properties)
 
+    def _compute_shape(self, col_schema, input_schema):
+        col_schema = super()._compute_shape(col_schema, input_schema)
+
+        min_count, max_count = (0, None)
+        if self.max_elements != np.iinfo(np.int64).max:
+            max_count = self.max_elements
+            if self.pad:
+                min_count = self.max_elements
+
+        return col_schema.with_shape((None, (min_count, max_count)))
+
     @property
     def output_tags(self):
         return [Tags.LIST]

diff --git a/nvtabular/ops/value_counts.py b/nvtabular/ops/value_counts.py
@@ -59,8 +59,20 @@ def transform(self, col_selector: ColumnSelector, df: DataFrameType) -> DataFram
 
     def _compute_properties(self, col_schema, input_schema):
         new_schema = super()._compute_properties(col_schema, input_schema)
-        stat_properties = self.stats.get(col_schema.name, {})
+        stat_properties = self.stats.get(col_schema.name, {"value_count": {"min": 0, "max":None}})
         return col_schema.with_properties({**new_schema.properties, **stat_properties})
 
+    def _compute_shape(self, col_schema, input_schema):
+        new_schema = super()._compute_shape(col_schema, input_schema)
+
+        value_counts = self.stats.get(col_schema.name, {}).get("value_count", {})
+
+        min_count, max_count = (0, None)
+        if value_counts:
+            min_count = value_counts.get("min", 0)
+            max_count = value_counts.get("max", None)
+
+        return new_schema.with_shape((None, (min_count, max_count)))
+
     def clear(self):
         self.stats = {}
diff --git a/tests/unit/ops/test_ops_schema.py b/tests/unit/ops/test_ops_schema.py
@@ -56,35 +56,38 @@ def test_schema_out(tags, properties, selection, op):
     output_schema = op.compute_output_schema(input_schema, selector)
 
     # should have dtype float
-    for col_name in selector.names:
-        names_group = [name for name in output_schema.column_schemas if col_name in name]
-        if names_group:
-            for name in names_group:
-                result_schema = output_schema.column_schemas[name]
+    for input_col_name in selector.names:
+        output_col_names = [name for name in output_schema.column_schemas if input_col_name in name]
+        if output_col_names:
+            for output_col_name in output_col_names:
+                result_schema = output_schema.column_schemas[output_col_name]
 
                 expected_dtype = op._compute_dtype(
-                    ColumnSchema(col_name), Schema([input_schema.column_schemas[col_name]])
+                    ColumnSchema(output_col_name),
+                    Schema([input_schema.column_schemas[input_col_name]]),
                 ).dtype
 
                 expected_tags = op._compute_tags(
-                    ColumnSchema(col_name), Schema([input_schema.column_schemas[col_name]])
+                    ColumnSchema(output_col_name),
+                    Schema([input_schema.column_schemas[input_col_name]]),
                 ).tags
 
                 expected_properties = op._compute_properties(
-                    ColumnSchema(col_name), Schema([input_schema.column_schemas[col_name]])
+                    ColumnSchema(output_col_name),
+                    Schema([input_schema.column_schemas[input_col_name]]),
                 ).properties
 
                 assert result_schema.dtype == expected_dtype
-                if name in selector.names:
+                if output_col_name in selector.names:
                     assert result_schema.properties == expected_properties
 
                     assert len(result_schema.tags) == len(expected_tags)
                 else:
                     assert set(expected_tags).issubset(result_schema.tags)
 
     not_used = [col for col in all_cols if col not in selector.names]
-    for col_name in not_used:
-        assert col_name not in output_schema.column_schemas
+    for input_col_name in not_used:
+        assert input_col_name not in output_schema.column_schemas
 
 
 @pytest.mark.parametrize("properties", [{"p1": "1"}])