Enable CategorifyTransform inference operator to run on int16 types (…

…#1798) * Enable CategorifyTransform cpp op to run on int16 types * Add test for categorify inference op with different types
NVIDIA-Merlin · Apr 12, 2023 · ae580ad · ae580ad
1 parent 2e5a84d
commit ae580ad
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 0 deletions.
diff --git a/cpp/nvtabular/inference/categorify.cc b/cpp/nvtabular/inference/categorify.cc
@@ -93,6 +93,9 @@ namespace nvtabular
           case 'u':
             switch (dtype.itemsize())
             {
+            case 2:
+              insert_int_mapping<uint16_t>(values);
+              return;
             case 4:
               insert_int_mapping<uint32_t>(values);
               return;
@@ -104,6 +107,9 @@ namespace nvtabular
           case 'i':
             switch (dtype.itemsize())
             {
+            case 2:
+              insert_int_mapping<int16_t>(values);
+              return;
             case 4:
               insert_int_mapping<int32_t>(values);
               return;
@@ -198,6 +204,8 @@ namespace nvtabular
           case 'u':
             switch (itemsize)
             {
+            case 2:
+              return transform_int<uint16_t>(input);
             case 4:
               return transform_int<uint32_t>(input);
             case 8:
@@ -207,6 +215,8 @@ namespace nvtabular
           case 'i':
             switch (itemsize)
             {
+            case 2:
+              return transform_int<int16_t>(input);
             case 4:
               return transform_int<int32_t>(input);
             case 8:

diff --git a/tests/unit/ops/test_categorify.py b/tests/unit/ops/test_categorify.py
@@ -695,3 +695,29 @@ def test_categorify_joint_list(cpu):
 
     assert compare_a == [1, 5, 2, 3]
     assert compare_e == [2, 3, 1, 4, 1]
+
+
+def test_categorify_inference():
+    num_rows = 100
+    a_char, z_char = np.array(["a", "z"]).view("int32")
+    input_tensors = {
+        "unicode_string": np.random.randint(
+            low=a_char, high=z_char, size=num_rows * 10, dtype="int32"
+        ).view("U10"),
+        "int16_feature": np.random.randint(0, 10, dtype="int16", size=num_rows),
+        "int32_feature": np.random.randint(0, 10, dtype="int32", size=num_rows),
+        "int64_feature": np.random.randint(0, 10, dtype="int64", size=num_rows),
+        "uint16_feature": np.random.randint(0, 10, dtype="uint16", size=num_rows),
+        "uint32_feature": np.random.randint(0, 10, dtype="uint32", size=num_rows),
+        "uint64_feature": np.random.randint(0, 10, dtype="uint64", size=num_rows),
+    }
+    df = dispatch.make_df(input_tensors)
+    cat_names = df.columns
+    cats = cat_names >> nvt.ops.Categorify()
+    workflow = nvt.Workflow(cats)
+    workflow.fit(nvt.Dataset(df))
+    model_config = {}
+    inference_op = cats.op.inference_initialize(cats.input_columns, model_config)
+    output_tensors = inference_op.transform(cats.input_columns, input_tensors)
+    for key in input_tensors:
+        assert output_tensors[key].dtype == np.dtype("int64")