MobileTeleSystems · feldlime · Apr 2, 2024 · Dec 10, 2023 · Dec 11, 2023 · Feb 18, 2024
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -28,7 +28,7 @@ jobs:
         uses: actions/cache@v3
         with:
           path: .venv
-          key: venv-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}
+          key: venv-${{ runner.os }}-3.8-${{ hashFiles('**/poetry.lock') }}
 
       - name: Install dependencies
         if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
@@ -65,7 +65,7 @@ jobs:
         uses: actions/cache@v3
         with:
           path: .venv
-          key: venv-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}
+          key: venv-${{ runner.os }}-${{ matrix.python-version }}-old-deps-${{ matrix.old-deps }}-${{ hashFiles('**/poetry.lock') }}
 
       - name: Install dependencies
         if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'

diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+# MacOS
+.DS_Store
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/.pylintrc b/.pylintrc
@@ -73,6 +73,7 @@ disable=arguments-differ,
         unused-argument,
         use-implicit-booleaness-not-comparison,
         use-symbolic-message-instead,
+        abstract-method
 
 # Enable the message, report, category or checker with the given id(s).
 # You can either give multiple identifier separated by comma (,) or
@@ -446,7 +447,7 @@ max-args=15
 max-attributes=12
 
 # Maximum number of boolean expressions in an if statement (see R0916).
-max-bool-expr=2
+max-bool-expr=3
 
 # Maximum number of branch for function / method body.
 max-branches=9

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,17 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 
+## [0.6.0] - Unreleased
+
+### Added 
+- Warm users/items support in `Dataset` ([#77](https://github.com/MobileTeleSystems/RecTools/pull/77))
+- Warm and cold users/items support in `ModelBase` ([#77](https://github.com/MobileTeleSystems/RecTools/pull/77))
+- Warm and cold users/items support in `cross_validate` ([#77](https://github.com/MobileTeleSystems/RecTools/pull/77))
+
+### Removed
+- `return_external_ids` parameter in `recommend` and `recommend_to_items` model methods ([#77](https://github.com/MobileTeleSystems/RecTools/pull/77))
+
+
 ## [0.5.0] - 22.03.2024
 
 ### Added

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -62,6 +62,7 @@ tqdm = "^4.27.0"
 implicit = "^0.7.1"
 attrs = ">=19.1.0,<24.0.0"
 typeguard = "^2.0.1"
+typing-extensions = "4.7.1"  # TODO: remove after dropping support for python 3.7
 
 
 lightfm = {version = ">=1.16,<=1.17", optional = true}

diff --git a/rectools/dataset/dataset.py b/rectools/dataset/dataset.py
@@ -22,7 +22,7 @@
 
 from rectools import Columns
 
-from .features import AbsentIdError, DenseFeatures, Features, SparseFeatures, UnknownIdError
+from .features import AbsentIdError, DenseFeatures, Features, SparseFeatures
 from .identifiers import IdMap
 from .interactions import Interactions
 
@@ -36,8 +36,8 @@ class Dataset:
     user-item interactions, user and item features
     in special `rectools` structures for convenient future usage.
 
-    This is data class, so you can create it explicitly, but
-    it's recommended to use `construct` method.
+    WARNING: It's highly not recommended to create `Dataset` object directly.
+    Use `construct` class method instead.
 
     Parameters
     ----------
@@ -59,6 +59,38 @@ class Dataset:
     user_features: tp.Optional[Features] = attr.ib(default=None)
     item_features: tp.Optional[Features] = attr.ib(default=None)
 
+    @property
+    def n_hot_users(self) -> int:
+        """
+        Return number of hot users in dataset.
+        Users with internal ids from `0` to `n_hot_users - 1` are hot (they are present in interactions).
+        Users with internal ids from `n_hot_users` to `dataset.user_id_map.size - 1` are warm
+        (they aren't present in interactions, but they have features).
+        """
+        return self.interactions.df[Columns.User].max() + 1
+
+    @property
+    def n_hot_items(self) -> int:
+        """
+        Return number of hot items in dataset.
+        Items with internal ids from `0` to `n_hot_items - 1` are hot (they are present in interactions).
+        Items with internal ids from `n_hot_items` to `dataset.item_id_map.size - 1` are warm
+        (they aren't present in interactions, but they have features).
+        """
+        return self.interactions.df[Columns.Item].max() + 1
+
+    def get_hot_user_features(self) -> tp.Optional[Features]:
+        """User features for hot users."""
+        if self.user_features is None:
+            return None
+        return self.user_features.take(range(self.n_hot_users))
+
+    def get_hot_item_features(self) -> tp.Optional[Features]:
+        """Item features for hot items."""
+        if self.item_features is None:
+            return None
+        return self.item_features.take(range(self.n_hot_items))
+
     @classmethod
     def construct(
         cls,
@@ -112,15 +144,16 @@ def construct(
         user_id_map = IdMap.from_values(interactions_df[Columns.User].values)
         item_id_map = IdMap.from_values(interactions_df[Columns.Item].values)
         interactions = Interactions.from_raw(interactions_df, user_id_map, item_id_map)
-        user_features = cls._make_features(
+
+        user_features, user_id_map = cls._make_features(
             user_features_df,
             cat_user_features,
             make_dense_user_features,
             user_id_map,
             Columns.User,
             "user",
         )
-        item_features = cls._make_features(
+        item_features, item_id_map = cls._make_features(
             item_features_df,
             cat_item_features,
             make_dense_item_features,
@@ -138,32 +171,30 @@ def _make_features(
         id_map: IdMap,
         possible_id_col: str,
         feature_type: str,
-    ) -> tp.Optional[Features]:
+    ) -> tp.Tuple[tp.Optional[Features], IdMap]:
         if df is None:
-            return None
+            return None, id_map
 
         id_col = possible_id_col if possible_id_col in df else "id"
+        id_map = id_map.add_ids(df[id_col].values, raise_if_already_present=False)
 
         if make_dense:
             try:
-                return DenseFeatures.from_dataframe(df, id_map, id_col=id_col)
-            except UnknownIdError:
-                raise ValueError(f"Some ids from {feature_type} features table not present in interactions")
+                return DenseFeatures.from_dataframe(df, id_map, id_col=id_col), id_map
             except AbsentIdError:
                 raise ValueError(
                     f"An error has occurred while constructing {feature_type} features: "
-                    "When using dense features all ids from interactions must present in features table"
+                    "When using dense features all ids from interactions must be present in features table"
                 )
             except Exception as e:  # pragma: no cover
                 raise RuntimeError(f"An error has occurred while constructing {feature_type} features: {e!r}")
+
         try:
-            return SparseFeatures.from_flatten(df, id_map, cat_features, id_col=id_col)
-        except UnknownIdError:
-            raise ValueError(f"Some ids from {feature_type} features table not present in interactions")
+            return SparseFeatures.from_flatten(df, id_map, cat_features, id_col=id_col), id_map
         except Exception as e:  # pragma: no cover
             raise RuntimeError(f"An error has occurred while constructing {feature_type} features: {e!r}")
 
-    def get_user_item_matrix(self, include_weights: bool = True) -> sparse.csr_matrix:
+    def get_user_item_matrix(self, include_weights: bool = True, include_warm: bool = False) -> sparse.csr_matrix:
         """
         Construct user-item CSR matrix based on `interactions` attribute.
 
@@ -177,14 +208,19 @@ def get_user_item_matrix(self, include_weights: bool = True) -> sparse.csr_matri
         include_weights : bool, default ``True``
              Whether include interaction weights in matrix or not.
              If False, all values in returned matrix will be equal to ``1``.
+        include_warm : bool, default ``False``
+            Whether to include warm users and items into the matrix or not.
+            Rows and columns for warm users and items will be added to the end of matrix,
+            they will contain only zeros.
 
         Returns
         -------
         csr_matrix
             Resized user-item CSR matrix
         """
         matrix = self.interactions.get_user_item_matrix(include_weights)
-        matrix.resize(self.user_id_map.internal_ids.size, self.item_id_map.internal_ids.size)
+        if include_warm:
+            matrix.resize(self.user_id_map.size, self.item_id_map.size)
         return matrix
 
     def get_raw_interactions(self, include_weight: bool = True, include_datetime: bool = True) -> pd.DataFrame:

diff --git a/rectools/dataset/features.py b/rectools/dataset/features.py
@@ -160,6 +160,10 @@ def take(self, ids: InternalIds) -> "DenseFeatures":
             names=self.names,
         )
 
+    def __len__(self) -> int:
+        """Return number of objects."""
+        return self.values.shape[0]
+
 
 SparseFeatureName = tp.Tuple[str, tp.Any]
 
@@ -442,5 +446,9 @@ def take(self, ids: InternalIds) -> "SparseFeatures":
             names=self.names,
         )
 
+    def __len__(self) -> int:
+        """Return number of objects."""
+        return self.values.shape[0]
+
 
 Features = tp.Union[DenseFeatures, SparseFeatures]
diff --git a/rectools/dataset/identifiers.py b/rectools/dataset/identifiers.py
@@ -20,6 +20,7 @@
 import attr
 import numpy as np
 import pandas as pd
+import typing_extensions as tpe
 
 from rectools import ExternalId, ExternalIds, InternalId, InternalIds
 from rectools.utils import fast_isin, get_from_series_by_index
@@ -97,6 +98,11 @@ def size(self) -> int:
         """Return number of ids in map."""
         return self.external_ids.size
 
+    @property
+    def external_dtype(self) -> tp.Type:
+        """Return dtype of external ids."""
+        return self.external_ids.dtype
+
     @property
     def to_internal(self) -> pd.Series:
         """Map internal->external."""
@@ -120,7 +126,21 @@ def get_external_sorted_by_internal(self) -> np.ndarray:
         """Return array of external ids sorted by internal ids."""
         return self.external_ids
 
-    def convert_to_internal(self, external: ExternalIds, strict: bool = True) -> np.ndarray:
+    @tp.overload
+    def convert_to_internal(  # noqa: D102
+        self, external: ExternalIds, strict: bool = ..., return_missing: tpe.Literal[False] = False
+    ) -> np.ndarray:  # pragma: no cover
+        ...
+
+    @tp.overload
+    def convert_to_internal(  # noqa: D102
+        self, external: ExternalIds, strict: bool = ..., *, return_missing: tpe.Literal[True]
+    ) -> tp.Tuple[np.ndarray, np.ndarray]:  # pragma: no cover
+        ...
+
+    def convert_to_internal(
+        self, external: ExternalIds, strict: bool = True, return_missing: bool = False
+    ) -> tp.Union[np.ndarray, tp.Tuple[np.ndarray, np.ndarray]]:
         """
         Convert any sequence of external ids to array of internal ids (map external -> internal).
 
@@ -132,21 +152,43 @@ def convert_to_internal(self, external: ExternalIds, strict: bool = True) -> np.
              Defines behaviour when some of given external ids do not exist in mapping.
                 - If ``True``, `KeyError` will be raised;
                 - If ``False``, nonexistent ids will be skipped.
+        return_missing : bool, default ``False``
+            If True, return a tuple of 2 arrays: internal ids and missing ids (that are not in map).
+            Works only if `strict` is False.
 
         Returns
         -------
         np.ndarray
             Array of internal ids.
+        np.ndarray, np.ndarray
+            Tuple of 2 arrays: internal ids and missing ids.
+            Only if `strict` is False and `return_missing` is True.
 
         Raises
         ------
         KeyError
             If some of given external ids do not exist in mapping and `strict` flag is ``True``.
+        ValueError
+            If `strict` and `return_missing` are both ``True``.
         """
-        internal = get_from_series_by_index(self.to_internal, external, strict)
-        return internal
-
-    def convert_to_external(self, internal: InternalIds, strict: bool = True) -> np.ndarray:
+        result = get_from_series_by_index(self.to_internal, external, strict, return_missing)
+        return result
+
+    @tp.overload
+    def convert_to_external(  # noqa: D102
+        self, internal: InternalIds, strict: bool = ..., return_missing: tpe.Literal[False] = False
+    ) -> np.ndarray:  # pragma: no cover
+        ...
+
+    @tp.overload
+    def convert_to_external(  # noqa: D102
+        self, internal: InternalIds, strict: bool = ..., *, return_missing: tpe.Literal[True]
+    ) -> tp.Tuple[np.ndarray, np.ndarray]:  # pragma: no cover
+        ...
+
+    def convert_to_external(
+        self, internal: InternalIds, strict: bool = True, return_missing: bool = False
+    ) -> tp.Union[np.ndarray, tp.Tuple[np.ndarray, np.ndarray]]:
         """
         Convert any sequence of internal ids to array of external ids (map internal -> external).
 
@@ -158,19 +200,27 @@ def convert_to_external(self, internal: InternalIds, strict: bool = True) -> np.
              Defines behaviour when some of given internal ids do not exist in mapping.
                 - If ``True``, `KeyError` will be raised;
                 - If ``False``, nonexistent ids will be skipped.
+        return_missing : bool, default ``False``
+            If True, return a tuple of 2 arrays: external ids and missing ids (that are not in map).
+            Works only if `strict` is False.
 
         Returns
         -------
         np.ndarray
             Array of external ids.
+        np.ndarray, np.ndarray
+            Tuple of 2 arrays: external ids and missing ids.
+            Only if `strict` is False and `return_missing` is True.
 
         Raises
         ------
         KeyError
             If some of given internal ids do not exist in mapping and `strict` flag is True.
+        ValueError
+            If `strict` and `return_missing` are both ``True``.
         """
-        external = get_from_series_by_index(self.to_external, internal, strict)
-        return external
+        result = get_from_series_by_index(self.to_external, internal, strict, return_missing)
+        return result
 
     def add_ids(self, values: ExternalIds, raise_if_already_present: bool = False) -> "IdMap":
         """

diff --git a/rectools/dataset/torch_datasets.py b/rectools/dataset/torch_datasets.py
@@ -70,15 +70,15 @@ def __init__(
     @classmethod
     def from_dataset(cls: tp.Type[DD], dataset: Dataset) -> DD:
         ui_matrix = dataset.get_user_item_matrix()
-        if dataset.item_features is not None:
-            item_features = dataset.item_features.get_sparse()
-        else:
+
+        # We take hot here since this dataset is used for fit only
+        item_features = dataset.get_hot_item_features()
+        user_features = dataset.get_hot_user_features()
+        if item_features is None:
             raise AttributeError("Item features attribute of dataset could not be None")
-        if dataset.user_features is not None:
-            user_features = dataset.user_features.get_sparse()
-        else:
+        if user_features is None:
             raise AttributeError("User features attribute of dataset could not be None")
-        return cls(items=item_features, users=user_features, interactions=ui_matrix)
+        return cls(items=item_features.get_sparse(), users=user_features.get_sparse(), interactions=ui_matrix)
 
     def __len__(self) -> int:
         return self.interactions.shape[0]
@@ -114,6 +114,7 @@ def __init__(self, items: sparse.csr_matrix):
 
     @classmethod
     def from_dataset(cls: tp.Type[ID], dataset: Dataset) -> ID:
+        # We take all features here since this dataset is used for recommend only, not for fit
         if dataset.item_features is not None:
             return cls(dataset.item_features.get_sparse())
         raise AttributeError("Item features attribute of dataset could not be None")
@@ -155,6 +156,7 @@ def from_dataset(
         dataset: Dataset,
         keep_users: tp.Optional[tp.Sequence[int]] = None,
     ) -> UD:
+        # We take all features here since this dataset is used for recommend only, not for fit
         if dataset.user_features is not None:
             return cls(
                 dataset.user_features.get_sparse(),