Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Experimental] Introduce ImplicitBPRWrapper model #232

Merged
merged 8 commits into from
Jan 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .readthedocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ build:
pre_build:
- cp -r examples docs/source/
post_install:
- pip install --no-cache-dir poetry
- pip install --no-cache-dir poetry==1.8.5
- poetry export -f requirements.txt -o requirements.txt -E all --without-hashes
- pip install --no-cache-dir -r requirements.txt

Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## Unreleased

### Added
- `ImplicitBPRWrapperModel` model ([#232](https://github.com/MobileTeleSystems/RecTools/pull/232))

## [0.9.0] - 11.12.2024

Expand Down
3 changes: 2 additions & 1 deletion rectools/metrics/intersection.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Dict, Hashable, Optional, Union
from collections.abc import Hashable
blondered marked this conversation as resolved.
Show resolved Hide resolved
from typing import Dict, Optional, Union

import attr
import numpy as np
Expand Down
2 changes: 2 additions & 0 deletions rectools/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@

from .ease import EASEModel
from .implicit_als import ImplicitALSWrapperModel
from .implicit_bpr import ImplicitBPRWrapperModel
from .implicit_knn import ImplicitItemKNNWrapperModel
from .popular import PopularModel
from .popular_in_category import PopularInCategoryModel
Expand All @@ -59,6 +60,7 @@
__all__ = (
"EASEModel",
"ImplicitALSWrapperModel",
"ImplicitBPRWrapperModel",
"ImplicitItemKNNWrapperModel",
"LightFMWrapperModel",
"PopularModel",
Expand Down
8 changes: 2 additions & 6 deletions rectools/models/implicit_als.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,15 @@
from implicit.cpu.als import AlternatingLeastSquares as CPUAlternatingLeastSquares
from implicit.gpu.als import AlternatingLeastSquares as GPUAlternatingLeastSquares
from implicit.utils import check_random_state
from pydantic import BeforeValidator, ConfigDict, PlainSerializer, SerializationInfo, WrapSerializer
from pydantic import BeforeValidator, ConfigDict, SerializationInfo, WrapSerializer
from scipy import sparse
from tqdm.auto import tqdm

from rectools.dataset import Dataset, Features
from rectools.exceptions import NotFittedError
from rectools.models.base import ModelConfig
from rectools.utils.misc import get_class_or_function_full_path, import_object
from rectools.utils.serialization import RandomState
from rectools.utils.serialization import DType, RandomState

from .rank import Distance
from .vector import Factors, VectorModel
Expand Down Expand Up @@ -68,10 +68,6 @@ def _serialize_alternating_least_squares_class(
),
]

DType = tpe.Annotated[
np.dtype, BeforeValidator(func=np.dtype), PlainSerializer(func=lambda dtp: dtp.name, when_used="json")
]


class AlternatingLeastSquaresConfig(tpe.TypedDict):
"""Config for implicit `AlternatingLeastSquares` model."""
Expand Down
224 changes: 224 additions & 0 deletions rectools/models/implicit_bpr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
import typing as tp
from copy import deepcopy

import numpy as np
import typing_extensions as tpe
from implicit.bpr import BayesianPersonalizedRanking

# pylint: disable=no-name-in-module
from implicit.cpu.bpr import BayesianPersonalizedRanking as CPUBayesianPersonalizedRanking
from implicit.gpu.bpr import BayesianPersonalizedRanking as GPUBayesianPersonalizedRanking

# pylint: enable=no-name-in-module
from pydantic import BeforeValidator, ConfigDict, SerializationInfo, WrapSerializer

from rectools.dataset.dataset import Dataset
from rectools.exceptions import NotFittedError
from rectools.models.base import ModelConfig
from rectools.models.rank import Distance
from rectools.models.vector import Factors, VectorModel
from rectools.utils.misc import get_class_or_function_full_path, import_object
from rectools.utils.serialization import DType, RandomState

BPR_STRING = "BayesianPersonalizedRanking"

AnyBayesianPersonalizedRanking = tp.Union[CPUBayesianPersonalizedRanking, GPUBayesianPersonalizedRanking]
BayesianPersonalizedRankingType = tp.Union[
tp.Type[AnyBayesianPersonalizedRanking], tp.Literal["BayesianPersonalizedRanking"]
]


def _get_bpr_class(spec: tp.Any) -> tp.Any:
if spec in (BPR_STRING, get_class_or_function_full_path(BayesianPersonalizedRanking)):
return "BayesianPersonalizedRanking"
if isinstance(spec, str):
return import_object(spec)
return spec


def _serialize_bpr_class(
cls: BayesianPersonalizedRankingType, handler: tp.Callable, info: SerializationInfo
) -> tp.Union[None, str, AnyBayesianPersonalizedRanking]:
if cls in (CPUBayesianPersonalizedRanking, GPUBayesianPersonalizedRanking) or cls == "BayesianPersonalizedRanking":
return BPR_STRING
if info.mode == "json":
return get_class_or_function_full_path(cls)
return cls


BayesianPersonalizedRankingClass = tpe.Annotated[
BayesianPersonalizedRankingType,
BeforeValidator(_get_bpr_class),
WrapSerializer(
func=_serialize_bpr_class,
when_used="always",
),
]


class BayesianPersonalizedRankingConfig(tpe.TypedDict):
"""Config for implicit `BayesianPersonalizedRanking` model."""

cls: tpe.NotRequired[BayesianPersonalizedRankingClass]
factors: tpe.NotRequired[int]
learning_rate: tpe.NotRequired[float]
regularization: tpe.NotRequired[float]
dtype: tpe.NotRequired[DType]
num_threads: tpe.NotRequired[int]
iterations: tpe.NotRequired[int]
verify_negative_samples: tpe.NotRequired[bool]
random_state: tpe.NotRequired[RandomState]
use_gpu: tpe.NotRequired[bool]


class ImplicitBPRWrapperModelConfig(ModelConfig):
"""Config for `ImplicitBPRWrapperModel`"""

model_config = ConfigDict(arbitrary_types_allowed=True)

model: BayesianPersonalizedRankingConfig


class ImplicitBPRWrapperModel(VectorModel[ImplicitBPRWrapperModelConfig]):
"""
Wrapper for `implicit.bpr.BayesianPersonalizedRanking` model.

See https://benfred.github.io/implicit/api/models/cpu/bpr.html for details of the base model.

Parameters
----------
model : BayesianPersonalizedRanking
Base model to wrap.
verbose : int, default ``0``
Degree of verbose output. If ``0``, no output will be provided.
"""

recommends_for_warm = False
recommends_for_cold = False

u2i_dist = Distance.DOT
i2i_dist = Distance.COSINE

config_class = ImplicitBPRWrapperModelConfig

def __init__(self, model: AnyBayesianPersonalizedRanking, verbose: int = 0):
self._config = self._make_config(model, verbose)
super().__init__(verbose=verbose)
self.model: AnyBayesianPersonalizedRanking
self._model = model # for refit

self.use_gpu = isinstance(model, GPUBayesianPersonalizedRanking)
if not self.use_gpu:
self.n_threads = model.num_threads

@classmethod
def _make_config(cls, model: AnyBayesianPersonalizedRanking, verbose: int) -> ImplicitBPRWrapperModelConfig:
model_cls = (
model.__class__
if model.__class__ not in (CPUBayesianPersonalizedRanking, GPUBayesianPersonalizedRanking)
else "BayesianPersonalizedRanking"
)

inner_model_config = {
"cls": model_cls,
"factors": model.factors,
"learning_rate": model.learning_rate,
"dtype": None,
"regularization": model.regularization,
"iterations": model.iterations,
"verify_negative_samples": model.verify_negative_samples,
"random_state": model.random_state,
}
if isinstance(model, GPUBayesianPersonalizedRanking): # pragma: no cover
inner_model_config["use_gpu"] = True
else:
inner_model_config.update(
{
"use_gpu": False,
"dtype": model.dtype,
"num_threads": model.num_threads,
}
)

return ImplicitBPRWrapperModelConfig(
cls=cls,
model=tp.cast(BayesianPersonalizedRankingConfig, inner_model_config),
verbose=verbose,
)

def _get_config(self) -> ImplicitBPRWrapperModelConfig:
return self._config

@classmethod
def _from_config(cls, config: ImplicitBPRWrapperModelConfig) -> tpe.Self:
inner_model_params = deepcopy(config.model)
inner_model_cls = inner_model_params.pop("cls", BayesianPersonalizedRanking)
inner_model_cls = tp.cast(tp.Callable, inner_model_cls)
if inner_model_cls == BPR_STRING:
inner_model_cls = BayesianPersonalizedRanking
model = inner_model_cls(**inner_model_params)
return cls(model=model, verbose=config.verbose)

def _fit(self, dataset: Dataset) -> None:
self.model = deepcopy(self._model)

ui_csr = dataset.get_user_item_matrix(include_weights=True).astype(np.float32)
self.model.fit(ui_csr, show_progress=self.verbose > 0)

def _get_users_factors(self, dataset: Dataset) -> Factors:
return Factors(get_users_vectors(self.model))

def _get_items_factors(self, dataset: Dataset) -> Factors:
return Factors(get_items_vectors(self.model))

def get_vectors(self) -> tp.Tuple[np.ndarray, np.ndarray]:
"""
Return user and item vector representation from fitted model.

Returns
-------
(np.ndarray, np.ndarray)
User and item vectors.
Shapes are (n_users, n_factors) and (n_items, n_factors).
"""
if not self.is_fitted:
raise NotFittedError(self.__class__.__name__)
return get_users_vectors(self.model), get_items_vectors(self.model)


def get_users_vectors(model: AnyBayesianPersonalizedRanking) -> np.ndarray:
"""
Get user vectors from BPR model as a numpy array.

Parameters
----------
model : BayesianPersonalizedRanking
Fitted BPR model. Can be CPU or GPU model

Returns
-------
np.ndarray
User vectors.
"""
if isinstance(model, GPUBayesianPersonalizedRanking): # pragma: no cover
return model.user_factors.to_numpy()
return model.user_factors


def get_items_vectors(model: AnyBayesianPersonalizedRanking) -> np.ndarray:
"""
Get item vectors from BPR model as a numpy array.

Parameters
----------
model : BayesianPersonalizedRanking
Fitted BPR model. Can be CPU or GPU model

Returns
-------
np.ndarray
Item vectors.
"""
if isinstance(model, GPUBayesianPersonalizedRanking): # pragma: no cover
return model.item_factors.to_numpy()
return model.item_factors
6 changes: 5 additions & 1 deletion rectools/utils/serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import numpy as np
import typing_extensions as tpe
from pydantic import PlainSerializer
from pydantic import BeforeValidator, PlainSerializer

FileLike = tp.Union[str, Path, tp.IO[bytes]]

Expand All @@ -37,6 +37,10 @@ def _serialize_random_state(rs: tp.Optional[tp.Union[None, int, np.random.Random
PlainSerializer(func=_serialize_random_state, when_used="json"),
]

DType = tpe.Annotated[
np.dtype, BeforeValidator(func=np.dtype), PlainSerializer(func=lambda dtp: dtp.name, when_used="json")
]


def read_bytes(f: FileLike) -> bytes:
"""Read bytes from a file."""
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ show_column_numbers = True
disable_error_code = type-arg

[isort]
profile = black
line_length = 120
wrap_length = 120
multi_line_output = 3
Expand Down
Loading