diff --git a/CHANGELOG.md b/CHANGELOG.md index 312525ae..15386cbe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,18 +4,29 @@ All notable changes to this project will be documented in this file ## [v0.3.0] - 2020-06-30 ### Changed +- early stopping strategies +- offline recommendation files evaluation (ProxyRecommender, RecommendationFolder) - negative sampling evaluation +- improved Microsoft Windows compatibility - binarization of explicit dataset -- incremental splitting +- automatic loading of implicit datasets +- multiple prefiltering - managing side information with modular loaders -- adding and fixing visual recsys method: +- alignment of side information with training data +- improved Documentation: Model creation, Side Information loading, Early Stopping, Negative Sampling +- added nDCG as formulated in Rendle's 2020 KDD paper +- visual loader with tensorflow pipeline +- added and fixing visual recsys method: - DVBPR - VBPR - DeepStyle - ACF - VNPR -- early stopping strategies -- adding new recommender method +- added new recommender method + - MF (Rendle's 2020 RecSys reproducibility paper) + - EASER + - RP3beta + - iALS ## [v0.2.1] - 2020-03-27 ### Changed diff --git a/external/models/DistMult/DistMult.py b/external/models/DistMult/DistMult.py deleted file mode 100644 index 02cd757a..00000000 --- a/external/models/DistMult/DistMult.py +++ /dev/null @@ -1,108 +0,0 @@ -""" -Module description: - -""" - -__version__ = '0.1' -__author__ = 'Vito Walter Anelli, Claudio Pomo' -__email__ = 'vitowalter.anelli@poliba.it, claudio.pomo@poliba.it' - -from tqdm import tqdm - -from .triple_sampler import TripleSampler as TS -from elliot.recommender import BaseRecommenderModel -from elliot.recommender.base_recommender_model import init_charger -from .DistMult_model import DistMultModel -from elliot.recommender.recommender_utils_mixin import RecMixin - - -class DistMult(RecMixin, BaseRecommenderModel): - r""" - - """ - @init_charger - def __init__(self, data, config, params, *args, **kwargs): - """ - - """ - ###################################### - - self._params_list = [ - ("_learning_rate", "lr", "lr", 0.1, None, None), - ("_factors", "factors", "f", 100, int, None), - ("_F2", "F2", "F2", 0, None, None), - ("_N3", "N3", "N3", 0, None, None), - ("_corruption", "corruption", "c", "so", None, None), - ("_input_type", "input_type", "intype", "standard", None, None), - ("_blackbox_lambda", "blackbox_lambda", "bl", 0, None, None), - ("_mask", "mask", "mask", False, None, None), - ("_loader", "loader", "load", "KGCompletion", None, None), - ] - self.autoset_params() - - self._ratings = self._data.train_dict - - self._side = getattr(self._data.side_information, self._loader, None) - - self._sampler = TS(self._side, self._seed) - - if self._batch_size < 1: - self._batch_size = self._num_users - - self._transactions_per_epoch = self._side.Xs.shape[0] - - self._model = DistMultModel(self._side, - self._learning_rate, - self._factors, - self._F2, - self._N3, - self._corruption, - self._input_type, - self._blackbox_lambda, - self._mask, - self._seed) - - @property - def name(self): - return "DistMult" \ - + f"_{self.get_base_params_shortcut()}" \ - + f"_{self.get_params_shortcut()}" - - def train(self): - if self._restore: - return self.restore_weights() - - for it in self.iterate(self._epochs): - loss = 0 - steps = 0 - with tqdm(total=int(self._transactions_per_epoch // self._batch_size), disable=not self._verbose) as t: - for batch in self._sampler.step(self._batch_size): - steps += 1 - loss += self._model.train_step(batch) - t.set_postfix({'loss': f'{loss.numpy() / steps:.5f}'}) - t.update() - - # self.evaluate(it, loss.numpy()/(it + 1)) - - # for it in self.iterate(self._epochs): - # loss = 0 - # steps = 0 - # with tqdm(total=int(self._transactions_per_epoch // self._batch_size), disable=not self._verbose) as t: - # for batch in self._sampler.step(self._transactions_per_epoch, self._batch_size): - # steps += 1 - # loss += self._model.train_step(batch) - # t.set_postfix({'loss': f'{loss.numpy() / steps:.5f}'}) - # t.update() - # - # self.evaluate(it, loss.numpy()/(it + 1)) - - def get_recommendations(self, k: int = 100): - predictions_top_k_test = {} - predictions_top_k_val = {} - for index, offset in enumerate(range(0, self._num_users, self._batch_size)): - offset_stop = min(offset + self._batch_size, self._num_users) - predictions = self._model.predict_batch(offset, offset_stop) - recs_val, recs_test = self.process_protocol(k, predictions, offset, offset_stop) - predictions_top_k_val.update(recs_val) - predictions_top_k_test.update(recs_test) - return predictions_top_k_val, predictions_top_k_test diff --git a/external/models/DistMult/DistMult_model.py b/external/models/DistMult/DistMult_model.py deleted file mode 100644 index 5675c84c..00000000 --- a/external/models/DistMult/DistMult_model.py +++ /dev/null @@ -1,174 +0,0 @@ -""" -Module description: - -""" - -__version__ = '0.1' -__author__ = 'Vito Walter Anelli, Claudio Pomo, Daniele Malitesta' -__email__ = 'vitowalter.anelli@poliba.it, claudio.pomo@poliba.it, daniele.malitesta@poliba.it' - -import os - -import numpy as np -import tensorflow as tf -from tensorflow import keras -import typing as t - -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' - - -class DistMultModel(keras.Model): - - def __init__(self, - side, - learning_rate, - factors, - F2, - N3, - corruption, - input_type, - blackbox_lambda, - mask, - random_seed=42, - name="NNBPRMF", - **kwargs): - super().__init__(name=name, **kwargs) - tf.random.set_seed(random_seed) - - self.side = side - self.learning_rate = learning_rate - self.factors = factors - self.F2 = F2 - self.N3 = N3 - self.corruption = corruption - self.input_type = input_type - self.blackbox_lambda = blackbox_lambda - self.mask = mask - init_size = 1e-3 - - self.initializer = tf.initializers.GlorotUniform() - - self.entity_embeddings = keras.layers.Embedding(input_dim=self.side.nb_entities, output_dim=self.factors, - embeddings_initializer=self.initializer, - # embeddings_regularizer=keras.regularizers.l2(self.l_w), - trainable=True, dtype=tf.float32) - self.predicate_embeddings = keras.layers.Embedding(input_dim=self.side.nb_predicates, output_dim=self.factors, - embeddings_initializer=self.initializer, - # embeddings_regularizer=keras.regularizers.l2(self.l_w), - trainable=True, dtype=tf.float32) - - self.entity_embeddings(0) - self.predicate_embeddings(0) - - # TODO: Scale operation multiplying the embeddings with init_size - - self.optimizer = tf.optimizers.Adam(self.learning_rate) - if self.blackbox_lambda is None: - self.loss_function = tf.keras.losses.SparseCategoricalCrossentropy() - else: - #TODO: NegativeMRR(lambda=blackbox_lambda) - self.loss_function = tf.keras.losses.SparseCategoricalCrossentropy() - - #TODO: masks - - @tf.function - def score(self, - rel: tf.Tensor, - arg1: tf.Tensor, - arg2: tf.Tensor, - *args, **kwargs) -> tf.Tensor: - # [B] - #TODO: check the axis and the dimensions - res = tf.reduce_sum(rel * arg1 * arg2, 1) - return res - - # @tf.function - def call(self, - rel: t.Optional[tf.Tensor] = None, - arg1: t.Optional[tf.Tensor] = None, - arg2: t.Optional[tf.Tensor] = None, - entity_embeddings: t.Optional[tf.Tensor] = None, - predicate_embeddings: t.Optional[tf.Tensor] = None, - *args, **kwargs) -> tf.Tensor: - # [N, E] - ent_emb = self.entity_embeddings if self.entity_embeddings is not None else entity_embeddings - pred_emb = self.predicate_embeddings if self.predicate_embeddings is not None else predicate_embeddings - - assert ((1 if rel is None else 0) + (1 if arg1 is None else 0) + (1 if arg2 is None else 0)) == 1 - - # [B] Tensor - scores = None - - # [B, N] = [B, E] @ [E, N] - if rel is None: - scores = (arg1 * arg2) @ tf.transpose(ent_emb.weights[0]) - elif arg1 is None: - scores = (rel * arg2) @ tf.transpose(pred_emb.weights[0]) - elif arg2 is None: - scores = (rel * arg1) @ tf.transpose(ent_emb.weights[0]) - - assert scores is not None - - return scores - - # @tf.function - def train_step(self, batch): - with tf.GradientTape() as tape: - xp_batch, xs_batch, xo_batch, xi_batch = batch - - xp_batch_emb = self.predicate_embeddings(xp_batch) - xs_batch_emb = self.entity_embeddings(xs_batch) - xo_batch_emb = self.entity_embeddings(xo_batch) - - loss = 0.0 - - if 's' in self.corruption: - po_scores = self.call(xp_batch_emb, None, xo_batch_emb) - # if self.mask is True: - # po_scores = po_scores + mask_po[xi_batch, :] - - loss += self.loss_function(xs_batch, po_scores) - - if 'o' in self.corruption: - sp_scores = self.call(xp_batch_emb, xs_batch_emb, None) - # if self.mask is True: - # sp_scores = sp_scores + mask_sp[xi_batch, :] - - loss += self.loss_function(xo_batch, sp_scores) - - if 'p' in self.corruption: - so_scores = self.call(None, xs_batch_emb, xo_batch_emb) - # if elf.mask is True: - # so_scores = so_scores + mask_so[xi_batch, :] - - loss += self.loss_function(xp_batch, so_scores) - - factors = [e for e in [xp_batch_emb, xs_batch_emb, xo_batch_emb]] - - # TODO: F2 and N3 regularization - # if self.F2 is not None: - # loss += self.F2 * F2_reg(factors) - # - # if self.N3 is not None: - # loss += self.N3 * N3_reg(factors) - - grads = tape.gradient(loss, self.trainable_weights) - self.optimizer.apply_gradients(zip(grads, self.trainable_weights)) - - return loss - - # @tf.function - # def predict_batch(self, start, stop): - # return tf.transpose(self.item_bias_embedding.weights[0]) + tf.matmul(self.user_embedding.weights[0][start:stop], self.item_embedding.weights[0], transpose_b=True) - - @tf.function - def predict(self, inputs, training=False, **kwargs): - logits, _ = self.call(inputs=inputs, training=True) - return logits - - @tf.function - def get_top_k(self, preds, train_mask, k=100): - return tf.nn.top_k(tf.where(train_mask, preds, -np.inf), k=k, sorted=True) - - def get_config(self): - raise NotImplementedError diff --git a/external/models/DistMult/__init__.py b/external/models/DistMult/__init__.py deleted file mode 100644 index 8c446dc2..00000000 --- a/external/models/DistMult/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .DistMult import DistMult \ No newline at end of file diff --git a/external/models/DistMult/triple_sampler.py b/external/models/DistMult/triple_sampler.py deleted file mode 100644 index b599b740..00000000 --- a/external/models/DistMult/triple_sampler.py +++ /dev/null @@ -1,32 +0,0 @@ - -import numpy as np -import random -from typing import Tuple -from types import SimpleNamespace - - -class TripleSampler: - def __init__(self, - side: SimpleNamespace, - random_seed: int = 42) -> None: - self.random_state = random.Random(random_seed) - self.Xs = side.Xs - self.Xp = side.Xp - self.Xo = side.Xo - - self.Xi = np.arange(start=0, stop=self.Xs.shape[0], dtype=np.int32) - - assert np.allclose(self.Xs.shape, self.Xp.shape) - assert np.allclose(self.Xs.shape, self.Xo.shape) - - self.nb_examples = self.Xs.shape[0] - - def step(self, batch_size: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: - curriculum_order = self.random_state.sample(range(self.nb_examples), self.nb_examples) - - for start_idx in range(0, self.nb_examples, batch_size): - end_idx = min(start_idx + batch_size, self.nb_examples) - yield self.Xp[curriculum_order[start_idx: end_idx]], \ - self.Xs[curriculum_order[start_idx: end_idx]], \ - self.Xo[curriculum_order[start_idx: end_idx]], \ - self.Xi[curriculum_order[start_idx: end_idx]], diff --git a/external/models/EASE_R/__init__.py b/external/models/EASE_R/__init__.py deleted file mode 100644 index 82668ac9..00000000 --- a/external/models/EASE_R/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .ease_r import EASER \ No newline at end of file diff --git a/external/models/EASE_R/ease_r.py b/external/models/EASE_R/ease_r.py deleted file mode 100644 index af0b0aa4..00000000 --- a/external/models/EASE_R/ease_r.py +++ /dev/null @@ -1,135 +0,0 @@ -""" -Module description: - -""" - -__version__ = '0.1' -__author__ = 'Vito Walter Anelli, Claudio Pomo' -__email__ = 'vitowalter.anelli@poliba.it, claudio.pomo@poliba.it' - -import time - -import numpy as np -from sklearn.utils.extmath import safe_sparse_dot - -from elliot.recommender.base_recommender_model import BaseRecommenderModel -from elliot.recommender.base_recommender_model import init_charger -from elliot.recommender.recommender_utils_mixin import RecMixin - -np.random.seed(42) - - -class EASER(RecMixin, BaseRecommenderModel): - - @init_charger - def __init__(self, data, config, params, *args, **kwargs): - self._random = np.random - - self._params_list = [ - ("_neighborhood", "neighborhood", "neighborhood", -1, int, None), - ("_l2_norm", "l2_norm", "l2_norm", 1e3, float, None) - ] - - self.autoset_params() - if self._neighborhood == -1: - self._neighborhood = self._data.num_items - - @property - def name(self): - return f"EASER_{self.get_params_shortcut()}" - - def get_recommendations(self, k: int = 10): - predictions_top_k_val = {} - predictions_top_k_test = {} - - recs_val, recs_test = self.process_protocol(k) - - predictions_top_k_val.update(recs_val) - predictions_top_k_test.update(recs_test) - - return predictions_top_k_val, predictions_top_k_test - - def get_single_recommendation(self, mask, k): - return {u: self.get_user_predictions(u, mask, k) for u in self._data.train_dict.keys()} - - def get_user_predictions(self, user_id, mask, top_k=10): - user_id = self._data.public_users.get(user_id) - b = self._preds[user_id] - a = mask[user_id] - b[~a] = -np.inf - indices, values = zip(*[(self._data.private_items.get(u_list[0]), u_list[1]) - for u_list in enumerate(b.data)]) - - indices = np.array(indices) - values = np.array(values) - local_k = min(top_k, len(values)) - partially_ordered_preds_indices = np.argpartition(values, -local_k)[-local_k:] - real_values = values[partially_ordered_preds_indices] - real_indices = indices[partially_ordered_preds_indices] - local_top_k = real_values.argsort()[::-1] - return [(real_indices[item], real_values[item]) for item in local_top_k] - - def train(self): - if self._restore: - return self.restore_weights() - - - start = time.time() - - # self._train = normalize(self._data.sp_i_train_ratings, norm='l2', axis=1) - # self._train = normalize(self._train, norm='l2', axis=0) - - self._train = self._data.sp_i_train_ratings - - self._similarity_matrix = safe_sparse_dot(self._train.T, self._train, dense_output=True) - - diagonal_indices = np.diag_indices(self._similarity_matrix.shape[0]) - item_popularity = np.ediff1d(self._train.tocsc().indptr) - self._similarity_matrix[diagonal_indices] = item_popularity + self._l2_norm - - P = np.linalg.inv(self._similarity_matrix) - - self._similarity_matrix = P / (-np.diag(P)) - - self._similarity_matrix[diagonal_indices] = 0.0 - - end = time.time() - print(f"The similarity computation has taken: {end - start}") - - # data, rows_indices, cols_indptr = [], [], [] - # - # column_row_index = np.arange(len(self._data.items), dtype=np.int32) - # - # for item_idx in range(len(self._data.items)): - # cols_indptr.append(len(data)) - # column_data = self._similarity_matrix[:, item_idx] - # - # non_zero_data = column_data != 0 - # - # idx_sorted = np.argsort(column_data[non_zero_data]) # sort by column - # top_k_idx = idx_sorted[-self._neighborhood:] - # - # data.extend(column_data[non_zero_data][top_k_idx]) - # rows_indices.extend(column_row_index[non_zero_data][top_k_idx]) - # - # cols_indptr.append(len(data)) - # - # W_sparse = sparse.csc_matrix((data, rows_indices, cols_indptr), - # shape=(len(self._data.items), len(self._data.items)), dtype=np.float32).tocsr() - - self._preds = self._train.dot(self._similarity_matrix) - - # recs = self.get_recommendations(self.evaluator.get_needed_recommendations()) - # result_dict = self.evaluator.eval(recs) - # self._results.append(result_dict) - # print(f'Finished') - # - # if self._results[-1][self._validation_k]["val_results"][self._validation_metric] > best_metric_value: - # print("******************************************") - # if self._save_weights: - # with open(self._saving_filepath, "wb") as f: - # pickle.dump(self._model.get_model_state(), f) - # if self._save_recs: - # store_recommendation(recs, self._config.path_output_rec_result + f"{self.name}.tsv") - - self.evaluate() diff --git a/external/models/NeuMF/__init__.py b/external/models/NeuMF/__init__.py deleted file mode 100644 index cebb299f..00000000 --- a/external/models/NeuMF/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .neural_matrix_factorization import NeuMF diff --git a/external/models/NeuMF/custom_sampler.py b/external/models/NeuMF/custom_sampler.py deleted file mode 100644 index 8f75dd37..00000000 --- a/external/models/NeuMF/custom_sampler.py +++ /dev/null @@ -1,57 +0,0 @@ -""" -Module description: - -""" - -__version__ = '0.1' -__author__ = 'Vito Walter Anelli, Claudio Pomo' -__email__ = 'vitowalter.anelli@poliba.it, claudio.pomo@poliba.it' - -import numpy as np -np.random.seed(42) -import random -random.seed(42) - - -class Sampler: - def __init__(self, indexed_ratings, m): - self._indexed_ratings = indexed_ratings - self._users = list(self._indexed_ratings.keys()) - self._nusers = len(self._users) - self._items = list({k for a in self._indexed_ratings.values() for k in a.keys()}) - self._nitems = len(self._items) - self._ui_dict = {u: list(set(indexed_ratings[u])) for u in indexed_ratings} - self._lui_dict = {u: len(v) for u, v in self._ui_dict.items()} - self._m = m - - def step(self, batch_size: int): - r_int = np.random.randint - n_items = self._nitems - ui_dict = self._ui_dict - lui_dict = self._lui_dict - - # def sample_pos(u): - # ui = ui_dict[u] - # lui = lui_dict[u] - # if lui == n_items: - # return None - # return ui[r_int(lui)] - pos = {(u, i, 1) for u, items in ui_dict.items() for i in items} - - neg = set() - for u, i, _ in pos: - ui = ui_dict[u] - for _ in range(self._m): - j = r_int(n_items) - while j in ui: - j = r_int(n_items) - neg.add((u, j, 0)) - - samples = list(pos) - samples.extend(list(neg)) - samples = random.sample(samples, len(samples)) - # samples_zip = list(zip(samples)) - - for start in range(0, len(samples), batch_size): - u, i, b = map(np.array, zip(*samples[start:min(start + batch_size, len(samples))])) - yield u, i, b diff --git a/external/models/NeuMF/neural_matrix_factorization.py b/external/models/NeuMF/neural_matrix_factorization.py deleted file mode 100644 index c4dc9334..00000000 --- a/external/models/NeuMF/neural_matrix_factorization.py +++ /dev/null @@ -1,154 +0,0 @@ -""" -Module description: - -""" - -__version__ = '0.1' -__author__ = 'Vito Walter Anelli, Claudio Pomo' -__email__ = 'vitowalter.anelli@poliba.it, claudio.pomo@poliba.it' - -from ast import literal_eval as make_tuple - -import numpy as np -from tqdm import tqdm - -from . import custom_sampler as ps -from elliot.recommender.base_recommender_model import BaseRecommenderModel -from elliot.recommender.base_recommender_model import init_charger -from .neural_matrix_factorization_model import NeuralMatrixFactorizationModel -from elliot.recommender.recommender_utils_mixin import RecMixin - - -class NeuMF(RecMixin, BaseRecommenderModel): - r""" - Neural Collaborative Filtering - - For further details, please refer to the `paper `_ - - Args: - mf_factors: Number of MF latent factors - mlp_factors: Number of MLP latent factors - mlp_hidden_size: List of units for each layer - lr: Learning rate - dropout: Dropout rate - is_mf_train: Whether to train the MF embeddings - is_mlp_train: Whether to train the MLP layers - - To include the recommendation model, add it to the config file adopting the following pattern: - - .. code:: yaml - - models: - NeuMF: - meta: - save_recs: True - epochs: 10 - batch_size: 512 - mf_factors: 10 - mlp_factors: 10 - mlp_hidden_size: (64,32) - lr: 0.001 - dropout: 0.0 - is_mf_train: True - is_mlp_train: True - """ - @init_charger - def __init__(self, data, config, params, *args, **kwargs): - - - self._params_list = [ - ("_learning_rate", "lr", "lr", 0.001, None, None), - ("_mf_factors", "mf_factors", "mffactors", 10, int, None), - #("_mlp_factors", "mlp_factors", "mlpfactors", 10, int, None), - #("_mlp_hidden_size", "mlp_hidden_size", "mlpunits", "(64,32)", lambda x: list(make_tuple(str(x))), lambda x: self._batch_remove(str(x), " []").replace(",", "-")), - ("_dropout", "dropout", "drop", 0, None, None), - ("_is_mf_train", "is_mf_train", "mftrain", True, None, None), - ("_is_mlp_train", "is_mlp_train", "mlptrain", True, None, None), - ("_m", "m", "m", 0, int, None) - ] - self.autoset_params() - - self._sampler = ps.Sampler(self._data.i_train_dict, self._m) - self._mlp_hidden_size = (self._mf_factors*4, self._mf_factors*2, self._mf_factors) - self._mlp_factors = self._mf_factors - - if self._batch_size < 1: - self._batch_size = self._data.transactions - - self._ratings = self._data.train_dict - self._sp_i_train = self._data.sp_i_train - self._i_items_set = list(range(self._num_items)) - - self._model = NeuralMatrixFactorizationModel(self._num_users, self._num_items, self._mf_factors, - self._mlp_factors, self._mlp_hidden_size, - self._dropout, self._is_mf_train, self._is_mlp_train, - self._learning_rate) - - @property - def name(self): - return "NeuMF"\ - + "_e:" + str(self._epochs) \ - + "_bs:" + str(self._batch_size) \ - + f"_{self.get_params_shortcut()}" - - def train(self): - if self._restore: - return self.restore_weights() - - best_metric_value = 0 - - for it in range(self._epochs): - loss = 0 - steps = 0 - with tqdm(total=int(self._data.transactions * (self._m + 1) // self._batch_size), disable=not self._verbose) as t: - for batch in self._sampler.step(self._batch_size): - steps += 1 - loss += self._model.train_step(batch).numpy() - t.set_postfix({'loss': f'{loss / steps:.5f}'}) - t.update() - - self.evaluate(it, loss) - - # if not (it + 1) % self._validation_rate: - # recs = self.get_recommendations(self.evaluator.get_needed_recommendations()) - # result_dict = self.evaluator.eval(recs) - # self._results.append(result_dict) - # - # print(f'Epoch {(it + 1)}/{self._epochs} loss {loss/steps:.5f}') - # - # if self._results[-1][self._validation_k]["val_results"][self._validation_metric] > best_metric_value: - # print("******************************************") - # best_metric_value = self._results[-1][self._validation_k]["val_results"][self._validation_metric] - # if self._save_weights: - # self._model.save_weights(self._saving_filepath) - # if self._save_recs: - # store_recommendation(recs, self._config.path_output_rec_result + f"{self.name}-it:{it + 1}.tsv") - - def get_recommendations(self, k: int = 100): - predictions_top_k_test = {} - predictions_top_k_val = {} - for index, offset in enumerate(range(0, self._num_users, self._batch_size)): - offset_stop = min(offset + self._batch_size, self._num_users) - predictions = self._model.get_recs( - ( - np.repeat(np.array(list(range(offset, offset_stop)))[:, None], repeats=self._num_items, axis=1), - np.array([self._i_items_set for _ in range(offset, offset_stop)]) - ) - ) - # v, i = self._model.get_top_k(predictions, self.get_train_mask(offset, offset_stop), k=k) - - # items_ratings_pair = [list(zip(map(self._data.private_items.get, u_list[0]), u_list[1])) - # for u_list in list(zip(i.numpy(), v.numpy()))] - # predictions_top_k.update(dict(zip(map(self._data.private_users.get, - # range(offset, offset_stop)), items_ratings_pair))) - recs_val, recs_test = self.process_protocol(k, predictions, offset, offset_stop) - - predictions_top_k_val.update(recs_val) - predictions_top_k_test.update(recs_test) - return predictions_top_k_val, predictions_top_k_test - - def get_single_recommendation(self, mask, k, predictions, offset, offset_stop): - v, i = self._model.get_top_k(predictions, mask[offset: offset_stop], k=k) - items_ratings_pair = [list(zip(map(self._data.private_items.get, u_list[0]), u_list[1])) - for u_list in list(zip(i.numpy(), v.numpy()))] - return dict(zip(map(self._data.private_users.get, range(offset, offset_stop)), items_ratings_pair)) \ No newline at end of file diff --git a/external/models/NeuMF/neural_matrix_factorization_model.py b/external/models/NeuMF/neural_matrix_factorization_model.py deleted file mode 100644 index 45d50bbf..00000000 --- a/external/models/NeuMF/neural_matrix_factorization_model.py +++ /dev/null @@ -1,148 +0,0 @@ -""" -Module description: - -""" - -__version__ = '0.1' -__author__ = 'Vito Walter Anelli, Claudio Pomo, Daniele Malitesta' -__email__ = 'vitowalter.anelli@poliba.it, claudio.pomo@poliba.it' - -import os -import numpy as np -import tensorflow as tf -from tensorflow import keras - -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' -tf.random.set_seed(0) - - -class NeuralMatrixFactorizationModel(keras.Model): - def __init__(self, - num_users, - num_items, - embed_mf_size, embed_mlp_size, mlp_hidden_size, dropout, is_mf_train, - is_mlp_train, learning_rate=0.01, - name="NeuralMatrixFactorizationModel", - **kwargs): - super().__init__(name=name, **kwargs) - tf.random.set_seed(42) - self.num_users = num_users - self.num_items = num_items - self.embed_mf_size = embed_mf_size - self.embed_mlp_size = embed_mlp_size - self.mlp_hidden_size = mlp_hidden_size - self.dropout = dropout - self.is_mf_train = is_mf_train - self.is_mlp_train = is_mlp_train - - self.initializer = tf.initializers.GlorotUniform() - - self.user_mf_embedding = keras.layers.Embedding(input_dim=self.num_users, output_dim=self.embed_mf_size, - embeddings_initializer=self.initializer, name='U_MF', - dtype=tf.float32) - self.item_mf_embedding = keras.layers.Embedding(input_dim=self.num_items, output_dim=self.embed_mf_size, - embeddings_initializer=self.initializer, name='I_MF', - dtype=tf.float32) - self.user_mlp_embedding = keras.layers.Embedding(input_dim=self.num_users, output_dim=self.embed_mlp_size, - embeddings_initializer=self.initializer, name='U_MLP', - dtype=tf.float32) - self.item_mlp_embedding = keras.layers.Embedding(input_dim=self.num_items, output_dim=self.embed_mlp_size, - embeddings_initializer=self.initializer, name='I_MLP', - dtype=tf.float32) - self.user_mf_embedding(0) - self.user_mlp_embedding(0) - self.item_mf_embedding(0) - self.item_mlp_embedding(0) - - self.mlp_layers = keras.Sequential() - - for units in mlp_hidden_size: - self.mlp_layers.add(keras.layers.Dropout(dropout)) - self.mlp_layers.add(keras.layers.Dense(units, activation='relu')) - - if self.is_mf_train and self.is_mlp_train: - self.predict_layer = keras.layers.Dense(1, input_dim=self.embed_mf_size + self.mlp_hidden_size[-1]) - elif self.is_mf_train: - self.predict_layer = keras.layers.Dense(1, input_dim=self.embed_mf_size) - elif self.is_mlp_train: - self.predict_layer = keras.layers.Dense(1, input_dim=self.mlp_hidden_size[-1]) - self.sigmoid = keras.activations.sigmoid - self.loss = keras.losses.BinaryCrossentropy() - - self.optimizer = tf.optimizers.Adam(learning_rate) - - @tf.function - def call(self, inputs, training=None, mask=None): - user, item = inputs - user_mf_e = self.user_mf_embedding(user) - item_mf_e = self.item_mf_embedding(item) - user_mlp_e = self.user_mlp_embedding(user) - item_mlp_e = self.item_mlp_embedding(item) - if self.is_mf_train: - mf_output = user_mf_e * item_mf_e # [batch_size, embedding_size] - if self.is_mlp_train: - mlp_output = self.mlp_layers(tf.concat([user_mlp_e, item_mlp_e], -1)) # [batch_size, layers[-1]] - if self.is_mf_train and self.is_mlp_train: - output = self.sigmoid(self.predict_layer(tf.concat([mf_output, mlp_output], -1))) - elif self.is_mf_train: - output = self.sigmoid(self.predict_layer(mf_output)) - elif self.is_mlp_train: - output = self.sigmoid(self.predict_layer(mlp_output)) - else: - raise RuntimeError('mf_train and mlp_train can not be False at the same time') - return output - - @tf.function - def train_step(self, batch): - user, pos, label = batch - with tf.GradientTape() as tape: - # Clean Inference - output = self(inputs=(user, pos), training=True) - loss = self.loss(label, output) - - grads = tape.gradient(loss, self.trainable_weights) - self.optimizer.apply_gradients(zip(grads, self.trainable_weights)) - - return loss - - @tf.function - def predict(self, inputs, training=False, **kwargs): - """ - Get full predictions on the whole users/items matrix. - - Returns: - The matrix of predicted values. - """ - output = self.call(inputs=inputs, training=training) - return output - - @tf.function - def get_recs(self, inputs, training=False, **kwargs): - """ - Get full predictions on the whole users/items matrix. - - Returns: - The matrix of predicted values. - """ - user, item = inputs - user_mf_e = self.user_mf_embedding(user) - item_mf_e = self.item_mf_embedding(item) - user_mlp_e = self.user_mlp_embedding(user) - item_mlp_e = self.item_mlp_embedding(item) - if self.is_mf_train: - mf_output = user_mf_e * item_mf_e # [batch_size, embedding_size] - if self.is_mlp_train: - mlp_output = self.mlp_layers(tf.concat([user_mlp_e, item_mlp_e], -1)) # [batch_size, layers[-1]] - if self.is_mf_train and self.is_mlp_train: - output = self.sigmoid(self.predict_layer(tf.concat([mf_output, mlp_output], -1))) - elif self.is_mf_train: - output = self.sigmoid(self.predict_layer(mf_output)) - elif self.is_mlp_train: - output = self.sigmoid(self.predict_layer(mlp_output)) - else: - raise RuntimeError('mf_train and mlp_train can not be False at the same time') - return tf.squeeze(output) - - @tf.function - def get_top_k(self, preds, train_mask, k=100): - return tf.nn.top_k(tf.where(train_mask, preds, -np.inf), k=k, sorted=True) diff --git a/external/models/Proxy/Proxy.py b/external/models/Proxy/Proxy.py deleted file mode 100644 index f8f16fb1..00000000 --- a/external/models/Proxy/Proxy.py +++ /dev/null @@ -1,78 +0,0 @@ -import ntpath -import numpy as np -import pandas as pd - -from elliot.recommender.base_recommender_model import BaseRecommenderModel -from elliot.recommender.recommender_utils_mixin import RecMixin -from elliot.recommender.base_recommender_model import init_charger - - -class ProxyRecommender(RecMixin, BaseRecommenderModel): - @init_charger - def __init__(self, data, config, params, *args, **kwargs): - """ - Create a Proxy recommender to evaluate already generated recommendations. - :param name: data loader object - :param path: path to the directory rec. results - :param args: parameters - """ - self._random = np.random - - self._params_list = [ - ("_name", "name", "name", "", None, None), - ("_path", "path", "path", "", None, None) - ] - self.autoset_params() - if not self._name: - self._name = ntpath.basename(self._path).split(".", 1)[0].split("_", 1)[0] - - @property - def name(self): - return self._name - - def train(self): - print("Reading recommendations") - self._recommendations = self.read_recommendations(self._path) - - print("Evaluating recommendations") - self.evaluate() - - def get_recommendations(self, top_k): - predictions_top_k_val = {} - predictions_top_k_test = {} - - recs_val, recs_test = self.process_protocol(top_k) - - predictions_top_k_val.update(recs_val) - predictions_top_k_test.update(recs_test) - - return predictions_top_k_val, predictions_top_k_test - - def get_single_recommendation(self, mask, k): - - nonzero = mask.nonzero() - candidate_items = {} - [candidate_items.setdefault(self._data.private_users[user], set()).add(self._data.private_items[item]) for user, item in zip(*nonzero)] - recs = {} - for u, user_recs in self._recommendations.items(): - user_cleaned_recs = [] - user_candidate_items = candidate_items[u] - for p, (item, prediction) in enumerate(user_recs): - if p >= k: - break - if item in user_candidate_items: - user_cleaned_recs.append((item, prediction)) - recs[u] = user_cleaned_recs - return recs - - def read_recommendations(self, path): - recs = {} - column_names = ["userId", "itemId", "prediction", "timestamp"] - data = pd.read_csv(path, sep="\t", header=None, names=column_names) - user_groups = data.groupby(['userId']) - for name, group in user_groups: - recs[name] = sorted(data.loc[group.index][['itemId', 'prediction']].apply(tuple, axis=1).to_list(), key=lambda x: x[1], reverse=True) - return recs - - - diff --git a/external/models/Proxy/__init__.py b/external/models/Proxy/__init__.py deleted file mode 100644 index f32529d5..00000000 --- a/external/models/Proxy/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .Proxy import ProxyRecommender \ No newline at end of file diff --git a/external/models/RP3beta/__init__.py b/external/models/RP3beta/__init__.py deleted file mode 100644 index 377cdeef..00000000 --- a/external/models/RP3beta/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .rp3beta import RP3beta \ No newline at end of file diff --git a/external/models/RP3beta/rp3beta.py b/external/models/RP3beta/rp3beta.py deleted file mode 100644 index 9f82373a..00000000 --- a/external/models/RP3beta/rp3beta.py +++ /dev/null @@ -1,186 +0,0 @@ -""" -Module description: - -""" - -__version__ = '0.1' -__author__ = 'Vito Walter Anelli, Claudio Pomo' -__email__ = 'vitowalter.anelli@poliba.it, claudio.pomo@poliba.it' - -import pickle -import time - -import numpy as np -import scipy.sparse as sparse -from sklearn.preprocessing import normalize - -from elliot.recommender.base_recommender_model import BaseRecommenderModel -from elliot.recommender.base_recommender_model import init_charger -from elliot.recommender.recommender_utils_mixin import RecMixin -from elliot.utils.write import store_recommendation - -np.random.seed(42) - - -class RP3beta(RecMixin, BaseRecommenderModel): - - @init_charger - def __init__(self, data, config, params, *args, **kwargs): - self._random = np.random - - self._params_list = [ - ("_neighborhood", "neighborhood", "neighborhood", 10, int, None), - ("_alpha", "alpha", "alpha", 1., float, None), - ("_beta", "beta", "beta", 0.6, float, None), - ("_normalize_similarity", "normalize_similarity", "normalize_similarity", False, bool, None) - ] - - self.autoset_params() - if self._neighborhood == -1: - self._neighborhood = self._data.num_items - - - @property - def name(self): - return f"RP3beta_{self.get_params_shortcut()}" - - def get_recommendations(self, k: int = 10): - predictions_top_k_val = {} - predictions_top_k_test = {} - - recs_val, recs_test = self.process_protocol(k) - - predictions_top_k_val.update(recs_val) - predictions_top_k_test.update(recs_test) - - return predictions_top_k_val, predictions_top_k_test - - def get_single_recommendation(self, mask, k, *args): - return {u: self.get_user_predictions(u, mask, k) for u in self._data.train_dict.keys()} - - def get_user_predictions(self, user_id, mask, top_k=10): - user_id = self._data.public_users.get(user_id) - user_recs = self._preds[user_id].toarray()[0] - user_recs_mask = mask[user_id] - user_recs[~user_recs_mask] = -np.inf - indices, values = zip(*[(self._data.private_items.get(u_list[0]), u_list[1]) - for u_list in enumerate(user_recs)]) - - indices = np.array(indices) - values = np.array(values) - local_k = min(top_k, len(values)) - partially_ordered_preds_indices = np.argpartition(values, -local_k)[-local_k:] - real_values = values[partially_ordered_preds_indices] - real_indices = indices[partially_ordered_preds_indices] - local_top_k = real_values.argsort()[::-1] - return [(real_indices[item], real_values[item]) for item in local_top_k] - - def train(self): - if self._restore: - return self.restore_weights() - - self._train = self._data.sp_i_train_ratings.copy() - self.Pui = normalize(self._train, norm='l1', axis=1) - - X_bool = self._train.transpose(copy=True) - X_bool.data = np.ones(X_bool.data.size, np.float32) - - X_bool_sum = np.array(X_bool.sum(axis=1)).ravel() - - self.degree = np.zeros(self._train.shape[1]) - - nonZeroMask = X_bool_sum != 0.0 - - self.degree[nonZeroMask] = np.power(X_bool_sum[nonZeroMask], -self._beta) - - self.Piu = normalize(X_bool, norm='l1', axis=1) - del (X_bool) - - if self._alpha != 1.: - self.Pui = self.Pui.power(self._alpha) - self.Piu = self.Piu.power(self._alpha) - - block_dim = 200 - d_t = self.Piu - - dataBlock = 10000000 - - rows = np.zeros(dataBlock, dtype=np.int32) - cols = np.zeros(dataBlock, dtype=np.int32) - values = np.zeros(dataBlock, dtype=np.float32) - - numCells = 0 - - start = time.time() - - for current_block_start_row in range(0, self.Pui.shape[1], block_dim): - - if current_block_start_row + block_dim > self.Pui.shape[1]: - block_dim = self.Pui.shape[1] - current_block_start_row - - similarity_block = d_t[current_block_start_row:current_block_start_row + block_dim, :] * self.Pui - similarity_block = similarity_block.toarray() - - for row_in_block in range(block_dim): - row_data = np.multiply(similarity_block[row_in_block, :], self.degree) - row_data[current_block_start_row + row_in_block] = 0 - - best = row_data.argsort()[::-1][:self._neighborhood] - - notZerosMask = row_data[best] != 0.0 - - values_to_add = row_data[best][notZerosMask] - cols_to_add = best[notZerosMask] - - for index in range(len(values_to_add)): - - if numCells == len(rows): - rows = np.concatenate((rows, np.zeros(dataBlock, dtype=np.int32))) - cols = np.concatenate((cols, np.zeros(dataBlock, dtype=np.int32))) - values = np.concatenate((values, np.zeros(dataBlock, dtype=np.float32))) - - rows[numCells] = current_block_start_row + row_in_block - cols[numCells] = cols_to_add[index] - values[numCells] = values_to_add[index] - - numCells += 1 - - self._similarity_matrix = sparse.csr_matrix((values[:numCells], (rows[:numCells], cols[:numCells])), - shape=(self.Pui.shape[1], self.Pui.shape[1])) - - if self._normalize_similarity: - self._similarity_matrix = normalize(self._similarity_matrix, norm='l1', axis=1) - - self._similarity_matrix = self._similarity_matrix.tocsc() - - data, rows_indices, cols_indptr = [], [], [] - - for item_idx in range(len(self._data.items)): - cols_indptr.append(len(data)) - - start_position = self._similarity_matrix.indptr[item_idx] - end_position = self._similarity_matrix.indptr[item_idx + 1] - - column_data = self._similarity_matrix.data[start_position:end_position] - column_row_index = self._similarity_matrix.indices[start_position:end_position] - - - non_zero_data = column_data != 0 - - idx_sorted = np.argsort(column_data[non_zero_data]) # sort by column - top_k_idx = idx_sorted[-self._neighborhood:] - - data.extend(column_data[non_zero_data][top_k_idx]) - rows_indices.extend(column_row_index[non_zero_data][top_k_idx]) - - cols_indptr.append(len(data)) - - W_sparse = sparse.csc_matrix((data, rows_indices, cols_indptr), - shape=(len(self._data.items), len(self._data.items)), dtype=np.float32).tocsr() - - self._preds = self._train.dot(W_sparse) - - end = time.time() - print(f"The similarity computation has taken: {end - start}") - - self.evaluate() \ No newline at end of file diff --git a/external/models/RendleMF_NeuMFvsMF/MF.py b/external/models/RendleMF_NeuMFvsMF/MF.py deleted file mode 100644 index 82efdea9..00000000 --- a/external/models/RendleMF_NeuMFvsMF/MF.py +++ /dev/null @@ -1,154 +0,0 @@ -""" -Module description: - -""" - -__version__ = '0.1' -__author__ = 'Vito Walter Anelli, Claudio Pomo' -__email__ = 'vitowalter.anelli@poliba.it, claudio.pomo@poliba.it' - -import pickle -import time -from tqdm import tqdm - -import numpy as np - -from . import custom_sampler_rendle as ps -from elliot.recommender.base_recommender_model import BaseRecommenderModel -from elliot.recommender.base_recommender_model import init_charger -from elliot.recommender.recommender_utils_mixin import RecMixin -from elliot.utils.write import store_recommendation - -from .MF_model import MFModel - - - -class MF(RecMixin, BaseRecommenderModel): - r""" - Matrix Factorization (implementation from "Neural Collaborative Filtering vs. Matrix Factorization Revisited") - - For further details, please refer to the `paper `_ - - Args: - factors: Number of latent factors - lr: Learning rate - bias_regularization: Regularization coefficient for the bias - user_regularization: Regularization coefficient for user latent factors - positive_item_regularization: Regularization coefficient for positive item latent factors - negative_item_regularization: Regularization coefficient for negative item latent factors - update_negative_item_factors: - update_users: - update_items: - update_bias: - - To include the recommendation model, add it to the config file adopting the following pattern: - - .. code:: yaml - - models: - MF: - meta: - save_recs: True - epochs: 10 - factors: 10 - lr: 0.001 - reg: 0.0025 - """ - - @init_charger - def __init__(self, data, config, params, *args, **kwargs): - self._random = np.random - - self._params_list = [ - ("_factors", "factors", "f", 10, int, None), - ("_learning_rate", "lr", "lr", 0.05, None, None), - ("_regularization", "reg", "reg", 0, None, None), - ("_m", "m", "m", 0, int, None), - ("_seed", "random_seed", "seed", 42, None, None) - ] - self.autoset_params() - - np.random.seed(self._seed) - - self._ratings = self._data.train_dict - self._sampler = ps.Sampler(self._data.i_train_dict, self._m, self._data.sp_i_train, self._seed) - - # self._batch_size = self._data.transactions * (self._m + 1) - self._batch_size = 100000 - - self._model = MFModel(self._factors, - self._data, - self._learning_rate, - self._regularization, - self._seed) - - def get_recommendations(self, k: int = 10): - self._model.prepare_predictions() - - predictions_top_k_val = {} - predictions_top_k_test = {} - - recs_val, recs_test = self.process_protocol(k) - - predictions_top_k_val.update(recs_val) - predictions_top_k_test.update(recs_test) - - return predictions_top_k_val, predictions_top_k_test - - def get_single_recommendation(self, mask, k, *args): - return {u: self._model.get_user_predictions(u, mask, k) for u in self._data.train_dict.keys()} - - def predict(self, u: int, i: int): - """ - Get prediction on the user item pair. - - Returns: - A single float vaue. - """ - return self._model.predict(u, i) - - @property - def name(self): - return "MF" \ - + "_e:" + str(self._epochs) \ - + f"_{self.get_params_shortcut()}" - - def train(self): - if self._restore: - return self.restore_weights() - - print(f"Transactions: {self._data.transactions}") - - for it in range(self._epochs): - print(f"\n********** Iteration: {it + 1}") - loss = 0 - steps = 0 - - with tqdm(total=int(self._data.transactions * (self._m + 1) // self._batch_size), disable=not self._verbose) as t: - for batch in self._sampler.step(self._batch_size): - steps += 1 - loss += self._model.train_step(batch)/len(batch) - t.set_postfix({'loss': f'{loss/steps:.5f}'}) - t.update() - - self.evaluate(it, loss) - - def restore_weights(self): - try: - with open(self._saving_filepath, "rb") as f: - self._model.set_model_state(pickle.load(f)) - print(f"Model correctly Restored") - - recs = self.get_recommendations(self.evaluator.get_needed_recommendations()) - result_dict = self.evaluator.eval(recs) - self._results.append(result_dict) - - print("******************************************") - if self._save_recs: - store_recommendation(recs, self._config.path_output_rec_result + f"{self.name}.tsv") - return True - - except Exception as ex: - print(f"Error in model restoring operation! {ex}") - - return False diff --git a/external/models/RendleMF_NeuMFvsMF/MF_model.py b/external/models/RendleMF_NeuMFvsMF/MF_model.py deleted file mode 100644 index c0b45dc6..00000000 --- a/external/models/RendleMF_NeuMFvsMF/MF_model.py +++ /dev/null @@ -1,166 +0,0 @@ -""" -Module description: - -""" - -__version__ = '0.1' -__author__ = 'Vito Walter Anelli, Claudio Pomo' -__email__ = 'vitowalter.anelli@poliba.it, claudio.pomo@poliba.it' - -import numpy as np - -np.random.seed(42) - - -class MFModel(object): - def __init__(self, F, - data, - lr, - reg, - random, - *args): - np.random.seed(random) - self._factors = F - self._users = data.users - self._items = data.items - self._private_users = data.private_users - self._public_users = data.public_users - self._private_items = data.private_items - self._public_items = data.public_items - self._lr = lr - self._reg = reg - self.initialize(*args) - - def initialize(self, loc: float = 0, scale: float = 0.1): - """ - This function initialize the data model - :param loc: - :param scale: - :return: - """ - - self._global_bias = 0 - - "same parameters as np.randn" - self._user_bias = np.zeros(len(self._users)) - self._item_bias = np.zeros(len(self._items)) - self._user_factors = \ - np.random.normal(loc=loc, scale=scale, size=(len(self._users), self._factors)) - self._item_factors = \ - np.random.normal(loc=loc, scale=scale, size=(len(self._items), self._factors)) - - @property - def name(self): - return "MF" - - def indexed_predict(self, user, item): - return self._global_bias + self._user_bias[user] + self._item_bias[item] \ - + self._user_factors[user] @ self._item_factors[item] - - def get_user_predictions(self, user_id, mask, top_k=10): - user_id = self._public_users.get(user_id) - # b = self._train[user_id].dot(W_sparse) - # b = self._global_bias + self._user_bias[user_id] + self._item_bias \ - # + self._user_factors[user_id] @ self._item_factors.T - b = self._preds[user_id] - a = mask[user_id] - b[~a] = -np.inf - indices, values = zip(*[(self._private_items.get(u_list[0]), u_list[1]) - for u_list in enumerate(b.data)]) - - indices = np.array(indices) - values = np.array(values) - local_k = min(top_k, len(values)) - partially_ordered_preds_indices = np.argpartition(values, -local_k)[-local_k:] - real_values = values[partially_ordered_preds_indices] - real_indices = indices[partially_ordered_preds_indices] - local_top_k = real_values.argsort()[::-1] - return [(real_indices[item], real_values[item]) for item in local_top_k] - - def train_step(self, batch, **kwargs): - sum_of_loss = 0 - lr = self._lr - reg = self._reg - for user, item, rating in batch: - gb_ = self._global_bias - uf_ = self._user_factors[user] - if_ = self._item_factors[item] - ub_ = self._user_bias[user] - ib_ = self._item_bias[item] - - prediction = gb_ + ub_ + ib_ + np.dot(uf_, if_) - # prediction = gb_ + ub_ + ib_ + uf_ @ if_ - - if prediction > 0: - one_plus_exp_minus_pred = 1.0 + np.exp(-prediction) - sigmoid = 1.0 / one_plus_exp_minus_pred - this_loss = (np.log(one_plus_exp_minus_pred) + - (1.0 - rating) * prediction) - else: - exp_pred = np.exp(prediction) - sigmoid = exp_pred / (1.0 + exp_pred) - this_loss = -rating * prediction + np.log(1.0 + exp_pred) - - grad = rating - sigmoid - - self._user_factors[user] += lr * (grad * if_ - reg * uf_) - self._item_factors[item] += lr * (grad * uf_ - reg * if_) - self._user_bias[user] += lr * (grad - reg * ub_) - self._item_bias[item] += lr * (grad - reg * ib_) - self._global_bias += lr * (grad - reg * gb_) - sum_of_loss += this_loss - - return sum_of_loss - - def prepare_predictions(self): - self._preds = np.expand_dims(self._user_bias, axis=1) + (self._global_bias + self._item_bias + self._user_factors @ self._item_factors.T) - - def update_factors(self, user: int, item: int, rating: float): - uf_ = self._user_factors[user] - if_ = self._item_factors[item] - ub_ = self._user_bias[user] - ib_ = self._item_bias[item] - gb_ = self._global_bias - lr = self._lr - reg = self._reg - - - prediction = gb_ + ub_ + ib_ + np.dot(uf_,if_) - # prediction = gb_ + ub_ + ib_ + uf_ @ if_ - - if prediction > 0: - one_plus_exp_minus_pred = 1.0 + np.exp(-prediction) - sigmoid = 1.0 / one_plus_exp_minus_pred - this_loss = (np.log(one_plus_exp_minus_pred) + - (1.0 - rating) * prediction) - else: - exp_pred = np.exp(prediction) - sigmoid = exp_pred / (1.0 + exp_pred) - this_loss = -rating * prediction + np.log(1.0 + exp_pred) - - grad = rating - sigmoid - - self._user_factors[user] += lr * (grad * if_ - reg * uf_) - self._item_factors[item] += lr * (grad * uf_ - reg * if_) - self._user_bias[user] += lr * (grad - reg * ub_) - self._item_bias[item] += lr * (grad - reg * ib_) - self._global_bias += lr * (grad - reg * gb_) - - return this_loss - - def get_model_state(self): - saving_dict = {} - saving_dict['_global_bias'] = self._global_bias - saving_dict['_user_bias'] = self._user_bias - saving_dict['_item_bias'] = self._item_bias - saving_dict['_user_factors'] = self._user_factors - saving_dict['_item_factors'] = self._item_factors - return saving_dict - - def set_model_state(self, saving_dict): - self._global_bias = saving_dict['_global_bias'] - self._user_bias = saving_dict['_user_bias'] - self._item_bias = saving_dict['_item_bias'] - self._user_factors = saving_dict['_user_factors'] - self._item_factors = saving_dict['_item_factors'] - diff --git a/external/models/RendleMF_NeuMFvsMF/__init__.py b/external/models/RendleMF_NeuMFvsMF/__init__.py deleted file mode 100644 index db4e17b4..00000000 --- a/external/models/RendleMF_NeuMFvsMF/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .MF import MF as RendleMF \ No newline at end of file diff --git a/external/models/RendleMF_NeuMFvsMF/custom_sampler.py b/external/models/RendleMF_NeuMFvsMF/custom_sampler.py deleted file mode 100644 index a18e4a54..00000000 --- a/external/models/RendleMF_NeuMFvsMF/custom_sampler.py +++ /dev/null @@ -1,75 +0,0 @@ -""" -Module description: - -""" - -__version__ = '0.1' -__author__ = 'Vito Walter Anelli, Claudio Pomo' -__email__ = 'vitowalter.anelli@poliba.it, claudio.pomo@poliba.it' - -import numpy as np -import random - - -class Sampler: - def __init__(self, indexed_ratings, m, sparse_matrix, seed): - np.random.seed(seed) - random.seed(seed) - self._sparse = sparse_matrix - self._indexed_ratings = indexed_ratings - self._users = list(self._indexed_ratings.keys()) - self._nusers = len(self._users) - self._items = list({k for a in self._indexed_ratings.values() for k in a.keys()}) - self._nitems = len(self._items) - self._ui_dict = {u: list(set(indexed_ratings[u])) for u in indexed_ratings} - self._lui_dict = {u: len(v) for u, v in self._ui_dict.items()} - self._m = m - - def step(self, batch_size): - r_int = np.random.randint - n_users = self._nusers - n_items = self._nitems - ui_dict = self._ui_dict - lui_dict = self._lui_dict - - # def sample_pos(u): - # ui = ui_dict[u] - # lui = lui_dict[u] - # if lui == n_items: - # return None - # return ui[r_int(lui)] - # pos = {(u, i, 1) for u, items in ui_dict.items() for i in items} - nonzero = self._sparse.nonzero() - pos = list(zip(*nonzero,np.ones(len(nonzero[0]), dtype=np.int32))) - - neg = list() - for u, i, _ in pos: - neg_samples = random.sample(range(n_items), self._m) - neg += list(zip(np.ones(len(neg_samples), dtype=np.int32) * u, neg_samples, np.zeros(len(neg_samples), dtype=np.int32))) - pass - # for _ in range(self._m): - # neg.add((u, r_int(n_items), 0)) - - # samples = list(pos) - samples = pos + neg - samples = random.sample(samples, len(samples)) - - # def sample(): - # u = r_int(n_users) - # ui = ui_dict[u] - # lui = lui_dict[u] - # if lui == n_items: - # sample() - # i = ui[r_int(lui)] - # - # j = r_int(n_items) - # while j in ui: - # j = r_int(n_items) - # return u, i, j - - # for sample in zip(samples): - # yield - - for start in range(0, len(samples), batch_size): - # u, i, b = samples[start:min(start + batch_size, len(samples))] - yield samples[start:min(start + batch_size, len(samples))] diff --git a/external/models/RendleMF_NeuMFvsMF/custom_sampler_linear.py b/external/models/RendleMF_NeuMFvsMF/custom_sampler_linear.py deleted file mode 100644 index 5d4e6b18..00000000 --- a/external/models/RendleMF_NeuMFvsMF/custom_sampler_linear.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Module description: - -""" - -__version__ = '0.1' -__author__ = 'Vito Walter Anelli, Claudio Pomo' -__email__ = 'vitowalter.anelli@poliba.it, claudio.pomo@poliba.it' - -import numpy as np -np.random.seed(42) -import random -random.seed(42) -from time import time - -class Sampler: - def __init__(self, indexed_ratings, m, sparse_matrix, seed): - self._sparse = sparse_matrix - self._indexed_ratings = indexed_ratings - self._users = list(self._indexed_ratings.keys()) - self._nusers = len(self._users) - self._items = list({k for a in self._indexed_ratings.values() for k in a.keys()}) - self._nitems = len(self._items) - self._ui_dict = {u: list(set(indexed_ratings[u])) for u in indexed_ratings} - self._lui_dict = {u: len(v) for u, v in self._ui_dict.items()} - self._m = m - - def step(self, batch_size): - # t1 = time() - r_int = np.random.randint - n_users = self._nusers - n_items = self._nitems - ui_dict = self._ui_dict - lui_dict = self._lui_dict - - # pos = {(u, i, 1) for u, items in ui_dict.items() for i in items} - - nonzero = self._sparse.nonzero() - pos = list(zip(*nonzero, np.ones(len(nonzero[0]), dtype=np.int32))) - - neg = set() - for u, i, _ in pos: - for _ in range(self._m): - neg.add((u, r_int(n_items), 0)) - - # samples = list(pos) - samples = pos + list(neg) - samples = random.sample(samples, len(samples)) - # t2 = time() - # print('Epoch sampling [%.1f s]', t2 - t1) - - for start in range(0, len(samples), batch_size): - yield samples[start:min(start + batch_size, len(samples))] diff --git a/external/models/RendleMF_NeuMFvsMF/custom_sampler_rendle.py b/external/models/RendleMF_NeuMFvsMF/custom_sampler_rendle.py deleted file mode 100644 index 69a8fc85..00000000 --- a/external/models/RendleMF_NeuMFvsMF/custom_sampler_rendle.py +++ /dev/null @@ -1,85 +0,0 @@ -""" -Module description: - -""" - -__version__ = '0.1' -__author__ = 'Vito Walter Anelli, Claudio Pomo' -__email__ = 'vitowalter.anelli@poliba.it, claudio.pomo@poliba.it' - -import numpy as np -import random -import time - - -class Sampler: - def __init__(self, indexed_ratings, m, sparse_matrix, seed): - np.random.seed(seed) - random.seed(seed) - self._sparse = sparse_matrix - self._indexed_ratings = indexed_ratings - self._users = list(self._indexed_ratings.keys()) - self._nusers = len(self._users) - self._items = list({k for a in self._indexed_ratings.values() for k in a.keys()}) - self._nitems = len(self._items) - self._ui_dict = {u: list(set(indexed_ratings[u])) for u in indexed_ratings} - self._lui_dict = {u: len(v) for u, v in self._ui_dict.items()} - self._m = m - self._nonzero = self._sparse.nonzero() - self._num_pos_examples = len(self._nonzero[0]) - self._positive_pairs = list(zip(*self._nonzero, np.ones(len(self._nonzero[0]), dtype=np.int32))) - - def step(self, batch_size): - """Converts a list of positive pairs into a two class dataset. - Args: - positive_pairs: an array of shape [n, 2], each row representing a positive - user-item pair. - num_negatives: the number of negative items to sample for each positive. - Returns: - An array of shape [n*(1 + num_negatives), 3], where each row is a tuple - (user, item, label). The examples are obtained as follows: - To each (user, item) pair in positive_pairs correspond: - * one positive example (user, item, 1) - * num_negatives negative examples (user, item', 0) where item' is sampled - uniformly at random. - """ - time_start = time.time() - r_int = np.random.randint - num_items = self._nitems - num_negatives = self._m - num_pos_examples = self._num_pos_examples - positive_pairs = self._positive_pairs - - training_matrix = np.empty([num_pos_examples * (1 + num_negatives), 3], - dtype=np.int32) - index = 0 - - for pos_index in range(num_pos_examples): - u = positive_pairs[pos_index][0] - i = positive_pairs[pos_index][1] - - # Treat the rating as a positive training instance - training_matrix[index] = [u, i, 1] - index += 1 - - # Add N negatives by sampling random items. - # This code does not enforce that the sampled negatives are not present in - # the training data. It is possible that the sampling procedure adds a - # negative that is already in the set of positives. It is also possible - # that an item is sampled twice. Both cases should be fine. - for _ in range(num_negatives): - j = r_int(num_items) - training_matrix[index] = [u, j, 0] - index += 1 - # neg = set() - # for u, i, _ in pos: - # for _ in range(self._m): - # neg.add((u, r_int(n_items), 0)) - # - # samples = list(pos) - # samples += list(neg) - samples_indices = random.sample(range(training_matrix.shape[0]), training_matrix.shape[0]) - training_matrix = training_matrix[samples_indices] - print(f"Sampling has taken {round(time.time()-time_start, 2)} seconds") - for start in range(0, training_matrix.shape[0], batch_size): - yield training_matrix[start:min(start + batch_size, training_matrix.shape[0])] diff --git a/external/models/__init__.py b/external/models/__init__.py index 5a669279..7215b8c7 100644 --- a/external/models/__init__.py +++ b/external/models/__init__.py @@ -1,13 +1 @@ from .most_popular import MostPop -from .EASE_R import EASER -from .RP3beta import RP3beta -# from .KaVAE import KaVAE -from .RendleMF_NeuMFvsMF import RendleMF -from .iALS import iALS -from .NeuMF import NeuMF -from .Proxy import ProxyRecommender -from .item_knn import ItemKNN -from .attribute_item_knn import AttributeItemKNN -from .user_knn import UserKNN -from .attribute_user_knn import AttributeUserKNN -from .DistMult import DistMult \ No newline at end of file diff --git a/external/models/attribute_item_knn/__init__.py b/external/models/attribute_item_knn/__init__.py deleted file mode 100644 index 6f07ac97..00000000 --- a/external/models/attribute_item_knn/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ - -from .attribute_item_knn import AttributeItemKNN \ No newline at end of file diff --git a/external/models/attribute_item_knn/attribute_item_knn.py b/external/models/attribute_item_knn/attribute_item_knn.py deleted file mode 100644 index ce538b91..00000000 --- a/external/models/attribute_item_knn/attribute_item_knn.py +++ /dev/null @@ -1,139 +0,0 @@ -""" -Module description: - -""" - -__version__ = '0.1' -__author__ = 'Vito Walter Anelli, Claudio Pomo' -__email__ = 'vitowalter.anelli@poliba.it, claudio.pomo@poliba.it' - -import numpy as np -import pickle -import time - -from elliot.recommender.recommender_utils_mixin import RecMixin -from elliot.utils.write import store_recommendation -import scipy.sparse as sp - -from elliot.recommender.base_recommender_model import BaseRecommenderModel -from .attribute_item_knn_similarity import Similarity -from elliot.recommender.base_recommender_model import init_charger - - -class AttributeItemKNN(RecMixin, BaseRecommenderModel): - r""" - Attribute Item-kNN proposed in MyMediaLite Recommender System Library - - For further details, please refer to the `paper `_ - - Args: - neighbors: Number of item neighbors - similarity: Similarity function - - To include the recommendation model, add it to the config file adopting the following pattern: - - .. code:: yaml - - models: - AttributeItemKNN: - meta: - save_recs: True - neighbors: 40 - similarity: cosine - """ - @init_charger - def __init__(self, data, config, params, *args, **kwargs): - - self._params_list = [ - ("_num_neighbors", "neighbors", "nn", 40, int, None), - ("_similarity", "similarity", "sim", "cosine", None, None), - ("_implicit", "implicit", "bin", False, None, None), - ("_loader", "loader", "load", "ItemAttributes", None, None), - ] - self.autoset_params() - - self._ratings = self._data.train_dict - - self._side = getattr(self._data.side_information, self._loader, None) - - self._i_feature_dict = {i_item: [self._side.public_features[feature] for feature - in self._side.feature_map[item]] for item, i_item - in self._data.public_items.items()} - self._sp_i_features = self.build_feature_sparse() - - self._model = Similarity(data=self._data, attribute_matrix=self._sp_i_features, num_neighbors=self._num_neighbors, similarity=self._similarity, implicit=self._implicit) - - def get_single_recommendation(self, mask, k, *args): - return {u: self._model.get_user_recs(u, mask, k) for u in self._ratings.keys()} - - def get_recommendations(self, k: int = 10): - predictions_top_k_val = {} - predictions_top_k_test = {} - - recs_val, recs_test = self.process_protocol(k) - - predictions_top_k_val.update(recs_val) - predictions_top_k_test.update(recs_test) - - return predictions_top_k_val, predictions_top_k_test - - def build_feature_sparse(self): - - rows_cols = [(i, f) for i, features in self._i_feature_dict.items() for f in features] - rows = [u for u, _ in rows_cols] - cols = [i for _, i in rows_cols] - data = sp.csr_matrix((np.ones_like(rows), (rows, cols)), dtype='float32', - shape=(self._num_items, len(self._side.public_features))) - return data - - @property - def name(self): - return f"AttributeItemKNN_{self.get_params_shortcut()}" - - def train(self): - if self._restore: - return self.restore_weights() - - start = time.time() - self._model.initialize() - end = time.time() - print(f"The similarity computation has taken: {end - start}") - - print(f"Transactions: {self._data.transactions}") - - self.evaluate() - - # best_metric_value = 0 - # - # recs = self.get_recommendations(self.evaluator.get_needed_recommendations()) - # result_dict = self.evaluator.eval(recs) - # self._results.append(result_dict) - # print(f'Finished') - # - # if self._results[-1][self._validation_k]["val_results"][self._validation_metric] > best_metric_value: - # print("******************************************") - # if self._save_weights: - # with open(self._saving_filepath, "wb") as f: - # pickle.dump(self._model.get_model_state(), f) - # if self._save_recs: - # store_recommendation(recs, self._config.path_output_rec_result + f"{self.name}.tsv") - - def restore_weights(self): - try: - with open(self._saving_filepath, "rb") as f: - self._model.set_model_state(pickle.load(f)) - print(f"Model correctly Restored") - - recs = self.get_recommendations(self.evaluator.get_needed_recommendations()) - result_dict = self.evaluator.eval(recs) - self._results.append(result_dict) - - print("******************************************") - if self._save_recs: - store_recommendation(recs, self._config.path_output_rec_result + f"{self.name}.tsv") - return True - - except Exception as ex: - print(f"Error in model restoring operation! {ex}") - - return False diff --git a/external/models/attribute_item_knn/attribute_item_knn_similarity.py b/external/models/attribute_item_knn/attribute_item_knn_similarity.py deleted file mode 100644 index 4290da3a..00000000 --- a/external/models/attribute_item_knn/attribute_item_knn_similarity.py +++ /dev/null @@ -1,184 +0,0 @@ - -import numpy as np -from scipy import sparse -from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, haversine_distances, chi2_kernel, manhattan_distances -from sklearn.metrics import pairwise_distances - - - - -class Similarity(object): - """ - Simple kNN class - """ - - def __init__(self, data, attribute_matrix, num_neighbors, similarity, implicit): - self._data = data - self._ratings = data.train_dict - self._attribute_matrix = attribute_matrix - self._num_neighbors = num_neighbors - self._similarity = similarity - self._implicit = implicit - - if self._implicit: - self._URM = self._data.sp_i_train - else: - self._URM = self._data.sp_i_train_ratings - - self._users = self._data.users - self._items = self._data.items - self._private_users = self._data.private_users - self._public_users = self._data.public_users - self._private_items = self._data.private_items - self._public_items = self._data.public_items - - def initialize(self): - """ - This function initialize the data model - """ - - supported_similarities = ["cosine", "dot", ] - supported_dissimilarities = ["euclidean", "manhattan", "haversine", "chi2", 'cityblock', 'l1', 'l2', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] - print(f"\nSupported Similarities: {supported_similarities}") - print(f"Supported Distances/Dissimilarities: {supported_dissimilarities}\n") - - # self._item_ratings = {} - # for u, user_items in self._ratings.items(): - # for i, v in user_items.items(): - # self._item_ratings.setdefault(i, {}).update({u: v}) - # - # self._transactions = self._data.transactions - - self._similarity_matrix = np.empty((len(self._items), len(self._items))) - - self.process_similarity(self._similarity) - ############## - data, rows_indices, cols_indptr = [], [], [] - - column_row_index = np.arange(len(self._data.items), dtype=np.int32) - - for item_idx in range(len(self._data.items)): - cols_indptr.append(len(data)) - column_data = self._similarity_matrix[:, item_idx] - - non_zero_data = column_data != 0 - - idx_sorted = np.argsort(column_data[non_zero_data]) # sort by column - top_k_idx = idx_sorted[-self._num_neighbors:] - - data.extend(column_data[non_zero_data][top_k_idx]) - rows_indices.extend(column_row_index[non_zero_data][top_k_idx]) - - cols_indptr.append(len(data)) - - W_sparse = sparse.csc_matrix((data, rows_indices, cols_indptr), - shape=(len(self._data.items), len(self._data.items)), dtype=np.float32).tocsr() - self._preds = self._URM.dot(W_sparse).toarray() - ############## - # self.compute_neighbors() - - del self._similarity_matrix - - # def compute_neighbors(self): - # self._neighbors = {} - # for x in range(self._similarity_matrix.shape[0]): - # arr = np.concatenate((self._similarity_matrix[0:x, x], [-np.inf], self._similarity_matrix[x, x+1:])) - # top_indices = np.argpartition(arr, -self._num_neighbors)[-self._num_neighbors:] - # arr = arr[top_indices] - # self._neighbors[self._private_items[x]] = {self._private_items[i]: arr[p] for p, i in enumerate(top_indices)} - # - # def get_item_neighbors(self, item): - # return self._neighbors.get(item, {}) - - def process_similarity(self, similarity): - if similarity == "cosine": - self._similarity_matrix = cosine_similarity(self._attribute_matrix) - elif similarity == "dot": - self._similarity_matrix = (self._attribute_matrix @ self._attribute_matrix.T).toarray() - elif similarity == "euclidean": - self._similarity_matrix = (1 / (1 + euclidean_distances(self._attribute_matrix))) - elif similarity == "manhattan": - self._similarity_matrix = (1 / (1 + manhattan_distances(self._attribute_matrix))) - elif similarity == "haversine": - self._similarity_matrix = (1 / (1 + haversine_distances(self._attribute_matrix))) - elif similarity == "chi2": - self._similarity_matrix = (1 / (1 + chi2_kernel(self._attribute_matrix))) - elif similarity in ['cityblock', 'l1', 'l2']: - self._similarity_matrix = (1 / (1 + pairwise_distances(self._attribute_matrix, metric=similarity))) - elif similarity in ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']: - self._similarity_matrix = (1 / (1 + pairwise_distances(self._attribute_matrix.toarray(), metric=similarity))) - else: - raise Exception("Not implemented similarity") - - # def process_cosine(self): - # x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1) - # self._similarity_matrix = cosine_similarity(self._attribute_matrix) - # # g = np.vectorize(self.compute_cosine) - # # g(x,y) - # # for item_row in range(self._similarity_matrix.shape[0]): - # # for item_col in range(item_row + 1, self._similarity_matrix.shape[1]): - # # self._similarity_matrix[item_row, item_col] = self.compute_cosine( - # # self._item_ratings.get(self._private_items[item_row],{}), self._item_ratings.get(self._private_items[item_col], {})) - - # def compute_cosine(self, i_index, j_index): - # i_dict = self._item_ratings.get(self._private_items[i_index],{}) - # j_dict = self._item_ratings.get(self._private_items[j_index],{}) - # union_keyset = set().union(*[i_dict, j_dict]) - # i: np.ndarray = np.array([[i_dict.get(x, 0) for x in union_keyset]]) - # j: np.ndarray = np.array([[j_dict.get(x, 0) for x in union_keyset]]) - # self._similarity_matrix[i_index, j_index] = cosine_similarity(i, j)[0, 0] - # - # def get_transactions(self): - # return self._transactions - - # def get_user_recs(self, u, mask, k): - # user_items = self._ratings[u].keys() - # user_mask = mask[self._data.public_users[u]] - # predictions = {i: self.score_item(self.get_item_neighbors(i), user_items) for i in self._data.items if - # user_mask[self._data.public_items[i]]} - # indices, values = zip(*predictions.items()) - # indices = np.array(indices) - # values = np.array(values) - # local_k = min(k, len(values)) - # partially_ordered_preds_indices = np.argpartition(values, -local_k)[-local_k:] - # real_values = values[partially_ordered_preds_indices] - # real_indices = indices[partially_ordered_preds_indices] - # local_top_k = real_values.argsort()[::-1] - # return [(real_indices[item], real_values[item]) for item in local_top_k] - - def get_user_recs(self, u, mask, k): - user_id = self._data.public_users.get(u) - user_recs = self._preds[user_id] - # user_items = self._ratings[u].keys() - user_recs_mask = mask[user_id] - user_recs[~user_recs_mask] = -np.inf - indices, values = zip(*[(self._data.private_items.get(u_list[0]), u_list[1]) - for u_list in enumerate(user_recs)]) - - # indices, values = zip(*predictions.items()) - indices = np.array(indices) - values = np.array(values) - local_k = min(k, len(values)) - partially_ordered_preds_indices = np.argpartition(values, -local_k)[-local_k:] - real_values = values[partially_ordered_preds_indices] - real_indices = indices[partially_ordered_preds_indices] - local_top_k = real_values.argsort()[::-1] - return [(real_indices[item], real_values[item]) for item in local_top_k] - - # @staticmethod - # def score_item(neighs, user_items): - # num = sum([v for k, v in neighs.items() if k in user_items]) - # den = sum(np.power(list(neighs.values()), 1)) - # return num/den if den != 0 else 0 - - def get_model_state(self): - saving_dict = {} - saving_dict['_neighbors'] = self._neighbors - saving_dict['_similarity'] = self._similarity - saving_dict['_num_neighbors'] = self._num_neighbors - return saving_dict - - def set_model_state(self, saving_dict): - self._neighbors = saving_dict['_neighbors'] - self._similarity = saving_dict['_similarity'] - self._num_neighbors = saving_dict['_num_neighbors'] diff --git a/external/models/attribute_user_knn/__init__.py b/external/models/attribute_user_knn/__init__.py deleted file mode 100644 index a542e236..00000000 --- a/external/models/attribute_user_knn/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .attribute_user_knn import AttributeUserKNN \ No newline at end of file diff --git a/external/models/attribute_user_knn/attribute_user_knn.py b/external/models/attribute_user_knn/attribute_user_knn.py deleted file mode 100644 index 9509d804..00000000 --- a/external/models/attribute_user_knn/attribute_user_knn.py +++ /dev/null @@ -1,170 +0,0 @@ -""" -Module description: - -""" - -__version__ = '0.1' -__author__ = 'Vito Walter Anelli, Claudio Pomo' -__email__ = 'vitowalter.anelli@poliba.it, claudio.pomo@poliba.it' - -import numpy as np -import pickle -import time -import typing as t -import scipy.sparse as sp - -from elliot.recommender.recommender_utils_mixin import RecMixin -from elliot.utils.write import store_recommendation - -from elliot.recommender.base_recommender_model import BaseRecommenderModel -from .attribute_user_knn_similarity import Similarity -from .tfidf_utils import TFIDF -from elliot.recommender.base_recommender_model import init_charger - - -class AttributeUserKNN(RecMixin, BaseRecommenderModel): - r""" - Attribute User-kNN proposed in MyMediaLite Recommender System Library - - For further details, please refer to the `paper `_ - - Args: - neighbors: Number of item neighbors - similarity: Similarity function - profile: Profile type ('binary', 'tfidf') - - To include the recommendation model, add it to the config file adopting the following pattern: - - .. code:: yaml - - models: - AttributeUserKNN: - meta: - save_recs: True - neighbors: 40 - similarity: cosine - profile: binary - """ - @init_charger - def __init__(self, data, config, params, *args, **kwargs): - - self._params_list = [ - ("_num_neighbors", "neighbors", "nn", 40, int, None), - ("_similarity", "similarity", "sim", "cosine", None, None), - ("_profile_type", "profile", "profile", "binary", None, None), - ("_implicit", "implicit", "bin", False, None, None), - ("_loader", "loader", "load", "ItemAttributes", None, None), - ] - self.autoset_params() - - self._ratings = self._data.train_dict - - self._side = getattr(self._data.side_information, self._loader, None) - - if self._profile_type == "tfidf": - self._tfidf_obj = TFIDF(self._side.feature_map) - self._tfidf = self._tfidf_obj.tfidf() - self._user_profiles = self._tfidf_obj.get_profiles(self._ratings) - else: - self._user_profiles = {user: self.compute_binary_profile(user_items) for user, user_items in self._ratings.items()} - - self._i_feature_dict = {self._data.public_users[user]: {self._side.public_features[feature]: value - for feature, value in user_features.items()} - for user, user_features in self._user_profiles.items()} - self._sp_i_features = self.build_feature_sparse_values() - - self._model = Similarity(data=self._data, attribute_matrix=self._sp_i_features, num_neighbors=self._num_neighbors, similarity=self._similarity, implicit=self._implicit) - - def get_single_recommendation(self, mask, k, *args): - return {u: self._model.get_user_recs(u, mask, k) for u in self._ratings.keys()} - - def get_recommendations(self, k: int = 10): - predictions_top_k_val = {} - predictions_top_k_test = {} - - recs_val, recs_test = self.process_protocol(k) - - predictions_top_k_val.update(recs_val) - predictions_top_k_test.update(recs_test) - - return predictions_top_k_val, predictions_top_k_test - - @property - def name(self): - return f"AttributeUserKNN_{self.get_params_shortcut()}" - - def train(self): - if self._restore: - return self.restore_weights() - - start = time.time() - self._model.initialize() - end = time.time() - print(f"The similarity computation has taken: {end - start}") - - print(f"Transactions: {self._data.transactions}") - - - self.evaluate() - # best_metric_value = 0 - # - # recs = self.get_recommendations(self.evaluator.get_needed_recommendations()) - # result_dict = self.evaluator.eval(recs) - # self._results.append(result_dict) - # print(f'Finished') - # - # if self._results[-1][self._validation_k]["val_results"][self._validation_metric] > best_metric_value: - # print("******************************************") - # if self._save_weights: - # with open(self._saving_filepath, "wb") as f: - # pickle.dump(self._model.get_model_state(), f) - # if self._save_recs: - # store_recommendation(recs, self._config.path_output_rec_result + f"{self.name}.tsv") - - def restore_weights(self): - try: - with open(self._saving_filepath, "rb") as f: - self._model.set_model_state(pickle.load(f)) - print(f"Model correctly Restored") - - recs = self.get_recommendations(self.evaluator.get_needed_recommendations()) - result_dict = self.evaluator.eval(recs) - self._results.append(result_dict) - - print("******************************************") - if self._save_recs: - store_recommendation(recs, self._config.path_output_rec_result + f"{self.name}.tsv") - return True - - except Exception as ex: - print(f"Error in model restoring operation! {ex}") - - return False - - def compute_binary_profile(self, user_items_dict: t.Dict): - user_features = {} - partial = 1/len(user_items_dict) - for item in user_items_dict.keys(): - for feature in self._side.feature_map.get(item,[]): - user_features[feature] = user_features.get(feature, 0) + partial - return user_features - - def build_feature_sparse(self): - - rows_cols = [(i, f) for i, features in self._i_feature_dict.items() for f in features] - rows = [u for u, _ in rows_cols] - cols = [i for _, i in rows_cols] - data = sp.csr_matrix((np.ones_like(rows), (rows, cols)), dtype='float32', - shape=(self._num_items, len(self._side.public_features))) - return data - - def build_feature_sparse_values(self): - rows_cols_values = [(u, f, v) for u, features in self._i_feature_dict.items() for f, v in features.items()] - rows = [u for u, _, _ in rows_cols_values] - cols = [i for _, i, _ in rows_cols_values] - values = [r for _, _, r in rows_cols_values] - - data = sp.csr_matrix((values, (rows, cols)), dtype='float32', - shape=(self._num_users, len(self._side.public_features))) - - return data diff --git a/external/models/attribute_user_knn/attribute_user_knn_similarity.py b/external/models/attribute_user_knn/attribute_user_knn_similarity.py deleted file mode 100644 index e5fcedb1..00000000 --- a/external/models/attribute_user_knn/attribute_user_knn_similarity.py +++ /dev/null @@ -1,187 +0,0 @@ - -import numpy as np -from scipy import sparse -from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, haversine_distances, chi2_kernel, manhattan_distances -from sklearn.metrics import pairwise_distances - - - - -class Similarity(object): - """ - Simple kNN class - """ - - def __init__(self, data, attribute_matrix, num_neighbors, similarity, implicit): - self._data = data - self._ratings = data.train_dict - self._attribute_matrix = attribute_matrix - self._num_neighbors = num_neighbors - self._similarity = similarity - self._implicit = implicit - - if self._implicit: - self._URM = self._data.sp_i_train - else: - self._URM = self._data.sp_i_train_ratings - - self._users = self._data.users - self._items = self._data.items - self._private_users = self._data.private_users - self._public_users = self._data.public_users - self._private_items = self._data.private_items - self._public_items = self._data.public_items - - def initialize(self): - """ - This function initialize the data model - """ - - supported_similarities = ["cosine", "dot", ] - supported_dissimilarities = ["euclidean", "manhattan", "haversine", "chi2", 'cityblock', 'l1', 'l2', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] - print(f"\nSupported Similarities: {supported_similarities}") - print(f"Supported Distances/Dissimilarities: {supported_dissimilarities}\n") - - # self._user_ratings = self._ratings - # - # self._item_ratings = {} - # for u, user_items in self._ratings.items(): - # for i, v in user_items.items(): - # self._item_ratings.setdefault(i, {}).update({u: v}) - # - # self._transactions = self._data.transactions - - self._similarity_matrix = np.empty((len(self._users), len(self._users))) - - self.process_similarity(self._similarity) - - ############## - data, rows_indices, cols_indptr = [], [], [] - - column_row_index = np.arange(len(self._users), dtype=np.int32) - - for user_idx in range(len(self._users)): - cols_indptr.append(len(data)) - column_data = self._similarity_matrix[:, user_idx] - - non_zero_data = column_data != 0 - - idx_sorted = np.argsort(column_data[non_zero_data]) # sort by column - top_k_idx = idx_sorted[-self._num_neighbors:] - - data.extend(column_data[non_zero_data][top_k_idx]) - rows_indices.extend(column_row_index[non_zero_data][top_k_idx]) - - cols_indptr.append(len(data)) - - W_sparse = sparse.csc_matrix((data, rows_indices, cols_indptr), - shape=(len(self._users), len(self._users)), dtype=np.float32).tocsr() - self._preds = W_sparse.dot(self._URM).toarray() - ############## - # self.compute_neighbors() - - del self._similarity_matrix - - # def compute_neighbors(self): - # self._neighbors = {} - # for x in range(self._similarity_matrix.shape[0]): - # arr = np.concatenate((self._similarity_matrix[0:x, x], [-np.inf], self._similarity_matrix[x, x+1:])) - # top_indices = np.argpartition(arr, -self._num_neighbors)[-self._num_neighbors:] - # arr = arr[top_indices] - # self._neighbors[self._private_users[x]] = {self._private_users[i]: arr[p] for p, i in enumerate(top_indices)} - # - # def get_user_neighbors(self, item): - # return self._neighbors.get(item, {}) - - def process_similarity(self, similarity): - if similarity == "cosine": - self._similarity_matrix = cosine_similarity(self._attribute_matrix) - elif similarity == "dot": - self._similarity_matrix = (self._attribute_matrix @ self._attribute_matrix.T).toarray() - elif similarity == "euclidean": - self._similarity_matrix = (1 / (1 + euclidean_distances(self._attribute_matrix))) - elif similarity == "manhattan": - self._similarity_matrix = (1 / (1 + manhattan_distances(self._attribute_matrix))) - elif similarity == "haversine": - self._similarity_matrix = (1 / (1 + haversine_distances(self._attribute_matrix))) - elif similarity == "chi2": - self._similarity_matrix = (1 / (1 + chi2_kernel(self._attribute_matrix))) - elif similarity in ['cityblock', 'l1', 'l2']: - self._similarity_matrix = (1 / (1 + pairwise_distances(self._attribute_matrix, metric=similarity))) - elif similarity in ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']: - self._similarity_matrix = (1 / (1 + pairwise_distances(self._attribute_matrix.toarray(), metric=similarity))) - else: - raise Exception("Not implemented similarity") - - def get_user_recs(self, u, mask, k): - user_id = self._data.public_users.get(u) - user_recs = self._preds[user_id] - # user_items = self._ratings[u].keys() - user_recs_mask = mask[user_id] - user_recs[~user_recs_mask] = -np.inf - indices, values = zip(*[(self._data.private_items.get(u_list[0]), u_list[1]) - for u_list in enumerate(user_recs)]) - - # indices, values = zip(*predictions.items()) - indices = np.array(indices) - values = np.array(values) - local_k = min(k, len(values)) - partially_ordered_preds_indices = np.argpartition(values, -local_k)[-local_k:] - real_values = values[partially_ordered_preds_indices] - real_indices = indices[partially_ordered_preds_indices] - local_top_k = real_values.argsort()[::-1] - return [(real_indices[item], real_values[item]) for item in local_top_k] - - # def process_cosine(self): - # x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1) - # self._similarity_matrix[x, y] = cosine_similarity(self._attribute_matrix)[x, y] - # # g = np.vectorize(self.compute_cosine) - # # g(x, y) - # # for item_row in range(self._similarity_matrix.shape[0]): - # # for item_col in range(item_row + 1, self._similarity_matrix.shape[1]): - # # self._similarity_matrix[item_row, item_col] = self.compute_cosine( - # # self._item_ratings.get(self._private_items[item_row],{}), self._item_ratings.get(self._private_items[item_col], {})) - - # def compute_cosine(self, i_index, j_index): - # u_dict = self._user_ratings.get(self._private_users[i_index],{}) - # v_dict = self._user_ratings.get(self._private_users[j_index],{}) - # union_keyset = set().union(*[u_dict, v_dict]) - # u: np.ndarray = np.array([[1 if x in u_dict.keys() else 0 for x in union_keyset]]) - # v: np.ndarray = np.array([[1 if x in v_dict.keys() else 0 for x in union_keyset]]) - # self._similarity_matrix[i_index, j_index] = cosine_similarity(u, v)[0, 0] - - # def get_transactions(self): - # return self._transactions - - # def get_user_recs(self, u, mask, k): - # user_items = self._ratings[u].keys() - # user_mask = mask[self._data.public_users[u]] - # predictions = {i: self.score_item(self.get_user_neighbors(u), user_items) for i in self._data.items if - # user_mask[self._data.public_items[i]]} - # indices, values = zip(*predictions.items()) - # indices = np.array(indices) - # values = np.array(values) - # local_k = min(k, len(values)) - # partially_ordered_preds_indices = np.argpartition(values, -local_k)[-local_k:] - # real_values = values[partially_ordered_preds_indices] - # real_indices = indices[partially_ordered_preds_indices] - # local_top_k = real_values.argsort()[::-1] - # return [(real_indices[item], real_values[item]) for item in local_top_k] - - # @staticmethod - # def score_item(neighs, user_neighs_items): - # num = sum([v for k, v in neighs.items() if k in user_neighs_items]) - # den = sum(np.power(list(neighs.values()), 1)) - # return num/den if den != 0 else 0 - - def get_model_state(self): - saving_dict = {} - saving_dict['_neighbors'] = self._neighbors - saving_dict['_similarity'] = self._similarity - saving_dict['_num_neighbors'] = self._num_neighbors - return saving_dict - - def set_model_state(self, saving_dict): - self._neighbors = saving_dict['_neighbors'] - self._similarity = saving_dict['_similarity'] - self._num_neighbors = saving_dict['_num_neighbors'] diff --git a/external/models/attribute_user_knn/tfidf_utils.py b/external/models/attribute_user_knn/tfidf_utils.py deleted file mode 100644 index ab37717b..00000000 --- a/external/models/attribute_user_knn/tfidf_utils.py +++ /dev/null @@ -1,28 +0,0 @@ -import typing as t -from collections import Counter -import math - -class TFIDF: - def __init__(self, map: t.Dict[int, t.List[int]]): - self.__map = map - self.__o = Counter(feature for feature_list in self.__map.values() for feature in feature_list ) - self.__maxi = max(self.__o.values()) - self.__total_documents = len(self.__map) - self.__idfo = {k: math.log(self.__total_documents/v) for k, v in self.__o.items()} - self.__tfidf = {} - for k, v in self.__map.items(): - normalization = math.sqrt(sum([self.__idfo[i]**2 for i in v])) - self.__tfidf[k] ={i:self.__idfo[i]/normalization for i in v} - - def tfidf(self): - return self.__tfidf - - def get_profiles(self, ratings: t.Dict[int, t.Dict[int, float]]): - profiles = {} - profiles = {u: {f: profiles.get(u, {}).get(f, 0) + v for i in items.keys() if i in self.__tfidf.keys() for f, v in self.__tfidf[i].items()} for u, items in ratings.items()} - profiles = {u: {f: v/len(ratings[u]) for f, v in f_dict.items()} for u, f_dict in profiles.items()} - # print(profiles[0]) - # for u, items in ratings.items(): - # profiles[u] = {} - # profiles[u] = {f: profiles[u].get(f, 0) for i in items.keys() for f in self.__tfidf[i]} - return profiles diff --git a/external/models/iALS/__init__.py b/external/models/iALS/__init__.py deleted file mode 100644 index c709306b..00000000 --- a/external/models/iALS/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .iALS import iALS \ No newline at end of file diff --git a/external/models/iALS/iALS.py b/external/models/iALS/iALS.py deleted file mode 100644 index aeeae128..00000000 --- a/external/models/iALS/iALS.py +++ /dev/null @@ -1,147 +0,0 @@ -""" -Module description: - -""" - -__version__ = '0.1' -__author__ = 'Vito Walter Anelli, Claudio Pomo' -__email__ = 'vitowalter.anelli@poliba.it, claudio.pomo@poliba.it' - -import numpy as np -import pickle - -from elliot.recommender.recommender_utils_mixin import RecMixin -from elliot.utils.write import store_recommendation -from .iALS_model import iALSModel -from elliot.recommender.base_recommender_model import BaseRecommenderModel -from elliot.recommender.base_recommender_model import init_charger - -np.random.seed(42) - - -class iALS(RecMixin, BaseRecommenderModel): - r""" - Weighted XXX Matrix Factorization - - For further details, please refer to the `paper `_ - - Args: - factors: Number of latent factors - lr: Learning rate - alpha: - reg: Regularization coefficient - - To include the recommendation model, add it to the config file adopting the following pattern: - - .. code:: yaml - - models: - WRMF: - meta: - save_recs: True - epochs: 10 - factors: 50 - alpha: 1 - reg: 0.1 - """ - - @init_charger - def __init__(self, data, config, params, *args, **kwargs): - self._random = np.random - - self._params_list = [ - ("_factors", "factors", "factors", 10, int, None), - ("_alpha", "alpha", "alpha", 1, float, None), - ("_epsilon", "epsilon", "epsilon", 1, float, None), - ("_reg", "reg", "reg", 0.1, float, None), - ("_scaling", "scaling", "scaling", "linear", None, None) - ] - self.autoset_params() - - self._ratings = self._data.train_dict - self._sp_i_train = self._data.sp_i_train - - self._model = iALSModel(self._factors, self._data, self._random, self._alpha, self._epsilon, self._reg, - self._scaling) - - # def get_recommendations(self, k: int = 100): - # return {u: self._model.get_user_recs(u, k) for u in self._ratings.keys()} - - def get_recommendations(self, k: int = 10): - self._model.prepare_predictions() - - predictions_top_k_val = {} - predictions_top_k_test = {} - - recs_val, recs_test = self.process_protocol(k) - - predictions_top_k_val.update(recs_val) - predictions_top_k_test.update(recs_test) - - return predictions_top_k_val, predictions_top_k_test - - def get_single_recommendation(self, mask, k, *args): - return {u: self._model.get_user_recs(u, mask, k) for u in self._data.train_dict.keys()} - - # def predict(self, u: int, i: int): - # """ - # Get prediction on the user item pair. - # - # Returns: - # A single float vaue. - # """ - # return self._model.predict(u, i) - - @property - def name(self): - return "iALS" \ - + "_e:" + str(self._epochs) \ - + f"_{self.get_params_shortcut()}" - - def train(self): - if self._restore: - return self.restore_weights() - - best_metric_value = 0 - for it in range(self._epochs): - self._model.train_step() - - print("Iteration Finished") - - self.evaluate(it) - - # if not (it + 1) % self._validation_rate: - # recs = self.get_recommendations(self.evaluator.get_needed_recommendations()) - # result_dict = self.evaluator.eval(recs) - # self._results.append(result_dict) - # - # print(f'Epoch {(it + 1)}/{self._epochs}') - # - # if self._results[-1][self._validation_k]["val_results"][self._validation_metric] > best_metric_value: - # print("******************************************") - # best_metric_value = self._results[-1][self._validation_k]["val_results"][self._validation_metric] - # if self._save_weights: - # with open(self._saving_filepath, "wb") as f: - # pickle.dump(self._model.get_model_state(), f) - # if self._save_recs: - # store_recommendation(recs, self._config.path_output_rec_result + f"{self.name}-it:{it + 1}.tsv") - - def restore_weights(self): - try: - with open(self._saving_filepath, "rb") as f: - self._model.set_model_state(pickle.load(f)) - print(f"Model correctly Restored") - - recs = self.get_recommendations(self.evaluator.get_needed_recommendations()) - result_dict = self.evaluator.eval(recs) - self._results.append(result_dict) - - print("******************************************") - if self._save_recs: - store_recommendation(recs, self._config.path_output_rec_result + f"{self.name}.tsv") - return True - - except Exception as ex: - print(f"Error in model restoring operation! {ex}") - - return False diff --git a/external/models/iALS/iALS_model.py b/external/models/iALS/iALS_model.py deleted file mode 100644 index 9c0ed9cc..00000000 --- a/external/models/iALS/iALS_model.py +++ /dev/null @@ -1,132 +0,0 @@ -""" -Module description: - -""" - -__version__ = '0.1' -__author__ = 'Vito Walter Anelli, Claudio Pomo' -__email__ = 'vitowalter.anelli@poliba.it, claudio.pomo@poliba.it' - -import numpy as np -from scipy import sparse as sp -from scipy.sparse.linalg import spsolve - - -class iALSModel(object): - """ - Simple Matrix Factorization class - """ - - def __init__(self, factors, data, random, alpha, epsilon, reg, scaling): - - self._data = data - self.random = random - self.C = self._data.sp_i_train - if scaling == "linear": - self.C.data = 1.0 + alpha * self.C.data - elif scaling == "log": - self.C.data = 1.0 + alpha * np.log(1.0 + self.C.data / epsilon) - self.C_csc = self.C.tocsc() - self.train_dict = self._data.train_dict - self.user_num, self.item_num = self._data.num_users, self._data.num_items - - self.X = self.random.normal(scale=0.01, size=(self.user_num, factors)) - self.Y = self.random.normal(scale=0.01, size=(self.item_num, factors)) - - warm_item_mask = np.ediff1d(self._data.sp_i_train.tocsc().indptr) > 0 - self.warm_items = np.arange(0, self.item_num, dtype=np.int32)[warm_item_mask] - - self.X_eye = sp.eye(self.user_num) - self.Y_eye = sp.eye(self.item_num) - self.lambda_eye = reg * sp.eye(factors) - - self.user_vec, self.item_vec, self.pred_mat = None, None, None - - def train_step(self): - yTy = self.Y.T.dot(self.Y) - - C = self.C - for u in range(self.user_num): - start = C.indptr[u] - end = C.indptr[u+1] - - Cu = C.data[start:end] - Pu = self.Y[C.indices[start:end], :] - - B = yTy + Pu.T.dot(((Cu - 1) * Pu.T).T) + self.lambda_eye - - self.X[u] = np.dot(np.linalg.inv(B), Pu.T.dot(Cu)) - - # Cu = self.C[u, :].toarray() - # Pu = Cu.copy() - # Pu[Pu != 0] = 1 - # CuI = sp.diags(Cu, [0]) - # CuI.data = CuI.data - 1 - # A = self.Y.T.dot(CuI).dot(self.Y) - # B = yTy + A + self.lambda_eye - # self.X[u] = np.dot(np.linalg.inv(B.toarray()), self.Y.T.dot(CuI)) - - # Cu = self.C[u, :].toarray() - # Pu = Cu.copy() - # Pu[Pu != 0] = 1 - # CuI = sp.diags(Cu, [0]) - # CuI.data = CuI.data - 1 - # yTCuIY = self.Y.T.dot(CuI).dot(self.Y) - # yTCuPu = self.Y.T.dot(CuI + self.Y_eye).dot(Pu.T) - # self.X[u] = spsolve(yTy + yTCuIY + self.lambda_eye, yTCuPu) - - xTx = self.X.T.dot(self.X) - C = self.C_csc - for i in self.warm_items: - start = C.indptr[i] - end = C.indptr[i + 1] - - Cu = C.data[start:end] - Pi = self.X[C.indices[start:end], :] - - B = xTx + Pi.T.dot(((Cu - 1) * Pi.T).T) + self.lambda_eye - - self.Y[i] = np.dot(np.linalg.inv(B), Pi.T.dot(Cu)) - - # Ci = self.C[:, i].T.toarray() - # Pi = Ci.copy() - # Pi[Pi != 0] = 1 - # CiI = sp.diags(Ci, [0]) - # CiI.data = CiI.data - 1 - # xTCiIX = self.X.T.dot(CiI).dot(self.X) - # xTCiPi = self.X.T.dot(CiI + self.X_eye).dot(Pi.T) - # self.Y[i] = spsolve(xTx + xTCiIX + self.lambda_eye, xTCiPi) - - def predict(self, user, item): - return self.pred_mat[self._data.public_users[user], self._data.public_items[item]] - - def get_user_recs(self, user, mask, k=100): - user_mask = mask[self._data.public_users[user]] - predictions = {i: self.predict(user, i) for i in self._data.items if user_mask[self._data.public_items[i]]} - - indices, values = zip(*predictions.items()) - indices = np.array(indices) - values = np.array(values) - local_k = min(k, len(values)) - partially_ordered_preds_indices = np.argpartition(values, -local_k)[-local_k:] - real_values = values[partially_ordered_preds_indices] - real_indices = indices[partially_ordered_preds_indices] - local_top_k = real_values.argsort()[::-1] - return [(real_indices[item], real_values[item]) for item in local_top_k] - - def get_model_state(self): - saving_dict = {} - saving_dict['pred_mat'] = self.pred_mat - saving_dict['X'] = self.X - saving_dict['Y'] = self.Y - saving_dict['C'] = self.C - return saving_dict - - def set_model_state(self, saving_dict): - self.pred_mat = saving_dict['pred_mat'] - self.X = saving_dict['X'] - self.Y = saving_dict['Y'] - self.C = saving_dict['C'] - - def prepare_predictions(self): - self.pred_mat = self.X.dot(self.Y.T) diff --git a/external/models/item_knn/__init__.py b/external/models/item_knn/__init__.py deleted file mode 100644 index f843566b..00000000 --- a/external/models/item_knn/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ - -from .item_knn import ItemKNN \ No newline at end of file diff --git a/external/models/item_knn/aiolli_ferrari.py b/external/models/item_knn/aiolli_ferrari.py deleted file mode 100644 index 9e8ec608..00000000 --- a/external/models/item_knn/aiolli_ferrari.py +++ /dev/null @@ -1,508 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on 23/10/17 -@author: Maurizio Ferrari Dacrema - -""" - -import numpy as np -import time, sys -import scipy.sparse as sp - - -def check_matrix(X, format='csc', dtype=np.float32): - """ - This function takes a matrix as input and transforms it into the specified format. - The matrix in input can be either sparse or ndarray. - If the matrix in input has already the desired format, it is returned as-is - the dtype parameter is always applied and the default is np.float32 - :param X: - :param format: - :param dtype: - :return: - """ - - if format == 'csc' and not isinstance(X, sp.csc_matrix): - return X.tocsc().astype(dtype) - elif format == 'csr' and not isinstance(X, sp.csr_matrix): - return X.tocsr().astype(dtype) - elif format == 'coo' and not isinstance(X, sp.coo_matrix): - return X.tocoo().astype(dtype) - elif format == 'dok' and not isinstance(X, sp.dok_matrix): - return X.todok().astype(dtype) - elif format == 'bsr' and not isinstance(X, sp.bsr_matrix): - return X.tobsr().astype(dtype) - elif format == 'dia' and not isinstance(X, sp.dia_matrix): - return X.todia().astype(dtype) - elif format == 'lil' and not isinstance(X, sp.lil_matrix): - return X.tolil().astype(dtype) - elif isinstance(X, np.ndarray): - X = sp.csr_matrix(X, dtype=dtype) - X.eliminate_zeros() - return check_matrix(X, format=format, dtype=dtype) - else: - return X.astype(dtype) - - -class AiolliSimilarity(object): - def __init__(self, data, - maxk=40, - shrink=100, - similarity='cosine', - implicit=False, - normalize=True, - asymmetric_alpha=0.5, - tversky_alpha = 1.0, - tversky_beta = 1.0, - row_weights = None): - """ - ItemKNN recommender - Parameters - ---------- - user_num : int, the number of users - item_num : int, the number of items - maxk : int, the max similar items number - shrink : float, shrink similarity value - similarity : str, way to calculate similarity - normalize : bool, whether calculate similarity with normalized value - """ - self._data = data - self._implicit = implicit - - if self._implicit: - self._train_set = data.sp_i_train - else: - self._train_set = self._data.sp_i_train_ratings - - self._private_users = self._data.private_users - self._public_users = self._data.public_users - self._private_items = self._data.private_items - self._public_items = self._data.public_items - - self.user_num = self._data.num_users - self.item_num = self._data.num_items - - self.k = maxk - self.shrink = shrink - self.normalize = normalize - self.similarity = similarity - self.asymmetric_alpha = asymmetric_alpha - self.tversky_alpha = tversky_alpha - self.tversky_beta = tversky_beta - self.row_weights = row_weights - - self.RECOMMENDER_NAME = "ItemKNNCFRecommender" - - # self.pred_mat = None - # self.yr = None - - def initialize(self): - # self.yr = defaultdict(list) - # for _, row in self._train_set.iterrows(): - # self.yr[int(row['user'])].append((int(row['item']), row['rating'])) - - # train = self._convert_df(self.user_num, self.item_num, self._train_set) - train = self._train_set.tocsc() - - cold_items_mask = np.ediff1d(train.tocsc().indptr) == 0 - - if cold_items_mask.any(): - print("{}: Detected {} ({:.2f} %) cold items.".format( - self.RECOMMENDER_NAME, cold_items_mask.sum(), cold_items_mask.sum() / len(cold_items_mask) * 100)) - - similarity = Compute_Similarity(train, - shrink=self.shrink, - topK=self.k, - normalize=self.normalize, - similarity=self.similarity, - asymmetric_alpha=self.asymmetric_alpha, - tversky_alpha=self.tversky_alpha, - tversky_beta=self.tversky_beta, - row_weights=self.row_weights - ) - - w_sparse = similarity.compute_similarity() - w_sparse = w_sparse.tocsc() - - # self.pred_mat = train.dot(w_sparse).tolil() - self.pred_mat = train.dot(w_sparse).toarray() - - def get_user_recs(self, u, mask, k): - user_id = self._data.public_users.get(u) - user_recs = self.pred_mat[user_id] - # user_items = self._ratings[u].keys() - user_recs_mask = mask[user_id] - user_recs[~user_recs_mask] = -np.inf - indices, values = zip(*[(self._data.private_items.get(u_list[0]), u_list[1]) - for u_list in enumerate(user_recs)]) - - # indices, values = zip(*predictions.items()) - indices = np.array(indices) - values = np.array(values) - local_k = min(k, len(values)) - partially_ordered_preds_indices = np.argpartition(values, -local_k)[-local_k:] - real_values = values[partially_ordered_preds_indices] - real_indices = indices[partially_ordered_preds_indices] - local_top_k = real_values.argsort()[::-1] - return [(real_indices[item], real_values[item]) for item in local_top_k] - - # def get_user_recs(self, user, k=100): - # user_items = self._data.train_dict[user].keys() - # predictions = {i: self.predict(user, i) for i in self._data.items if i not in user_items} - # indices, values = zip(*predictions.items()) - # indices = np.array(indices) - # values = np.array(values) - # local_k = min(k, len(values)) - # partially_ordered_preds_indices = np.argpartition(values, -local_k)[-local_k:] - # real_values = values[partially_ordered_preds_indices] - # real_indices = indices[partially_ordered_preds_indices] - # local_top_k = real_values.argsort()[::-1] - # return [(real_indices[item], real_values[item]) for item in local_top_k] - - # def predict(self, u, i): - # indexed_user = self._public_users[u] - # indexed_item = self._public_items[i] - # if indexed_user >= self.user_num or indexed_item >= self.item_num: - # raise ValueError('User and/or item is unkown.') - # - # return self.pred_mat[indexed_user, indexed_item] - - def _convert_df(self, user_num, item_num, df): - """Process Data to make ItemKNN available""" - ratings = list(df['rating']) - rows = list(df['user']) - cols = list(df['item']) - - mat = sp.csc_matrix((ratings, (rows, cols)), shape=(user_num, item_num)) - - return mat - - -class Compute_Similarity: - - def __init__(self, dataMatrix, topK=100, shrink=0, normalize=True, - asymmetric_alpha=0.5, tversky_alpha=1.0, tversky_beta=1.0, - similarity="cosine", row_weights=None): - """ - Computes the cosine similarity on the columns of dataMatrix - If it is computed on URM=|users|x|items|, pass the URM as is. - If it is computed on ICM=|items|x|features|, pass the ICM transposed. - :param dataMatrix: - :param topK: - :param shrink: - :param normalize: If True divide the dot product by the product of the norms - :param row_weights: Multiply the values in each row by a specified value. Array - :param asymmetric_alpha Coefficient alpha for the asymmetric cosine - :param similarity: "cosine" computes Cosine similarity - "adjusted" computes Adjusted Cosine, removing the average of the users - "asymmetric" computes Asymmetric Cosine - "pearson" computes Pearson Correlation, removing the average of the items - "jaccard" computes Jaccard similarity for binary interactions using Tanimoto - "dice" computes Dice similarity for binary interactions - "tversky" computes Tversky similarity for binary interactions - "tanimoto" computes Tanimoto coefficient for binary interactions - """ - """ - Asymmetric Cosine as described in: - Aiolli, F. (2013, October). Efficient top-n recommendation for very large scale binary rated datasets. In Proceedings of the 7th ACM conference on Recommender systems (pp. 273-280). ACM. - - """ - - # super(Compute_Similarity_Python, self).__init__() - - self.shrink = shrink - self.normalize = normalize - - self.n_rows, self.n_columns = dataMatrix.shape - self.TopK = min(topK, self.n_columns) - - self.asymmetric_alpha = asymmetric_alpha - self.tversky_alpha = tversky_alpha - self.tversky_beta = tversky_beta - - self.dataMatrix = dataMatrix.copy() - - self.adjusted_cosine = False - self.asymmetric_cosine = False - self.pearson_correlation = False - self.tanimoto_coefficient = False - self.dice_coefficient = False - self.tversky_coefficient = False - - if similarity == "adjusted": - self.adjusted_cosine = True - elif similarity == "asymmetric": - self.asymmetric_cosine = True - elif similarity == "pearson": - self.pearson_correlation = True - elif similarity == "jaccard" or similarity == "tanimoto": - self.tanimoto_coefficient = True - # Tanimoto has a specific kind of normalization - self.normalize = False - - elif similarity == "dice": - self.dice_coefficient = True - self.normalize = False - - elif similarity == "tversky": - self.tversky_coefficient = True - self.normalize = False - - elif similarity == "cosine": - pass - else: - raise ValueError("Compute_Similarity: value for parameter 'mode' not recognized." - "\nAllowed values are: 'cosine', 'pearson', 'adjusted', 'asymmetric', 'jaccard', 'tanimoto'," - "dice, tversky." - "\nPassed value was '{}'\nTry with implementation: standard".format(similarity)) - - self.use_row_weights = False - - if row_weights is not None: - - if dataMatrix.shape[0] != len(row_weights): - raise ValueError("Cosine_Similarity: provided row_weights and dataMatrix have different number of rows." - "Col_weights has {} columns, dataMatrix has {}.".format(len(row_weights), - dataMatrix.shape[0])) - - self.use_row_weights = True - self.row_weights = row_weights.copy() - self.row_weights_diag = sp.diags(self.row_weights) - - self.dataMatrix_weighted = self.dataMatrix.T.dot(self.row_weights_diag).T - - def applyAdjustedCosine(self): - """ - Remove from every data point the average for the corresponding row - :return: - """ - - self.dataMatrix = check_matrix(self.dataMatrix, 'csr') - - interactionsPerRow = np.diff(self.dataMatrix.indptr) - - nonzeroRows = interactionsPerRow > 0 - sumPerRow = np.asarray(self.dataMatrix.sum(axis=1)).ravel() - - rowAverage = np.zeros_like(sumPerRow) - rowAverage[nonzeroRows] = sumPerRow[nonzeroRows] / interactionsPerRow[nonzeroRows] - - # Split in blocks to avoid duplicating the whole data structure - start_row = 0 - end_row = 0 - - blockSize = 1000 - - while end_row < self.n_rows: - end_row = min(self.n_rows, end_row + blockSize) - - self.dataMatrix.data[self.dataMatrix.indptr[start_row]:self.dataMatrix.indptr[end_row]] -= \ - np.repeat(rowAverage[start_row:end_row], interactionsPerRow[start_row:end_row]) - - start_row += blockSize - - def applyPearsonCorrelation(self): - """ - Remove from every data point the average for the corresponding column - :return: - """ - - self.dataMatrix = check_matrix(self.dataMatrix, 'csc') - - interactionsPerCol = np.diff(self.dataMatrix.indptr) - - nonzeroCols = interactionsPerCol > 0 - sumPerCol = np.asarray(self.dataMatrix.sum(axis=0)).ravel() - - colAverage = np.zeros_like(sumPerCol) - colAverage[nonzeroCols] = sumPerCol[nonzeroCols] / interactionsPerCol[nonzeroCols] - - # Split in blocks to avoid duplicating the whole data structure - start_col = 0 - end_col = 0 - - blockSize = 1000 - - while end_col < self.n_columns: - end_col = min(self.n_columns, end_col + blockSize) - - self.dataMatrix.data[self.dataMatrix.indptr[start_col]:self.dataMatrix.indptr[end_col]] -= \ - np.repeat(colAverage[start_col:end_col], interactionsPerCol[start_col:end_col]) - - start_col += blockSize - - def useOnlyBooleanInteractions(self): - - # Split in blocks to avoid duplicating the whole data structure - start_pos = 0 - end_pos = 0 - - blockSize = 1000 - - while end_pos < len(self.dataMatrix.data): - end_pos = min(len(self.dataMatrix.data), end_pos + blockSize) - - self.dataMatrix.data[start_pos:end_pos] = np.ones(end_pos - start_pos) - - start_pos += blockSize - - def compute_similarity(self, start_col=None, end_col=None, block_size=100): - """ - Compute the similarity for the given dataset - :param self: - :param start_col: column to begin with - :param end_col: column to stop before, end_col is excluded - :return: - """ - - values = [] - rows = [] - cols = [] - - start_time = time.time() - start_time_print_batch = start_time - processedItems = 0 - - if self.adjusted_cosine: - self.applyAdjustedCosine() - - elif self.pearson_correlation: - self.applyPearsonCorrelation() - - elif self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient: - self.useOnlyBooleanInteractions() - - # We explore the matrix column-wise - self.dataMatrix = check_matrix(self.dataMatrix, 'csc') - - # Compute sum of squared values to be used in normalization - sumOfSquared = np.array(self.dataMatrix.power(2).sum(axis=0)).ravel() - - # Tanimoto does not require the square root to be applied - if not (self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient): - sumOfSquared = np.sqrt(sumOfSquared) - - if self.asymmetric_cosine: - sumOfSquared_to_1_minus_alpha = np.power(sumOfSquared, 2 * (1 - self.asymmetric_alpha)) - sumOfSquared_to_alpha = np.power(sumOfSquared, 2 * self.asymmetric_alpha) - - self.dataMatrix = check_matrix(self.dataMatrix, 'csc') - - start_col_local = 0 - end_col_local = self.n_columns - - if start_col is not None and start_col > 0 and start_col < self.n_columns: - start_col_local = start_col - - if end_col is not None and end_col > start_col_local and end_col < self.n_columns: - end_col_local = end_col - - start_col_block = start_col_local - - this_block_size = 0 - - # Compute all similarities for each item using vectorization - while start_col_block < end_col_local: - - end_col_block = min(start_col_block + block_size, end_col_local) - this_block_size = end_col_block - start_col_block - - # All data points for a given item - item_data = self.dataMatrix[:, start_col_block:end_col_block] - item_data = item_data.toarray().squeeze() - - # If only 1 feature avoid last dimension to disappear - if item_data.ndim == 1: - item_data = np.atleast_2d(item_data) - - if self.use_row_weights: - this_block_weights = self.dataMatrix_weighted.T.dot(item_data) - - else: - # Compute item similarities - this_block_weights = self.dataMatrix.T.dot(item_data) - - for col_index_in_block in range(this_block_size): - - if this_block_size == 1: - this_column_weights = this_block_weights - else: - this_column_weights = this_block_weights[:, col_index_in_block] - - columnIndex = col_index_in_block + start_col_block - this_column_weights[columnIndex] = 0.0 - - # Apply normalization and shrinkage, ensure denominator != 0 - if self.normalize: - - if self.asymmetric_cosine: - denominator = sumOfSquared_to_alpha[ - columnIndex] * sumOfSquared_to_1_minus_alpha + self.shrink + 1e-6 - else: - denominator = sumOfSquared[columnIndex] * sumOfSquared + self.shrink + 1e-6 - - this_column_weights = np.multiply(this_column_weights, 1 / denominator) - - # Apply the specific denominator for Tanimoto - elif self.tanimoto_coefficient: - denominator = sumOfSquared[columnIndex] + sumOfSquared - this_column_weights + self.shrink + 1e-6 - this_column_weights = np.multiply(this_column_weights, 1 / denominator) - - elif self.dice_coefficient: - denominator = sumOfSquared[columnIndex] + sumOfSquared + self.shrink + 1e-6 - this_column_weights = np.multiply(this_column_weights, 1 / denominator) - - elif self.tversky_coefficient: - denominator = this_column_weights + \ - (sumOfSquared[columnIndex] - this_column_weights) * self.tversky_alpha + \ - (sumOfSquared - this_column_weights) * self.tversky_beta + self.shrink + 1e-6 - this_column_weights = np.multiply(this_column_weights, 1 / denominator) - - # If no normalization or tanimoto is selected, apply only shrink - elif self.shrink != 0: - this_column_weights = this_column_weights / self.shrink - - # this_column_weights = this_column_weights.toarray().ravel() - - # Sort indices and select TopK - # Sorting is done in three steps. Faster then plain np.argsort for higher number of items - # - Partition the data to extract the set of relevant items - # - Sort only the relevant items - # - Get the original item index - relevant_items_partition = (-this_column_weights).argpartition(self.TopK - 1)[0:self.TopK] - relevant_items_partition_sorting = np.argsort(-this_column_weights[relevant_items_partition]) - top_k_idx = relevant_items_partition[relevant_items_partition_sorting] - - # Incrementally build sparse matrix, do not add zeros - notZerosMask = this_column_weights[top_k_idx] != 0.0 - numNotZeros = np.sum(notZerosMask) - - values.extend(this_column_weights[top_k_idx][notZerosMask]) - rows.extend(top_k_idx[notZerosMask]) - cols.extend(np.ones(numNotZeros) * columnIndex) - - # Add previous block size - processedItems += this_block_size - - if time.time() - start_time_print_batch >= 30 or end_col_block == end_col_local: - columnPerSec = processedItems / (time.time() - start_time + 1e-9) - - print("Similarity column {} ( {:2.0f} % ), {:.2f} column/sec, elapsed time {:.2f} min".format( - processedItems, processedItems / (end_col_local - start_col_local) * 100, columnPerSec, - (time.time() - start_time) / 60)) - - sys.stdout.flush() - sys.stderr.flush() - - start_time_print_batch = time.time() - - start_col_block += block_size - - # End while on columns - - W_sparse = sp.csr_matrix((values, (rows, cols)), - shape=(self.n_columns, self.n_columns), - dtype=np.float32) - - return W_sparse diff --git a/external/models/item_knn/item_knn.py b/external/models/item_knn/item_knn.py deleted file mode 100644 index 6ae22620..00000000 --- a/external/models/item_knn/item_knn.py +++ /dev/null @@ -1,145 +0,0 @@ -""" -Module description: - -""" - -__version__ = '0.1' -__author__ = 'Vito Walter Anelli, Claudio Pomo' -__email__ = 'vitowalter.anelli@poliba.it, claudio.pomo@poliba.it' - -import numpy as np -import pickle -import time -import scipy.sparse as sp - -from elliot.recommender.recommender_utils_mixin import RecMixin -from elliot.utils.write import store_recommendation - -from elliot.recommender.base_recommender_model import BaseRecommenderModel -from .item_knn_similarity import Similarity -from .aiolli_ferrari import AiolliSimilarity -from elliot.recommender.base_recommender_model import init_charger - - -class ItemKNN(RecMixin, BaseRecommenderModel): - r""" - Amazon.com recommendations: item-to-item collaborative filtering - - For further details, please refer to the `paper `_ - - Args: - neighbors: Number of item neighbors - similarity: Similarity function - implementation: Implementation type ('aiolli', 'classical') - - To include the recommendation model, add it to the config file adopting the following pattern: - - .. code:: yaml - - models: - ItemKNN: - meta: - save_recs: True - neighbors: 40 - similarity: cosine - implementation: aiolli - """ - @init_charger - def __init__(self, data, config, params, *args, **kwargs): - - self._params_list = [ - ("_num_neighbors", "neighbors", "nn", 40, int, None), - ("_similarity", "similarity", "sim", "cosine", None, None), - ("_implementation", "implementation", "imp", "standard", None, None), - ("_implicit", "implicit", "bin", False, None, None), - ("_shrink", "shrink", "shrink", 0, None, None), - ("_normalize", "normalize", "norm", True, None, None), - ("_asymmetric_alpha", "asymmetric_alpha", "asymalpha", False, None, lambda x: x if x else ""), - ("_tversky_alpha", "tversky_alpha", "tvalpha", False, None, lambda x: x if x else ""), - ("_tversky_beta", "tversky_beta", "tvbeta", False, None, lambda x: x if x else ""), - ("_row_weights", "normalize", "rweights", False, None, lambda x: x if x else "") - ] - self.autoset_params() - - self._ratings = self._data.train_dict - if self._implementation == "aiolli": - self._model = AiolliSimilarity(data=self._data, - maxk=self._num_neighbors, - shrink=self._shrink, - similarity=self._similarity, - implicit=self._implicit, - normalize=self._normalize, - asymmetric_alpha=self._asymmetric_alpha, - tversky_alpha=self._tversky_alpha, - tversky_beta=self._tversky_beta, - row_weights=self._row_weights) - else: - if (not self._normalize) or (self._asymmetric_alpha) or (self._tversky_alpha) or (self._tversky_beta) or (self._row_weights) or (self._shrink): - print("Options normalize, asymmetric_alpha, tversky_alpha, tversky_beta, row_weights are ignored with standard implementation. Try with implementation: aiolli") - self._model = Similarity(data=self._data, num_neighbors=self._num_neighbors, similarity=self._similarity, implicit=self._implicit) - - def get_single_recommendation(self, mask, k, *args): - return {u: self._model.get_user_recs(u, mask, k) for u in self._ratings.keys()} - - def get_recommendations(self, k: int = 10): - predictions_top_k_val = {} - predictions_top_k_test = {} - - recs_val, recs_test = self.process_protocol(k) - - predictions_top_k_val.update(recs_val) - predictions_top_k_test.update(recs_test) - - return predictions_top_k_val, predictions_top_k_test - - @property - def name(self): - return f"ItemKNN_{self.get_params_shortcut()}" - - def train(self): - if self._restore: - return self.restore_weights() - - start = time.time() - self._model.initialize() - end = time.time() - print(f"The similarity computation has taken: {end - start}") - - print(f"Transactions: {self._data.transactions}") - - self.evaluate() - - # best_metric_value = 0 - # - # recs = self.get_recommendations(self.evaluator.get_needed_recommendations()) - # result_dict = self.evaluator.eval(recs) - # self._results.append(result_dict) - # print(f'Finished') - # - # if self._results[-1][self._validation_k]["val_results"][self._validation_metric] > best_metric_value: - # print("******************************************") - # if self._save_weights: - # with open(self._saving_filepath, "wb") as f: - # pickle.dump(self._model.get_model_state(), f) - # if self._save_recs: - # store_recommendation(recs, self._config.path_output_rec_result + f"{self.name}.tsv") - - def restore_weights(self): - try: - with open(self._saving_filepath, "rb") as f: - self._model.set_model_state(pickle.load(f)) - print(f"Model correctly Restored") - - recs = self.get_recommendations(self.evaluator.get_needed_recommendations()) - result_dict = self.evaluator.eval(recs) - self._results.append(result_dict) - - print("******************************************") - if self._save_recs: - store_recommendation(recs, self._config.path_output_rec_result + f"{self.name}.tsv") - return True - - except Exception as ex: - print(f"Error in model restoring operation! {ex}") - - return False diff --git a/external/models/item_knn/item_knn_similarity.py b/external/models/item_knn/item_knn_similarity.py deleted file mode 100644 index aad03336..00000000 --- a/external/models/item_knn/item_knn_similarity.py +++ /dev/null @@ -1,193 +0,0 @@ - -import numpy as np -from scipy import sparse -from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, haversine_distances, chi2_kernel, manhattan_distances -from sklearn.metrics import pairwise_distances -from sklearn.preprocessing import normalize - - -class Similarity(object): - """ - Simple kNN class - """ - - def __init__(self, data, num_neighbors, similarity, implicit): - self._data = data - self._ratings = data.train_dict - self._num_neighbors = num_neighbors - self._similarity = similarity - self._implicit = implicit - - if self._implicit: - self._URM = self._data.sp_i_train - else: - self._URM = self._data.sp_i_train_ratings - - self._users = self._data.users - self._items = self._data.items - self._private_users = self._data.private_users - self._public_users = self._data.public_users - self._private_items = self._data.private_items - self._public_items = self._data.public_items - - def initialize(self): - """ - This function initialize the data model - """ - - self.supported_similarities = ["cosine", "dot", ] - self.supported_dissimilarities = ["euclidean", "manhattan", "haversine", "chi2", 'cityblock', 'l1', 'l2', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] - print(f"\nSupported Similarities: {self.supported_similarities}") - print(f"Supported Distances/Dissimilarities: {self.supported_dissimilarities}\n") - - # self._item_ratings = {} - # for u, user_items in self._ratings.items(): - # for i, v in user_items.items(): - # self._item_ratings.setdefault(i, {}).update({u: v}) - - # self._transactions = self._data.transactions - - self._similarity_matrix = np.empty((len(self._items), len(self._items))) - - self.process_similarity(self._similarity) - - # self._similarity_matrix = normalize(self._similarity_matrix, norm='l1', axis=1) - - ############## - data, rows_indices, cols_indptr = [], [], [] - - column_row_index = np.arange(len(self._data.items), dtype=np.int32) - - for item_idx in range(len(self._data.items)): - cols_indptr.append(len(data)) - column_data = self._similarity_matrix[:, item_idx] - - non_zero_data = column_data != 0 - - idx_sorted = np.argsort(column_data[non_zero_data]) # sort by column - top_k_idx = idx_sorted[-self._num_neighbors:] - - data.extend(column_data[non_zero_data][top_k_idx]) - rows_indices.extend(column_row_index[non_zero_data][top_k_idx]) - - cols_indptr.append(len(data)) - - W_sparse = sparse.csc_matrix((data, rows_indices, cols_indptr), - shape=(len(self._data.items), len(self._data.items)), dtype=np.float32).tocsr() - self._preds = self._URM.dot(W_sparse).toarray() - ############## - # self.compute_neighbors() - - del self._similarity_matrix - - # def compute_neighbors(self): - # self._neighbors = {} - # for x in range(self._similarity_matrix.shape[0]): - # arr = np.concatenate((self._similarity_matrix[0:x, x], [-np.inf], self._similarity_matrix[x, x+1:])) - # top_indices = np.argpartition(arr, -self._num_neighbors)[-self._num_neighbors:] - # arr = arr[top_indices] - # self._neighbors[self._private_items[x]] = {self._private_items[i]: arr[p] for p, i in enumerate(top_indices)} - # - # def get_item_neighbors(self, item): - # return self._neighbors.get(item, {}) - - def process_similarity(self, similarity): - if similarity == "cosine": - # x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1) - # self._similarity_matrix[x, y] = cosine_similarity(self._data.sp_i_train_ratings.T)[x, y] - self._similarity_matrix = cosine_similarity(self._URM.T) - elif similarity == "dot": - self._similarity_matrix = (self._URM.T @ self._URM).toarray() - elif similarity == "euclidean": - self._similarity_matrix = (1 / (1 + euclidean_distances(self._URM.T))) - elif similarity == "manhattan": - self._similarity_matrix = (1 / (1 + manhattan_distances(self._URM.T))) - elif similarity == "haversine": - self._similarity_matrix = (1 / (1 + haversine_distances(self._URM.T))) - elif similarity == "chi2": - self._similarity_matrix = (1 / (1 + chi2_kernel(self._URM.T))) - elif similarity in ['cityblock', 'l1', 'l2']: - self._similarity_matrix = (1 / (1 + pairwise_distances(self._URM.T, metric=similarity))) - elif similarity in ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']: - - self._similarity_matrix = (1 / (1 + pairwise_distances(self._URM.T.toarray(), metric=similarity))) - else: - raise ValueError("Compute Similarity: value for parameter 'similarity' not recognized." - f"\nAllowed values are: {self.supported_similarities}, {self.supported_dissimilarities}." - f"\nPassed value was {similarity}\nTry with implementation: aiolli") - - # def process_cosine(self): - # x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1) - # self._similarity_matrix[x, y] = cosine_similarity(self._data.sp_i_train_ratings.T)[x, y] - # # g = np.vectorize(self.compute_cosine) - # # g(x,y) - # # for item_row in range(self._similarity_matrix.shape[0]): - # # for item_col in range(item_row + 1, self._similarity_matrix.shape[1]): - # # self._similarity_matrix[item_row, item_col] = self.compute_cosine( - # # self._item_ratings.get(self._private_items[item_row],{}), self._item_ratings.get(self._private_items[item_col], {})) - # - # def compute_cosine(self, i_index, j_index): - # i_dict = self._item_ratings.get(self._private_items[i_index],{}) - # j_dict = self._item_ratings.get(self._private_items[j_index],{}) - # union_keyset = set().union(*[i_dict, j_dict]) - # i: np.ndarray = np.array([[i_dict.get(x, 0) for x in union_keyset]]) - # j: np.ndarray = np.array([[j_dict.get(x, 0) for x in union_keyset]]) - # self._similarity_matrix[i_index, j_index] = cosine_similarity(i, j)[0, 0] - # - # def get_transactions(self): - # return self._transactions - - # def get_user_recs(self, u, mask, k): - # user_items = self._ratings[u].keys() - # user_mask = mask[self._data.public_users[u]] - # predictions = {i: self.score_item(self.get_item_neighbors(i), user_items) for i in self._data.items if - # user_mask[self._data.public_items[i]]} - # - # indices, values = zip(*predictions.items()) - # indices = np.array(indices) - # values = np.array(values) - # local_k = min(k, len(values)) - # partially_ordered_preds_indices = np.argpartition(values, -local_k)[-local_k:] - # real_values = values[partially_ordered_preds_indices] - # real_indices = indices[partially_ordered_preds_indices] - # local_top_k = real_values.argsort()[::-1] - # return [(real_indices[item], real_values[item]) for item in local_top_k] - - def get_user_recs(self, u, mask, k): - user_id = self._data.public_users.get(u) - user_recs = self._preds[user_id] - # user_items = self._ratings[u].keys() - user_recs_mask = mask[user_id] - user_recs[~user_recs_mask] = -np.inf - indices, values = zip(*[(self._data.private_items.get(u_list[0]), u_list[1]) - for u_list in enumerate(user_recs)]) - - # indices, values = zip(*predictions.items()) - indices = np.array(indices) - values = np.array(values) - local_k = min(k, len(values)) - partially_ordered_preds_indices = np.argpartition(values, -local_k)[-local_k:] - real_values = values[partially_ordered_preds_indices] - real_indices = indices[partially_ordered_preds_indices] - local_top_k = real_values.argsort()[::-1] - return [(real_indices[item], real_values[item]) for item in local_top_k] - - # @staticmethod - # def score_item(neighs, user_items): - # num = sum([v for k, v in neighs.items() if k in user_items]) - # den = sum(np.power(list(neighs.values()), 1)) - # return num/den if den != 0 else 0 - - def get_model_state(self): - saving_dict = {} - saving_dict['_neighbors'] = self._neighbors - saving_dict['_similarity'] = self._similarity - saving_dict['_num_neighbors'] = self._num_neighbors - saving_dict['_implicit'] = self._implicit - return saving_dict - - def set_model_state(self, saving_dict): - self._neighbors = saving_dict['_neighbors'] - self._similarity = saving_dict['_similarity'] - self._num_neighbors = saving_dict['_num_neighbors'] - self._implicit = saving_dict['_implicit'] diff --git a/external/models/user_knn/__init__.py b/external/models/user_knn/__init__.py deleted file mode 100644 index 31001a4f..00000000 --- a/external/models/user_knn/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .user_knn import UserKNN \ No newline at end of file diff --git a/external/models/user_knn/aiolli_ferrari.py b/external/models/user_knn/aiolli_ferrari.py deleted file mode 100644 index e3c0500c..00000000 --- a/external/models/user_knn/aiolli_ferrari.py +++ /dev/null @@ -1,507 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Created on 23/10/17 -@author: Maurizio Ferrari Dacrema - -""" - -import numpy as np -import time, sys -import scipy.sparse as sp - - -def check_matrix(X, format='csc', dtype=np.float32): - """ - This function takes a matrix as input and transforms it into the specified format. - The matrix in input can be either sparse or ndarray. - If the matrix in input has already the desired format, it is returned as-is - the dtype parameter is always applied and the default is np.float32 - :param X: - :param format: - :param dtype: - :return: - """ - - if format == 'csc' and not isinstance(X, sp.csc_matrix): - return X.tocsc().astype(dtype) - elif format == 'csr' and not isinstance(X, sp.csr_matrix): - return X.tocsr().astype(dtype) - elif format == 'coo' and not isinstance(X, sp.coo_matrix): - return X.tocoo().astype(dtype) - elif format == 'dok' and not isinstance(X, sp.dok_matrix): - return X.todok().astype(dtype) - elif format == 'bsr' and not isinstance(X, sp.bsr_matrix): - return X.tobsr().astype(dtype) - elif format == 'dia' and not isinstance(X, sp.dia_matrix): - return X.todia().astype(dtype) - elif format == 'lil' and not isinstance(X, sp.lil_matrix): - return X.tolil().astype(dtype) - elif isinstance(X, np.ndarray): - X = sp.csr_matrix(X, dtype=dtype) - X.eliminate_zeros() - return check_matrix(X, format=format, dtype=dtype) - else: - return X.astype(dtype) - - -class AiolliSimilarity(object): - def __init__(self, data, - maxk=40, - shrink=100, - similarity='cosine', - implicit=False, - normalize=True, - asymmetric_alpha=0.5, - tversky_alpha = 1.0, - tversky_beta = 1.0, - row_weights = None): - """ - ItemKNN recommender - Parameters - ---------- - user_num : int, the number of users - item_num : int, the number of items - maxk : int, the max similar items number - shrink : float, shrink similarity value - similarity : str, way to calculate similarity - normalize : bool, whether calculate similarity with normalized value - """ - self._data = data - self._implicit = implicit - - if self._implicit: - self._train_set = data.sp_i_train - else: - self._train_set = self._data.sp_i_train_ratings - - self._private_users = self._data.private_users - self._public_users = self._data.public_users - self._private_items = self._data.private_items - self._public_items = self._data.public_items - - self.user_num = self._data.num_users - self.item_num = self._data.num_items - - self.k = maxk - self.shrink = shrink - self.normalize = normalize - self.similarity = similarity - self.asymmetric_alpha = asymmetric_alpha - self.tversky_alpha = tversky_alpha - self.tversky_beta = tversky_beta - self.row_weights = row_weights - - self.RECOMMENDER_NAME = "UserKNNCFRecommender" - - # self.pred_mat = None - # self.yr = None - - def initialize(self): - # self.yr = defaultdict(list) - # for _, row in self._train_set.iterrows(): - # self.yr[int(row['user'])].append((int(row['item']), row['rating'])) - - # train = self._convert_df(self.user_num, self.item_num, self._train_set) - train = self._train_set.tocsc() - - cold_user_mask = np.ediff1d(train.tocsc().indptr) == 0 - - if cold_user_mask.any(): - print("{}: Detected {} ({:.2f} %) cold items.".format( - self.RECOMMENDER_NAME, cold_user_mask.sum(), cold_user_mask.sum() / len(cold_user_mask) * 100)) - - similarity = Compute_Similarity(train.T, - shrink=self.shrink, - topK=self.k, - normalize=self.normalize, - similarity=self.similarity, - asymmetric_alpha=self.asymmetric_alpha, - tversky_alpha=self.tversky_alpha, - tversky_beta=self.tversky_beta, - row_weights=self.row_weights) - - w_sparse = similarity.compute_similarity() - w_sparse = w_sparse.tocsc() - - # self.pred_mat = w_sparse.dot(train).tolil() - self.pred_mat = w_sparse.dot(train).toarray() - - def get_user_recs(self, u, mask, k): - user_id = self._data.public_users.get(u) - user_recs = self.pred_mat[user_id] - # user_items = self._ratings[u].keys() - user_recs_mask = mask[user_id] - user_recs[~user_recs_mask] = -np.inf - indices, values = zip(*[(self._data.private_items.get(u_list[0]), u_list[1]) - for u_list in enumerate(user_recs)]) - - # indices, values = zip(*predictions.items()) - indices = np.array(indices) - values = np.array(values) - local_k = min(k, len(values)) - partially_ordered_preds_indices = np.argpartition(values, -local_k)[-local_k:] - real_values = values[partially_ordered_preds_indices] - real_indices = indices[partially_ordered_preds_indices] - local_top_k = real_values.argsort()[::-1] - return [(real_indices[item], real_values[item]) for item in local_top_k] - - # def get_user_recs(self, user, k=100): - # user_items = self._data.train_dict[user].keys() - # predictions = {i: self.predict(user, i) for i in self._data.items if i not in user_items} - # indices, values = zip(*predictions.items()) - # indices = np.array(indices) - # values = np.array(values) - # local_k = min(k, len(values)) - # partially_ordered_preds_indices = np.argpartition(values, -local_k)[-local_k:] - # real_values = values[partially_ordered_preds_indices] - # real_indices = indices[partially_ordered_preds_indices] - # local_top_k = real_values.argsort()[::-1] - # return [(real_indices[item], real_values[item]) for item in local_top_k] - - # def predict(self, u, i): - # indexed_user = self._public_users[u] - # indexed_item = self._public_items[i] - # if indexed_user >= self.user_num or indexed_item >= self.item_num: - # raise ValueError('User and/or item is unkown.') - # - # return self.pred_mat[indexed_user, indexed_item] - - def _convert_df(self, user_num, item_num, df): - """Process Data to make ItemKNN available""" - ratings = list(df['rating']) - rows = list(df['user']) - cols = list(df['item']) - - mat = sp.csc_matrix((ratings, (rows, cols)), shape=(user_num, item_num)) - - return mat - - -class Compute_Similarity: - - def __init__(self, dataMatrix, topK=100, shrink=0, normalize=True, - asymmetric_alpha=0.5, tversky_alpha=1.0, tversky_beta=1.0, - similarity="cosine", row_weights=None): - """ - Computes the cosine similarity on the columns of dataMatrix - If it is computed on URM=|users|x|items|, pass the URM as is. - If it is computed on ICM=|items|x|features|, pass the ICM transposed. - :param dataMatrix: - :param topK: - :param shrink: - :param normalize: If True divide the dot product by the product of the norms - :param row_weights: Multiply the values in each row by a specified value. Array - :param asymmetric_alpha Coefficient alpha for the asymmetric cosine - :param similarity: "cosine" computes Cosine similarity - "adjusted" computes Adjusted Cosine, removing the average of the users - "asymmetric" computes Asymmetric Cosine - "pearson" computes Pearson Correlation, removing the average of the items - "jaccard" computes Jaccard similarity for binary interactions using Tanimoto - "dice" computes Dice similarity for binary interactions - "tversky" computes Tversky similarity for binary interactions - "tanimoto" computes Tanimoto coefficient for binary interactions - """ - """ - Asymmetric Cosine as described in: - Aiolli, F. (2013, October). Efficient top-n recommendation for very large scale binary rated datasets. In Proceedings of the 7th ACM conference on Recommender systems (pp. 273-280). ACM. - - """ - - # super(Compute_Similarity_Python, self).__init__() - - self.shrink = shrink - self.normalize = normalize - - self.n_rows, self.n_columns = dataMatrix.shape - self.TopK = min(topK, self.n_columns) - - self.asymmetric_alpha = asymmetric_alpha - self.tversky_alpha = tversky_alpha - self.tversky_beta = tversky_beta - - self.dataMatrix = dataMatrix.copy() - - self.adjusted_cosine = False - self.asymmetric_cosine = False - self.pearson_correlation = False - self.tanimoto_coefficient = False - self.dice_coefficient = False - self.tversky_coefficient = False - - if similarity == "adjusted": - self.adjusted_cosine = True - elif similarity == "asymmetric": - self.asymmetric_cosine = True - elif similarity == "pearson": - self.pearson_correlation = True - elif similarity == "jaccard" or similarity == "tanimoto": - self.tanimoto_coefficient = True - # Tanimoto has a specific kind of normalization - self.normalize = False - - elif similarity == "dice": - self.dice_coefficient = True - self.normalize = False - - elif similarity == "tversky": - self.tversky_coefficient = True - self.normalize = False - - elif similarity == "cosine": - pass - else: - raise ValueError("Cosine_Similarity: value for parameter 'mode' not recognized." - " Allowed values are: 'cosine', 'pearson', 'adjusted', 'asymmetric', 'jaccard', 'tanimoto'," - "dice, tversky." - " Passed value was '{}'".format(similarity)) - - self.use_row_weights = False - - if row_weights is not None: - - if dataMatrix.shape[0] != len(row_weights): - raise ValueError("Cosine_Similarity: provided row_weights and dataMatrix have different number of rows." - "Col_weights has {} columns, dataMatrix has {}.".format(len(row_weights), - dataMatrix.shape[0])) - - self.use_row_weights = True - self.row_weights = row_weights.copy() - self.row_weights_diag = sp.diags(self.row_weights) - - self.dataMatrix_weighted = self.dataMatrix.T.dot(self.row_weights_diag).T - - def applyAdjustedCosine(self): - """ - Remove from every data point the average for the corresponding row - :return: - """ - - self.dataMatrix = check_matrix(self.dataMatrix, 'csr') - - interactionsPerRow = np.diff(self.dataMatrix.indptr) - - nonzeroRows = interactionsPerRow > 0 - sumPerRow = np.asarray(self.dataMatrix.sum(axis=1)).ravel() - - rowAverage = np.zeros_like(sumPerRow) - rowAverage[nonzeroRows] = sumPerRow[nonzeroRows] / interactionsPerRow[nonzeroRows] - - # Split in blocks to avoid duplicating the whole data structure - start_row = 0 - end_row = 0 - - blockSize = 1000 - - while end_row < self.n_rows: - end_row = min(self.n_rows, end_row + blockSize) - - self.dataMatrix.data[self.dataMatrix.indptr[start_row]:self.dataMatrix.indptr[end_row]] -= \ - np.repeat(rowAverage[start_row:end_row], interactionsPerRow[start_row:end_row]) - - start_row += blockSize - - def applyPearsonCorrelation(self): - """ - Remove from every data point the average for the corresponding column - :return: - """ - - self.dataMatrix = check_matrix(self.dataMatrix, 'csc') - - interactionsPerCol = np.diff(self.dataMatrix.indptr) - - nonzeroCols = interactionsPerCol > 0 - sumPerCol = np.asarray(self.dataMatrix.sum(axis=0)).ravel() - - colAverage = np.zeros_like(sumPerCol) - colAverage[nonzeroCols] = sumPerCol[nonzeroCols] / interactionsPerCol[nonzeroCols] - - # Split in blocks to avoid duplicating the whole data structure - start_col = 0 - end_col = 0 - - blockSize = 1000 - - while end_col < self.n_columns: - end_col = min(self.n_columns, end_col + blockSize) - - self.dataMatrix.data[self.dataMatrix.indptr[start_col]:self.dataMatrix.indptr[end_col]] -= \ - np.repeat(colAverage[start_col:end_col], interactionsPerCol[start_col:end_col]) - - start_col += blockSize - - def useOnlyBooleanInteractions(self): - - # Split in blocks to avoid duplicating the whole data structure - start_pos = 0 - end_pos = 0 - - blockSize = 1000 - - while end_pos < len(self.dataMatrix.data): - end_pos = min(len(self.dataMatrix.data), end_pos + blockSize) - - self.dataMatrix.data[start_pos:end_pos] = np.ones(end_pos - start_pos) - - start_pos += blockSize - - def compute_similarity(self, start_col=None, end_col=None, block_size=100): - """ - Compute the similarity for the given dataset - :param self: - :param start_col: column to begin with - :param end_col: column to stop before, end_col is excluded - :return: - """ - - values = [] - rows = [] - cols = [] - - start_time = time.time() - start_time_print_batch = start_time - processedItems = 0 - - if self.adjusted_cosine: - self.applyAdjustedCosine() - - elif self.pearson_correlation: - self.applyPearsonCorrelation() - - elif self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient: - self.useOnlyBooleanInteractions() - - # We explore the matrix column-wise - self.dataMatrix = check_matrix(self.dataMatrix, 'csc') - - # Compute sum of squared values to be used in normalization - sumOfSquared = np.array(self.dataMatrix.power(2).sum(axis=0)).ravel() - - # Tanimoto does not require the square root to be applied - if not (self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient): - sumOfSquared = np.sqrt(sumOfSquared) - - if self.asymmetric_cosine: - sumOfSquared_to_1_minus_alpha = np.power(sumOfSquared, 2 * (1 - self.asymmetric_alpha)) - sumOfSquared_to_alpha = np.power(sumOfSquared, 2 * self.asymmetric_alpha) - - self.dataMatrix = check_matrix(self.dataMatrix, 'csc') - - start_col_local = 0 - end_col_local = self.n_columns - - if start_col is not None and start_col > 0 and start_col < self.n_columns: - start_col_local = start_col - - if end_col is not None and end_col > start_col_local and end_col < self.n_columns: - end_col_local = end_col - - start_col_block = start_col_local - - this_block_size = 0 - - # Compute all similarities for each item using vectorization - while start_col_block < end_col_local: - - end_col_block = min(start_col_block + block_size, end_col_local) - this_block_size = end_col_block - start_col_block - - # All data points for a given item - item_data = self.dataMatrix[:, start_col_block:end_col_block] - item_data = item_data.toarray().squeeze() - - # If only 1 feature avoid last dimension to disappear - if item_data.ndim == 1: - item_data = np.atleast_2d(item_data) - - if self.use_row_weights: - this_block_weights = self.dataMatrix_weighted.T.dot(item_data) - - else: - # Compute item similarities - this_block_weights = self.dataMatrix.T.dot(item_data) - - for col_index_in_block in range(this_block_size): - - if this_block_size == 1: - this_column_weights = this_block_weights - else: - this_column_weights = this_block_weights[:, col_index_in_block] - - columnIndex = col_index_in_block + start_col_block - this_column_weights[columnIndex] = 0.0 - - # Apply normalization and shrinkage, ensure denominator != 0 - if self.normalize: - - if self.asymmetric_cosine: - denominator = sumOfSquared_to_alpha[ - columnIndex] * sumOfSquared_to_1_minus_alpha + self.shrink + 1e-6 - else: - denominator = sumOfSquared[columnIndex] * sumOfSquared + self.shrink + 1e-6 - - this_column_weights = np.multiply(this_column_weights, 1 / denominator) - - # Apply the specific denominator for Tanimoto - elif self.tanimoto_coefficient: - denominator = sumOfSquared[columnIndex] + sumOfSquared - this_column_weights + self.shrink + 1e-6 - this_column_weights = np.multiply(this_column_weights, 1 / denominator) - - elif self.dice_coefficient: - denominator = sumOfSquared[columnIndex] + sumOfSquared + self.shrink + 1e-6 - this_column_weights = np.multiply(this_column_weights, 1 / denominator) - - elif self.tversky_coefficient: - denominator = this_column_weights + \ - (sumOfSquared[columnIndex] - this_column_weights) * self.tversky_alpha + \ - (sumOfSquared - this_column_weights) * self.tversky_beta + self.shrink + 1e-6 - this_column_weights = np.multiply(this_column_weights, 1 / denominator) - - # If no normalization or tanimoto is selected, apply only shrink - elif self.shrink != 0: - this_column_weights = this_column_weights / self.shrink - - # this_column_weights = this_column_weights.toarray().ravel() - - # Sort indices and select TopK - # Sorting is done in three steps. Faster then plain np.argsort for higher number of items - # - Partition the data to extract the set of relevant items - # - Sort only the relevant items - # - Get the original item index - relevant_items_partition = (-this_column_weights).argpartition(self.TopK - 1)[0:self.TopK] - relevant_items_partition_sorting = np.argsort(-this_column_weights[relevant_items_partition]) - top_k_idx = relevant_items_partition[relevant_items_partition_sorting] - - # Incrementally build sparse matrix, do not add zeros - notZerosMask = this_column_weights[top_k_idx] != 0.0 - numNotZeros = np.sum(notZerosMask) - - values.extend(this_column_weights[top_k_idx][notZerosMask]) - rows.extend(top_k_idx[notZerosMask]) - cols.extend(np.ones(numNotZeros) * columnIndex) - - # Add previous block size - processedItems += this_block_size - - if time.time() - start_time_print_batch >= 30 or end_col_block == end_col_local: - columnPerSec = processedItems / (time.time() - start_time + 1e-9) - - print("Similarity column {} ( {:2.0f} % ), {:.2f} column/sec, elapsed time {:.2f} min".format( - processedItems, processedItems / (end_col_local - start_col_local) * 100, columnPerSec, - (time.time() - start_time) / 60)) - - sys.stdout.flush() - sys.stderr.flush() - - start_time_print_batch = time.time() - - start_col_block += block_size - - # End while on columns - - W_sparse = sp.csr_matrix((values, (rows, cols)), - shape=(self.n_columns, self.n_columns), - dtype=np.float32) - - return W_sparse diff --git a/external/models/user_knn/user_knn.py b/external/models/user_knn/user_knn.py deleted file mode 100644 index 6b24827f..00000000 --- a/external/models/user_knn/user_knn.py +++ /dev/null @@ -1,128 +0,0 @@ -""" -Module description: - -""" - -__version__ = '0.1' -__author__ = 'Vito Walter Anelli, Claudio Pomo' -__email__ = 'vitowalter.anelli@poliba.it, claudio.pomo@poliba.it' - -import pickle -import time - -from elliot.recommender.recommender_utils_mixin import RecMixin -from elliot.utils.write import store_recommendation - -from elliot.recommender.base_recommender_model import BaseRecommenderModel -from .user_knn_similarity import Similarity -from .aiolli_ferrari import AiolliSimilarity -from elliot.recommender.base_recommender_model import init_charger - - -class UserKNN(RecMixin, BaseRecommenderModel): - r""" - GroupLens: An Open Architecture for Collaborative Filtering of Netnews - - For further details, please refer to the `paper `_ - - Args: - neighbors: Number of item neighbors - similarity: Similarity function - implementation: Implementation type ('aiolli', 'classical') - - To include the recommendation model, add it to the config file adopting the following pattern: - - .. code:: yaml - - models: - UserKNN: - meta: - save_recs: True - neighbors: 40 - similarity: cosine - implementation: aiolli - """ - @init_charger - def __init__(self, data, config, params, *args, **kwargs): - - self._params_list = [ - ("_num_neighbors", "neighbors", "nn", 40, int, None), - ("_similarity", "similarity", "sim", "cosine", None, None), - ("_implementation", "implementation", "imp", "standard", None, None), - ("_implicit", "implicit", "bin", False, None, None), - ("_shrink", "shrink", "shrink", 0, None, None), - ("_normalize", "normalize", "norm", True, None, None), - ("_asymmetric_alpha", "asymmetric_alpha", "asymalpha", False, None, lambda x: x if x else ""), - ("_tversky_alpha", "tversky_alpha", "tvalpha", False, None, lambda x: x if x else ""), - ("_tversky_beta", "tversky_beta", "tvbeta", False, None, lambda x: x if x else ""), - ("_row_weights", "normalize", "rweights", None, None, lambda x: x if x else "") - ] - self.autoset_params() - - self._ratings = self._data.train_dict - if self._implementation == "aiolli": - self._model = AiolliSimilarity(data=self._data, - maxk=self._num_neighbors, - shrink=self._shrink, - similarity=self._similarity, - implicit=self._implicit, - normalize=self._normalize, - asymmetric_alpha=self._asymmetric_alpha, - tversky_alpha=self._tversky_alpha, - tversky_beta=self._tversky_beta, - row_weights=self._row_weights) - else: - if (not self._normalize) or (self._asymmetric_alpha) or (self._tversky_alpha) or (self._tversky_beta) or (self._row_weights) or (self._shrink): - print("Options normalize, asymmetric_alpha, tversky_alpha, tversky_beta, row_weights are ignored with standard implementation. Try with implementation: aiolli") - self._model = Similarity(data=self._data, num_neighbors=self._num_neighbors, similarity=self._similarity, implicit=self._implicit) - - def get_single_recommendation(self, mask, k, *args): - return {u: self._model.get_user_recs(u, mask, k) for u in self._ratings.keys()} - - def get_recommendations(self, k: int = 10): - predictions_top_k_val = {} - predictions_top_k_test = {} - - recs_val, recs_test = self.process_protocol(k) - - predictions_top_k_val.update(recs_val) - predictions_top_k_test.update(recs_test) - - return predictions_top_k_val, predictions_top_k_test - - @property - def name(self): - return f"UserKNN_{self.get_params_shortcut()}" - - def train(self): - if self._restore: - return self.restore_weights() - - start = time.time() - self._model.initialize() - end = time.time() - print(f"The similarity computation has taken: {end - start}") - - print(f"Transactions: {self._data.transactions}") - - self.evaluate() - - def restore_weights(self): - try: - with open(self._saving_filepath, "rb") as f: - self._model.set_model_state(pickle.load(f)) - print(f"Model correctly Restored") - - recs = self.get_recommendations(self.evaluator.get_needed_recommendations()) - result_dict = self.evaluator.eval(recs) - self._results.append(result_dict) - - print("******************************************") - if self._save_recs: - store_recommendation(recs, self._config.path_output_rec_result + f"{self.name}.tsv") - return True - - except Exception as ex: - print(f"Error in model restoring operation! {ex}") - - return False diff --git a/external/models/user_knn/user_knn_similarity.py b/external/models/user_knn/user_knn_similarity.py deleted file mode 100644 index 2c8d5b4c..00000000 --- a/external/models/user_knn/user_knn_similarity.py +++ /dev/null @@ -1,193 +0,0 @@ - -import numpy as np -from scipy import sparse -from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, haversine_distances, chi2_kernel, manhattan_distances -from sklearn.metrics import pairwise_distances - - - - -class Similarity(object): - """ - Simple kNN class - """ - - def __init__(self, data, num_neighbors, similarity, implicit): - self._data = data - self._ratings = data.train_dict - self._num_neighbors = num_neighbors - self._similarity = similarity - self._implicit = implicit - - if self._implicit: - self._URM = self._data.sp_i_train - else: - self._URM = self._data.sp_i_train_ratings - - self._users = self._data.users - self._items = self._data.items - self._private_users = self._data.private_users - self._public_users = self._data.public_users - self._private_items = self._data.private_items - self._public_items = self._data.public_items - - def initialize(self): - """ - This function initialize the data model - """ - - self.supported_similarities = ["cosine", "dot", ] - self.supported_dissimilarities = ["euclidean", "manhattan", "haversine", "chi2", 'cityblock', 'l1', 'l2', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] - print(f"\nSupported Similarities: {self.supported_similarities}") - print(f"Supported Distances/Dissimilarities: {self.supported_dissimilarities}\n") - - # self._user_ratings = self._ratings - # - # self._item_ratings = {} - # for u, user_items in self._ratings.items(): - # for i, v in user_items.items(): - # self._item_ratings.setdefault(i, {}).update({u: v}) - # - # self._transactions = self._data.transactions - - self._similarity_matrix = np.empty((len(self._users), len(self._users))) - - self.process_similarity(self._similarity) - - ############## - data, rows_indices, cols_indptr = [], [], [] - - column_row_index = np.arange(len(self._users), dtype=np.int32) - - for user_idx in range(len(self._users)): - cols_indptr.append(len(data)) - column_data = self._similarity_matrix[:, user_idx] - - non_zero_data = column_data != 0 - - idx_sorted = np.argsort(column_data[non_zero_data]) # sort by column - top_k_idx = idx_sorted[-self._num_neighbors:] - - data.extend(column_data[non_zero_data][top_k_idx]) - rows_indices.extend(column_row_index[non_zero_data][top_k_idx]) - - cols_indptr.append(len(data)) - - W_sparse = sparse.csc_matrix((data, rows_indices, cols_indptr), - shape=(len(self._users), len(self._users)), dtype=np.float32).tocsr() - self._preds = W_sparse.dot(self._URM).toarray() - ############## - # self.compute_neighbors() - - del self._similarity_matrix - - # def compute_neighbors(self): - # self._neighbors = {} - # for x in range(self._similarity_matrix.shape[0]): - # arr = np.concatenate((self._similarity_matrix[0:x, x], [-np.inf], self._similarity_matrix[x, x+1:])) - # top_indices = np.argpartition(arr, -self._num_neighbors)[-self._num_neighbors:] - # arr = arr[top_indices] - # self._neighbors[self._private_users[x]] = {self._private_users[i]: arr[p] for p, i in enumerate(top_indices)} - - # def get_user_neighbors(self, item): - # return self._neighbors.get(item, {}) - - def process_similarity(self, similarity): - if similarity == "cosine": - self._similarity_matrix = cosine_similarity(self._URM) - elif similarity == "dot": - self._similarity_matrix = (self._URM @ self._URM.T).toarray() - elif similarity == "euclidean": - self._similarity_matrix = (1 / (1 + euclidean_distances(self._URM))) - elif similarity == "manhattan": - self._similarity_matrix = (1 / (1 + manhattan_distances(self._URM))) - elif similarity == "haversine": - self._similarity_matrix = (1 / (1 + haversine_distances(self._URM))) - elif similarity == "chi2": - self._similarity_matrix = (1 / (1 + chi2_kernel(self._URM))) - elif similarity in ['cityblock', 'l1', 'l2']: - self._similarity_matrix = (1 / (1 + pairwise_distances(self._URM, metric=similarity))) - elif similarity in ['braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']: - self._similarity_matrix = (1 / (1 + pairwise_distances(self._URM.toarray(), metric=similarity))) - else: - raise ValueError("Compute Similarity: value for parameter 'similarity' not recognized." - f"\nAllowed values are: {self.supported_similarities}, {self.supported_dissimilarities}." - f"\nPassed value was {similarity}\nTry with implementation: aiolli") - - # def process_cosine(self): - # x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1) - # self._similarity_matrix[x, y] = cosine_similarity(self._data.sp_i_train_ratings)[x, y] - # # g = np.vectorize(self.compute_cosine) - # # g(x, y) - # # for item_row in range(self._similarity_matrix.shape[0]): - # # for item_col in range(item_row + 1, self._similarity_matrix.shape[1]): - # # self._similarity_matrix[item_row, item_col] = self.compute_cosine( - # # self._item_ratings.get(self._private_items[item_row],{}), self._item_ratings.get(self._private_items[item_col], {})) - - # def compute_cosine(self, i_index, j_index): - # u_dict = self._user_ratings.get(self._private_users[i_index],{}) - # v_dict = self._user_ratings.get(self._private_users[j_index],{}) - # union_keyset = set().union(*[u_dict, v_dict]) - # u: np.ndarray = np.array([[1 if x in u_dict.keys() else 0 for x in union_keyset]]) - # v: np.ndarray = np.array([[1 if x in v_dict.keys() else 0 for x in union_keyset]]) - # self._similarity_matrix[i_index, j_index] = cosine_similarity(u, v)[0, 0] - - # def get_transactions(self): - # return self._transactions - - def get_user_recs(self, u, mask, k): - user_id = self._data.public_users.get(u) - user_recs = self._preds[user_id] - # user_items = self._ratings[u].keys() - user_recs_mask = mask[user_id] - user_recs[~user_recs_mask] = -np.inf - indices, values = zip(*[(self._data.private_items.get(u_list[0]), u_list[1]) - for u_list in enumerate(user_recs)]) - - # indices, values = zip(*predictions.items()) - indices = np.array(indices) - values = np.array(values) - local_k = min(k, len(values)) - partially_ordered_preds_indices = np.argpartition(values, -local_k)[-local_k:] - real_values = values[partially_ordered_preds_indices] - real_indices = indices[partially_ordered_preds_indices] - local_top_k = real_values.argsort()[::-1] - return [(real_indices[item], real_values[item]) for item in local_top_k] - - # def get_user_recs(self, u, mask, k): - # user_items = self._ratings[u].keys() - # user_mask = mask[self._data.public_users[u]] - # predictions = {i: self.score_item(self.get_user_neighbors(u), user_items) for i in self._data.items if - # user_mask[self._data.public_items[i]]} - # - # # user_items = self._ratings[u].keys() - # # predictions = {i: self.score_item(self.get_user_neighbors(u), self._item_ratings[i].keys()) - # # for i in self._data.items if i not in user_items} - # - # indices, values = zip(*predictions.items()) - # indices = np.array(indices) - # values = np.array(values) - # local_k = min(k, len(values)) - # partially_ordered_preds_indices = np.argpartition(values, -local_k)[-local_k:] - # real_values = values[partially_ordered_preds_indices] - # real_indices = indices[partially_ordered_preds_indices] - # local_top_k = real_values.argsort()[::-1] - # return [(real_indices[item], real_values[item]) for item in local_top_k] - - # @staticmethod - # def score_item(neighs, user_neighs_items): - # num = sum([v for k, v in neighs.items() if k in user_neighs_items]) - # den = sum(np.power(list(neighs.values()), 1)) - # return num/den if den != 0 else 0 - - def get_model_state(self): - saving_dict = {} - saving_dict['_neighbors'] = self._neighbors - saving_dict['_similarity'] = self._similarity - saving_dict['_num_neighbors'] = self._num_neighbors - return saving_dict - - def set_model_state(self, saving_dict): - self._neighbors = saving_dict['_neighbors'] - self._similarity = saving_dict['_similarity'] - self._num_neighbors = saving_dict['_num_neighbors']