Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace pandas.Series.map() in SAR #1023

Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 46 additions & 16 deletions reco_utils/recommender/sar/sar_singlenode.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@

class SARSingleNode:
"""Simple Algorithm for Recommendations (SAR) implementation
SAR is a fast scalable adaptive algorithm for personalized recommendations based on user transaction history
and items description. The core idea behind SAR is to recommend items like those that a user already has
demonstrated an affinity to. It does this by 1) estimating the affinity of users for items, 2) estimating
similarity across items, and then 3) combining the estimates to generate a set of recommendations for a given user.

SAR is a fast scalable adaptive algorithm for personalized recommendations based on user transaction history
and items description. The core idea behind SAR is to recommend items like those that a user already has
demonstrated an affinity to. It does this by 1) estimating the affinity of users for items, 2) estimating
similarity across items, and then 3) combining the estimates to generate a set of recommendations for a given user.
"""

def __init__(
Expand Down Expand Up @@ -113,7 +113,7 @@ def compute_affinity_matrix(self, df, rating_col):
indices in a sparse matrix, and the events as the data. Here, we're treating
the ratings as the event weights. We convert between different sparse-matrix
formats to de-duplicate user-item pairs, otherwise they will get added up.

Args:
df (pd.DataFrame): Indexed df of users and items
rating_col (str): Name of column to use for ratings
Expand Down Expand Up @@ -155,8 +155,8 @@ def compute_time_decay(self, df, decay_column):
def compute_coocurrence_matrix(self, df):
""" Co-occurrence matrix.

The co-occurrence matrix is defined as :math:`C = U^T * U`
The co-occurrence matrix is defined as :math:`C = U^T * U`

where U is the user_affinity matrix with 1's as values (instead of ratings).

Args:
Expand Down Expand Up @@ -231,8 +231,12 @@ def fit(self, df):

logger.info("Creating index columns")
# add mapping of user and item ids to indices
temp_df.loc[:, self.col_item_id] = temp_df[self.col_item].map(self.item2index)
temp_df.loc[:, self.col_user_id] = temp_df[self.col_user].map(self.user2index)
temp_df.loc[:, self.col_item_id] = temp_df[self.col_item].apply(
lambda item: self.item2index.get(item, np.NaN)
)
temp_df.loc[:, self.col_user_id] = temp_df[self.col_user].apply(
lambda user: self.user2index.get(user, np.NaN)
)

if self.normalize:
logger.info("Calculating normalization factors")
Expand Down Expand Up @@ -283,13 +287,18 @@ def score(self, test, remove_seen=False, normalize=False):
test (pd.DataFrame): user to test
remove_seen (bool): flag to remove items seen in training from recommendation
normalize (bool): flag to normalize scores to be in the same scale as the original ratings

Returns:
np.ndarray: Value of interest of all items for the users.
"""

# get user / item indices from test set
user_ids = test[self.col_user].drop_duplicates().map(self.user2index).values
user_ids = list(
map(
lambda user: self.user2index.get(user, np.NaN),
test[self.col_user].unique()
)
)
if any(np.isnan(user_ids)):
raise ValueError("SAR cannot score users that are not in the training set")

Expand Down Expand Up @@ -367,7 +376,14 @@ def get_item_based_topk(self, items, top_k=10, sort_top_k=True):
"""

# convert item ids to indices
item_ids = items[self.col_item].map(self.item2index)
item_ids = np.asarray(
list(
map(
lambda item: self.item2index.get(item, np.NaN),
items[self.col_item].values
)
)
)

# if no ratings were provided assume they are all 1
if self.col_rating in items.columns:
Expand Down Expand Up @@ -450,7 +466,7 @@ def recommend_k_items(

def predict(self, test):
"""Output SAR scores for only the users-items pairs which are in the test set

Args:
test (pd.DataFrame): DataFrame that contains users and items to test

Expand All @@ -459,10 +475,24 @@ def predict(self, test):
"""

test_scores = self.score(test)
user_ids = test[self.col_user].map(self.user2index).values
user_ids = np.asarray(
list(
map(
lambda user: self.user2index.get(user, np.NaN),
test[self.col_user].values
)
)
)

# create mapping of new items to zeros
item_ids = test[self.col_item].map(self.item2index).values
item_ids = np.asarray(
list(
map(
lambda item: self.item2index.get(item, np.NaN),
test[self.col_item].values
)
)
)
nans = np.isnan(item_ids)
if any(nans):
logger.warning(
Expand Down