Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add model deepfm & dat #74

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions deepmatch/models/dat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"""
Author:
Yang Bo, [email protected]
Reference:
Yantao Yu, Weipeng Wang, Zhoutian Feng, Daiyue Xue, et al. A Dual Augumented Two-tower Model for Online Large-scale Recommendation. DLP-KDD 2021.
"""

from deepctr.feature_column import build_input_features, create_embedding_matrix
from deepctr.layers import PredictionLayer, DNN, combined_dnn_input
from deepctr.layers.utils import Hash
from tensorflow.python.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten
from tensorflow.keras.regularizers import l2
from tensorflow.keras import backend as K

from ..inputs import input_from_feature_columns
from ..layers.core import Similarity

def generate_augmented_embedding(feat, l2_reg_embedding=1e-6):
inp = Input(shape=(1,), name='aug_inp_' + feat.name, dtype=feat.dtype)
if feat.use_hash:
lookup_idx = Hash(feat.vocabulary_size, mask_zero=False, vocabulary_path=feat.vocabulary_path)(inp)
else:
lookup_idx = inp
emb = Embedding(feat.vocabulary_size, feat.embedding_dim,
embeddings_initializer=feat.embeddings_initializer,
embeddings_regularizer=l2(l2_reg_embedding),
name='aug_emb_' + feat.embedding_name)
emb.trainable = feat.trainable
return inp, Flatten()(emb(lookup_idx))

def DAT(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(64, 32),
item_dnn_hidden_units=(64, 32),
dnn_activation='tanh', dnn_use_bn=False,
l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, seed=1024, metric='cos'):
"""Instantiates the Deep Structured Semantic Model architecture.

:param user_feature_columns: An iterable containing user's features used by the model.
:param item_feature_columns: An iterable containing item's features used by the model.
:param user_dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of user tower
:param item_dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of item tower
:param dnn_activation: Activation function to use in deep net
:param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in deep net
:param l2_reg_dnn: float. L2 regularizer strength applied to DNN
:param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector
:param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.
:param seed: integer ,to use as random seed.
:param metric: str, ``"cos"`` for cosine or ``"ip"`` for inner product
:return: A Keras model instance.

"""

embedding_matrix_dict = create_embedding_matrix(user_feature_columns + item_feature_columns, l2_reg_embedding,
seed=seed,
seq_mask_zero=True)

user_features = build_input_features(user_feature_columns)
user_sparse_embedding_list, user_dense_value_list = input_from_feature_columns(user_features,
user_feature_columns,
l2_reg_embedding, seed=seed,
embedding_matrix_dict=embedding_matrix_dict)
i_u, a_u = generate_augmented_embedding(user_feature_columns[0])
user_inputs_list = list(user_features.values()) + [i_u]
user_dnn_input = combined_dnn_input(user_sparse_embedding_list, [a_u])

item_features = build_input_features(item_feature_columns)
item_sparse_embedding_list, item_dense_value_list = input_from_feature_columns(item_features,
item_feature_columns,
l2_reg_embedding, seed=seed,
embedding_matrix_dict=embedding_matrix_dict)
i_v, a_v = generate_augmented_embedding(item_feature_columns[0])
item_inputs_list = list(item_features.values()) + [i_v]
item_dnn_input = combined_dnn_input(item_sparse_embedding_list, [a_v])

user_dnn_out = DNN(user_dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout,
dnn_use_bn, seed=seed)(user_dnn_input)

item_dnn_out = DNN(item_dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout,
dnn_use_bn, seed=seed)(item_dnn_input)

score = Similarity(type=metric)([user_dnn_out, item_dnn_out])

output = PredictionLayer("binary", False)(score)

model = Model(inputs=user_inputs_list + item_inputs_list, outputs=output)

model.__setattr__("user_input", user_inputs_list)
model.__setattr__("item_input", item_inputs_list)
model.__setattr__("user_embedding", user_dnn_out)
model.__setattr__("item_embedding", item_dnn_out)

a_u_l = K.stop_gradient(a_u)
a_v_l = K.stop_gradient(a_v)
return model, output, user_dnn_out, item_dnn_out, a_u_l, a_v_l
84 changes: 84 additions & 0 deletions deepmatch/models/deepfm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""
Author:
Yang Bo, [email protected]
Reference:
Guo H, Tang R, Ye Y, et al. Deepfm: a factorization-machine based neural network for ctr prediction[J]. arXiv preprint arXiv:1703.04247, 2017.(https://arxiv.org/abs/1703.04247)
"""

from deepctr.feature_column import build_input_features, create_embedding_matrix
from deepctr.layers import PredictionLayer, DNN, combined_dnn_input
from tensorflow.python.keras.models import Model

from ..inputs import input_from_feature_columns
from ..layers.core import Similarity
import tensorflow as tf


def DeepFM(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(64, 32),
item_dnn_hidden_units=(64, 32),
dnn_activation='tanh', dnn_use_bn=False,
l2_reg_dnn=0, l2_reg_embedding=1e-6, dnn_dropout=0, seed=1024, metric='cos'):
"""Instantiates the Deep Structured Semantic Model architecture.

:param user_feature_columns: An iterable containing user's features used by the model.
:param item_feature_columns: An iterable containing item's features used by the model.
:param user_dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of user tower
:param item_dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of item tower
:param dnn_activation: Activation function to use in deep net
:param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in deep net
:param l2_reg_dnn: float. L2 regularizer strength applied to DNN
:param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector
:param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.
:param seed: integer ,to use as random seed.
:param metric: str, ``"cos"`` for cosine or ``"ip"`` for inner product
:return: A Keras model instance.

"""

embedding_matrix_dict = create_embedding_matrix(user_feature_columns + item_feature_columns, l2_reg_embedding,
seed=seed,
seq_mask_zero=True)

user_features = build_input_features(user_feature_columns)
user_inputs_list = list(user_features.values())
user_sparse_embedding_list, user_dense_value_list = input_from_feature_columns(user_features,
user_feature_columns,
l2_reg_embedding, seed=seed,
embedding_matrix_dict=embedding_matrix_dict)
user_dnn_input = combined_dnn_input(user_sparse_embedding_list, user_dense_value_list)

item_features = build_input_features(item_feature_columns)
item_inputs_list = list(item_features.values())
item_sparse_embedding_list, item_dense_value_list = input_from_feature_columns(item_features,
item_feature_columns,
l2_reg_embedding, seed=seed,
embedding_matrix_dict=embedding_matrix_dict)
item_dnn_input = combined_dnn_input(item_sparse_embedding_list, item_dense_value_list)

user_dnn_out = DNN(user_dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout,
dnn_use_bn, seed=seed)(user_dnn_input)
user_dnn_norm = tf.keras.layers.LayerNormalization(axis=1)(user_dnn_out)
user_fm_out = tf.reduce_sum(tf.concat(user_sparse_embedding_list, axis=1), axis=1)
user_fm_norm = tf.keras.layers.LayerNormalization(axis=1)(user_fm_out)

item_dnn_out = DNN(item_dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout,
dnn_use_bn, seed=seed)(item_dnn_input)
item_dnn_norm = tf.keras.layers.LayerNormalization(axis=1)(item_dnn_out)
item_fm_out = tf.reduce_sum(tf.concat(item_sparse_embedding_list, axis=1), axis=1)
item_fm_norm = tf.keras.layers.LayerNormalization(axis=1)(item_fm_out)

user_emb_out = tf.concat([user_dnn_norm, user_fm_norm], axis=1)
item_emb_out = tf.concat([item_dnn_norm, item_fm_norm], axis=1)

score = Similarity(type=metric)([user_emb_out, item_emb_out])

output = PredictionLayer("binary", False)(score)

model = Model(inputs=user_inputs_list + item_inputs_list, outputs=output)

model.__setattr__("user_input", user_inputs_list)
model.__setattr__("item_input", item_inputs_list)
model.__setattr__("user_embedding", user_emb_out)
model.__setattr__("item_embedding", item_emb_out)

return model
123 changes: 123 additions & 0 deletions examples/run_dat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
import pandas as pd
from deepctr.feature_column import SparseFeat, VarLenSparseFeat
from preprocess import gen_data_set, gen_model_input
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras.models import Model

from deepmatch.models import *
import tensorflow.keras.backend as K
import tensorflow as tf

from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution() # using for custom loss

def dual_augmented_loss(p_u, p_v, a_u, a_v):
def loss(y_true, y_pred):
y_ = K.cast(y_true, tf.float32)
loss_p = K.mean(K.square(y_ - y_pred))
loss_u = K.mean(K.square(y_ * a_u + (1 - y_) * p_v - p_v))
loss_v = K.mean(K.square(y_ * a_v + (1 - y_) * p_u - p_u))
return loss_p + 0.5 * loss_u + 0.5 * loss_v
return loss

if __name__ == "__main__":

data = pd.read_csvdata = pd.read_csv("./movielens_sample.txt")
sparse_features = ["movie_id", "user_id",
"gender", "age", "occupation", "zip", ]
SEQ_LEN = 50
negsample = 3

# 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input`

features = ['user_id', 'movie_id', 'gender', 'age', 'occupation', 'zip']
feature_max_idx = {}
for feature in features:
lbe = LabelEncoder()
data[feature] = lbe.fit_transform(data[feature]) + 1
feature_max_idx[feature] = data[feature].max() + 1

user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id')

item_profile = data[["movie_id"]].drop_duplicates('movie_id')

user_profile.set_index("user_id", inplace=True)

user_item_list = data.groupby("user_id")['movie_id'].apply(list)

train_set, test_set = gen_data_set(data, negsample)

train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN)
test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN)
train_model_input['aug_inp_user_id'] = train_model_input['user_id']
train_model_input['aug_inp_movie_id'] = train_model_input['movie_id']
test_model_input['aug_inp_user_id'] = test_model_input['user_id']
test_model_input['aug_inp_movie_id'] = test_model_input['movie_id']

# 2.count #unique features for each sparse field and generate feature config for sequence feature

embedding_dim = 8

user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim),
SparseFeat("gender", feature_max_idx['gender'], embedding_dim),
SparseFeat("age", feature_max_idx['age'], embedding_dim),
SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim),
SparseFeat("zip", feature_max_idx['zip'], embedding_dim),
VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim,
embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'),
]

item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)]

# 3.Define Model and train

user_dnn_hidden_units=(32, 16, 8)
item_dnn_hidden_units=(32, 16, 8)
model, y_pred, p_u, p_v, a_u, a_v = DAT(user_feature_columns, item_feature_columns, user_dnn_hidden_units=user_dnn_hidden_units, item_dnn_hidden_units=item_dnn_hidden_units) # FM(user_feature_columns,item_feature_columns)

model.compile(optimizer='adagrad', loss=dual_augmented_loss(p_u, p_v, a_u, a_v))

history = model.fit(train_model_input, train_label, # train_label,
batch_size=256, epochs=1, verbose=1, validation_split=0.0, )

# 4. Generate user features for testing and full item features for retrieval
test_user_model_input = test_model_input
all_item_model_input = {"movie_id": item_profile['movie_id'].values, "aug_inp_movie_id": item_profile['movie_id'].values}

user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)

user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12)
item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12)

print(user_embs.shape)
print(item_embs.shape)

# 5. [Optional] ANN search by faiss and evaluate the result

# test_true_label = {line[0]:[line[2]] for line in test_set}
#
# import numpy as np
# import faiss
# from tqdm import tqdm
# from deepmatch.utils import recall_N
#
# index = faiss.IndexFlatIP(user_dnn_hidden_units[-1])
# # faiss.normalize_L2(item_embs)
# index.add(item_embs)
# # faiss.normalize_L2(user_embs)
# D, I = index.search(user_embs, 50)
# s = []
# hit = 0
# for i, uid in tqdm(enumerate(test_user_model_input['user_id'])):
# try:
# pred = [item_profile['movie_id'].values[x] for x in I[i]]
# filter_item = None
# recall_score = recall_N(test_true_label[uid], pred, N=50)
# s.append(recall_score)
# if test_true_label[uid] in pred:
# hit += 1
# except:
# print(i)
# print("recall", np.mean(s))
# print("hr", hit / len(test_user_model_input['user_id']))
Loading