intel-analytics · hkvision · Nov 3, 2022 · Sep 7, 2022 · Sep 8, 2022 · Sep 8, 2022
diff --git a/python/orca/tutorial/pytorch/NCF/model.py b/python/orca/tutorial/pytorch/NCF/model.py
@@ -0,0 +1,129 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Most of the pytorch code is adapted from guoyang9's NCF implementation for
+# ml-1m dataset.
+# https://github.com/guoyang9/NCF
+#
+
+import torch
+import torch.nn as nn
+
+
+class NCF(nn.Module):
+    def __init__(self, user_num, item_num, factor_num, num_layers, dropout,
+        model, GMF_model=None, MLP_model=None):
+        super(NCF, self).__init__()
+        """
+        user_num: number of users;
+        item_num: number of items;
+        factor_num: number of predictive factors;
+        num_layers: the number of layers in MLP model;
+        dropout: dropout rate between fully connected layers;
+        model: 'MLP', 'GMF', 'NeuMF-end', and 'NeuMF-pre';
+        GMF_model: pre-trained GMF weights;
+        MLP_model: pre-trained MLP weights.
+        """
+        self.dropout = dropout
+        self.model = model
+        self.GMF_model = GMF_model
+        self.MLP_model = MLP_model
+
+        self.embed_user_GMF = nn.Embedding(user_num, factor_num)
+        self.embed_item_GMF = nn.Embedding(item_num, factor_num)
+        self.embed_user_MLP = nn.Embedding(user_num, \
+            factor_num * (2 ** (num_layers - 1)))
+        self.embed_item_MLP = nn.Embedding(item_num, \
+            factor_num * (2 ** (num_layers - 1)))
+
+        MLP_modules = []
+        for i in range(num_layers):
+            input_size = factor_num * (2 ** (num_layers - i))
+            MLP_modules.append(nn.Dropout(p=self.dropout))
+            MLP_modules.append(nn.Linear(input_size, input_size//2))
+            MLP_modules.append(nn.ReLU())
+        self.MLP_layers = nn.Sequential(*MLP_modules)
+
+        if self.model in ['MLP', 'GMF']:
+            predict_size = factor_num
+        else:
+            predict_size = factor_num * 2
+        output_modules = []
+        output_modules.append(nn.Linear(predict_size, 1))
+        output_modules.append(nn.Sigmoid())
+        self.predict_layer = nn.Sequential(*output_modules)
+
+        self._init_weight_()
+
+    def _init_weight_(self):
+        """ We leave the weights initialization here. """
+        if not self.model == 'NeuMF-pre':
+            nn.init.normal_(self.embed_user_GMF.weight, std=0.01)
+            nn.init.normal_(self.embed_user_MLP.weight, std=0.01)
+            nn.init.normal_(self.embed_item_GMF.weight, std=0.01)
+            nn.init.normal_(self.embed_item_MLP.weight, std=0.01)
+
+            for m in self.MLP_layers:
+                if isinstance(m, nn.Linear):
+                    nn.init.xavier_uniform_(m.weight)
+
+            for m in self.modules():
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    m.bias.data.zero_()
+        else:
+            # embedding layers
+            self.embed_user_GMF.weight.data.copy_(self.GMF_model.embed_user_GMF.weight)
+            self.embed_item_GMF.weight.data.copy_(self.GMF_model.embed_item_GMF.weight)
+            self.embed_user_MLP.weight.data.copy_(self.MLP_model.embed_user_MLP.weight)
+            self.embed_item_MLP.weight.data.copy_(self.MLP_model.embed_item_MLP.weight)
+
+            # mlp layers
+            for (m1, m2) in zip(self.MLP_layers, self.MLP_model.MLP_layers):
+                if isinstance(m1, nn.Linear) and isinstance(m2, nn.Linear):
+                    m1.weight.data.copy_(m2.weight)
+                    m1.bias.data.copy_(m2.bias)
+
+            # predict layers
+            predict_weight = torch.cat([
+                self.GMF_model.predict_layer.weight,
+                self.MLP_model.predict_layer.weight], dim=1)
+            precit_bias = self.GMF_model.predict_layer.bias + \
+                self.MLP_model.predict_layer.bias
+
+            self.predict_layer.weight.data.copy_(0.5 * predict_weight)
+            self.predict_layer.bias.data.copy_(0.5 * precit_bias)
+
+    def forward(self, *args):
+        user, item = args[0], args[1]
+
+        if not self.model == 'MLP':
+            embed_user_GMF = self.embed_user_GMF(user)
+            embed_item_GMF = self.embed_item_GMF(item)
+            output_GMF = embed_user_GMF * embed_item_GMF
+        if not self.model == 'GMF':
+            embed_user_MLP = self.embed_user_MLP(user)
+            embed_item_MLP = self.embed_item_MLP(item)
+            interaction = torch.cat((embed_user_MLP, embed_item_MLP), -1)
+            output_MLP = self.MLP_layers(interaction)
+
+        if self.model == 'GMF':
+            concat = output_GMF
+        elif self.model == 'MLP':
+            concat = output_MLP
+        else:
+            concat = torch.cat((output_GMF, output_MLP), -1)
+
+        prediction = self.predict_layer(concat)
+        return prediction.view(-1)
diff --git a/python/orca/tutorial/pytorch/NCF/train_data_loader.py b/python/orca/tutorial/pytorch/NCF/train_data_loader.py
@@ -0,0 +1,170 @@
+#
+# Copyright 2016 The BigDL Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Most of the pytorch code is adapted from guoyang9's NCF implementation for
+# ml-1m dataset.
+# https://github.com/guoyang9/NCF
+#
+
+import numpy as np
+import pandas as pd
+import scipy.sparse as sp
+
+# Step 1: Init Orca Context
+
+from bigdl.orca import init_orca_context, stop_orca_context
+init_orca_context()
+
+# Step 2: Define Train Dataset
+
+from sklearn.model_selection import train_test_split
+import torch.utils.data as data
+
+
+class NCFData(data.Dataset):
+    def __init__(self, features,
+                    item_num=0, train_mat=None, num_ng=4, is_training=False):
+        super(NCFData, self).__init__()
+        """ Note that the labels are only useful when training, we thus
+            add them in the ng_sample() function.
+        """
+        self.features_ps = features
+        self.item_num = item_num
+        self.train_mat = train_mat
+        self.num_ng = num_ng
+        self.is_training = is_training
+        self.labels = [0 for _ in range(len(features))]
+
+    def ng_sample(self):
+        assert self.is_training, 'no need to sampling when testing'
+
+        self.features_ng = []
+        for x in self.features_ps:
+            u = x[0]
+            for t in range(self.num_ng):
+                j = np.random.randint(self.item_num)
+                while (u, j) in self.train_mat:
+                    j = np.random.randint(self.item_num)
+                self.features_ng.append([u, j])
+
+        labels_ps = [1 for _ in range(len(self.features_ps))]
+        labels_ng = [0 for _ in range(len(self.features_ng))]
+
+        self.features_fill = self.features_ps + self.features_ng
+        self.labels_fill = labels_ps + labels_ng
+
+    def __len__(self):
+        if self.is_training:
+            return (self.num_ng + 1) * len(self.labels)
+        return len(self.labels)
+
+    def __getitem__(self, idx):
+        features = self.features_fill if self.is_training else self.features_ps
+        labels = self.labels_fill if self.is_training else self.labels
+
+        user = features[idx][0]
+        item = features[idx][1]
+        label = float(labels[idx])
+        return user, item, label
+
+
+def train_loader_func(config, batch_size):
+    data_X = pd.read_csv(
+        "ml-1m/ratings.dat",
+        sep="::", header=None, names=['user', 'item'],
+        usecols=[0, 1], dtype={0: np.int64, 1: np.int64})
+    user_num = data_X['user'].max() + 1
+    item_num = data_X['item'].max() + 1
+
+    data_X = data_X.values.tolist()
+
+    # load ratings as a dok matrix
+    train_mat = sp.dok_matrix((user_num, item_num), dtype=np.int64)
+    for x in data_X:
+        train_mat[x[0], x[1]] = 1
+
+    train_data, _ = train_test_split(data_X, test_size=0.1, random_state=100)
+
+    train_dataset = NCFData(train_data, item_num=item_num, train_mat=train_mat, num_ng=4, is_training=True)
+    train_loader = data.DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=0)
+    train_loader.dataset.ng_sample()  # sample negative items for training datasets
+    return train_loader
+
+
+def test_loader_func(config, batch_size):
+    data_X = pd.read_csv(
+        "ml-1m/ratings.dat",
+        sep="::", header=None, names=['user', 'item'],
+        usecols=[0, 1], dtype={0: np.int64, 1: np.int64})
+
+    data_X = data_X.values.tolist()
+
+    _, test_data = train_test_split(data_X, test_size=0.1, random_state=100)
+
+    test_dataset = NCFData(test_data)
+    test_loader = data.DataLoader(test_dataset, shuffle=False, num_workers=0)
+    return test_loader
+
+# Step 3: Define the Model
+
+from model import NCF
+import torch.nn as nn
+import torch.optim as optim
+
+
+def model_creator(config):
+    data_X = pd.read_csv(
+        "ml-1m/ratings.dat",
+        sep="::", header=None, names=['user', 'item'],
+        usecols=[0, 1], dtype={0: np.int64, 1: np.int64})
+    user_num = data_X['user'].max() + 1
+    item_num = data_X['item'].max() + 1
+
+    model = NCF(user_num, item_num, \
+        factor_num=32, num_layers=3, dropout=0.0, model="NeuMF-end")
+    model.train()
+    return model
+
+
+def optimizer_creator(model, config):
+    return optim.Adam(model.parameters(), lr=0.001)
+
+loss_function = nn.BCEWithLogitsLoss()
+
+# Step 4: Fit with Orca Estimator
+
+from bigdl.orca.learn.pytorch import Estimator
+from bigdl.orca.learn.metrics import Accuracy
+
+# Create the estimator
+backend = "ray"  # "ray" or "spark"
+est = Estimator.from_torch(model=model_creator, optimizer=optimizer_creator, \
+    loss=loss_function, metrics=[Accuracy()], backend=backend)
+
+# Fit the estimator
+est.fit(data=train_loader_func, epochs=1)
+
+# Step 5: Save and Load the Model
+
+# Evaluate the model
+result = est.evaluate(data=test_loader_func)
+for r in result:
+    print(r, ":", result[r])
+
+# Save the model
+est.save("NCF_model")
+
+# Stop orca context when program finishes
+stop_orca_context()