SessionRecommender python api and example (#1465)

* session recommender python
intel · Aug 8, 2019 · 5e3f3ef · 5e3f3ef
1 parent fc7f219
commit 5e3f3ef
Showing 3 changed files with 252 additions and 0 deletions.
diff --git a/pyzoo/test/zoo/models/recommendation/test_sessionrecommender.py b/pyzoo/test/zoo/models/recommendation/test_sessionrecommender.py
@@ -0,0 +1,78 @@
+#
+# Copyright 2018 Analytics Zoo Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import pytest
+
+import random
+
+from zoo.pipeline.api.keras.layers import *
+from zoo.models.recommendation.session_recommender import SessionRecommender
+from test.zoo.pipeline.utils.test_utils import ZooTestCase
+
+np.random.seed(1337)  # for reproducibility
+
+
+class TestSessionRecommender(ZooTestCase):
+    def test_forward_backward_without_history(self):
+        model = SessionRecommender(30, 5, [10, 5], 2)
+        input_data = np.random.randint(1, 30, size=(10, 2))
+        self.assert_forward_backward(model, input_data)
+
+    def test_forward_backward_with_history(self):
+        model = SessionRecommender(30, 5, [10, 5], 2, True, [6, 3], 5)
+        input_data = [np.random.randint(1, 30, size=(10, 2)),
+                      np.random.randint(1, 30, size=(10, 5))]
+        self.assert_forward_backward(model, input_data)
+
+    def test_save_load(self):
+        model = SessionRecommender(30, 5, [10, 5], 2, True, [6, 3], 5)
+        input_data = [np.random.randint(1, 30, size=(10, 2)),
+                      np.random.randint(1, 30, size=(10, 5))]
+        self.assert_zoo_model_save_load(model, input_data)
+
+    def test_compile_fit(self):
+        model = SessionRecommender(30, 5, [10, 5], 2, True, [6, 3], 5)
+        input_data = [[np.random.randint(1, 30, size=(2)),
+                       np.random.randint(1, 30, size=(5)),
+                       np.random.randint(1, 30)] for i in range(100)]
+        samples = self.sc.parallelize(input_data)\
+            .map(lambda x: Sample.from_ndarray((x[0], x[1]), np.array(x[2])))
+        train, test = samples.randomSplit([0.8, 0.2], seed=1)
+        model.compile(loss='sparse_categorical_crossentropy',
+                      optimizer='rmsprop',
+                      metrics=['top5Accuracy'])
+        model.fit(train, batch_size=4, nb_epoch=1, validation_data=test)
+
+    def test_recommed_predict(self):
+        model = SessionRecommender(30, 5, [10, 5], 2, True, [6, 3], 5)
+        input_data = [[np.random.randint(1, 30, size=(2)),
+                       np.random.randint(1, 30, size=(5)),
+                       np.random.randint(1, 30)] for i in range(100)]
+        samples = [Sample.from_ndarray((input_data[i][0], input_data[i][1]),
+                                       np.array(input_data[i][2])) for i in range(100)]
+        rdd = self.sc.parallelize(samples)
+        results1 = model.predict(rdd).collect()
+        print(results1[0])
+
+        recommendations1 = model.recommend_for_session(rdd, 3, zero_based_label=False).collect()
+        print(recommendations1[0])
+
+        recommendations2 = model.recommend_for_session(samples, 3, zero_based_label=False)
+        print(recommendations2[0])
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/pyzoo/zoo/models/recommendation/session_recommender.py b/pyzoo/zoo/models/recommendation/session_recommender.py
@@ -0,0 +1,143 @@
+#
+# Copyright 2018 Analytics Zoo Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import sys
+
+from zoo.models.common import KerasZooModel
+from zoo.models.recommendation import Recommender
+from zoo.pipeline.api.keras.layers import *
+from zoo.pipeline.api.keras.models import *
+
+if sys.version >= '3':
+    long = int
+    unicode = str
+
+
+class SessionRecommender(Recommender):
+    """
+    The Session Recommender model used for recommendation.
+
+    # Arguments
+     item_ount: The number of distinct items. Positive integer.
+     item_embed: The output size of embedding layer. Positive integer.
+     rnn_hidden_layers: Units of hidden layers for the mlp model. Array of positive integers.
+     session_length: The max number of items in the sequence of a session
+     include_history: Whether to include purchase history. Boolean. Default is true.
+     mlp_hidden_layers: Units of hidden layers for the mlp model. Array of positive integers.
+     history_length: The max number of items in the sequence of historical purchase
+     """
+    def __init__(self, item_count, item_embed, rnn_hidden_layers, session_length,
+                 include_history=False, mlp_hidden_layers=[10, 5], his_length=2,
+                 bigdl_type="float"):
+        self.item_count = int(item_count)
+        self.item_embed = int(item_embed)
+        self.mlp_hidden_layers = [int(unit) for unit in mlp_hidden_layers]
+        self.rnn_hidden_layers = [int(unit) for unit in rnn_hidden_layers]
+        self.include_history = include_history
+        self.session_length = int(session_length)
+        self.his_length = int(his_length)
+        self.bigdl_type = bigdl_type
+        self.model = self.build_model()
+        super(SessionRecommender, self).__init__(None, self.bigdl_type,
+                                                 self.item_count,
+                                                 self.item_embed,
+                                                 self.rnn_hidden_layers,
+                                                 self.session_length,
+                                                 self.include_history,
+                                                 self.mlp_hidden_layers,
+                                                 self.his_length,
+                                                 self.model)
+
+    def build_model(self):
+        input_rnn = Input(shape=(self.session_length,))
+        session_table = Embedding(self.item_count + 1, self.item_embed, init="uniform")(input_rnn)
+
+        gru = GRU(self.rnn_hidden_layers[0], return_sequences=True)(session_table)
+        for hidden in range(1, len(self.rnn_hidden_layers) - 1):
+            gru = GRU(self.rnn_hidden_layers[hidden], return_sequences=True)(gru)
+        gru_last = GRU(self.rnn_hidden_layers[-1], return_sequences=False)(gru)
+        rnn = Dense(self.item_count)(gru_last)
+
+        if self.include_history:
+            input_mlp = Input(shape=(self.his_length,))
+            his_table = Embedding(self.item_count + 1, self.item_embed, init="uniform")(input_mlp)
+            embedSum = KerasLayerWrapper(Sum(dimension=2))(his_table)
+            flatten = Flatten()(embedSum)
+            mlp = Dense(self.mlp_hidden_layers[0], activation="relu")(flatten)
+            for hidden in range(1, len(self.mlp_hidden_layers)):
+                mlp = Dense(self.mlp_hidden_layers[hidden], activation="relu")(mlp)
+            mlp_last = Dense(self.item_count)(mlp)
+            merged = merge(inputs=[rnn, mlp_last], mode="sum")
+            out = Activation(activation="softmax")(merged)
+            model = Model(input=[input_rnn, input_mlp], output=out)
+        else:
+            out = Activation(activation="softmax")(rnn)
+            model = Model(input=input_rnn, output=out)
+        return model
+
+    def recommend_for_user(self, feature_rdd, max_items):
+        raise Exception("recommend_for_user: Unsupported for SessionRecommender")
+
+    def recommend_for_item(self, feature_rdd, max_users):
+        raise Exception("recommend_for_item: Unsupported for SessionRecommender")
+
+    def predict_user_item_pair(self, feature_rdd):
+        raise Exception("predict_user_item_pair: Unsupported for SessionRecommender")
+
+    def recommend_for_session(self, sessions, max_items, zero_based_label):
+        """
+        recommend for sessions given rdd of samples or list of samples.
+
+        # Arguments
+        sessions: rdd of samples or list of samples.
+        max_items:   Number of items to be recommended to each user. Positive integer.
+        zero_based_label: True if data starts from 0, False if data starts from 1
+        :return rdd of list of list(item, probability),
+        """
+        if isinstance(sessions, list):
+            sc = get_spark_context()
+            sessions_rdd = sc.parallelize(sessions)
+        elif (isinstance(sessions, RDD)):
+                sessions_rdd = sessions
+        else:
+            raise TypeError("Unsupported training data type: %s" % type(sessions))
+        results = callBigDlFunc(self.bigdl_type, "recommendForSession",
+                                self.value,
+                                sessions_rdd,
+                                max_items,
+                                zero_based_label)
+
+        if isinstance(sessions, list):
+            return results.collect()
+        else:
+            return results
+
+    @staticmethod
+    def load_model(path, weight_path=None, bigdl_type="float"):
+        """
+        Load an existing SessionRecommender model (with weights).
+
+        # Arguments
+        path: The path for the pre-defined model.
+              Local file system, HDFS and Amazon S3 are supported.
+              HDFS path should be like 'hdfs://[host]:[port]/xxx'.
+              Amazon S3 path should be like 's3a://bucket/xxx'.
+        weight_path: The path for pre-trained weights if any. Default is None.
+        """
+        jmodel = callBigDlFunc(bigdl_type, "loadSessionRecommender", path, weight_path)
+        model = KerasZooModel._do_load(jmodel, bigdl_type)
+        model.__class__ = SessionRecommender
+        return model
diff --git a/zoo/src/main/scala/com/intel/analytics/zoo/models/python/PythonZooModel.scala b/zoo/src/main/scala/com/intel/analytics/zoo/models/python/PythonZooModel.scala
@@ -310,6 +310,26 @@ class PythonZooModel[T: ClassTag](implicit ev: TensorNumeric[T]) extends PythonZ
     WideAndDeep.loadModel(path, weightPath)
   }
 
+  def createZooSessionRecommender(
+      itemCount: Int,
+      itemEmbed: Int,
+      rnnHiddenLayers: JList[Int],
+      sessionLength: Int,
+      includeHistory: Boolean,
+      mlpHiddenLayers: JList[Int],
+      historyLength: Int,
+      model: AbstractModule[Activity, Activity, T]): SessionRecommender[T] = {
+    new SessionRecommender[T](itemCount, itemEmbed, rnnHiddenLayers.asScala.toArray, sessionLength,
+      includeHistory, mlpHiddenLayers.asScala.toArray, historyLength)
+      .addModel(model.asInstanceOf[AbstractModule[Tensor[T], Tensor[T], T]])
+  }
+
+  def loadSessionRecommender(
+      path: String,
+      weightPath: String = null): SessionRecommender[T] = {
+    SessionRecommender.loadModel(path, weightPath)
+  }
+
   def toUserItemFeatureRdd(featureRdd: JavaRDD[Array[Object]]): RDD[UserItemFeature[T]] = {
     featureRdd.rdd.foreach(x =>
       require(x.length == 3, "UserItemFeature should consist of userId, itemId and sample"))
@@ -347,6 +367,17 @@ class PythonZooModel[T: ClassTag](implicit ev: TensorNumeric[T]) extends PythonZ
     toPredictionJavaRdd(predictionRdd)
   }
 
+  def recommendForSession(
+      model: SessionRecommender[T],
+      featureRdd: JavaRDD[Sample],
+      maxItems: Int,
+      zeroBasedLabel: Boolean): JavaRDD[JList[JList[Float]]] = {
+    val predictionRdd: RDD[Array[(Int, Float)]] = model
+      .recommendForSession(toJSample(featureRdd), maxItems, zeroBasedLabel)
+
+    predictionRdd.map(x => x.toList.map(y => List(y._1.toFloat, y._2).asJava).asJava).toJavaRDD()
+  }
+
   def getNegativeSamples(indexed: DataFrame): DataFrame = {
     Utils.getNegativeSamples(indexed)
   }