diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 0a09cefada..3cdde7513d 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -12,4 +12,5 @@
- [ ] I have followed the [contribution guidelines](../CONTRIBUTING.md) and code style for this project.
- [ ] I have added tests covering my contributions.
-- [ ] I have updated the documentation accordingly.
\ No newline at end of file
+- [ ] I have updated the documentation accordingly.
+- [ ] This PR is being made to `staging` and not `master`.
diff --git a/README.md b/README.md
index 2bf2d44759..c14c4ee23b 100644
--- a/README.md
+++ b/README.md
@@ -98,13 +98,14 @@ The following tests run on a Windows and Linux DSVM daily. These machines run 24
| Build Type | Branch | Status | | Branch | Status |
| --- | --- | --- | --- | --- | --- |
-| **Linux CPU** | master | [![Status](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_apis/build/status/nightly?branchName=master)](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_build/latest?definitionId=4792) | | staging | [![Status](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_apis/build/status/nightly_staging?branchName=staging)](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_build/latest?definitionId=4594) |
-| **Linux GPU** | master | [![Status](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_apis/build/status/nightly_gpu?branchName=master)](https://msdata.visualstudio.com/DefaultCollection/AlgorithmsAndDataScience/_build/latest?definitionId=4997) | | staging | [![Status](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_apis/build/status/nightly_gpu_staging?branchName=staging)](https://msdata.visualstudio.com/DefaultCollection/AlgorithmsAndDataScience/_build/latest?definitionId=4998) |
-| **Linux Spark** | master | [![Status](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_apis/build/status/nightly_spark?branchName=master)](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_build/latest?definitionId=4804) | | staging | [![Status](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_apis/build/status/Recommenders/nightly_spark_staging)](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_build/latest?definitionId=5186) |
-| **Windows CPU** | master | [![Status](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_apis/build/status/nightly_win?branchName=master)](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_build/latest?definitionId=6743) | | staging | [![Status](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_apis/build/status/nightly_staging_win?branchName=staging)](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_build/latest?definitionId=6752) |
-| **Windows GPU** | master | [![Status](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_apis/build/status/nightly_gpu_win?branchName=master)](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_build/latest?definitionId=6756) | | staging | [![Status](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_apis/build/status/nightly_gpu_staging_win?branchName=staging)](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_build/latest?definitionId=6761) |
-| **Windows Spark** | master | [![Status](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_apis/build/status/nightly_spark_win?branchName=master)](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_build/latest?definitionId=6757) | | staging | [![Status](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_apis/build/status/nightly_spark_staging_win?branchName=staging)](https://msdata.visualstudio.com/AlgorithmsAndDataScience/_build/latest?definitionId=6754) |
-
+| **Linux CPU** | master | [![Build Status](https://dev.azure.com/best-practices/recommenders/_apis/build/status/linux-tests/dsvm_nightly_linux_cpu?branchName=master)](https://dev.azure.com/best-practices/recommenders/_build/latest?definitionId=67&branchName=master) | | staging | [![Build Status](https://dev.azure.com/best-practices/recommenders/_apis/build/status/linux-tests/dsvm_nightly_linux_cpu?branchName=staging)](https://dev.azure.com/best-practices/recommenders/_build/latest?definitionId=67&branchName=staging) |
+| **Linux GPU** | master | [![Build Status](https://dev.azure.com/best-practices/recommenders/_apis/build/status/linux-tests/dsvm_nightly_linux_gpu?branchName=master)](https://dev.azure.com/best-practices/recommenders/_build/latest?definitionId=85&branchName=master) | | staging | [![Build Status](https://dev.azure.com/best-practices/recommenders/_apis/build/status/linux-tests/dsvm_nightly_linux_gpu?branchName=staging)](https://dev.azure.com/best-practices/recommenders/_build/latest?definitionId=85&branchName=staging) |
+| **Linux Spark** | master | [![Build Status](https://dev.azure.com/best-practices/recommenders/_apis/build/status/linux-tests/dsvm_nightly_linux_pyspark?branchName=master)](https://dev.azure.com/best-practices/recommenders/_build/latest?definitionId=86&branchName=master) | | staging | [![Build Status](https://dev.azure.com/best-practices/recommenders/_apis/build/status/linux-tests/dsvm_nightly_linux_pyspark?branchName=staging)](https://dev.azure.com/best-practices/recommenders/_build/latest?definitionId=86&branchName=staging) |
+| **Windows CPU** | master | [![Build Status](https://dev.azure.com/best-practices/recommenders/_apis/build/status/windows-tests/dsvm_nightly_win_cpu?branchName=master)](https://dev.azure.com/best-practices/recommenders/_build/latest?definitionId=101&branchName=master) | | staging | [![Build Status](https://dev.azure.com/best-practices/recommenders/_apis/build/status/windows-tests/dsvm_nightly_win_cpu?branchName=staging)](https://dev.azure.com/best-practices/recommenders/_build/latest?definitionId=101&branchName=staging) |
+| **Windows GPU** | master | [![Build Status](https://dev.azure.com/best-practices/recommenders/_apis/build/status/windows-tests/dsvm_nightly_win_gpu?branchName=master)](https://dev.azure.com/best-practices/recommenders/_build/latest?definitionId=102&branchName=master) | | staging | [![Build Status](https://dev.azure.com/best-practices/recommenders/_apis/build/status/windows-tests/dsvm_nightly_win_gpu?branchName=staging)](https://dev.azure.com/best-practices/recommenders/_build/latest?definitionId=102&branchName=staging) |
+| **Windows Spark** | master | [![Build Status](https://dev.azure.com/best-practices/recommenders/_apis/build/status/windows-tests/dsvm_nightly_win_pyspark?branchName=master)](https://dev.azure.com/best-practices/recommenders/_build/latest?definitionId=103&branchName=master) | | staging | [![Build Status](https://dev.azure.com/best-practices/recommenders/_apis/build/status/windows-tests/dsvm_nightly_win_pyspark?branchName=staging)](https://dev.azure.com/best-practices/recommenders/_build/latest?definitionId=103&branchName=staging) |
+
+
### Related projects
-
-[Microsoft AI Labs Github](https://aka.ms/ai-labs) Find other Best Practice projects, and Azure AI design patterns in our central repository.
+[Microsoft AI Github](https://github.com/microsoft/ai): Find other Best Practice projects, and Azure AI design patterns in our central repository.
diff --git a/SETUP.md b/SETUP.md
index 6b28baa61f..012caea608 100644
--- a/SETUP.md
+++ b/SETUP.md
@@ -81,9 +81,73 @@ To install the PySpark environment:
python scripts/generate_conda_file.py --pyspark
conda env create -f reco_pyspark.yaml
-Additionally, if you want to test a particular version of spark, you may pass the --pyspark-version argument:
+> Additionally, if you want to test a particular version of spark, you may pass the --pyspark-version argument:
+>
+> python scripts/generate_conda_file.py --pyspark-version 2.4.0
+
+Then, we need to set the environment variables `PYSPARK_PYTHON` and `PYSPARK_DRIVER_PYTHON` to point to the conda python executable.
+
+Click on the following menus to see details:
+
+Linux or MacOS
+
+To set these variables every time the environment is activated, we can follow the steps of this [guide](https://conda.io/docs/user-guide/tasks/manage-environments.html#macos-and-linux).
+First, get the path of the environment `reco_pyspark` is installed:
+
+ RECO_ENV=$(conda env list | grep reco_pyspark | awk '{print $NF}')
+
+Then, create the file `$RECO_ENV/etc/conda/activate.d/env_vars.sh` and add:
+
+ #!/bin/sh
+ RECO_ENV=$(conda env list | grep reco_pyspark | awk '{print $NF}')
+ export PYSPARK_PYTHON=$RECO_ENV/bin/python
+ export PYSPARK_DRIVER_PYTHON=$RECO_ENV/bin/python
+ export SPARK_HOME_BACKUP=$SPARK_HOME
+ unset SPARK_HOME
+
+This will export the variables every time we do `conda activate reco_pyspark`.
+To unset these variables when we deactivate the environment,
+create the file `$RECO_ENV/etc/conda/deactivate.d/env_vars.sh` and add:
+
+ #!/bin/sh
+ unset PYSPARK_PYTHON
+ unset PYSPARK_DRIVER_PYTHON
+ export SPARK_HOME=$SPARK_HOME_BACKUP
+ unset SPARK_HOME_BACKUP
+
+
- python scripts/generate_conda_file.py --pyspark-version 2.4.0
+Windows
+
+To set these variables every time the environment is activated, we can follow the steps of this [guide](https://conda.io/docs/user-guide/tasks/manage-environments.html#windows).
+First, get the path of the environment `reco_pyspark` is installed:
+
+ for /f "delims=" %A in ('conda env list ^| grep reco_pyspark ^| awk "{print $NF}"') do set "RECO_ENV=%A"
+
+Then, create the file `%RECO_ENV%\etc\conda\activate.d\env_vars.bat` and add:
+
+ @echo off
+ for /f "delims=" %%A in ('conda env list ^| grep reco_pyspark ^| awk "{print $NF}"') do set "RECO_ENV=%%A"
+ set PYSPARK_PYTHON=%RECO_ENV%\python.exe
+ set PYSPARK_DRIVER_PYTHON=%RECO_ENV%\python.exe
+ set SPARK_HOME_BACKUP=%SPARK_HOME%
+ set SPARK_HOME=
+ set PYTHONPATH_BACKUP=%PYTHONPATH%
+ set PYTHONPATH=
+
+This will export the variables every time we do `conda activate reco_pyspark`.
+To unset these variables when we deactivate the environment,
+create the file `%RECO_ENV%\etc\conda\deactivate.d\env_vars.bat` and add:
+
+ @echo off
+ set PYSPARK_PYTHON=
+ set PYSPARK_DRIVER_PYTHON=
+ set SPARK_HOME=%SPARK_HOME_BACKUP%
+ set SPARK_HOME_BACKUP=
+ set PYTHONPATH=%PYTHONPATH_BACKUP%
+ set PYTHONPATH_BACKUP=
+
+
@@ -97,68 +161,12 @@ To install the environment:
python scripts/generate_conda_file.py --gpu --pyspark
conda env create -f reco_full.yaml
+Then, we need to set the environment variables `PYSPARK_PYTHON` and `PYSPARK_DRIVER_PYTHON` to point to the conda python executable.
+See **PySpark environment** setup section for the details about how to setup those variables.
+where you will need to change `reco_pyspark` string in the commands to `reco_full`.
-> **NOTE** - for PySpark environments (`reco_pyspark` and `reco_full`), we need to set the environment variables
-> `PYSPARK_PYTHON` and `PYSPARK_DRIVER_PYTHON` to point to the conda python executable.
->
-> Click on the following menus to see details:
->
->
-> Linux or MacOS
->
-> To set these variables every time the environment is activated, we can follow the steps of this [guide](https://conda.io/docs/user-guide/tasks/manage-environments.html#macos-and-linux).
-> Assuming that we have installed the environment in `/anaconda/envs/reco_pyspark`,
-> create the file `/anaconda/envs/reco_pyspark/etc/conda/activate.d/env_vars.sh` and add:
->
-> #!/bin/sh
-> export PYSPARK_PYTHON=/anaconda/envs/reco_pyspark/bin/python
-> export PYSPARK_DRIVER_PYTHON=/anaconda/envs/reco_pyspark/bin/python
-> export SPARK_HOME_BACKUP=$SPARK_HOME
-> unset SPARK_HOME
->
-> This will export the variables every time we do `conda activate reco_pyspark`.
-> To unset these variables when we deactivate the environment,
-> create the file `/anaconda/envs/reco_pyspark/etc/conda/deactivate.d/env_vars.sh` and add:
->
-> #!/bin/sh
-> unset PYSPARK_PYTHON
-> unset PYSPARK_DRIVER_PYTHON
-> export SPARK_HOME=$SPARK_HOME_BACKUP
-> unset SPARK_HOME_BACKUP
->
->
->
-> Windows
->
-> To set these variables every time the environment is activated, we can follow the steps of this [guide](https://conda.io/docs/user-guide/tasks/manage-environments.html#windows).
-> Assuming that we have installed the environment in `c:\anaconda\envs\reco_pyspark`,
-> create the file `c:\anaconda\envs\reco_pyspark\etc\conda\activate.d\env_vars.bat` and add:
->
-> @echo off
-> set PYSPARK_PYTHON=c:\anaconda\envs\reco_pyspark\python.exe
-> set PYSPARK_DRIVER_PYTHON=c:\anaconda\envs\reco_pyspark\python.exe
-> set SPARK_HOME_BACKUP=%SPARK_HOME%
-> set SPARK_HOME=
-> set PYTHONPATH_BACKUP=%PYTHONPATH%
-> set PYTHONPATH=
->
-> This will export the variables every time we do `conda activate reco_pyspark`.
-> To unset these variables when we deactivate the environment,
-> create the file `c:\anaconda\envs\reco_pyspark\etc\conda\deactivate.d\env_vars.bat` and add:
->
-> @echo off
-> set PYSPARK_PYTHON=
-> set PYSPARK_DRIVER_PYTHON=
-> set SPARK_HOME=%SPARK_HOME_BACKUP%
-> set SPARK_HOME_BACKUP=
-> set PYTHONPATH=%PYTHONPATH_BACKUP%
-> set PYTHONPATH_BACKUP=
->
->
-
-
### Register the conda environment as a kernel in Jupyter
We can register our created conda environment to appear as a kernel in the Jupyter notebooks.
diff --git a/docker/Dockerfile b/docker/Dockerfile
index bf3d18c430..479bfbcab5 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -14,7 +14,7 @@ WORKDIR ${HOME}
# Install base dependencies
RUN apt-get update && \
- apt-get install -y curl git
+ apt-get install -y curl git
# Install Anaconda
ARG ANACONDA="https://repo.continuum.io/miniconda/Miniconda3-4.6.14-Linux-x86_64.sh"
@@ -42,7 +42,6 @@ RUN mkdir ${HOME}/.jupyter && \
# CPU Stage
FROM base AS cpu
-# Setup Conda environment
RUN python recommenders/scripts/generate_conda_file.py --name base
diff --git a/notebooks/00_quick_start/README.md b/notebooks/00_quick_start/README.md
index 2d968665a8..685252abdd 100644
--- a/notebooks/00_quick_start/README.md
+++ b/notebooks/00_quick_start/README.md
@@ -15,7 +15,7 @@ data preparation, model building, and model evaluation by using the utility func
| [rbm](rbm_movielens.ipynb)| MovieLens | Python CPU, GPU | Utilizing the Restricted Boltzmann Machine (rbm) [4] to predict movie ratings in a Python+GPU (TensorFlow) environment.
| [rlrmc](rlrmc_movielens.ipynb) | Movielens | Python CPU | Utilizing the Riemannian Low-rank Matrix Completion (RLRMC) [6] to predict movie rating in a Python+CPU environment
| [sar](sar_movielens.ipynb) | MovieLens | Python CPU | Utilizing Simple Algorithm for Recommendation (SAR) algorithm to predict movie ratings in a Python+CPU environment.
-| [sar_azureml](sar_movielens_with_azureml.ipynb)| MovieLens | Python CPU | An example of how to utilize and evaluate SAR using the [Azure Machine Learning service](https://docs.microsoft.com/azure/machine-learning/service/overview-what-is-azure-ml)(AzureML). It takes the content of the [sar quickstart notebook](sar_movielens.ipynb) and demonstrates how to use the power of the cloud to manage data, switch to powerful GPU machines, and monitor runs while training a model.
+| [sar_azureml](sar_movielens_with_azureml.ipynb)| MovieLens | Python CPU | An example of how to utilize and evaluate SAR using the [Azure Machine Learning service](https://docs.microsoft.com/azure/machine-learning/service/overview-what-is-azure-ml) (AzureML). It takes the content of the [sar quickstart notebook](sar_movielens.ipynb) and demonstrates how to use the power of the cloud to manage data, switch to powerful GPU machines, and monitor runs while training a model.
| [wide-and-deep](wide_deep_movielens.ipynb) | MovieLens | Python CPU, GPU | Utilizing Wide-and-Deep Model (Wide-and-Deep) [5] to predict movie ratings in a Python+GPU (TensorFlow) environment.
| [xdeepfm](xdeepfm_criteo.ipynb) | Criteo, Synthetic Data | Python CPU, GPU | Utilizing the eXtreme Deep Factorization Machine (xDeepFM) [3] to learn both low and high order feature interactions for predicting CTR, in a Python+GPU (TensorFlow) environment.
diff --git a/notebooks/00_quick_start/sar_movielens.ipynb b/notebooks/00_quick_start/sar_movielens.ipynb
index fc36437be4..d8d19df340 100644
--- a/notebooks/00_quick_start/sar_movielens.ipynb
+++ b/notebooks/00_quick_start/sar_movielens.ipynb
@@ -41,16 +41,16 @@
},
{
"cell_type": "code",
- "execution_count": 71,
+ "execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) \n",
+ "System version: 3.7.3 | packaged by conda-forge | (default, Jul 1 2019, 21:52:21) \n",
"[GCC 7.3.0]\n",
- "Pandas version: 0.24.1\n"
+ "Pandas version: 0.23.4\n"
]
}
],
@@ -60,12 +60,11 @@
"sys.path.append(\"../../\")\n",
"\n",
"import logging\n",
- "import time\n",
- "\n",
"import numpy as np\n",
"import pandas as pd\n",
"import papermill as pm\n",
"\n",
+ "from reco_utils.common.timer import Timer\n",
"from reco_utils.dataset import movielens\n",
"from reco_utils.dataset.python_splitters import python_stratified_split\n",
"from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n",
@@ -91,7 +90,7 @@
},
{
"cell_type": "code",
- "execution_count": 72,
+ "execution_count": 3,
"metadata": {
"tags": [
"parameters"
@@ -115,14 +114,14 @@
},
{
"cell_type": "code",
- "execution_count": 73,
+ "execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
- "4.93MB [00:01, 3.46MB/s] \n"
+ "100%|██████████| 4.81k/4.81k [00:02<00:00, 1.90kKB/s]\n"
]
},
{
@@ -201,7 +200,7 @@
"4 166 346 1.0 886397596"
]
},
- "execution_count": 73,
+ "execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -228,7 +227,7 @@
},
{
"cell_type": "code",
- "execution_count": 74,
+ "execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@@ -237,7 +236,7 @@
},
{
"cell_type": "code",
- "execution_count": 75,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
@@ -299,7 +298,7 @@
},
{
"cell_type": "code",
- "execution_count": 76,
+ "execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
@@ -334,73 +333,47 @@
},
{
"cell_type": "code",
- "execution_count": 77,
+ "execution_count": 8,
"metadata": {},
"outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2019-05-07 11:16:40,709 INFO Collecting user affinity matrix\n",
- "2019-05-07 11:16:40,715 INFO Calculating time-decayed affinities\n",
- "2019-05-07 11:16:40,766 INFO Creating index columns\n",
- "2019-05-07 11:16:40,782 INFO Building user affinity sparse matrix\n",
- "2019-05-07 11:16:40,787 INFO Calculating item co-occurrence\n",
- "2019-05-07 11:16:40,910 INFO Calculating item similarity\n",
- "2019-05-07 11:16:40,910 INFO Using jaccard based similarity\n",
- "2019-05-07 11:16:40,990 INFO Done training\n"
- ]
- },
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Took 0.284792423248291 seconds for training.\n"
+ "Took 0.3302565817721188 seconds for training.\n"
]
}
],
"source": [
- "start_time = time.time()\n",
- "\n",
- "model.fit(train)\n",
+ "with Timer() as train_time:\n",
+ " model.fit(train)\n",
"\n",
- "train_time = time.time() - start_time\n",
- "print(\"Took {} seconds for training.\".format(train_time))"
+ "print(\"Took {} seconds for training.\".format(train_time.interval))"
]
},
{
"cell_type": "code",
- "execution_count": 78,
+ "execution_count": 9,
"metadata": {},
"outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2019-05-07 11:16:41,003 INFO Calculating recommendation scores\n",
- "2019-05-07 11:16:41,114 INFO Removing seen items\n"
- ]
- },
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Took 0.1463017463684082 seconds for prediction.\n"
+ "Took 0.21034361701458693 seconds for prediction.\n"
]
}
],
"source": [
- "start_time = time.time()\n",
- "\n",
- "top_k = model.recommend_k_items(test, remove_seen=True)\n",
+ "with Timer() as test_time:\n",
+ " top_k = model.recommend_k_items(test, remove_seen=True)\n",
"\n",
- "test_time = time.time() - start_time\n",
- "print(\"Took {} seconds for prediction.\".format(test_time))"
+ "print(\"Took {} seconds for prediction.\".format(test_time.interval))"
]
},
{
"cell_type": "code",
- "execution_count": 79,
+ "execution_count": 10,
"metadata": {
"scrolled": true
},
@@ -435,32 +408,32 @@
"
\n",
" 0 \n",
" 1 \n",
- " 58 \n",
- " 3.049881 \n",
+ " 204 \n",
+ " 3.313306 \n",
" \n",
" \n",
" 1 \n",
" 1 \n",
- " 7 \n",
- " 3.053073 \n",
+ " 89 \n",
+ " 3.280465 \n",
" \n",
" \n",
" 2 \n",
" 1 \n",
- " 318 \n",
- " 3.059262 \n",
+ " 11 \n",
+ " 3.233867 \n",
" \n",
" \n",
" 3 \n",
" 1 \n",
- " 210 \n",
- " 3.095604 \n",
+ " 367 \n",
+ " 3.192575 \n",
" \n",
" \n",
" 4 \n",
" 1 \n",
- " 96 \n",
- " 3.124997 \n",
+ " 423 \n",
+ " 3.131517 \n",
" \n",
" \n",
"\n",
@@ -468,19 +441,20 @@
],
"text/plain": [
" userID itemID prediction\n",
- "0 1 58 3.049881\n",
- "1 1 7 3.053073\n",
- "2 1 318 3.059262\n",
- "3 1 210 3.095604\n",
- "4 1 96 3.124997"
+ "0 1 204 3.313306\n",
+ "1 1 89 3.280465\n",
+ "2 1 11 3.233867\n",
+ "3 1 367 3.192575\n",
+ "4 1 423 3.131517"
]
},
+ "execution_count": 10,
"metadata": {},
- "output_type": "display_data"
+ "output_type": "execute_result"
}
],
"source": [
- "display(top_k.head())"
+ "top_k.head()"
]
},
{
@@ -494,7 +468,7 @@
},
{
"cell_type": "code",
- "execution_count": 80,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
@@ -503,7 +477,7 @@
},
{
"cell_type": "code",
- "execution_count": 81,
+ "execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
@@ -512,7 +486,7 @@
},
{
"cell_type": "code",
- "execution_count": 82,
+ "execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
@@ -521,7 +495,7 @@
},
{
"cell_type": "code",
- "execution_count": 83,
+ "execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
@@ -530,7 +504,7 @@
},
{
"cell_type": "code",
- "execution_count": 84,
+ "execution_count": 15,
"metadata": {},
"outputs": [
{
@@ -557,17 +531,9 @@
},
{
"cell_type": "code",
- "execution_count": 85,
+ "execution_count": 16,
"metadata": {},
"outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2019-05-07 11:16:42,926 INFO Calculating recommendation scores\n",
- "2019-05-07 11:16:43,033 INFO Removing seen items\n"
- ]
- },
{
"data": {
"text/html": [
@@ -650,7 +616,7 @@
"4 876 288 3.0 879428101 NaN"
]
},
- "execution_count": 85,
+ "execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
@@ -737,8 +703,8 @@
"pm.record(\"ndcg\", eval_ndcg)\n",
"pm.record(\"precision\", eval_precision)\n",
"pm.record(\"recall\", eval_recall)\n",
- "pm.record(\"train_time\", train_time)\n",
- "pm.record(\"test_time\", test_time)"
+ "pm.record(\"train_time\", train_time.interval)\n",
+ "pm.record(\"test_time\", test_time.interval)"
]
}
],
@@ -759,7 +725,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.8"
+ "version": "3.7.3"
}
},
"nbformat": 4,
diff --git a/notebooks/00_quick_start/sar_movielens_with_azureml.ipynb b/notebooks/00_quick_start/sar_movielens_with_azureml.ipynb
index f99e1eced6..e09110cd04 100644
--- a/notebooks/00_quick_start/sar_movielens_with_azureml.ipynb
+++ b/notebooks/00_quick_start/sar_movielens_with_azureml.ipynb
@@ -48,8 +48,7 @@
"- SAR does not use item or user features, so cannot handle cold-start use cases\n",
"- SAR requires the creation of an $mxm$ dense matrix (where $m$ is the number of items). So memory consumption can be an issue with large numbers of items.\n",
"- SAR is best used for ranking items per user, as the scale of predicted ratings may be different from the input range and will differ across users.\n",
- "For more details see the deep dive notebook on SAR here: [SAR Deep Dive Notebook](../02_model/sar_deep_dive.ipynb)",
- "\n",
+ "For more details see the deep dive notebook on SAR here: [SAR Deep Dive Notebook](../02_model/sar_deep_dive.ipynb)\n",
"---\n",
"## Prerequisities\n",
" - **Azure Subscription**\n",
@@ -60,14 +59,14 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "azureml.core version: 1.0.23\n"
+ "azureml.core version: 1.0.18\n"
]
}
],
@@ -78,6 +77,7 @@
"\n",
"import os\n",
"import shutil\n",
+ "import numpy as np\n",
"from tempfile import TemporaryDirectory\n",
"\n",
"import azureml\n",
@@ -94,7 +94,7 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 26,
"metadata": {
"tags": [
"parameters"
@@ -106,7 +106,7 @@
"TOP_K = 10\n",
"\n",
"# Select Movielens data size: 100k, 1m, 10m, or 20m\n",
- "MOVIELENS_DATA_SIZE = '1m'"
+ "MOVIELENS_DATA_SIZE = '100k'"
]
},
{
@@ -126,19 +126,9 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Performing interactive authentication. Please follow the instructions on the terminal.\n",
- "To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code AA9E5YB5M to authenticate.\n",
- "Found the config file in: /data/home/testuser/notebooks/Recommenders/notebooks/00_quick_start/.azureml/config.json\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"ws = get_or_create_workspace(\n",
" subscription_id=\"\",\n",
@@ -158,7 +148,7 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
@@ -178,31 +168,23 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
- "5.92MB [00:00, 13.8MB/s] \n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Uploading /tmp/tmpc98zwvek/movielens_1m_data.pkl\n",
- "Uploaded /tmp/tmpc98zwvek/movielens_1m_data.pkl, 1 files out of an estimated total of 1\n"
+ "100%|██████████| 4.81k/4.81k [00:02<00:00, 1.98kKB/s]\n"
]
},
{
"data": {
"text/plain": [
- "$AZUREML_DATAREFERENCE_3d32ed3550b24ea9af3ce37c4977b877"
+ "$AZUREML_DATAREFERENCE_57dbc7117f67479892135cec2819b78b"
]
},
- "execution_count": 16,
+ "execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
@@ -221,7 +203,7 @@
"data.to_pickle(os.path.join(tmp_dir.name, data_file_name))\n",
"\n",
"ds = ws.get_default_datastore()\n",
- "ds.upload(src_dir=tmp_dir.name, target_path=TARGET_DIR, overwrite=True, show_progress=True)"
+ "ds.upload(src_dir=tmp_dir.name, target_path=TARGET_DIR, overwrite=True, show_progress=False)"
]
},
{
@@ -261,14 +243,18 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Found existing compute target\n"
+ "Creating a new compute target...\n",
+ "Creating\n",
+ "Succeeded\n",
+ "AmlCompute wait for completion finished\n",
+ "Minimum number of nodes requested have been provisioned\n"
]
}
],
@@ -309,7 +295,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
@@ -330,14 +316,14 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Writing /tmp/tmpc98zwvek/movielens-sar/train.py\n"
+ "Writing /tmp/tmp6imrlt0z/movielens-sar/train.py\n"
]
}
],
@@ -350,16 +336,21 @@
"import pandas as pd\n",
"import itertools\n",
"import logging\n",
- "import time\n",
"\n",
"from azureml.core import Run\n",
"from sklearn.externals import joblib\n",
"\n",
+ "from reco_utils.common.timer import Timer\n",
"from reco_utils.dataset import movielens\n",
- "from reco_utils.dataset.python_splitters import python_random_split\n",
+ "from reco_utils.dataset.python_splitters import python_stratified_split\n",
"from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k\n",
"from reco_utils.recommender.sar.sar_singlenode import SARSingleNode\n",
"\n",
+ "\n",
+ "logging.basicConfig(level=logging.DEBUG, \n",
+ " format='%(asctime)s %(levelname)-8s %(message)s')\n",
+ "\n",
+ "\n",
"TARGET_DIR = 'movielens'\n",
"OUTPUT_FILE_NAME = 'outputs/movielens_sar_model.pkl'\n",
"MODEL_FILE_NAME = 'movielens_sar_model.pkl'\n",
@@ -376,17 +367,7 @@
"parser.add_argument('--data-size', type=str, dest='data_size', default=10, help='Movielens data size: 100k, 1m, 10m, or 20m')\n",
"args = parser.parse_args()\n",
"\n",
- "data_pickle_path = os.path.join(args.data_folder, args.data_file)\n",
- "\n",
- "data = pd.read_pickle(path=data_pickle_path)\n",
- "\n",
- "# Log arguments to the run for tracking\n",
- "run.log(\"top-k\", args.top_k)\n",
- "run.log(\"data-size\", args.data_size)\n",
- "\n",
- "train, test = python_random_split(data)\n",
- "\n",
- "# instantiate the SAR algorithm and set the index\n",
+ "# set col names\n",
"header = {\n",
" \"col_user\": \"UserId\",\n",
" \"col_item\": \"MovieId\",\n",
@@ -394,34 +375,39 @@
" \"col_timestamp\": \"Timestamp\",\n",
"}\n",
"\n",
- "logging.basicConfig(level=logging.DEBUG, \n",
- " format='%(asctime)s %(levelname)-8s %(message)s')\n",
+ "# read data\n",
+ "data_pickle_path = os.path.join(args.data_folder, args.data_file)\n",
+ "data = pd.read_pickle(path=data_pickle_path)\n",
"\n",
+ "# Log arguments to the run for tracking\n",
+ "run.log(\"top-k\", args.top_k)\n",
+ "run.log(\"data-size\", args.data_size)\n",
+ "\n",
+ "# split dataset into train and test\n",
+ "train, test = python_stratified_split(data, ratio=0.75, col_user=header[\"col_user\"], col_item=header[\"col_item\"], seed=42)\n",
+ "\n",
+ "# instantiate the model\n",
"model = SARSingleNode(\n",
- " remove_seen=True, similarity_type=\"jaccard\", \n",
- " time_decay_coefficient=30, time_now=None, timedecay_formula=True, **header\n",
+ " similarity_type=\"jaccard\", \n",
+ " time_decay_coefficient=30, \n",
+ " time_now=None, \n",
+ " timedecay_formula=True, \n",
+ " **header\n",
")\n",
"\n",
"# train the SAR model\n",
- "start_time = time.time()\n",
+ "with Timer() as t:\n",
+ " model.fit(train)\n",
"\n",
- "model.fit(train)\n",
+ "run.log(name=\"Training time\", value=t.interval)\n",
"\n",
- "train_time = time.time() - start_time\n",
- "run.log(name=\"Training time\", value=train_time)\n",
+ "# predict top k items\n",
+ "with Timer() as t:\n",
+ " top_k = model.recommend_k_items(test, remove_seen=True)\n",
"\n",
- "start_time = time.time()\n",
+ "run.log(name=\"Prediction time\", value=t.interval)\n",
"\n",
- "top_k = model.recommend_k_items(test)\n",
- "\n",
- "test_time = time.time() - start_time\n",
- "run.log(name=\"Prediction time\", value=test_time)\n",
- "\n",
- "# TODO: remove this call when the model returns same type as input\n",
- "top_k['UserId'] = pd.to_numeric(top_k['UserId'])\n",
- "top_k['MovieId'] = pd.to_numeric(top_k['MovieId'])\n",
- "\n",
- "# evaluate\n",
+ "# compute evaluation metrics\n",
"eval_map = map_at_k(test, top_k, col_user=\"UserId\", col_item=\"MovieId\", \n",
" col_rating=\"Rating\", col_prediction=\"prediction\", \n",
" relevancy_method=\"top_k\", k=args.top_k)\n",
@@ -449,16 +435,16 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'/tmp/tmpc98zwvek/movielens-sar/reco_utils'"
+ "'/tmp/tmp6imrlt0z/movielens-sar/reco_utils'"
]
},
- "execution_count": 20,
+ "execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
@@ -466,6 +452,8 @@
"source": [
"# copy dependent python files\n",
"UTILS_DIR = os.path.join(SCRIPT_DIR, 'reco_utils')\n",
+ "if os.path.exists(UTILS_DIR):\n",
+ " shutil.rmtree(UTILS_DIR)\n",
"shutil.copytree('../../reco_utils/', UTILS_DIR)"
]
},
@@ -491,7 +479,7 @@
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": 39,
"metadata": {
"tags": [
"configure estimator"
@@ -526,7 +514,7 @@
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
@@ -551,13 +539,13 @@
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
- "model_id": "5119a29a82044100be0e2f47e40aef15",
+ "model_id": "145080f4fc0e47c8a892ff7db3f3c08b",
"version_major": 2,
"version_minor": 0
},
@@ -583,9 +571,26 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 42,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ ""
+ ],
+ "text/plain": [
+ "Run(Experiment: movielens-sar,\n",
+ "Id: movielens-sar_1575027796_199dd2c6,\n",
+ "Type: azureml.scriptrun,\n",
+ "Status: Completed)"
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"run"
]
@@ -602,21 +607,31 @@
},
{
"cell_type": "code",
- "execution_count": 28,
+ "execution_count": 43,
"metadata": {},
"outputs": [
{
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{'top-k': 10, 'data-size': '1m', 'Training time': 2.6481945514678955, 'Prediction time': 2.9131650924682617, 'map': 0.0023399652870897034, 'ndcg': 0.031352549193757774, 'precision': 0.038807947019867554, 'recall': 0.014086226527787116}\n"
- ]
+ "data": {
+ "text/plain": [
+ "{'top-k': 10,\n",
+ " 'data-size': '100k',\n",
+ " 'Training time': 0.4077951559999633,\n",
+ " 'Prediction time': 0.13354294300000902,\n",
+ " 'map': 0.11059057578638949,\n",
+ " 'ndcg': 0.3824612290501957,\n",
+ " 'precision': 0.33075291622481445,\n",
+ " 'recall': 0.1763854474342893}"
+ ]
+ },
+ "execution_count": 43,
+ "metadata": {},
+ "output_type": "execute_result"
}
],
"source": [
"# run below after run is complete, otherwise metrics is empty\n",
"metrics = run.get_metrics()\n",
- "print(metrics)"
+ "metrics"
]
},
{
@@ -629,7 +644,7 @@
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
@@ -643,21 +658,28 @@
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"# clean up temporary directory\n",
"tmp_dir.cleanup()"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
- "display_name": "Python 3.6 - AzureML",
+ "display_name": "Python (reco_base)",
"language": "python",
- "name": "python3-azureml"
+ "name": "reco_base"
},
"language_info": {
"codemirror_mode": {
@@ -669,7 +691,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.7"
+ "version": "3.6.8"
}
},
"nbformat": 4,
diff --git a/notebooks/00_quick_start/sequential_recsys_amazondataset.ipynb b/notebooks/00_quick_start/sequential_recsys_amazondataset.ipynb
new file mode 100644
index 0000000000..97d04c884c
--- /dev/null
+++ b/notebooks/00_quick_start/sequential_recsys_amazondataset.ipynb
@@ -0,0 +1,501 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Copyright (c) Microsoft Corporation. All rights reserved. \n",
+ "\n",
+ "Licensed under the MIT License. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Sequential Recommender Quick Start\n",
+ "\n",
+ "### Example: SLi_Rec : Adaptive User Modeling with Long and Short-Term Preferences for Personailzed Recommendation\n",
+ "Unlike a general recommender such as Matrix Factorization or xDeepFM (in the repo) which doesn't consider the order of the user's activities, sequential recommender systems take the sequence of the user behaviors as context and the goal is to predict the items that the user will interact in a short time (in an extreme case, the item that the user will interact next).\n",
+ "\n",
+ "This notebook aims to give you a quick example of how to train a sequential model based on a public Amazon dataset. Currently, we can support GRU4Rec \\[2\\], Caser \\[3\\] and SLi_Rec \\[1\\]. Without loss of generality, this notebook takes [SLi_Rec model](https://www.microsoft.com/en-us/research/uploads/prod/2019/07/IJCAI19-ready_v1.pdf) for example.\n",
+ "SLi_Rec \\[1\\] is a deep learning-based model aims at capturing both long and short-term user preferences for precise recommender systems. To summarize, SLi_Rec has the following key properties:\n",
+ "\n",
+ "* It adopts the attentive \"Asymmetric-SVD\" paradigm for long-term modeling;\n",
+ "* It takes both time irregularity and semantic irregularity into consideration by modifying the gating logic in LSTM.\n",
+ "* It uses an attention mechanism to dynamic fuse the long-term component and short-term component.\n",
+ "\n",
+ "In this notebook, we test SLi_Rec on a subset of the public dataset: [Amazon_reviews](http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Movies_and_TV_5.json.gz) and [Amazon_metadata](http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_Movies_and_TV.json.gz)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 0. Global Settings and Imports"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "System version: 3.6.8 |Anaconda, Inc.| (default, Feb 21 2019, 18:30:04) [MSC v.1916 64 bit (AMD64)]\n",
+ "Tensorflow version: 1.12.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "import sys\n",
+ "sys.path.append(\"../../\")\n",
+ "import os\n",
+ "import logging\n",
+ "import papermill as pm\n",
+ "from tempfile import TemporaryDirectory\n",
+ "\n",
+ "import tensorflow as tf\n",
+ "import time\n",
+ "\n",
+ "from reco_utils.common.constants import SEED\n",
+ "from reco_utils.recommender.deeprec.deeprec_utils import (\n",
+ " prepare_hparams\n",
+ ")\n",
+ "from reco_utils.dataset.amazon_reviews import download_and_extract, data_preprocessing\n",
+ "from reco_utils.dataset.download_utils import maybe_download\n",
+ "from reco_utils.recommender.deeprec.models.sequential.sli_rec import SLI_RECModel\n",
+ "#### to use the other model, use one of the following lines:\n",
+ "#from reco_utils.recommender.deeprec.models.sequential.asvd import ASVDModel\n",
+ "#from reco_utils.recommender.deeprec.models.sequential.caser import CaserModel\n",
+ "#from reco_utils.recommender.deeprec.models.sequential.gru4rec import GRU4RecModel\n",
+ "\n",
+ "from reco_utils.recommender.deeprec.IO.sequential_iterator import SequentialIterator\n",
+ "\n",
+ "print(\"System version: {}\".format(sys.version))\n",
+ "print(\"Tensorflow version: {}\".format(tf.__version__))\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Parameters"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "tags": [
+ "parameters"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "EPOCHS = 10\n",
+ "BATCH_SIZE = 400\n",
+ "RANDOM_SEED = SEED # Set None for non-deterministic result\n",
+ "yaml_file = '../../reco_utils/recommender/deeprec/config/sli_rec.yaml'\n",
+ "data_path = os.path.join(\"..\", \"..\", \"tests\", \"resources\", \"deeprec\", \"slirec\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1. Input data format\n",
+ "The input data contains 8 columns, i.e., ` ` columns are seperated by `\"\\t\"`. item_id and category_id denote the target item and category, which means that for this instance, we want to guess whether user user_id will interact with item_id at timestamp. `` columns record the user behavior list up to ``, elements are separated by commas. `` is a binary value with 1 for positive instances and 0 for negative instances. One example for an instance is: \n",
+ "\n",
+ "`1 A1QQ86H5M2LVW2 B0059XTU1S Movies 1377561600 B002ZG97WE,B004IK30PA,B000BNX3AU,B0017ANB08,B005LAIHW2 Movies,Movies,Movies,Movies,Movies 1304294400,1304812800,1315785600,1316304000,1356998400` \n",
+ "\n",
+ "Only the SLi_Rec model is time-aware. For the other models, you can just pad some meaningless timestamp in the data files to fill up the format, the models will ignore these columns.\n",
+ "\n",
+ "We use Softmax to the loss function. In training and evalution stage, we group 1 positive instance with num_ngs negative instances. Pair-wise ranking can be regarded as a special case of Softmax ranking, where num_ngs is set to 1. \n",
+ "\n",
+ "More specifically, for training and evalation, you need to organize the data file such that each one positive instance is followd by num_ngs negative instances. Our program will take 1+num_ngs lines as a unit for Softmax calculation. num_ngs is a parameter you need to pass to the `prepare_hparams`, `fit` and `run_eval` function. `train_num_ngs` in `prepare_hparams` denotes the number of negative instances for training, where a recommended number is 4. `valid_num_ngs` and `num_ngs` in `fit` and `run_eval` denote the number in evalution. In evaluation, the model calculates metrics among the 1+num_ngs instances. For the `predict` function, since we only need to calcuate a socre for each individual instance, there is no need for num_ngs setting. More details and examples will be provided in the following sections.\n",
+ "\n",
+ "For training stage, if you don't want to prepare negative instances, you can just provide positive instances and set the parameter `need_sample=True, train_num_ngs=train_num_ngs` for function `prepare_hparams`, our model will dynamicly sample `train_num_ngs` instances as negative samples in each mini batch.\n",
+ "\n",
+ "### Amazon dataset\n",
+ "Now let's start with a public dataset containing product reviews and metadata from Amazon, which is widely used as a benchmark dataset in recommemdation systems field."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████████████████████████████████████████████████████████████████████████| 692k/692k [02:17<00:00, 5.02kKB/s]\n",
+ "100%|████████████████████████████████████████████████████████████████████████████| 97.5k/97.5k [00:24<00:00, 4.00kKB/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "start reviews preprocessing...\n",
+ "start meta preprocessing...\n",
+ "start create instances...\n",
+ "creating item2cate dict\n",
+ "getting sampled data...\n",
+ "start data processing...\n",
+ "data generating...\n",
+ "vocab generating...\n",
+ "start valid negative sampling\n",
+ "start test negative sampling\n"
+ ]
+ }
+ ],
+ "source": [
+ "\n",
+ "# for test\n",
+ "train_file = os.path.join(data_path, r'train_data')\n",
+ "valid_file = os.path.join(data_path, r'valid_data')\n",
+ "test_file = os.path.join(data_path, r'test_data')\n",
+ "user_vocab = os.path.join(data_path, r'user_vocab.pkl')\n",
+ "item_vocab = os.path.join(data_path, r'item_vocab.pkl')\n",
+ "cate_vocab = os.path.join(data_path, r'category_vocab.pkl')\n",
+ "output_file = os.path.join(data_path, r'output.txt')\n",
+ "\n",
+ "reviews_name = 'reviews_Movies_and_TV_5.json'\n",
+ "meta_name = 'meta_Movies_and_TV.json'\n",
+ "reviews_file = os.path.join(data_path, reviews_name)\n",
+ "meta_file = os.path.join(data_path, meta_name)\n",
+ "train_num_ngs = 4 # number of negative instances with a positive instance for training\n",
+ "valid_num_ngs = 4 # number of negative instances with a positive instance for validation\n",
+ "test_num_ngs = 9 # number of negative instances with a positive instance for testing\n",
+ "sample_rate = 0.01 # sample a small item set for training and testing here for fast example\n",
+ "\n",
+ "input_files = [reviews_file, meta_file, train_file, valid_file, test_file, user_vocab, item_vocab, cate_vocab]\n",
+ "\n",
+ "if not os.path.exists(train_file):\n",
+ " download_and_extract(reviews_name, reviews_file)\n",
+ " download_and_extract(meta_name, meta_file)\n",
+ " data_preprocessing(*input_files, sample_rate=sample_rate, valid_num_ngs=valid_num_ngs, test_num_ngs=test_num_ngs)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### 1.1 Prepare hyper-parameters\n",
+ "prepare_hparams() will create a full set of hyper-parameters for model training, such as learning rate, feature number, and dropout ratio. We can put those parameters in a yaml file (a complete list of parameters can be found under our config folder) , or pass parameters as the function's parameters (which will overwrite yaml settings).\n",
+ "\n",
+ "Parameters hints: \n",
+ "`need_sample` controls whether to perform dynamic negative sampling in mini-batch. \n",
+ "`train_num_ngs` indicates how many negative instances followed by one positive instances. \n",
+ "Examples: \n",
+ "(1) `need_sample=True and train_num_ngs=4`: There are only positive instances in your training file. Our model will dynamically sample 4 negative instances for each positive instances in mini-batch. Note that if need_sample is set to True, train_num_ngs should be greater than zero. \n",
+ "(2) `need_sample=False and train_num_ngs=4`: In your training file, each one positive line is followed by 4 negative lines. Note that if need_sample is set to False, you must provide a traiing file with negative instances, and train_num_ngs should match the number of negative number in your training file."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "hparams = prepare_hparams(yaml_file, \n",
+ " embed_l2=0., \n",
+ " layer_l2=0., \n",
+ " learning_rate=0.001, \n",
+ " epochs=EPOCHS,\n",
+ " batch_size=BATCH_SIZE,\n",
+ " show_step=20,\n",
+ " MODEL_DIR=os.path.join(data_path, \"model/\"),\n",
+ " SUMMARIES_DIR=os.path.join(data_path, \"summary/\"),\n",
+ " user_vocab=user_vocab,\n",
+ " item_vocab=item_vocab,\n",
+ " cate_vocab=cate_vocab,\n",
+ " need_sample=True,\n",
+ " train_num_ngs=train_num_ngs, # provides the number of negative instances for each positive instance for loss computation.\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### 1.2 Create data loader\n",
+ "Designate a data iterator for the model. All our sequential models use SequentialIterator. \n",
+ "data format is introduced aboved. \n",
+ "\n",
+ " Validation and testing data are files after negative sampling offline with the number of `` and ``."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "input_creator = SequentialIterator"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2. Create model\n",
+ "When both hyper-parameters and data iterator are ready, we can create a model:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "model = SLI_RECModel(hparams, input_creator, seed=RANDOM_SEED)\n",
+ "## of course you can create models like ASVDModel, CaserModel and GRU4RecModel in the same manner\n",
+ "\n",
+ "\n",
+ "## sometimes we don't want to train a model from scratch\n",
+ "## then we can load a pre-trained model like this: \n",
+ "#model.load_model(r'your_model_path')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now let's see what is the model's performance at this point (without starting training):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'auc': 0.5114, 'logloss': 0.6931, 'mean_mrr': 0.29, 'ndcg2': 0.4517, 'ndcg4': 0.4517, 'ndcg6': 0.4517, 'ndcg8': 0.4517, 'ndcg10': 0.4517, 'group_auc': 0.512}\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(model.run_eval(test_file, num_ngs=test_num_ngs)) # test_num_ngs is the number of negative lines after each positive line in your test_file"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "AUC=0.5 is a state of random guess. We can see that before training, the model behaves like random guessing.\n",
+ "\n",
+ "#### 2.1 Train model\n",
+ "Next we want to train the model on a training set, and check the performance on a validation dataset. Training the model is as simple as a function call:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "step 20 , total_loss: 1.6097, data_loss: 1.6097\n",
+ "step 40 , total_loss: 1.6087, data_loss: 1.6087\n",
+ "eval valid at epoch 1: auc:0.4895,logloss:0.693,mean_mrr:0.4475,ndcg2:0.5827,ndcg4:0.5827,ndcg6:0.5827,ndcg8:0.5827,ndcg10:0.5827,group_auc:0.4907\n",
+ "step 20 , total_loss: 1.6069, data_loss: 1.6069\n",
+ "step 40 , total_loss: 1.4812, data_loss: 1.4812\n",
+ "eval valid at epoch 2: auc:0.5625,logloss:0.6931,mean_mrr:0.4916,ndcg2:0.6164,ndcg4:0.6164,ndcg6:0.6164,ndcg8:0.6164,ndcg10:0.6164,group_auc:0.5422\n",
+ "step 20 , total_loss: 1.4089, data_loss: 1.4089\n",
+ "step 40 , total_loss: 1.3968, data_loss: 1.3968\n",
+ "eval valid at epoch 3: auc:0.684,logloss:0.6957,mean_mrr:0.5984,ndcg2:0.6985,ndcg4:0.6985,ndcg6:0.6985,ndcg8:0.6985,ndcg10:0.6985,group_auc:0.6787\n",
+ "step 20 , total_loss: 1.2920, data_loss: 1.2920\n",
+ "step 40 , total_loss: 1.3227, data_loss: 1.3227\n",
+ "eval valid at epoch 4: auc:0.6965,logloss:0.6827,mean_mrr:0.6145,ndcg2:0.7107,ndcg4:0.7107,ndcg6:0.7107,ndcg8:0.7107,ndcg10:0.7107,group_auc:0.6914\n",
+ "step 20 , total_loss: 1.3205, data_loss: 1.3205\n",
+ "step 40 , total_loss: 1.2936, data_loss: 1.2936\n",
+ "eval valid at epoch 5: auc:0.6986,logloss:0.6657,mean_mrr:0.6192,ndcg2:0.7142,ndcg4:0.7142,ndcg6:0.7142,ndcg8:0.7142,ndcg10:0.7142,group_auc:0.6965\n",
+ "step 20 , total_loss: 1.2575, data_loss: 1.2575\n",
+ "step 40 , total_loss: 1.2785, data_loss: 1.2785\n",
+ "eval valid at epoch 6: auc:0.7055,logloss:0.6147,mean_mrr:0.6197,ndcg2:0.7146,ndcg4:0.7146,ndcg6:0.7146,ndcg8:0.7146,ndcg10:0.7146,group_auc:0.699\n",
+ "step 20 , total_loss: 1.2735, data_loss: 1.2735\n",
+ "step 40 , total_loss: 1.2838, data_loss: 1.2838\n",
+ "eval valid at epoch 7: auc:0.7205,logloss:0.6434,mean_mrr:0.6345,ndcg2:0.7257,ndcg4:0.7257,ndcg6:0.7257,ndcg8:0.7257,ndcg10:0.7257,group_auc:0.7092\n",
+ "step 20 , total_loss: 1.1849, data_loss: 1.1849\n",
+ "step 40 , total_loss: 1.1954, data_loss: 1.1954\n",
+ "eval valid at epoch 8: auc:0.7234,logloss:0.6514,mean_mrr:0.6413,ndcg2:0.7308,ndcg4:0.7308,ndcg6:0.7308,ndcg8:0.7308,ndcg10:0.7308,group_auc:0.715\n",
+ "step 20 , total_loss: 1.2023, data_loss: 1.2023\n",
+ "step 40 , total_loss: 1.1818, data_loss: 1.1818\n",
+ "eval valid at epoch 9: auc:0.7285,logloss:0.6794,mean_mrr:0.639,ndcg2:0.7292,ndcg4:0.7292,ndcg6:0.7292,ndcg8:0.7292,ndcg10:0.7292,group_auc:0.7152\n",
+ "step 20 , total_loss: 1.1680, data_loss: 1.1680\n",
+ "step 40 , total_loss: 1.1911, data_loss: 1.1911\n",
+ "eval valid at epoch 10: auc:0.7317,logloss:0.6242,mean_mrr:0.6454,ndcg2:0.7339,ndcg4:0.7339,ndcg6:0.7339,ndcg8:0.7339,ndcg10:0.7339,group_auc:0.7181\n",
+ "[(1, {'auc': 0.4895, 'logloss': 0.693, 'mean_mrr': 0.4475, 'ndcg2': 0.5827, 'ndcg4': 0.5827, 'ndcg6': 0.5827, 'ndcg8': 0.5827, 'ndcg10': 0.5827, 'group_auc': 0.4907}), (2, {'auc': 0.5625, 'logloss': 0.6931, 'mean_mrr': 0.4916, 'ndcg2': 0.6164, 'ndcg4': 0.6164, 'ndcg6': 0.6164, 'ndcg8': 0.6164, 'ndcg10': 0.6164, 'group_auc': 0.5422}), (3, {'auc': 0.684, 'logloss': 0.6957, 'mean_mrr': 0.5984, 'ndcg2': 0.6985, 'ndcg4': 0.6985, 'ndcg6': 0.6985, 'ndcg8': 0.6985, 'ndcg10': 0.6985, 'group_auc': 0.6787}), (4, {'auc': 0.6965, 'logloss': 0.6827, 'mean_mrr': 0.6145, 'ndcg2': 0.7107, 'ndcg4': 0.7107, 'ndcg6': 0.7107, 'ndcg8': 0.7107, 'ndcg10': 0.7107, 'group_auc': 0.6914}), (5, {'auc': 0.6986, 'logloss': 0.6657, 'mean_mrr': 0.6192, 'ndcg2': 0.7142, 'ndcg4': 0.7142, 'ndcg6': 0.7142, 'ndcg8': 0.7142, 'ndcg10': 0.7142, 'group_auc': 0.6965}), (6, {'auc': 0.7055, 'logloss': 0.6147, 'mean_mrr': 0.6197, 'ndcg2': 0.7146, 'ndcg4': 0.7146, 'ndcg6': 0.7146, 'ndcg8': 0.7146, 'ndcg10': 0.7146, 'group_auc': 0.699}), (7, {'auc': 0.7205, 'logloss': 0.6434, 'mean_mrr': 0.6345, 'ndcg2': 0.7257, 'ndcg4': 0.7257, 'ndcg6': 0.7257, 'ndcg8': 0.7257, 'ndcg10': 0.7257, 'group_auc': 0.7092}), (8, {'auc': 0.7234, 'logloss': 0.6514, 'mean_mrr': 0.6413, 'ndcg2': 0.7308, 'ndcg4': 0.7308, 'ndcg6': 0.7308, 'ndcg8': 0.7308, 'ndcg10': 0.7308, 'group_auc': 0.715}), (9, {'auc': 0.7285, 'logloss': 0.6794, 'mean_mrr': 0.639, 'ndcg2': 0.7292, 'ndcg4': 0.7292, 'ndcg6': 0.7292, 'ndcg8': 0.7292, 'ndcg10': 0.7292, 'group_auc': 0.7152}), (10, {'auc': 0.7317, 'logloss': 0.6242, 'mean_mrr': 0.6454, 'ndcg2': 0.7339, 'ndcg4': 0.7339, 'ndcg6': 0.7339, 'ndcg8': 0.7339, 'ndcg10': 0.7339, 'group_auc': 0.7181})]\n",
+ "best epoch: 10\n",
+ "Time cost for training is 9.53 mins\n"
+ ]
+ }
+ ],
+ "source": [
+ "start_time = time.time()\n",
+ "model = model.fit(train_file, valid_file, valid_num_ngs=valid_num_ngs) \n",
+ "# valid_num_ngs is the number of negative lines after each positive line in your valid_file \n",
+ "# we will evaluate the performance of model on valid_file every epoch\n",
+ "end_time = time.time()\n",
+ "print('Time cost for training is {0:.2f} mins'.format((end_time-start_time)/60.0))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### 2.2 Evaluate model\n",
+ "\n",
+ "Again, let's see what is the model's performance now (after training):"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'auc': 0.7111, 'logloss': 0.6447, 'mean_mrr': 0.4673, 'ndcg2': 0.5934, 'ndcg4': 0.5934, 'ndcg6': 0.5934, 'ndcg8': 0.5934, 'ndcg10': 0.5934, 'group_auc': 0.698}\n"
+ ]
+ },
+ {
+ "data": {
+ "application/papermill.record+json": {
+ "res_syn": {
+ "auc": 0.7111,
+ "group_auc": 0.698,
+ "logloss": 0.6447,
+ "mean_mrr": 0.4673,
+ "ndcg10": 0.5934,
+ "ndcg2": 0.5934,
+ "ndcg4": 0.5934,
+ "ndcg6": 0.5934,
+ "ndcg8": 0.5934
+ }
+ }
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "res_syn = model.run_eval(test_file, num_ngs=test_num_ngs)\n",
+ "print(res_syn)\n",
+ "pm.record(\"res_syn\", res_syn)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If we want to get the full prediction scores rather than evaluation metrics, we can do this:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model = model.predict(test_file, output_file)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# The data was downloaded in tmpdir folder. You can delete them manually if you do not need them any more."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### 2.3 Running models with large dataset\n",
+ "Here are performances using the whole amazon dataset among popular sequential models with 1,697,533 positive instances.\n",
+ " Settings for reproducing the results:\n",
+ " `learning_rate=0.001, dropout=0.3, item_embedding_dim=32, cate_embedding_dim=8, l2_norm=0, batch_size=400, \n",
+ "train_num_ngs=4, valid_num_ngs=4, test_num_ngs=49`\n",
+ "\n",
+ "\n",
+ "We compare the running time with CPU only and with GPU on the larger dataset. It appears that GPU can significantly accelerate the training. Hardware specification for running the large dataset: \n",
+ " GPU: Tesla P100-PCIE-16GB\n",
+ " CPU: 6 cores Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz\n",
+ " \n",
+ "| Models | AUC | g-AUC | NDCG@2 | NDCG@10 | seconds per epoch on GPU | seconds per epoch on CPU| config |\n",
+ "| :------| :------: | :------: | :------: | :------: | :------: | :------: | :------ |\n",
+ "| ASVD | 0.8251 | 0.8178 | 0.2922 | 0.4264 | 249.5 | 440.0 | N/A |\n",
+ "| GRU4Rec | 0.8411 | 0.8332 | 0.3213 | 0.4547 | 439.0 | 4285.0 | max_seq_length=50, hidden_size=40|\n",
+ "| Caser | 0.8244 | 0.8171 | 0.283 | 0.4194 | 314.3 | 5369.9 | T=1, n_v=128, n_h=128, L=3, min_seq_length=5|\n",
+ "| SLi_Rec | 0.8631 | 0.8519 | 0.3491 | 0.4842 | 549.6 | 5014.0 | attention_size=40, max_seq_length=50, hidden_size=40|\n",
+ "\n",
+ " Note that the four models are grid searched with a coarse granularity and the results are for reference only. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Reference\n",
+ "\\[1\\] Zeping Yu, Jianxun Lian, Ahmad Mahmoody, Gongshen Liu, Xing Xie. Adaptive User Modeling with Long and Short-Term Preferences for Personailzed Recommendation. In Proceedings of the 28th International Joint Conferences on Artificial Intelligence, IJCAI’19, Pages 4213-4219. AAAI Press, 2019.\n",
+ "\n",
+ "\\[2\\] Balázs Hidasi, Alexandros Karatzoglou, Linas Baltrunas, Domonkos Tikk. Session-based Recommendations with Recurrent Neural Networks. ICLR (Poster) 2016\n",
+ "\n",
+ "\\[3\\] Tang, Jiaxi, and Ke Wang. Personalized top-n sequential recommendation via convolutional sequence embedding. Proceedings of the Eleventh ACM International Conference on Web Search and Data Mining. ACM, 2018."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "celltoolbar": "Tags",
+ "kernelspec": {
+ "display_name": "Python (reco)",
+ "language": "python",
+ "name": "reco"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/02_model/fm_deep_dive.ipynb b/notebooks/02_model/fm_deep_dive.ipynb
new file mode 100644
index 0000000000..81e2615304
--- /dev/null
+++ b/notebooks/02_model/fm_deep_dive.ipynb
@@ -0,0 +1,894 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Copyright (c) Microsoft Corporation. All rights reserved. \n",
+ "\n",
+ "Licensed under the MIT License. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Factorization Machine Deep Dive\n",
+ "\n",
+ "Factorization machine (FM) is one of the representative algorithms that are used for building content-based recommenders model. The algorithm is powerful in terms of capturing the effects of not just the input features but also their interactions. The algorithm provides better generalization capability and expressiveness compared to other classic algorithms such as SVMs. The most recent research extends the basic FM algorithms by using deep learning techniques, which achieve remarkable improvement in a few practical use cases.\n",
+ "\n",
+ "This notebook presents a deep dive into the Factorization Machine algorithm, and demonstrates some best practices of using the contemporary FM implementations like [`xlearn`](https://github.com/aksnzhy/xlearn) for dealing with tasks like click-through rate prediction."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1 Factorization Machine"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 1.1 Factorization Machine"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "FM is an algorithm that uses factorization in prediction tasks with data set of high sparsity. The algorithm was original proposed in [\\[1\\]](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf). Traditionally, the algorithms such as SVM do not perform well in dealing with highly sparse data that is usually seen in many contemporary problems, e.g., click-through rate prediction, recommendation, etc. FM handles the problem by modeling not just first-order linear components for predicting the label, but also the cross-product of the feature variables in order to capture more generalized correlation between variables and label. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In certain occasions, the data that appears in recommendation problems, such as user, item, and feature vectors, can be encoded into a one-hot representation. Under this arrangement, classical algorithms like linear regression and SVM may suffer from the following problems:\n",
+ "1. The feature vectors are highly sparse, and thus it makes it hard to optimize the parameters to fit the model efficienly\n",
+ "2. Cross-product of features will be sparse as well, and this in turn, reduces the expressiveness of a model if it is designed to capture the high-order interactions between features"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The FM algorithm is designed to tackle the above two problems by factorizing latent vectors that model the low- and high-order components. The general idea of a FM model is expressed in the following equation:"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "$$\\hat{y}(\\textbf{x})=w_{0}+\\sum^{n}_{i=1}w_{i}x_{i}+\\sum^{n}_{i=1}\\sum^{n}_{j=i+1}<\\textbf{v}_{i}, \\textbf{v}_{j}>x_{i}x_{j}$$"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "where $\\hat{y}$ and $\\textbf{x}$ are the target to predict and input feature vectors, respectively. $w_{i}$ is the model parameters for the first-order component. $<\\textbf{v}_{i}, \\textbf{v}_{j}>$ is the dot product of two latent factors for the second-order interaction of feature variables, and it is defined as "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "$$<\\textbf{v}_{i}, \\textbf{v}_{j}>=\\sum^{k}_{f=1}v_{i,f}\\cdot v_{j,f}$$"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Compared to using fixed parameter for the high-order interaction components, using the factorized vectors increase generalization as well as expressiveness of the model. In addition to this, the computation complexity of the equation (above) is $O(kn)$ where $k$ and $n$ are the dimensionalities of the factorization vector and input feature vector, respectively (see [the paper](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf) for detailed discussion). In practice, usually a two-way FM model is used, i.e., only the second-order feature interactions are considered to favor computational efficiency."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 1.2 Field-Aware Factorization Machine"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Field-aware factorization machine (FFM) is an extension to FM. It was originally introduced in [\\[2\\]](https://www.csie.ntu.edu.tw/~cjlin/papers/ffm.pdf). The advantage of FFM over FM is that, it uses different factorized latent factors for different groups of features. The \"group\" is called \"field\" in the context of FFM. Putting features into fields resolves the issue that the latent factors shared by features that intuitively represent different categories of information may not well generalize the correlation. \n",
+ "\n",
+ "Different from the formula for the 2-order cross product as can be seen above in the FM equation, in the FFM settings, the equation changes to "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "$$\\theta_{\\text{FFM}}(\\textbf{w}\\textbf{x})=\\sum^{n}_{j1=1}\\sum^{n}_{j2=j1+1}<\\textbf{v}_{j1,f2}, \\textbf{v}_{j2,f1}>x_{j1}x_{j2}$$"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "where $f_1$ and $f_2$ are the fields of $j_1$ and $j_2$, respectively."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Compared to FM, the computational complexity increases to $O(n^2k)$. However, since the latent factors in FFM only need to learn the effect within the field, so the $k$ values in FFM is usually much smaller than that in FM."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 1.3 FM/FFM extensions"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In the recent years, FM/FFM extensions were proposed to enhance the model performance further. The new algorithms leverage the powerful deep learning neural network to improve the generalization capability of the original FM/FFM algorithms. Representatives of the such algorithms are summarized as below. Some of them are implemented and demonstrated in the microsoft/recommenders repository. \n",
+ "\n",
+ "|Algorithm|Notes|References|Example in Microsoft/Recommenders|\n",
+ "|--------------------|---------------------|------------------------|\n",
+ "|DeepFM|Combination of FM and DNN where DNN handles high-order interactions|[\\[3\\]](https://arxiv.org/abs/1703.04247)|-|\n",
+ "|xDeepFM|Combination of FM, DNN, and Compressed Interaction Network, for vectorized feature interactions|[\\[4\\]](https://dl.acm.org/citation.cfm?id=3220023)|[notebook](https://github.com/microsoft/recommenders/blob/master/notebooks/00_quick_start/xdeepfm_criteo.ipynb) / [utilities](https://github.com/microsoft/recommenders/blob/master/reco_utils/recommender/deeprec/models/xDeepFM.py)|\n",
+ "|Factorization Machine Supported Neural Network|Use FM user/item weight vectors as input layers for DNN model|[\\[5\\]](https://link.springer.com/chapter/10.1007/978-3-319-30671-1_4)|-|\n",
+ "|Product-based Neural Network|An additional product-wise layer between embedding layer and fully connected layer to improve expressiveness of interactions of features across fields|[\\[6\\]](https://ieeexplore.ieee.org/abstract/document/7837964)|-|\n",
+ "|Neural Factorization Machines|Improve the factorization part of FM by using stacks of NN layers to improve non-linear expressiveness|[\\[7\\]](https://dl.acm.org/citation.cfm?id=3080777)|-|\n",
+ "|Wide and deep|Combination of linear model (wide part) and deep neural network model (deep part) for memorisation and generalization|[\\[8\\]](https://dl.acm.org/citation.cfm?id=2988454)|[notebook](https://github.com/microsoft/recommenders/blob/master/notebooks/00_quick_start/wide_deep_movielens.ipynb) / [utilities](https://github.com/microsoft/recommenders/tree/master/reco_utils/recommender/wide_deep)|"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2 Factorization Machine Implementation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 2.1 Implementations"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The following table summarizes the implementations of FM/FFM. Some of them (e.g., xDeepFM and VW) are implemented and/or demonstrated in the microsoft/recommenders repository"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "|Implementation|Language|Notes|Examples in Microsoft/Recommenders|\n",
+ "|-----------------|------------------|------------------|---------------------|\n",
+ "|[libfm](https://github.com/srendle/libfm)|C++|Implementation of FM algorithm|-|\n",
+ "|[libffm](https://github.com/ycjuan/libffm)|C++|Original implemenation of FFM algorithm. It is handy in model building, but does not support Python interface|-|\n",
+ "|[xlearn](https://github.com/aksnzhy/xlearn)|C++ with Python interface|More computationally efficient compared to libffm without loss of modeling effectiveness|[notebook](https://github.com/microsoft/recommenders/blob/master/notebooks/02_model/fm_deep_dive.ipynb)|\n",
+ "|[Vowpal Wabbit FM](https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Matrix-factorization-example)|Online library with estimator API|Easy to use by calling API|[notebook](https://github.com/microsoft/recommenders/blob/master/notebooks/02_model/vowpal_wabbit_deep_dive.ipynb) / [utilities](https://github.com/microsoft/recommenders/tree/master/reco_utils/recommender/vowpal_wabbit)\n",
+ "|[microsoft/recommenders xDeepFM](https://github.com/microsoft/recommenders/blob/master/reco_utils/recommender/deeprec/models/xDeepFM.py)|Python|Support flexible interface with different configurations of FM and FM extensions, i.e., LR, FM, and/or CIN|[notebook](https://github.com/microsoft/recommenders/blob/master/notebooks/00_quick_start/xdeepfm_criteo.ipynb) / [utilities](https://github.com/microsoft/recommenders/blob/master/reco_utils/recommender/deeprec/models/xDeepFM.py)|"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Other than `libfm` and `libffm`, all the other three can be used in a Python environment. \n",
+ "\n",
+ "* A deep dive of using Vowbal Wabbit for FM model can be found [here](https://github.com/microsoft/recommenders/blob/master/notebooks/02_model/vowpal_wabbit_deep_dive.ipynb)\n",
+ "* A quick start of Microsoft xDeepFM algorithm can be found [here](https://github.com/microsoft/recommenders/blob/master/notebooks/00_quick_start/xdeepfm_criteo.ipynb). \n",
+ "\n",
+ "Therefore, in the example below, only code examples and best practices of using `xlearn` are presented."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 2.2 xlearn"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Setups for using `xlearn`.\n",
+ "\n",
+ "1. `xlearn` is implemented in C++ and has Python bindings, so it can be directly installed as a Python package from PyPI. The installation of `xlearn` is enabled in the [Recommenders repo environment setup script](https://github.com/microsoft/recommenders/blob/master/scripts/generate_conda_file.py). One can follow the general setup steps to install the environment as required, in which `xlearn` is installed as well.\n",
+ "2. NOTE `xlearn` may require some base libraries installed as prerequisites in the system, e.g., `cmake`."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "After a succesful creation of the environment, one can load the packages to run `xlearn` in a Jupyter notebook or Python script."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) \n",
+ "[GCC 7.3.0]\n",
+ "Xlearn version: 0.4.0\n"
+ ]
+ }
+ ],
+ "source": [
+ "import time\n",
+ "import sys\n",
+ "sys.path.append(\"../../\")\n",
+ "import os\n",
+ "import papermill as pm\n",
+ "from tempfile import TemporaryDirectory\n",
+ "import xlearn as xl\n",
+ "from sklearn.metrics import roc_auc_score\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import seaborn as sns\n",
+ "%matplotlib notebook\n",
+ "from matplotlib import pyplot as plt\n",
+ "\n",
+ "from reco_utils.common.constants import SEED\n",
+ "from reco_utils.common.timer import Timer\n",
+ "from reco_utils.recommender.deeprec.deeprec_utils import (\n",
+ " download_deeprec_resources, prepare_hparams\n",
+ ")\n",
+ "from reco_utils.recommender.deeprec.models.xDeepFM import XDeepFMModel\n",
+ "from reco_utils.recommender.deeprec.IO.iterator import FFMTextIterator\n",
+ "from reco_utils.tuning.parameter_sweep import generate_param_grid\n",
+ "from reco_utils.dataset.pandas_df_utils import LibffmConverter\n",
+ "\n",
+ "print(\"System version: {}\".format(sys.version))\n",
+ "print(\"Xlearn version: {}\".format(xl.__version__))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In the FM model building, data is usually represented in the libsvm data format. That is, `label feat1:val1 feat2:val2 ...`, where `label` is the target to predict, and `val` is the value to each feature `feat`.\n",
+ "\n",
+ "FFM algorithm requires data to be represented in the libffm format, where each vector is split into several fields with categorical/numerical features inside. That is, `label field1:feat1:val1 field2:feat2:val2 ...`."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In the Microsoft/Recommenders utility functions, [a libffm converter](https://github.com/microsoft/recommenders/blob/290dd920d4a6a4d3bff71dd9ee7273be0c02dbbc/reco_utils/dataset/pandas_df_utils.py#L86) is provided to achieve the transformation from a tabular feature vectors to the corresponding libffm representation. For example, the following shows how to transform the format of a synthesized data by using the module of `LibffmConverter`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " rating \n",
+ " field1 \n",
+ " field2 \n",
+ " field3 \n",
+ " field4 \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " 1 \n",
+ " 1:1:1 \n",
+ " 2:4:3 \n",
+ " 3:5:1.0 \n",
+ " 4:6:1 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " 0 \n",
+ " 1:2:1 \n",
+ " 2:4:4 \n",
+ " 3:5:2.0 \n",
+ " 4:7:1 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " 0 \n",
+ " 1:3:1 \n",
+ " 2:4:5 \n",
+ " 3:5:3.0 \n",
+ " 4:8:1 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " 1 \n",
+ " 1:3:1 \n",
+ " 2:4:6 \n",
+ " 3:5:4.0 \n",
+ " 4:9:1 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " 1 \n",
+ " 1:3:1 \n",
+ " 2:4:7 \n",
+ " 3:5:5.0 \n",
+ " 4:10:1 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " rating field1 field2 field3 field4\n",
+ "0 1 1:1:1 2:4:3 3:5:1.0 4:6:1\n",
+ "1 0 1:2:1 2:4:4 3:5:2.0 4:7:1\n",
+ "2 0 1:3:1 2:4:5 3:5:3.0 4:8:1\n",
+ "3 1 1:3:1 2:4:6 3:5:4.0 4:9:1\n",
+ "4 1 1:3:1 2:4:7 3:5:5.0 4:10:1"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_feature_original = pd.DataFrame({\n",
+ " 'rating': [1, 0, 0, 1, 1],\n",
+ " 'field1': ['xxx1', 'xxx2', 'xxx4', 'xxx4', 'xxx4'],\n",
+ " 'field2': [3, 4, 5, 6, 7],\n",
+ " 'field3': [1.0, 2.0, 3.0, 4.0, 5.0],\n",
+ " 'field4': ['1', '2', '3', '4', '5']\n",
+ "})\n",
+ "\n",
+ "converter = LibffmConverter().fit(df_feature_original, col_rating='rating')\n",
+ "df_out = converter.transform(df_feature_original)\n",
+ "df_out"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "There are in total 4 fields and 10 features.\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('There are in total {0} fields and {1} features.'.format(converter.field_count, converter.feature_count))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To illustrate the use of `xlearn`, the following example uses the [Criteo data set](https://labs.criteo.com/category/dataset/), which has already been processed in the libffm format, for building and evaluating a FFM model built by using `xlearn`. Sometimes, it is important to know the total numbers of fields and features. When building a FFM model, `xlearn` can count these numbers automatically."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "tags": [
+ "parameters"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "# Parameters\n",
+ "YAML_FILE_NAME = \"xDeepFM.yaml\"\n",
+ "TRAIN_FILE_NAME = \"cretio_tiny_train\"\n",
+ "VALID_FILE_NAME = \"cretio_tiny_valid\"\n",
+ "TEST_FILE_NAME = \"cretio_tiny_test\"\n",
+ "MODEL_FILE_NAME = \"model.out\"\n",
+ "OUTPUT_FILE_NAME = \"output.txt\"\n",
+ "\n",
+ "LEARNING_RATE = 0.2\n",
+ "LAMBDA = 0.002\n",
+ "# The metrics for binary classification options are \"acc\", \"prec\", \"f1\" and \"auc\"\n",
+ "# for regression, options are \"rmse\", \"mae\", \"mape\"\n",
+ "METRIC = \"auc\" \n",
+ "EPOCH = 10\n",
+ "OPT_METHOD = \"sgd\" # options are \"sgd\", \"adagrad\" and \"ftrl\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 10.3k/10.3k [00:01<00:00, 8.67kKB/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "tmpdir = TemporaryDirectory()\n",
+ "\n",
+ "data_path = tmpdir.name\n",
+ "yaml_file = os.path.join(data_path, YAML_FILE_NAME)\n",
+ "train_file = os.path.join(data_path, TRAIN_FILE_NAME)\n",
+ "valid_file = os.path.join(data_path, VALID_FILE_NAME)\n",
+ "test_file = os.path.join(data_path, TEST_FILE_NAME)\n",
+ "model_file = os.path.join(data_path, MODEL_FILE_NAME)\n",
+ "output_file = os.path.join(data_path, OUTPUT_FILE_NAME)\n",
+ "\n",
+ "if not os.path.exists(yaml_file):\n",
+ " download_deeprec_resources(r'https://recodatasets.blob.core.windows.net/deeprec/', data_path, 'xdeepfmresources.zip')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The following steps are from the [official documentation of `xlearn`](https://xlearn-doc.readthedocs.io/en/latest/index.html) for building a model. To begin with, we do not modify any training parameter values. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "NOTE, if `xlearn` is run through command line, the training process can be displayed in the console."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Training task\n",
+ "ffm_model = xl.create_ffm() # Use field-aware factorization machine (ffm)\n",
+ "ffm_model.setTrain(train_file) # Set the path of training dataset\n",
+ "ffm_model.setValidate(valid_file) # Set the path of validation dataset\n",
+ "\n",
+ "# Parameters:\n",
+ "# 0. task: binary classification\n",
+ "# 1. learning rate: 0.2\n",
+ "# 2. regular lambda: 0.002\n",
+ "# 3. evaluation metric: auc\n",
+ "# 4. number of epochs: 10\n",
+ "# 5. optimization method: sgd\n",
+ "param = {\"task\":\"binary\", \n",
+ " \"lr\": LEARNING_RATE, \n",
+ " \"lambda\": LAMBDA, \n",
+ " \"metric\": METRIC,\n",
+ " \"epoch\": EPOCH,\n",
+ " \"opt\": OPT_METHOD\n",
+ " }\n",
+ "\n",
+ "# Start to train\n",
+ "# The trained model will be stored in model.out\n",
+ "with Timer() as time_train:\n",
+ " ffm_model.fit(param, model_file)\n",
+ "\n",
+ "# Prediction task\n",
+ "ffm_model.setTest(test_file) # Set the path of test dataset\n",
+ "ffm_model.setSigmoid() # Convert output to 0-1\n",
+ "\n",
+ "# Start to predict\n",
+ "# The output result will be stored in output.txt\n",
+ "with Timer() as time_predict:\n",
+ " ffm_model.predict(model_file, output_file)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The output are the predicted labels (i.e., 1 or 0) for the testing data set. AUC score is calculated to evaluate the model performance."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open(output_file) as f:\n",
+ " predictions = f.readlines()\n",
+ "\n",
+ "with open(test_file) as f:\n",
+ " truths = f.readlines()\n",
+ "\n",
+ "truths = np.array([float(truth.split(' ')[0]) for truth in truths])\n",
+ "predictions = np.array([float(prediction.strip('')) for prediction in predictions])\n",
+ "\n",
+ "auc_score = roc_auc_score(truths, predictions)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.7498803439718372"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "auc_score"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/papermill.record+json": {
+ "auc_score": 0.7498803439718372
+ }
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "pm.record('auc_score', auc_score)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Training takes 10.77s and predicting takes 0.93s.\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('Training takes {0:.2f}s and predicting takes {1:.2f}s.'.format(time_train.interval, time_predict.interval))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "It can be seen that the model building/scoring process is fast and the model performance is good. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 2.3 Hyperparameter tuning of `xlearn`"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The following presents a naive approach to tune the parameters of `xlearn`, which is using grid-search of parameter values to find the optimal combinations. It is worth noting that the original [FFM paper](https://www.csie.ntu.edu.tw/~cjlin/papers/ffm.pdf) gave some hints in terms of the impact of parameters on the sampled Criteo dataset. \n",
+ "\n",
+ "The following are the parameters that can be tuned in the `xlearn` implementation of FM/FFM algorithm."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "|Parameter|Description|Default value|Notes|\n",
+ "|-------------|-----------------|------------------|-----------------|\n",
+ "|`lr`|Learning rate|0.2|Higher learning rate helps fit a model more efficiently but may also result in overfitting.|\n",
+ "|`lambda`|Regularization parameter|0.00002|The value needs to be selected empirically to avoid overfitting.|\n",
+ "|`k`|Dimensionality of the latent factors|4|In FFM the effect of k is not that significant as the algorithm itself considers field where `k` can be small to capture the effect of features within each of the fields.|\n",
+ "|`init`|Model initialization|0.66|-|\n",
+ "|`epoch`|Number of epochs|10|Using a larger epoch size will help converge the model to its optimal point|"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "param_dict = {\n",
+ " \"lr\": [0.0001, 0.001, 0.01],\n",
+ " \"lambda\": [0.001, 0.01, 0.1]\n",
+ "}\n",
+ "\n",
+ "param_grid = generate_param_grid(param_dict)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "auc_scores = []\n",
+ "\n",
+ "with Timer() as time_tune:\n",
+ " for param in param_grid:\n",
+ " ffm_model = xl.create_ffm() \n",
+ " ffm_model.setTrain(train_file) \n",
+ " ffm_model.setValidate(valid_file)\n",
+ " ffm_model.fit(param, model_file)\n",
+ "\n",
+ " ffm_model.setTest(test_file) \n",
+ " ffm_model.setSigmoid() \n",
+ " ffm_model.predict(model_file, output_file)\n",
+ "\n",
+ " with open(output_file) as f:\n",
+ " predictions = f.readlines()\n",
+ "\n",
+ " with open(test_file) as f:\n",
+ " truths = f.readlines()\n",
+ "\n",
+ " truths = np.array([float(truth.split(' ')[0]) for truth in truths])\n",
+ " predictions = np.array([float(prediction.strip('')) for prediction in predictions])\n",
+ "\n",
+ " auc_scores.append(roc_auc_score(truths, predictions))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Tuning by grid search takes 4.6 min\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('Tuning by grid search takes {0:.2} min'.format(time_tune.interval / 60))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " Lambda \n",
+ " 0.001 \n",
+ " 0.010 \n",
+ " 0.100 \n",
+ " \n",
+ " \n",
+ " LR \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0.0001 \n",
+ " 0.5482 \n",
+ " 0.6122 \n",
+ " 0.7210 \n",
+ " \n",
+ " \n",
+ " 0.0010 \n",
+ " 0.5456 \n",
+ " 0.6101 \n",
+ " 0.7246 \n",
+ " \n",
+ " \n",
+ " 0.0100 \n",
+ " 0.5406 \n",
+ " 0.6147 \n",
+ " 0.7238 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Lambda 0.001 0.010 0.100\n",
+ "LR \n",
+ "0.0001 0.5482 0.6122 0.7210\n",
+ "0.0010 0.5456 0.6101 0.7246\n",
+ "0.0100 0.5406 0.6147 0.7238"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "auc_scores = [float('%.4f' % x) for x in auc_scores]\n",
+ "auc_scores_array = np.reshape(auc_scores, (len(param_dict[\"lr\"]), len(param_dict[\"lambda\"]))) \n",
+ "\n",
+ "auc_df = pd.DataFrame(\n",
+ " data=auc_scores_array, \n",
+ " index=pd.Index(param_dict[\"lr\"], name=\"LR\"), \n",
+ " columns=pd.Index(param_dict[\"lambda\"], name=\"Lambda\")\n",
+ ")\n",
+ "auc_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAELCAYAAADawD2zAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3dd3gVVeLG8e9JCCC9Ewi95IKKCgioKEWQIkXFhiIqoqCiP5qIoiIqNlxcG7jKiouuAq5KUzoCKyqCoFK99JIQSOgEFkg5vz9mCAkpBMIkwLyf55mHmTNnzpm5Q95Mzp0711hrERGRi19IXu+AiIjkDgW+iIhPKPBFRHxCgS8i4hMKfBERn1Dgi4j4hAL/7LUDgsAG4JkM1j8IxAF/uNPDp6wvBkQDH6QquwdYCawAZgJl3PK3gL/c8klAiXNxAD53uvMHcBewBlgNfJmqfCawH/julPpfuG2uAsYCYW55N5xztwL4Gbgy57svJwQCgXaBQCAYCAQ2BAKBdOcyEAj8PRAI/OFO6wKBwH63/KpAIPBLIBBYHQgEVgQCgbtTbfOE254NBAJlTm3zQqXAPzuhwCigPXApTlBfmkG9icBV7vTPU9a9AixMtZwPeBdoCVyBEw5PuOvmAJe75euAZ8/FQfhYds5fbZzXuSlwGdAv1bq3gO4ZtPsFUAeoB1zCyV/ym4HmOOfvFeDjc3EQAoFAIN25DAQCac5lMBjsHwwGrwoGg1cB7wPfuquOAPcHg8HLcC4A3gkEAicupn4CWgNbc+Ewco0C/+w0xrky3AQcByYAt5zB9g2B8sDsVGXGnQq7/xYDdrjrZgOJ7vxioNLZ7rgA2Tt/j+AEyT53OTbVunnAoQzanQ5Yd1rCyfP0c6p2dP7OrcbAhmAwuCkYDGbnZ/EeYDxAMBhcFwwG17vzO3DOcVl3+fdgMLjFyx3PCwr8sxMBbE+1HOWWnep2nCv1r4HKblkIMBIYdErdBOAxnCGdHThXK59k0OZDwIyz3XEBsnf+It3pJ5yQbncG7Yfh/AUwM4N1PdH5O5ey+7NIIBCoClQHfshgXWMgP7DRg308b5jcfrSCMeZja22vTNb1AnoBXJK/bMMCYcVydd+y65bb2nNjqxvo+8QQAO7ueisNrr6CwU+9nFKnZKkSHI4/wvHjx+nR8x5u7XIzt3ToziO9u3PJJQV5750x3NOtC/Ub1OPpgS+RL18+vp48ln5PPs+WzdsYMfJFdu2KY+SI0SltDhz0GFc1qEf3ex7P9WM+E38rcU1e70KWqnVoTKUWV7BokDPKVuv2ppS5qiaLX/gspc5N/xpIcmISPzz6PoUrlKLjty/wbatnOH7wCADh19alXu+bmfPgyHTtXz+iJwlHjvHrsH+nKa9wXV2ue/VBvrvtFY7tj/fwCHOm++LBeb0L2TZr4c/8vPRPXnrqMQCmzVnIyr82MOTJnunqfjJ+Ert27023Lm7PPh4a8CLDBz/BlZdGplnX9t7HmPDhm5Qsfn5mUUbyV6pnMluXz4sOjTGlMlsF3JzZdtbaj3HHN0sWqXXePuRnR/ROIipVSFmuGBHOzpjYNHX27d2fMj/u04kMe/lpABo1voprr2tEz0e6UbhIIcLC8nM4/ghTpzgXg1s2bwNg8rfT6Tegd0obXe+9jTbtbuTWjhkNHcuZOBKzl8IVTv4XLRReiiM796WpczhmL3HLN2ATk4jfHseBjTEUqx7O7j83Zdl2/f63UbBUURYNHpumvGTdylw/4mFmdX/rvA77C035MqXZGbc7ZXlX3B7KlS6ZYd2ZC37iuf9Le+9E/OEj9BnyGk881DVd2F+MvBrSiQN+A5almn5zp3Ie9Zlrli9bQc2aValStRJhYWF0uaMDM6bPS1OnfPmyKfPtO7QiGHT+UuzVcyD16jbjysta8MKQN5g4fhIvvfgWMTt2EahTi9JlnCBqcWPTlG1atW5G3wG9uffu3vzvf0dz6SgvXnF/bqJY9XCKVC5LSFgoNW65hm1zlqeps3XWMipc57z3V6BkEYrVCOfQ1tiMmksReU8LIprXY/4ToyDVX86FK5am9Zh+LOz7Dw5u3nnuD8jHLq9Ti63RMUTF7CIhIYEZ83+ixXWN0tXbvD2ag4cOc+WlgZSyhIQE+r04gk5tmtO2+XW5udt5xpMrfJw3w1pZa7edusIYsz2D+heUpKQknh74Et9M/pTQ0FC++Pw//LV2Pc8+35c/lq9ixvR59H7sAdp1aEVSYiL79h2gz6NPZ9nmzp2xjHj9fb6f9SWJCYls37aDx91tRox8kQIF8jNp6r8A+G3pHwzoO9Trw7xo2aRkfnlhHO2+eBoTEsK6iQvZvy6aBk/dzu4/N7NtznKiF6ygUrN6dPnhTWxyMkuHj0+5Mu/wzQsUr1WBsMIF6br0PX58agzRC1fS9PUexEftptOUYQBsmbGUP96ZTP3+t1GgRBGue+1BAJITk5jaQefvXMgXGsqQJx/m0cHDSUpO5rb2N1KrWmU++HQClwVq0tIN/xk/LKJdy6YYc3K0Y+aCX1i2Yi37D8YzZdYCAIY/3Yc6tarzxbffM3biFPbs3c/tjwzkhsYNUoaNLmSejOEbY/oAi6y1f2aw7klr7funa+N8HtKRrJ3vY/iStQtpDF/Sy/UxfGvtqCzWnTbsRUTk3PNqSAdjTB2c+2EjcO5L3gFMtdau9apPERHJnCdv2hpjBuN8AMLgfABlqTs/3hiT2cfYRUTEQ15d4fcELrPWJqQuNMa8jfNckjc86ldERDLh1W2ZyUDFDMoruOtERCSXeXWF3w+YZ4xZz8mPPVcBanHygWAiIpKLvLpLZ6YxJhLnwUYROOP3UcBSa22SF32KiEjWPLtLh5NPDbQ4wzgn/hURkTzg1bN02gCjgfU4X/IBziNhaxljHrfWzs50YxER8YRXV/jvAq2ttVtSFxpjquM8M7yuR/2KiEgmvLpLJx/OmP2pojn5tW8iIpKLvLrCHwssNcZM4ORdOpWBrmT8pR4iIuIxr+7Sed0YMwXoDFzLybt0ullr13jRp4iIZM2zu3TcYF/jfhmKtdbuO902IiLiHa+epVPFGDPBGBML/AosMcbEumXVvOhTRESy5tWbthOBSUAFa21ta21tnMcqTMZ5qJqIiOQyrwK/jLV2YupP1Vprk6y1E4DSHvUpIiJZ8GoMf5kxZjQwjrR36TwA/O5RnyIikgWvAv9+nEckv8TJZ+lsB6ah2zJFRPKEV7dlHgc+dCcRETkPeDWGnyljTMfc7lNERPIg8IFGedCniIjv5cWXmL/oVZ8iIpI5fYm5iIhP6EvMRUR8Ql9iLiLiE/oScxERn9CXmIuI+ISXj0dOBhZ71b6IiJyZvLgPX0RE8oACX0TEJxT4IiI+ocAXEfEJBb6IiE8o8EVEfEKBLyLiEwp8ERGfUOCLiPiEAl9ExCcU+CIiPqHAFxHxCQW+iIhPKPBFRHxCgS8i4hMKfBERn1Dgi4j4hAJfRMQnFPgiIj6hwBcR8QnPvsQ8p44nJ+b1LshZSjB5vQeSE6Zg4bzeBfGIrvBFRHxCgS8i4hMKfBERn1Dgi4j4hAJfRMQnFPgiIj6hwBcR8QkFvoiITyjwRUR8QoEvIuITCnwREZ9Q4IuI+IQCX0TEJxT4IiI+ocAXEfEJBb6IiE8o8EVEfEKBLyLiEwp8ERGfUOCLiPiEAl9ExCcU+CIiPqHAFxHxCQW+iIhPKPBFRHxCgS8i4hMKfBERn1Dgi4j4hAJfRMQnFPgiIj6hwBcR8QkFvoiITyjwRUR8QoEvIuITCnwREZ9Q4IuI+IQCX0TEJxT4IiI+ocAXEfEJBb6IiE8o8EVEfEKBLyLiEwp8ERGfUOCLiPiEAl9ExCcU+CIiPqHAFxHxCQW+iIhPKPBFRHxCgS8i4hP58noHLlQ33dScEW8NJTQ0lHH/msjIkR+mWX/ffXcw/NVniYnZBcA//jGOcf+amLK+aNEiLP99LlOnzmLggBcBmDFzAuHhZTl69BgAnTt1Jy5uDwBdunRgyHP9sNayauVaevTomxuHedGq3OIKmg7rjgkNYe34Bfwxelq6OjU7NqFh/y5gLXvWbmPek6MBuPnzpylfvyY7l65jRo+RKfWLVi5L61F9KFiiCHGrtvBD3w9JTkiiQpMA173YndJ1KzO3zwdsmr40147zYrRo8W+88c4/SEpO5vZO7Xi4+11p1r/57kcsWb4CgKPHjrF3335+mfU1f63byCt/+4D4w0cICQ2h1/1dad+6eZptX3t7NJOmz2Hp3EkpZTPn/ZfRY/+NwRCoXYMRwwZ7f5AeOW3gG2NCgZLW2t3ucn7gQaC/tbaut7t3fgoJCeHtv79Mp473ER29kx9/nMr338/hr782pKn3zTffpYT5qYYOHciiH39NV/7QQ/34ffnKNGU1a1bjqUGP07rV7ezff5CyZUufu4PxIRNiuH74A3x37xscjtlLl+9eZuucZexbvyOlTvFq5anfpxOTu7zE8QNHKFi6WMq6P//xPfkuyc+l3W5M0+41z3ZlxT9nsnHqYm54rQd1urZgzefziI/ew/wBH3Fl75tz7RgvVklJSQwfOYox77xGeLky3P1wX1pe34Sa1aum1Bnct3fK/Bf/mcLa9RsBKFiwAK+98BRVK0cQG7eHu3o+SdMmDSlWtAgAq9au42D84TT9bd0ezT8/n8jnH46keLGi7Nm3PxeO0jtZDukYY7oCe4EVxpiFxpiWwCagPdAtF/bvvHT11VexaeNWtmzZTkJCAl9/PY2OHdtke/ur6l9O2XJlmDfvx2zV79GjKx999Bn79x8ESLnql7NT7qqaHNyyi0Pb4khOSGLj1MVUa9MwTZ2697Zk1bi5HD9wBICjew6mrIv+aTUJ8UfTtVux6aVs+n4JAOu+/pHqbZ02D0XtZu9f28Farw7JN1auXUeVShWpHFGBsLAw2rdqzg8/Ls60/vS5C7m5dQsAqlWpRNXKEQCUK1uaUiVLsG//AcD5RTJy1CcMfLxnmu2/njqTrl06UbxYUQBKlyzhwVHlntON4T8PNLTWVgT6AzOBJ621t1lrl59Nh8aYHmez3fmkYsXyREWfvBqMjo6hQsXy6erdemt7fv11Bv/+YjQRERUAMMbw+uvP89yQ1zJs+6N/vMUvi6cz+JknU8pq1a5B7VrVmTvva+YvmMRNNzXPcFvJnsLhJYnfsTdlOT5mL4XDS6apU7xGOCVqhHPrt0O5bcowKre4Iss2C5YswvGDR7BJyZm2KTkXG7eb8HJlU5bLlytDbCYXQDt27iI6ZidNGl6Zbt3KNUESEhKp7P5cfvnNNFpefw1ly5RKU2/r9mi2bo/mvkcHcu8j/Vi0+LdzeDS573SBf9xauwHADfjN1tpJp9nmdF7KbIUxppcx5jdjzG+JiYdy2I13jDHpyuwpV2/Tp8+lbp3radKkPfPn/8SYMc5Yb6/e3Zk9az7R0THp2njoob40btyOm1rfSdPrGnHvvV0AyJcvlJq1qtOubVcefOBJRo1+g+LFi6XbXrIpw/OXdjkkNJTi1cOZeterzH1iFM1HPEz+YoVy1KbkXEavaQYvPQAz5i6kTYvrCQ0NTVMet3svz778FsOH9CckJITYuD3Mnv8j997ROV0biUlJbI2K5tMP3mTES8/w4hvvcPBQ/Lk4lDxxujH8csaYAamWi6Retta+ndFGxpgVmbRngPSXwifb+xj4GKBwoWrn7Y9LdPROKkVUTFmOiKjAzpjYNHX27j051vfp2PG88orzRk+Txg24rmkjHunVncKFC5E/fxiH448wdOibxOxw3uCNjz/MV19NpeHVV/Lll98SHb2TpUt+JzExka1bo1i/bhM1a1Vj+bLMXmbJyuGYvRSpePJKrkiFUhzZtS9NnfiYvcT+voHkxCQObY9j/8YYilcPJ+7PTRm2eXTvIfIXK4QJDcEmJWfYpuRc+XJl2Bkbl7K8K3Y3Zctk/J7WjLkLeW5gnzRl8YcP8/igoTzZ6wGuvNx5C3Lt+o1si4rh5rsfAuDo0WO0v+shZnw1lvJly3DlZXUIy5ePShXDqValElujoqlXN+DREXrrdFf4Y4CiqabUy0Wy2K48cD/QKYPpgh+AXrbsT2rWqkbVqpUICwvjjjs68f33c9LUCQ8/+Wdnh443EQw6bxw99FA/6gSacmnd63luyGt8+eW3DB36JqGhoZQu7QwB5MuXj3btb2TNmnUAfDdtNs2aXQtA6dIlqVW7Ols2b8uNQ70oxf65ieLVwilauSwhYaHU7HwNW+akHaHcMnsZFa+9FHCGa0rUCOfg1tiMmkux4+c11OjQGIDIO25gy+yzGvWULFxeJ5JtUTuI2rGThIQEZsxbSMvrr0lXb/PWKA4eiueqy0/eV5KQkEDfZ1+hc7tWtL3xhpTy5tc1ZuG0L5n9zThmfzOOggULMOOrsQC0anYtS5b/CcC+/QfYsj2ayhUreHyU3snyCt9am9XwS78sNv0OKGKt/SOD7RZke+/OU0lJSQwcMJQpUz8jNDSUzz77irVr1/P8C/1Zvnwl07+fy2OP9eDmDq1JSkxi77799O71VJZtFiiQnylTPyMsXz5CQkNZMP8nPh07HoA5cxbSqtUN/LZsDslJSTw35PU0f0HImbFJySx6YRwd/v00JjSE4MSF7FsXzdUDbyduxWa2zlnO9gUrqNSsHnfNexObnMwvr47n2H7nT/lbvnmBEjUrEFa4IPcteY8Fg8YQtXAli1+fwE2jnqDxoDvZvWoLaycsAKDslTVoO6YfBYoXomrr+lw94Ha+av1MHr4CF658+UIZ0v8xeg94nqSkJG7r2IZaNarywZjPuKxOJC1vcMJ/+twFtG/dPM3w68wffmTZH6vYf+AQk6fPBeDV5wZQJ7Jmpv01bdKQn5csp3O3XoSGhDKwT09KXMDDqebUsedsb2jMNmttlXO8PynO5yEdydrI0tfn9S5IDvT8/eW83gXJgbAyNTJ5VyNnH7zKtFEA4/xqbQxEABbYASyxZ/sbRkREciQngZ9pcBtj2gCjgfVAtFtcCahljHncWjs7B/2KiMhZyDLwjTGHyDjYDXBJFpu+C7S21m45pb3qwHTAl5/QFRHJS6d707ZoDtqNyqA8Ggg7yzZFRCQHvHp42lhgqTFmArDdLasMdAU+8ahPERHJgieBb6193RgzBegMXIszBBQFdLPWrvGiTxERyZpnj0d2g32NMaaUs2j1sUMRkTzkyRegGGOqGGMmGGNigV+BJcaYWLesmhd9iohI1rz6xquJwCSggrW2trW2NlABmAxM8KhPERHJgleBX8ZaO9Fam3SiwFqbZK2dAOjbO0RE8oBXY/jLjDGjgXGkvUvnAeB3j/oUEZEseBX49wM9cZ59H4Fzl852YBq6LVNEJE94dVvmceBDdxIRkfOAV2P4mTLGdMztPkVEJA8CH2iUB32KiPieZx+8MsbUAW4h7eORp1prX/SqTxERyZxXH7wajHO/vQGWAEvd+fHGGH3Vj4hIHvDqCr8ncJm1NiF1oTHmbWA18IZH/YqISCa8GsNPBipmUF7BXSciIrnMqyv8fsA8Y8x6Tn7wqgpQC3jCoz5FRCQLXt2HP9MYE8nJ77Q98XjkpakftyAiIrnHy8cjJwOLvWpfRETOTF7chy8iInlAgS8i4hMKfBERn1Dgi4j4hAJfRMQnFPgiIj6hwBcR8QkFvoiITyjwRUR8QoEvIuITCnwREZ9Q4IuI+IQCX0TEJxT4IiI+ocAXEfEJBb6IiE8o8EVEfEKBLyLiEwp8ERGfUOCLiPiEAl9ExCcU+CIiPqHAFxHxCQW+iIhPKPBFRHxCgS8i4hMKfBERn8iX1zuQmWOJCXm9C3KWiiTl9R5IjiTrBF6sdIUvIuITCnwREZ9Q4IuI+IQCX0TEJxT4IiI+ocAXEfEJBb6IiE8o8EVEfEKBLyLiEwp8ERGfUOCLiPiEAl9ExCcU+CIiPqHAFxHxCQW+iIhPKPBFRHxCgS8i4hMKfBERn1Dgi4j4hAJfRMQnFPgiIj6hwBcR8QkFvoiITyjwRUR8QoEvIuITCnwREZ9Q4IuI+IQCX0TEJxT4IiI+ocAXEfEJBb6IiE8o8EVEfEKBLyLiEwp8ERGfUOCLiPiEAl9ExCcU+CIiPqHAFxHxCQW+iIhPKPBFRHxCgS8i4hMKfBERn1Dgi4j4hAJfRMQnFPgiIj6hwBcR8QkFvoiITyjwRUR8QoEvIuITCnwREZ/Il9c7cKFq26YFb7/9MqEhIYz9dDwj3hqVZv393e/izTeeJ3rHTgBGj/6UsZ+OT1lftGgRVq1YwOQpM+nb73kAGtSvxyef/J1LChZkxswf6D9gaEr9Po/34PHHe5CYmMiMGfN45tlXc+EoL14VWlxBo1e6Y0JC2DB+Aas/mJauTpVOTbhiYBewln1rtvFTn9EA3PjF05RpUJPYJetY8MDIdNtdPfx+at7djIm1Hwag4bBulG96KQD5CuanYJlifFW3t4dHd3Fb9Osy3nj3Y5KSk7m9Yxsevu/ONOvffG8MS35fAcDRo8fYu/8Av8yYyF/rN/HKyFHEH/4fISEh9Lr/Ltq3agbAC2+8y+q/1mMtVKtckVeH9KdQoUuI2RXLkFf/zqH4wyQlJdP/0Qdodm2jXD/mc0WBfxZCQkJ4791XaXfzPURFxbD4l+lM+242a9euT1Pvq/9MTQnzU700bBD//XFxmrJRH7zOY48NZvGvy/hu6ue0a9uSmbPm06L5dXTu1Jb6DVpz/PhxypYt7dmx+YEJMTR+7QHmdX2DIzF7aT/9ZaJmLePA+h0pdYpWL8/lT3Zi9i0vcfzAEQqULpaybs2H3xN6SX5q33djurZLXVGd/MUKpSlbNuyLlPnAQzdR8vJq5/6gfCIpKYnhb3/ImL8PJ7xsae5+pD8tmzahZvUqKXUG/98jKfNffD2Ntes3AlCwQAFee24AVStHELt7D3f17EfTxg0oVrQIg598hCKFnfM24v0xfPntdzx83518NG4ibVveQNfbbmbj5m089vQwZv/nwg18DemchcaN6rNx4xY2b95GQkICX301hc6d2mZ7+wb161G+fFnmzPlvSll4eDmKFivK4l+XAfD5F1/TuXM7AHr3vp8Rb43i+PHjAMTF7TmHR+M/pevX5NCWXcRviyM5IYktUxZTqW3DNHVqdWvJun/N5fiBIwAc23MwZd3ORatJjD+arl0TYmjwwj38PnxCpn1Xu/Vatkz+5Rwdif+sXLuOKhEVqFwxnLCwMNq3asYPixZnWn/6vIXc3Lo5ANWqRFC1cgQA5cqUplTJ4uzbfwAgJeyttRw9dhxjDADGGA4fcf4PHDp8mLJlSnl2bLkh1wPfGFMnt/s81ypGhLM96uTVYFR0DBUrhqer1+W2m1m+bA4TJ3xMpUoVAec/0FsjhjL4meFp6kZUDCc6KiZlOToqhgi3zdq1a3D99Y35edE0fpj7NVc3vNKLw/KNQuElObJjb8rykZi9FKpQMk2dYjXCKVojnDZThtJ22jAqtLjitO1G9mhD1Ozl/C92f4brC0eUpkjlcuxatDpnB+BjsXF7CC9XNmW5fNkyxO7O+AJox85YonfsokmD9Odu5ZogCYmJVI6okFL2/Gvv0PyW7mzeFsW9t3cE4PEe9/Ld7Pm06vIAjw8axpB+j57jI8pl1tpcnYBtWazrBfzmTr1ye9/OYLrTWvvPVMvdrbXvn1KntLW2gDv/qLX2B3f+CWvt0+7xPWit/cAtb2StnZtq+xustdPc+VXW2vestcZa29hau9mdz+vX4UKdsnP+vrPWTrLWhllrq1tro6y1JU6sf+qpp/7m1jlRv6K1dpG1Np+7HJ9Bv4Mz6EfTGUyRkZF3RkZG/jPVcvfIyMgMX9PIyMjBGa0rWLDgoMjIyGBkZOQ1GWwTGhkZOToyMrKHuzwgMjJyoDt/bWRk5JrIyMiQvH4dznbyZAzfGPNeZquAEpltZ639GPjYi306x6KAyqmWKwE7TqmT+rJjDPCmO38tcENUVFR54BCQH4gH3nXbyajNKOBbwAJLgGSgDBCX0wPxqeycvyhgMZAAbAaCQG1gKcDatWs7AanftKkP1AI2uMuF3Plaqep0BfqckyPwr+ycuxPSvd6BQKBY+fLlhwEPBoPBdGNBwWAwKRAITAQGAZ8CPYF27rpfAoFAQZyfvdgcHkee8GpIpwewClh2yvQbcNyjPnPTUpwf/uo4gd0VmHpKnQqp5jsDa935bkCVSpUqrQSeAj4DngFicH4BXIPzi/F+YIq7zWTgxDuEkW6fu8/d4fhOds7fZKClO18G53XflEWb3wPhQDV3OkLasA8AJQEN4OfMUqB2IBCoHggEMjt3BAKBdK+3W3/SoUOH9gSDwf+kKjeBQKDWiXmgE/CXu3ob0MpdVxcoyAV8oeXVXTpLgVXW2p9PXWGMGeZRn7kpEXgCmAWEAmOB1cDLOL/UpgL/hxP0icBe4MFstPsY8C/gEmCGO+G2Pxbnl+hx4AGcq305O9k5f7OANsAaIAnniu/EX20/fv755zWAKjhXnD3d+lm5B5iAzluOBIPBxEAgkObcBYPB1YFA4GXgt2AweCL87wEmBIPB1K/3XUCzYsWKJQQCgT/csgeBFcC4QCBQDOdi60+cn0WAgcCYQCDQH+fcPXhKmxcUY+2533djTCngqLX2yDlv/CJhjOnlDmHJBUjn78Ll53PnSeCn6cAJf2ut3edpRyIikiVPxvCNMVWMMROMMXHAr8BSY0ysW1bNiz5FRCRrXr1pOxGYBIRba2tba2vhvIk5GWccU0REcplXgV/GWjvRWpt0osBam2StnQBcdM8FMMa0M8YEjTEbjDHPZLC+gDFmorv+19R/5RhjnnXLg8aYtqnKx7p/Fa3KnaMQOPtzaYwpbYyZb4yJN8Z8kNv7Lell41w2M8YsN8YkGmPuyIt9zG1eBf4yY8xoY0wTY0xFd2pijBkN/O5Rn3nCGBMKjALaA5cC9xhjLj2lWk9gn/uXzt9x78l363UFLsO513e02x44d+u08/wAJEVOziVwFHgB51ZbyWPZPJfbcO7S+TJ39y7veBX49wMrgZdwbp+aDQzDua2wu0d95pXGwFCul9MAAAO1SURBVAZr7SZr7XGcIatbTqlzCzDOnf8aaGWch3XcAkyw1h6z1m7G+aBOYwBr7X9xbueU3HPW59Jae9hauwgn+CXvnfZcWmu3WGtX4HyQ0Rc8uQ/ffYE/dKeLXQSwPdVyFNAkszrW2kRjzAGcoa0InE9zpt42wrtdldPIybnUB+HOL9k5l76TFw9P65jbfXrMZFB26r2umdXJzraSe3JyLuX8ovOUgbx4PPKF+zDpjGX3uSyVAYwx+YDiOMM1Z/JcEPFeTs6lnF/0s5UBzwLfGFPHGDPYGPOeMeZdd76utfZFr/rMI0uB2saY6saYzJ7tMRXncQgAdwA/WOcTb1OBru6dH9Vxnu+yJJf2W9LLybmU80t2zqXvePXBq8E4b5IYnABb6s6Pz+j2qAuZtTb1c1nWAl9Za1cbY142xnR2q30ClDbGbAAG4DwsDWvtauArnOe1zAT6nLiV1RgzHufBTwFjTJQxpmduHpcf5eRcAhhjtgBvAw+65+zUu0Ikl2TnXBpjGhljooA7gY+MMRf9FxV49SyddcBl1tqEU8rzA6uttbXPeaciIpIlr4Z0koGKGZRXwEe3QImInE+8ejxyP2CeMWY9J2+NqoLzfPAnPOpTRESy4NnTMo0xITgffojAGb+PApamftyCiIjkHs8fjywiIueHvLgPX0RE8oACX3zDGBPvQZtbjDFl8qJvkTOlwBcR8Qmv7tIRuSAYYzoBzwP5cb6kvJu1dpcxZhhQHedW4kicD1ldg/O43WigU6rPmQwyxrR05++11m5wPzn9Jc7P2MxU/RUBpgAlgTDgeWvtFG+PUsShK3zxu0XANdba+jifDn861bqaQAecx+r+G5hvra0H/M8tP+GgtbYx8AHwjlv2LvChtbYRsDNV3aPAbdbaBkBLYKT7qGwRzynwxe8qAbOMMSuBQThfRnPCDPcqfiUQyskr9ZVAtVT1xqf691p3vmmq8s9T1TXAa8aYFcBcnNuWy5+TIxE5DQW++N37wAfulXtvoGCqdccArLXJQEKqh6Qlk3Y41GZj/oRuQFmgobX2KmDXKX2KeEaBL35XHGdMHk4+BfNM3Z3q31/c+Z9wntAITsin7i/WWpvgjvtXPcs+Rc6Y3rQVPynkPh3xhLdxvnrzP8aYaJxvH6t+Fu0WMMb8inMBdY9b1hf40hjTF/gmVd0vgGnGmN+AP4C/zqI/kbOiT9qKiPiEhnRERHxCgS8i4hMKfBERn1Dgi4j4hAJfRMQnFPgiIj6hwBcR8Yn/B5WB+yFi76KKAAAAAElFTkSuQmCC\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "fig, ax = plt.subplots()\n",
+ "sns.heatmap(auc_df, cbar=False, annot=True, fmt=\".4g\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "More advanced tuning methods like Bayesian Optimization can be used for searching for the optimal model efficiently. The benefit of using, for example, `HyperDrive` from Azure Machine Learning Services, for tuning the parameters, is that, the tuning tasks can be distributed across nodes of a cluster and the optimization can be run concurrently to save the total cost.\n",
+ "\n",
+ "* Details about how to tune hyper parameters by using Azure Machine Learning Services can be found [here](https://github.com/microsoft/recommenders/tree/master/notebooks/04_model_select_and_optimize).\n",
+ "* Note, to enable the tuning task on Azure Machine Learning Services by using HyperDrive, one needs a Docker image to containerize the environment where `xlearn` can be run. The Docker file provided [here](https://github.com/microsoft/recommenders/tree/master/docker) can be used for such purpose."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 2.4 Clean up"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tmpdir.cleanup()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## References"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ " \n",
+ "1. Rendle, Steffen. \"Factorization machines.\" 2010 IEEE International Conference on Data Mining. IEEE, 2010.\n",
+ "2. Juan, Yuchin, et al. \"Field-aware factorization machines for CTR prediction.\" Proceedings of the 10th ACM Conference on Recommender Systems. ACM, 2016.\n",
+ "3. Guo, Huifeng, et al. \"DeepFM: a factorization-machine based neural network for CTR prediction.\" arXiv preprint arXiv:1703.04247 (2017).\n",
+ "4. Lian, Jianxun, et al. \"xdeepfm: Combining explicit and implicit feature interactions for recommender systems.\" Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. ACM, 2018.\n",
+ "5. Qu, Yanru, et al. \"Product-based neural networks for user response prediction.\" 2016 IEEE 16th International Conference on Data Mining (ICDM). IEEE, 2016.\n",
+ "6. Zhang, Weinan, Tianming Du, and Jun Wang. \"Deep learning over multi-field categorical data.\" European conference on information retrieval. Springer, Cham, 2016.\n",
+ "7. He, Xiangnan, and Tat-Seng Chua. \"Neural factorization machines for sparse predictive analytics.\" Proceedings of the 40th International ACM SIGIR conference on Research and Development in Information Retrieval. ACM, 2017.\n",
+ "8. Cheng, Heng-Tze, et al. \"Wide & deep learning for recommender systems.\" Proceedings of the 1st workshop on deep learning for recommender systems. ACM, 2016.\n",
+ "9. Langford, John, Lihong Li, and Alex Strehl. \"Vowpal wabbit online learning project.\" (2007)."
+ ]
+ }
+ ],
+ "metadata": {
+ "celltoolbar": "Tags",
+ "kernelspec": {
+ "display_name": "Python 3.6 (Recommender)",
+ "language": "python",
+ "name": "reco_base"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/notebooks/03_evaluate/README.md b/notebooks/03_evaluate/README.md
index e2b564b9c3..b30448851a 100644
--- a/notebooks/03_evaluate/README.md
+++ b/notebooks/03_evaluate/README.md
@@ -17,6 +17,7 @@ Two approaches for evaluating model performance are demonstrated along with thei
* Recall - this measures the proportion of relevant items that are recommended
* Normalized Discounted Cumulative Gain (NDCG) - evaluates how well the predicted items for a user are ranked based on relevance
* Mean Average Precision (MAP) - average precision for each user normalized over all users
+3. Classification metrics: These are used to evaluate binary labels
* Arear Under Curver (AUC) - integral area under the receiver operating characteristic curve
* Logistic loss (Logloss) - the negative log-likelihood of the true labels given the predictions of a classifier
diff --git a/notebooks/05_operationalize/als_movie_o16n.ipynb b/notebooks/05_operationalize/als_movie_o16n.ipynb
index bdbe3fc79b..b4e5e20527 100644
--- a/notebooks/05_operationalize/als_movie_o16n.ipynb
+++ b/notebooks/05_operationalize/als_movie_o16n.ipynb
@@ -15,10 +15,9 @@
"source": [
"# Building a Real-time Recommendation API\n",
"\n",
- "This reference architecture shows the full lifecycle of building a recommendation system. It walks through the creation of appropriate azure resources, training a recommendation model using Azure Databricks and deploying it as an API. It uses Azure Cosmos DB, Azure Machine Learning, and Azure Kubernetes Service. \n",
+ "This reference architecture shows the full lifecycle of building a recommendation system. It walks through the creation of appropriate azure resources, training a recommendation model using a Virtual Machine or Databricks, and deploying it as an API. It uses Azure Cosmos DB, Azure Machine Learning, and Azure Kubernetes Service. \n",
"\n",
"This architecture can be generalized for many recommendation engine scenarios, including recommendations for products, movies, and news. \n",
- "\n",
"### Architecture\n",
"![architecture](https://recodatasets.blob.core.windows.net/images/reco-arch.png \"Architecture\")\n",
"\n",
@@ -28,16 +27,18 @@
"\n",
"### Components\n",
"This architecture consists of the following key components:\n",
- "* [Azure Databricks](https://docs.microsoft.com/en-us/azure/azure-databricks/what-is-azure-databricks) is used as a development environment to prepare input data and train the recommender model on a Spark cluster. Azure Databricks also provides an interactive workspace to run and collaborate on notebooks for any data processing or machine learning tasks. \n",
+ "* [Azure Databricks](https://docs.microsoft.com/en-us/azure/azure-databricks/what-is-azure-databricks)1) is used as a development environment to prepare input data and train the recommender model on a Spark cluster. Azure Databricks also provides an interactive workspace to run and collaborate on notebooks for any data processing or machine learning tasks.\n",
"* [Azure Kubernetes Service](https://docs.microsoft.com/en-us/azure/aks/intro-kubernetes)(AKS) is used to deploy and operationalize a machine learning model service API on a Kubernetes cluster. AKS hosts the containerized model, providing scalability that meets throughput requirements, identity and access management, and logging and health monitoring. \n",
"* [Azure Cosmos DB](https://docs.microsoft.com/en-us/azure/cosmos-db/introduction) is a globally distributed database service used to store the top 10 recommended movies for each user. Azure Cosmos DB is ideal for this scenario as it provides low latency (10 ms at 99th percentile) to read the top recommended items for a given user. \n",
"* [Azure Machine Learning Service](https://docs.microsoft.com/en-us/azure/machine-learning/service/) is a service used to track and manage machine learning models, and then package and deploy these models to a scalable Azure Kubernetes Service environment.\n",
"\n",
+ "1) Here, we are just giving an example of using Azure Databricks. Any platforms listed in [SETUP](https://github.com/microsoft/recommenders/blob/master/SETUP.md) can be used as well. \n",
+ "\n",
"\n",
"### Table of Contents.\n",
"0. [File Imports](#0-File-Imports)\n",
"1. [Service Creation](#1-Service-Creation)\n",
- "2. [Training](#2-Training)\n",
+ "2. [Training and evaluation](#2-Training)\n",
"3. [Operationalization](#3.-Operationalize-the-Recommender-Service)"
]
},
@@ -46,12 +47,9 @@
"metadata": {},
"source": [
"## Setup\n",
+ "To run this notebook on Azure Databricks, you should setup Azure Databricks by following the appropriate sections in the repository [SETUP instructions](https://github.com/microsoft/recommenders/blob/master/SETUP.md) and import this notebook into your Azure Databricks Workspace (see instructions [here](https://docs.azuredatabricks.net/user-guide/notebooks/notebook-manage.html#import-a-notebook)).\n",
"\n",
- "This notebook should be run on Azure Databricks. To import this notebook into your Azure Databricks Workspace, see instructions [here](https://docs.azuredatabricks.net/user-guide/notebooks/notebook-manage.html#import-a-notebook).\n",
- "\n",
- "Setup for Azure Databricks should be completed by following the appropriate sections in the repository's [SETUP file](../../SETUP.md). \n",
- "\n",
- "Please note: This notebook **REQUIRES** that you add the dependencies to support operationalization. See the [SETUP file](../../SETUP.md) for details."
+ "Please note: This notebook **REQUIRES** that you add the dependencies to support **operationalization**. See [SETUP](https://github.com/microsoft/recommenders/blob/master/SETUP.md) for details.\n"
]
},
{
@@ -63,66 +61,126 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Azure SDK version: 1.0.69\n"
+ ]
+ }
+ ],
"source": [
- "import numpy as np\n",
"import os\n",
- "import pandas as pd\n",
- "import pprint\n",
- "import shutil\n",
- "import time, timeit\n",
+ "import sys\n",
+ "sys.path.append(\"../../\")\n",
+ "import time\n",
"import urllib\n",
- "import yaml\n",
- "import json\n",
- "import uuid\n",
- "import matplotlib\n",
- "import matplotlib.pyplot as plt\n",
"\n",
"from azure.common.client_factory import get_client_from_cli_profile\n",
- "from azure.mgmt.compute import ComputeManagementClient\n",
"import azure.mgmt.cosmosdb\n",
"import azureml.core\n",
"from azureml.core import Workspace\n",
- "from azureml.core.run import Run\n",
- "from azureml.core.experiment import Experiment\n",
"from azureml.core.model import Model\n",
- "from azureml.core.image import ContainerImage\n",
"from azureml.core.compute import AksCompute, ComputeTarget\n",
+ "from azureml.core.compute_target import ComputeTargetException\n",
"from azureml.core.webservice import Webservice, AksWebservice\n",
- "\n",
- "\n",
- "import pydocumentdb\n",
+ "from azureml.exceptions import WebserviceException\n",
+ "from azureml.core import Environment\n",
+ "from azureml.core.environment import CondaDependencies\n",
+ "from azureml.core.model import InferenceConfig\n",
+ "from azureml.core.environment import SparkPackage\n",
"import pydocumentdb.document_client as document_client\n",
- "\n",
- "import pyspark\n",
- "from pyspark.ml.feature import StringIndexer\n",
"from pyspark.ml.recommendation import ALS\n",
- "from pyspark.sql import Row\n",
"from pyspark.sql.types import StructType, StructField\n",
- "from pyspark.sql.types import StringType, FloatType, IntegerType, LongType\n",
+ "from pyspark.sql.types import FloatType, IntegerType, LongType\n",
"\n",
+ "from reco_utils.common.timer import Timer\n",
+ "from reco_utils.common.spark_utils import start_or_get_spark\n",
"from reco_utils.dataset import movielens\n",
"from reco_utils.dataset.cosmos_cli import find_collection, read_collection, read_database, find_database\n",
+ "from reco_utils.dataset.download_utils import maybe_download\n",
"from reco_utils.dataset.spark_splitters import spark_random_split\n",
"from reco_utils.evaluation.spark_evaluation import SparkRatingEvaluation, SparkRankingEvaluation\n",
+ "from reco_utils.common.notebook_utils import is_databricks\n",
"\n",
- "print(\"PySpark version:\", pyspark.__version__)\n",
"print(\"Azure SDK version:\", azureml.core.VERSION)"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In JupyterHub, environment variables defined in `.../etc/conda/activate.d` may not be activated. If so, run the following cell to set PySpark environment variables. Make sure your conda environment path is `/anaconda/envs/reco_pyspark` or change the paths in the script."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "os.environ[\"PYSPARK_PYTHON\"]=\"/anaconda/envs/reco_pyspark/bin/python\"\n",
+ "os.environ[\"PYSPARK_DRIVER_PYTHON\"]=\"/anaconda/envs/reco_pyspark/bin/python\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ "
SparkContext
\n",
+ "\n",
+ "
Spark UI
\n",
+ "\n",
+ "
\n",
+ " Version \n",
+ " v2.3.1
\n",
+ " Master \n",
+ " local[*]
\n",
+ " AppName \n",
+ " ALS
\n",
+ " \n",
+ "
\n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Start spark session if needed\n",
+ "if not is_databricks():\n",
+ " cosmos_connector = (\n",
+ " \"https://search.maven.org/remotecontent?filepath=com/microsoft/azure/\"\n",
+ " \"azure-cosmosdb-spark_2.3.0_2.11/1.3.3/azure-cosmosdb-spark_2.3.0_2.11-1.3.3-uber.jar\"\n",
+ " )\n",
+ " jar_filepath = maybe_download(url=cosmos_connector, filename=\"cosmos.jar\")\n",
+ " spark = start_or_get_spark(\"ALS\", memory=\"10g\", jars=[jar_filepath])\n",
+ " sc = spark.sparkContext\n",
+ "display(sc)"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1 Service Creation\n",
- "Modify the **Subscription ID** to the subscription you would like to deploy to.\n",
+ "Modify the **Subscription ID** to the subscription you would like to deploy to and set the resource name variables.\n",
"\n",
"#### Services created by this notebook:\n",
- "1. [Azure ML Service](https://docs.databricks.com/user-guide/libraries.html)\n",
+ "1. [Azure ML Service](https://azure.microsoft.com/en-us/services/machine-learning-service/)\n",
"1. [Azure Cosmos DB](https://azure.microsoft.com/en-us/services/cosmos-db/)\n",
"1. [Azure Container Registery](https://docs.microsoft.com/en-us/azure/container-registry/)\n",
"1. [Azure Container Instances](https://docs.microsoft.com/en-us/azure/container-instances/)\n",
@@ -132,67 +190,107 @@
"1. [Azure Kubernetes Service (AKS)](https://azure.microsoft.com/en-us/services/kubernetes-service/)"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Add your Azure subscription ID**"
+ ]
+ },
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
+ "execution_count": 4,
+ "metadata": {},
"outputs": [],
"source": [
- "# Select the services names\n",
- "short_uuid = str(uuid.uuid4())[:4]\n",
- "prefix = \"reco\" + short_uuid\n",
- "data = \"mvl\"\n",
- "algo = \"als\"\n",
+ "# Add your subscription ID\n",
+ "subscription_id = \"\"\n",
"\n",
- "# location to store the secrets file for cosmosdb\n",
- "secrets_path = '/dbfs/FileStore/dbsecrets.json'\n",
- "ws_config_path = '/dbfs/FileStore'\n",
+ "# Set your workspace name\n",
+ "workspace_name = \"o16n-test\"\n",
+ "resource_group = \"{}-rg\".format(workspace_name)\n",
"\n",
- "# Add your subscription ID\n",
- "subscription_id = \"\"\n"
+ "# Set your region to deploy Azure ML workspace\n",
+ "location = \"eastus\"\n",
+ "\n",
+ "# AzureML service and Azure Kubernetes Service prefix\n",
+ "service_name = \"mvl-als\""
]
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {
- "collapsed": true
- },
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Login for Azure CLI so that AzureML can use Azure CLI login credentials\n",
+ "!az login"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
"outputs": [],
"source": [
- "# Resource group and workspace\n",
- "resource_group = prefix + \"_\" + data\n",
- "workspace_name = prefix + \"_\"+data+\"_aml\"\n",
- "workspace_region = \"westus2\"\n",
- "print(\"Resource group:\", resource_group)\n",
+ "# Change subscription if needed\n",
+ "!az account set --subscription {subscription_id}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Check account\n",
+ "!az account show"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# CosmosDB\n",
+ "account_name = \"{}-ds-sql\".format(workspace_name)\n",
+ "# account_name for CosmosDB cannot have \"_\" and needs to be less than 31 chars\n",
+ "account_name = account_name.replace(\"_\", \"-\")[:31]\n",
+ "cosmos_database = \"recommendations\"\n",
+ "cosmos_collection = \"user_recommendations_als\"\n",
+ "\n",
+ "# AzureML resource names\n",
+ "model_name = \"{}-reco.mml\".format(service_name)\n",
+ "aks_name = \"{}-aks\".format(service_name)\n",
+ "container_image_name = \"{}-img\".format(service_name)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# top k items to recommend\n",
+ "TOP_K = 10\n",
"\n",
- "# Columns\n",
+ "# Select MovieLens data size: 100k, 1m, 10m, or 20m\n",
+ "MOVIELENS_DATA_SIZE = '100k'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
"userCol = \"UserId\"\n",
"itemCol = \"MovieId\"\n",
"ratingCol = \"Rating\"\n",
"\n",
- "# CosmosDB\n",
- "location = workspace_region\n",
- "account_name = resource_group + \"-ds-sql\"\n",
- "# account_name for CosmosDB cannot have \"_\" and needs to be less than 31 chars\n",
- "account_name = account_name.replace(\"_\",\"-\")[0:min(31,len(prefix))]\n",
- "DOCUMENTDB_DATABASE = \"recommendations\"\n",
- "DOCUMENTDB_COLLECTION = \"user_recommendations_\" + algo\n",
- "\n",
- "# AzureML\n",
- "history_name = 'spark-ml-notebook'\n",
- "model_name = data+\"-\"+algo+\"-reco.mml\" #NOTE: The name of a asset must be only letters or numerals, not contain spaces, and under 30 characters\n",
- "service_name = data + \"-\" + algo\n",
- "experiment_name = data + \"_\"+ algo +\"_Experiment\"\n",
- "# Name here must be <= 16 chars and only include letters, numbers and \"-\"\n",
- "aks_name = prefix.replace(\"_\",\"-\")[0:min(12,len(prefix))] + '-aks'\n",
- "# add a name for the container\n",
- "container_image_name = '-'.join([data, algo])\n",
- "\n",
- "train_data_path = data + \"Train\"\n",
- "test_data_path = data + \"Test\""
+ "train_data_path = \"train\"\n",
+ "test_data_path = \"test\""
]
},
{
@@ -205,40 +303,58 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "WARNING - Warning: Falling back to use azure cli login credentials.\n",
+ "If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.\n",
+ "Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.\n"
+ ]
+ }
+ ],
"source": [
- "ws = Workspace.create(name = workspace_name,\n",
- " subscription_id = subscription_id,\n",
- " resource_group = resource_group, \n",
- " location = workspace_region,\n",
- " exist_ok=True)\n",
- "\n",
- "# persist the subscription id, resource group name, and workspace name in aml_config/config.json.\n",
- "ws.write_config(ws_config_path)"
+ "ws = Workspace.create(\n",
+ " name=workspace_name,\n",
+ " subscription_id=subscription_id,\n",
+ " resource_group=resource_group, \n",
+ " location=location,\n",
+ " exist_ok=True\n",
+ ")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### 1.2 Create a Cosmos DB resource to store recommendation results:"
+ "### 1.2 Create a Cosmos DB to store recommendation results\n",
+ "\n",
+ "This step will take some time to create CosmosDB resources."
]
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Database found\n",
+ "Collection found\n"
+ ]
+ }
+ ],
"source": [
- "## explicitly pass subscription_id in case user has multiple subscriptions\n",
- "client = get_client_from_cli_profile(azure.mgmt.cosmosdb.CosmosDB,\n",
- " subscription_id=subscription_id)\n",
+ "# explicitly pass subscription_id in case user has multiple subscriptions\n",
+ "client = get_client_from_cli_profile(\n",
+ " azure.mgmt.cosmosdb.CosmosDB,\n",
+ " subscription_id=subscription_id\n",
+ ")\n",
"\n",
"async_cosmosdb_create = client.database_accounts.create_or_update(\n",
" resource_group,\n",
@@ -252,51 +368,46 @@
")\n",
"account = async_cosmosdb_create.result()\n",
"\n",
- "my_keys = client.database_accounts.list_keys(\n",
- " resource_group,\n",
- " account_name\n",
- ")\n",
- "\n",
+ "my_keys = client.database_accounts.list_keys(resource_group, account_name)\n",
"master_key = my_keys.primary_master_key\n",
"endpoint = \"https://\" + account_name + \".documents.azure.com:443/\"\n",
"\n",
- "#db client\n",
+ "# DB client\n",
"client = document_client.DocumentClient(endpoint, {'masterKey': master_key})\n",
"\n",
- "if find_database(client, DOCUMENTDB_DATABASE) == False:\n",
- " db = client.CreateDatabase({ 'id': DOCUMENTDB_DATABASE })\n",
+ "if not find_database(client, cosmos_database):\n",
+ " db = client.CreateDatabase({'id': cosmos_database })\n",
+ " print(\"Database created\")\n",
"else:\n",
- " db = read_database(client, DOCUMENTDB_DATABASE)\n",
+ " db = read_database(client, cosmos_database)\n",
+ " print(\"Database found\")\n",
+ "\n",
"# Create collection options\n",
- "options = {\n",
- " 'offerThroughput': 11000\n",
- "}\n",
+ "options = dict(offerThroughput=11000)\n",
"\n",
"# Create a collection\n",
- "collection_definition = { 'id': DOCUMENTDB_COLLECTION, 'partitionKey': {'paths': ['/id'],'kind': 'Hash'} }\n",
- "if find_collection(client,DOCUMENTDB_DATABASE, DOCUMENTDB_COLLECTION) ==False:\n",
- " collection = client.CreateCollection(db['_self'], collection_definition, options)\n",
- "else:\n",
- " collection = read_collection(client, DOCUMENTDB_DATABASE, DOCUMENTDB_COLLECTION)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "secrets = {\n",
- " \"Endpoint\": endpoint,\n",
- " \"Masterkey\": master_key,\n",
- " \"Database\": DOCUMENTDB_DATABASE,\n",
- " \"Collection\": DOCUMENTDB_COLLECTION,\n",
- " \"Upsert\": \"true\"\n",
+ "collection_definition = {\n",
+ " 'id': cosmos_collection,\n",
+ " 'partitionKey': {'paths': ['/id'],'kind': 'Hash'}\n",
"}\n",
- "with open(secrets_path, \"w\") as file:\n",
- " json.dump(secrets, file)"
+ "if not find_collection(client, cosmos_database, cosmos_collection):\n",
+ " collection = client.CreateCollection(\n",
+ " db['_self'], \n",
+ " collection_definition,\n",
+ " options\n",
+ " )\n",
+ " print(\"Collection created\")\n",
+ "else:\n",
+ " collection = read_collection(client, cosmos_database, cosmos_collection)\n",
+ " print(\"Collection found\")\n",
+ " \n",
+ "dbsecrets = dict(\n",
+ " Endpoint=endpoint, \n",
+ " Masterkey=master_key, \n",
+ " Database=cosmos_database, \n",
+ " Collection=cosmos_collection, \n",
+ " Upsert=True\n",
+ ")"
]
},
{
@@ -305,50 +416,67 @@
"source": [
"## 2 Training\n",
"\n",
- "Next, we will train an [Alternating Least Squares model](https://spark.apache.org/docs/2.2.0/ml-collaborative-filtering.html) is trained using the [MovieLens](https://grouplens.org/datasets/movielens/) dataset."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "# top k items to recommend\n",
- "TOP_K = 10\n",
+ "Next, we train an [Alternating Least Squares model](https://spark.apache.org/docs/latest/ml-collaborative-filtering.html) on [MovieLens](https://grouplens.org/datasets/movielens/) dataset.\n",
"\n",
- "# Select MovieLens data size: 100k, 1m, 10m, or 20m\n",
- "MOVIELENS_DATA_SIZE = '100k'"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
"### 2.1 Download the MovieLens dataset"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 4.81k/4.81k [00:00<00:00, 16.8kKB/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+------+-------+------+\n",
+ "|UserId|MovieId|Rating|\n",
+ "+------+-------+------+\n",
+ "| 196| 242| 3.0|\n",
+ "| 186| 302| 3.0|\n",
+ "| 22| 377| 1.0|\n",
+ "| 244| 51| 2.0|\n",
+ "| 166| 346| 1.0|\n",
+ "| 298| 474| 4.0|\n",
+ "| 115| 265| 2.0|\n",
+ "| 253| 465| 5.0|\n",
+ "| 305| 451| 3.0|\n",
+ "| 6| 86| 3.0|\n",
+ "| 62| 257| 2.0|\n",
+ "| 286| 1014| 5.0|\n",
+ "| 200| 222| 5.0|\n",
+ "| 210| 40| 3.0|\n",
+ "| 224| 29| 3.0|\n",
+ "| 303| 785| 3.0|\n",
+ "| 122| 387| 5.0|\n",
+ "| 194| 274| 2.0|\n",
+ "| 291| 1042| 4.0|\n",
+ "| 234| 1184| 2.0|\n",
+ "+------+-------+------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ],
"source": [
"# Note: The DataFrame-based API for ALS currently only supports integers for user and item ids.\n",
"schema = StructType(\n",
" (\n",
- " StructField(\"UserId\", IntegerType()),\n",
- " StructField(\"MovieId\", IntegerType()),\n",
- " StructField(\"Rating\", FloatType()),\n",
- " StructField(\"Timestamp\", LongType()),\n",
+ " StructField(userCol, IntegerType()),\n",
+ " StructField(itemCol, IntegerType()),\n",
+ " StructField(ratingCol, FloatType()),\n",
" )\n",
")\n",
"\n",
- "data = movielens.load_spark_df(spark, size=MOVIELENS_DATA_SIZE, schema=schema, dbutils=dbutils)\n",
+ "data = movielens.load_spark_df(spark, size=MOVIELENS_DATA_SIZE, schema=schema)\n",
"data.show()"
]
},
@@ -362,43 +490,41 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "N train 75031\n",
+ "N test 24969\n"
+ ]
+ }
+ ],
"source": [
"train, test = spark_random_split(data, ratio=0.75, seed=42)\n",
- "print (\"N train\", train.cache().count())\n",
- "print (\"N test\", test.cache().count())"
+ "print(\"N train\", train.cache().count())\n",
+ "print(\"N test\", test.cache().count())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### 2.3 Train the ALS model on the training data, and get the top-k recommendations for our testing data\n",
+ "### 2.3 Train the ALS model on the training data\n",
"\n",
"To predict movie ratings, we use the rating data in the training set as users' explicit feedback. The hyperparameters used to estimate the model are set based on [this page](http://mymedialite.net/examples/datasets.html).\n",
"\n",
- "Under most circumstances, you would explore the hyperparameters and choose an optimal set based on some criteria. For additional details on this process, please see additional information in the deep dives [here](../04_model_select_and_optimize/hypertune_spark_deep_dive.ipynb)."
+ "Under most circumstances, you would explore the hyperparameters and choose an optimal set based on some criteria. For additional details on this process, please see additional information in the deep dives [here](https://github.com/microsoft/recommenders/blob/master/notebooks/04_model_select_and_optimize/tuning_spark_als.ipynb)."
]
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
+ "execution_count": 12,
+ "metadata": {},
"outputs": [],
"source": [
- "header = {\n",
- " \"userCol\": \"UserId\",\n",
- " \"itemCol\": \"MovieId\",\n",
- " \"ratingCol\": \"Rating\",\n",
- "}\n",
- "\n",
- "\n",
"als = ALS(\n",
" rank=10,\n",
" maxIter=15,\n",
@@ -407,16 +533,16 @@
" regParam=0.05,\n",
" coldStartStrategy='drop',\n",
" nonnegative=True,\n",
- " **header\n",
+ " userCol=userCol,\n",
+ " itemCol=itemCol,\n",
+ " ratingCol=ratingCol,\n",
")"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
+ "execution_count": 13,
+ "metadata": {},
"outputs": [],
"source": [
"model = als.fit(train)"
@@ -426,6 +552,8 @@
"cell_type": "markdown",
"metadata": {},
"source": [
+ "### 2.4 Get top-k recommendations for our testing data\n",
+ "\n",
"In the movie recommendation use case, recommending movies that have been rated by the users do not make sense. Therefore, the rated movies are removed from the recommended items.\n",
"\n",
"In order to achieve this, we recommend all movies to all users, and then remove the user-movie pairs that exist in the training dataset."
@@ -433,47 +561,98 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+------+-------+----------+\n",
+ "|UserId|MovieId|prediction|\n",
+ "+------+-------+----------+\n",
+ "| 148| 148| 2.9885666|\n",
+ "| 463| 148| 2.4566634|\n",
+ "| 471| 148| 5.211588|\n",
+ "| 496| 148| 2.7297416|\n",
+ "| 833| 148| 1.3390744|\n",
+ "| 243| 148| 2.865699|\n",
+ "| 392| 148| 2.7495515|\n",
+ "| 540| 148| 3.3569064|\n",
+ "| 623| 148| 2.7875235|\n",
+ "| 737| 148| 2.05034|\n",
+ "| 858| 148| 2.3909347|\n",
+ "| 897| 148| 3.0353885|\n",
+ "| 31| 148| 1.9389185|\n",
+ "| 516| 148| 3.8283033|\n",
+ "| 85| 148| 2.354567|\n",
+ "| 137| 148| 4.645254|\n",
+ "| 251| 148| 2.9032419|\n",
+ "| 451| 148| 3.7471225|\n",
+ "| 580| 148| 3.8034196|\n",
+ "| 808| 148| 3.6879075|\n",
+ "+------+-------+----------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ],
"source": [
"# Get the cross join of all user-item pairs and score them.\n",
- "users = train.select('UserId').distinct()\n",
- "items = train.select('MovieId').distinct()\n",
+ "users = train.select(userCol).distinct()\n",
+ "items = train.select(itemCol).distinct()\n",
"user_item = users.crossJoin(items)\n",
- "dfs_pred = model.transform(user_item)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
+ "dfs_pred = model.transform(user_item)\n",
"dfs_pred.show()"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+------+-------+----------+\n",
+ "|UserId|MovieId|prediction|\n",
+ "+------+-------+----------+\n",
+ "| 1| 587| 3.1521533|\n",
+ "| 1| 869| 2.416863|\n",
+ "| 1| 1208| 3.1338184|\n",
+ "| 1| 1677| 3.0138044|\n",
+ "| 2| 80| 2.7144198|\n",
+ "| 2| 303| 4.0214634|\n",
+ "| 2| 472| 2.5440526|\n",
+ "| 2| 582| 4.103753|\n",
+ "| 2| 838| 1.0517997|\n",
+ "| 2| 975| 2.737259|\n",
+ "| 2| 1260| 3.8516903|\n",
+ "| 2| 1325| 1.1345804|\n",
+ "| 2| 1381| 3.3377702|\n",
+ "| 2| 1530| 2.2663925|\n",
+ "| 3| 22| 3.2361977|\n",
+ "| 3| 57| 2.531235|\n",
+ "| 3| 89| 3.7172291|\n",
+ "| 3| 367| 2.3831084|\n",
+ "| 3| 1091| 1.6094522|\n",
+ "| 3| 1167| 3.595107|\n",
+ "+------+-------+----------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ],
"source": [
"# Remove seen items.\n",
"dfs_pred_exclude_train = dfs_pred.alias(\"pred\").join(\n",
" train.alias(\"train\"),\n",
- " (dfs_pred['UserId'] == train['UserId']) & (dfs_pred['MovieId'] == train['MovieId']),\n",
+ " (dfs_pred[userCol]==train[userCol]) & (dfs_pred[itemCol]==train[itemCol]),\n",
" how='outer'\n",
")\n",
- "\n",
- "top_all = dfs_pred_exclude_train.filter(dfs_pred_exclude_train[\"train.Rating\"].isNull()) \\\n",
- " .select('pred.' + 'UserId', 'pred.' + 'MovieId', 'pred.' + \"prediction\")\n",
+ "top_all = dfs_pred_exclude_train.filter(dfs_pred_exclude_train[\"train.\"+ratingCol].isNull()) \\\n",
+ " .select(\"pred.\"+userCol, \"pred.\"+itemCol, \"pred.prediction\")\n",
"\n",
"top_all.show()"
]
@@ -482,92 +661,149 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "### 2.4 Evaluate how well ALS performs\n",
+ "### 2.5 Evaluate how well ALS performs\n",
"\n",
- "Evaluate model performance using metrics such as Precision@K, Recall@K, [MAP](https://en.wikipedia.org/wiki/Evaluation_measures_\\(information_retrieval\\)) or [nDCG](https://en.wikipedia.org/wiki/Discounted_cumulative_gain). For a full guide on what metrics to evaluate your recommender with, consult [this guide](https://github.com/Microsoft/Recommenders/blob/master/notebooks/03_evaluate/evaluation.ipynb)."
+ "Evaluate model performance using metrics such as Precision@K, Recall@K, [MAP@K](https://en.wikipedia.org/wiki/Evaluation_measures_\\(information_retrieval\\) or [nDCG@K](https://en.wikipedia.org/wiki/Discounted_cumulative_gain). For a full guide on what metrics to evaluate your recommender with, consult [this guide](https://github.com/Microsoft/Recommenders/blob/master/notebooks/03_evaluate/evaluation.ipynb)."
]
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+------+-------+------+\n",
+ "|UserId|MovieId|Rating|\n",
+ "+------+-------+------+\n",
+ "| 1| 2| 3.0|\n",
+ "| 1| 3| 4.0|\n",
+ "| 1| 4| 3.0|\n",
+ "| 1| 14| 5.0|\n",
+ "| 1| 17| 3.0|\n",
+ "| 1| 27| 2.0|\n",
+ "| 1| 29| 1.0|\n",
+ "| 1| 35| 1.0|\n",
+ "| 1| 36| 2.0|\n",
+ "| 1| 51| 4.0|\n",
+ "| 1| 52| 4.0|\n",
+ "| 1| 54| 3.0|\n",
+ "| 1| 56| 4.0|\n",
+ "| 1| 60| 5.0|\n",
+ "| 1| 64| 5.0|\n",
+ "| 1| 69| 3.0|\n",
+ "| 1| 77| 4.0|\n",
+ "| 1| 83| 3.0|\n",
+ "| 1| 85| 3.0|\n",
+ "| 1| 88| 4.0|\n",
+ "+------+-------+------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ],
"source": [
+ "cols = {\n",
+ " 'col_user': userCol,\n",
+ " 'col_item': itemCol,\n",
+ " 'col_rating': ratingCol,\n",
+ " 'col_prediction': \"prediction\",\n",
+ "}\n",
+ "\n",
"test.show()"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "rank_eval = SparkRankingEvaluation(test, top_all, k = TOP_K, col_user=\"UserId\", col_item=\"MovieId\", \n",
- " col_rating=\"Rating\", col_prediction=\"prediction\", \n",
- " relevancy_method=\"top_k\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Model:\tALS\n",
+ "Top K:\t10\n",
+ "MAP:\t0.002841\n",
+ "NDCG:\t0.029330\n",
+ "Precision@K:\t0.033298\n",
+ "Recall@K:\t0.010961\n"
+ ]
+ }
+ ],
"source": [
"# Evaluate Ranking Metrics\n",
+ "rank_eval = SparkRankingEvaluation(\n",
+ " test, \n",
+ " top_all, \n",
+ " k=TOP_K,\n",
+ " **cols\n",
+ ")\n",
"\n",
- "print(\"Model:\\tALS\",\n",
- " \"Top K:\\t%d\" % rank_eval.k,\n",
- " \"MAP:\\t%f\" % rank_eval.map_at_k(),\n",
- " \"NDCG:\\t%f\" % rank_eval.ndcg_at_k(),\n",
- " \"Precision@K:\\t%f\" % rank_eval.precision_at_k(),\n",
- " \"Recall@K:\\t%f\" % rank_eval.recall_at_k(), sep='\\n')"
+ "print(\n",
+ " \"Model:\\tALS\",\n",
+ " \"Top K:\\t%d\" % rank_eval.k,\n",
+ " \"MAP:\\t%f\" % rank_eval.map_at_k(),\n",
+ " \"NDCG:\\t%f\" % rank_eval.ndcg_at_k(),\n",
+ " \"Precision@K:\\t%f\" % rank_eval.precision_at_k(),\n",
+ " \"Recall@K:\\t%f\" % rank_eval.recall_at_k(), sep='\\n'\n",
+ ")"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Model:\tALS rating prediction\n",
+ "RMSE:\t0.95\n",
+ "MAE:\t0.743691\n",
+ "Explained variance:\t0.284494\n",
+ "R squared:\t0.279619\n"
+ ]
+ }
+ ],
"source": [
"# Evaluate Rating Metrics\n",
- "\n",
"prediction = model.transform(test)\n",
- "rating_eval = SparkRatingEvaluation(test, prediction, col_user=\"UserId\", col_item=\"MovieId\", \n",
- " col_rating=\"Rating\", col_prediction=\"prediction\")\n",
+ "rating_eval = SparkRatingEvaluation(\n",
+ " test, \n",
+ " prediction, \n",
+ " **cols\n",
+ ")\n",
"\n",
- "print(\"Model:\\tALS rating prediction\",\n",
- " \"RMSE:\\t%.2f\" % rating_eval.rmse(),\n",
- " \"MAE:\\t%f\" % rating_eval.mae(),\n",
- " \"Explained variance:\\t%f\" % rating_eval.exp_var(),\n",
- " \"R squared:\\t%f\" % rating_eval.rsquared(), sep='\\n')"
+ "print(\n",
+ " \"Model:\\tALS rating prediction\",\n",
+ " \"RMSE:\\t%.2f\" % rating_eval.rmse(),\n",
+ " \"MAE:\\t%f\" % rating_eval.mae(),\n",
+ " \"Explained variance:\\t%f\" % rating_eval.exp_var(),\n",
+ " \"R squared:\\t%f\" % rating_eval.rsquared(), sep='\\n'\n",
+ ")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### 2.5 Save the model"
+ "### 2.6 Save the model"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
+ "execution_count": 19,
+ "metadata": {},
"outputs": [],
"source": [
- "model.write().overwrite().save(model_name)\n",
- "model_local = \"file:\" + os.getcwd() + \"/\" + model_name\n",
- "dbutils.fs.cp(model_name, model_local, True)"
+ "(model\n",
+ " .write()\n",
+ " .overwrite()\n",
+ " .save(model_name))"
]
},
{
@@ -589,17 +825,62 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "+---+--------------------+\n",
+ "| id| MovieId|\n",
+ "+---+--------------------+\n",
+ "|471|[1240, 1038, 867,...|\n",
+ "|463|[1195, 958, 1126,...|\n",
+ "|833|[179, 730, 1597, ...|\n",
+ "|496|[320, 1589, 767, ...|\n",
+ "|148|[320, 59, 253, 74...|\n",
+ "|540|[1512, 1642, 1367...|\n",
+ "|392|[1512, 511, 1367,...|\n",
+ "|243|[1512, 1367, 1642...|\n",
+ "|623|[958, 50, 172, 95...|\n",
+ "|737|[1512, 1524, 206,...|\n",
+ "|897|[1368, 320, 1643,...|\n",
+ "|858|[1266, 1195, 1240...|\n",
+ "| 31|[320, 1344, 836, ...|\n",
+ "|516|[1195, 1642, 1005...|\n",
+ "|580|[1512, 1098, 1015...|\n",
+ "|251|[1643, 1154, 127,...|\n",
+ "|451|[1154, 1368, 1218...|\n",
+ "| 85|[1512, 1367, 1643...|\n",
+ "|137|[1154, 169, 1639,...|\n",
+ "|808|[958, 867, 1449, ...|\n",
+ "+---+--------------------+\n",
+ "only showing top 20 rows\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "recs = model.recommendForAllUsers(10)\n",
+ "recs_topk = recs.withColumn(\"id\", recs[userCol].cast(\"string\")) \\\n",
+ " .select(\"id\", \"recommendations.\" + itemCol)\n",
+ "recs_topk.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
"outputs": [],
"source": [
- "with open(secrets_path) as json_data:\n",
- " writeConfig = json.load(json_data)\n",
- " recs = model.recommendForAllUsers(10)\n",
- " recs.withColumn(\"id\",recs[userCol].cast(\"string\")).select(\"id\", \"recommendations.\"+ itemCol)\\\n",
- " .write.format(\"com.microsoft.azure.cosmosdb.spark\").mode('overwrite').options(**writeConfig).save()"
+ "# Save data to CosmosDB\n",
+ "(recs_topk.coalesce(1)\n",
+ " .write\n",
+ " .format(\"com.microsoft.azure.cosmosdb.spark\")\n",
+ " .mode('overwrite')\n",
+ " .options(**dbsecrets)\n",
+ " .save())"
]
},
{
@@ -615,87 +896,43 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
+ "execution_count": 26,
+ "metadata": {},
"outputs": [],
"source": [
"score_sparkml = \"\"\"\n",
- "\n",
"import json\n",
+ "import pydocumentdb.document_client as document_client\n",
+ "\n",
"def init(local=False):\n",
" global client, collection\n",
" try:\n",
- " # Query them in SQL\n",
- " import pydocumentdb.document_client as document_client\n",
- "\n",
- " MASTER_KEY = '{key}'\n",
- " HOST = '{endpoint}'\n",
- " DATABASE_ID = \"{database}\"\n",
- " COLLECTION_ID = \"{collection}\"\n",
- " database_link = 'dbs/' + DATABASE_ID\n",
- " collection_link = database_link + '/colls/' + COLLECTION_ID\n",
- " \n",
- " client = document_client.DocumentClient(HOST, {'masterKey': MASTER_KEY})\n",
- " collection = client.ReadCollection(collection_link=collection_link)\n",
+ " client = document_client.DocumentClient('{endpoint}', dict(masterKey='{key}'))\n",
+ " collection = client.ReadCollection(collection_link='dbs/{database}/colls/{collection}')\n",
" except Exception as e:\n",
- " collection = e\n",
- "def run(input_json): \n",
+ " collection = e\n",
"\n",
+ "def run(input_json):\n",
" try:\n",
- " import json\n",
- "\n",
- " id = json.loads(json.loads(input_json)[0])['id']\n",
- " query = {'query': 'SELECT * FROM c WHERE c.id = \"' + str(id) +'\"' } #+ str(id)\n",
- "\n",
- " options = {'partitionKey':str(id)}\n",
- " document_link = 'dbs/{DOCUMENTDB_DATABASE}/colls/{DOCUMENTDB_COLLECTION}/docs/{0}'.format(id)\n",
- " result = client.ReadDocument(document_link, options);\n",
- " \n",
+ " # Query them in SQL\n",
+ " id = str(json.loads(json.loads(input_json)[0])['id'])\n",
+ " query = dict(query='SELECT * FROM c WHERE c.id = \"' + id +'\"')\n",
+ " options = dict(partitionKey=str(id))\n",
+ " document_link = 'dbs/{database}/colls/{collection}/docs/' + id\n",
+ " result = client.ReadDocument(document_link, options); \n",
" except Exception as e:\n",
" result = str(e)\n",
- " return json.dumps(str(result)) #json.dumps({{\"result\":result}})\n",
- "\"\"\"\n",
- "\n",
+ " return json.dumps(str(result))\n",
+ "\"\"\".format(key=dbsecrets['Masterkey'], \n",
+ " endpoint=dbsecrets['Endpoint'], \n",
+ " database=dbsecrets['Database'], \n",
+ " collection=dbsecrets['Collection'])\n",
"\n",
- "with open(secrets_path) as json_data:\n",
- " writeConfig = json.load(json_data)\n",
- " score_sparkml = score_sparkml.replace(\"{key}\",writeConfig['Masterkey']).replace(\"{endpoint}\",writeConfig['Endpoint']).replace(\"{database}\",writeConfig['Database']).replace(\"{collection}\",writeConfig['Collection']).replace(\"{DOCUMENTDB_DATABASE}\",DOCUMENTDB_DATABASE).replace(\"{DOCUMENTDB_COLLECTION}\", DOCUMENTDB_COLLECTION)\n",
+ "# test validity of python string\n",
+ "exec(score_sparkml)\n",
"\n",
- " exec(score_sparkml)\n",
- "\n",
- " with open(\"score_sparkml.py\", \"w\") as file:\n",
- " file.write(score_sparkml)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Next, create a environment config file with the dependencies needed:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
- "source": [
- "%%writefile myenv_sparkml.yml\n",
- "\n",
- "name: myenv\n",
- "channels:\n",
- " - defaults\n",
- "dependencies:\n",
- " - pip:\n",
- " - numpy==1.14.2\n",
- " - scikit-learn==0.19.1\n",
- " - pandas\n",
- " - azureml-core\n",
- " - pydocumentdb"
+ "with open(\"score_sparkml.py\", \"w\") as file:\n",
+ " file.write(score_sparkml)"
]
},
{
@@ -707,16 +944,25 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Registering model mvl-als-reco.mml\n",
+ "mvl-als-reco.mml AML trained model 1\n"
+ ]
+ }
+ ],
"source": [
- "mymodel = Model.register(model_path = model_name, # this points to a local file\n",
- " model_name = model_name, # this is the name the model is registered as, am using same name for both path and name. \n",
- " description = \"ADB trained model\",\n",
- " workspace = ws)\n",
+ "mymodel = Model.register(\n",
+ " model_path=model_name, # this points to a local file\n",
+ " model_name=model_name, # this is the name the model is registered as\n",
+ " description=\"AML trained model\",\n",
+ " workspace=ws\n",
+ ")\n",
"\n",
"print(mymodel.name, mymodel.description, mymodel.version)"
]
@@ -732,68 +978,81 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "#### 3.3.1 Create a container for your model service:"
+ "#### 3.3.1 Create an Environment for your model:"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
+ "execution_count": 28,
+ "metadata": {},
"outputs": [],
"source": [
- "# Create Image for Web Service\n",
- "models = [mymodel]\n",
- "runtime = \"spark-py\"\n",
- "conda_file = 'myenv_sparkml.yml'\n",
- "driver_file = \"score_sparkml.py\"\n",
- "\n",
- "# image creation\n",
- "from azureml.core.image import ContainerImage\n",
- "myimage_config = ContainerImage.image_configuration(execution_script = driver_file, \n",
- " runtime = runtime, \n",
- " conda_file = conda_file)\n",
- "\n",
- "image = ContainerImage.create(name = container_image_name,\n",
- " # this is the model object\n",
- " models = [mymodel],\n",
- " image_config = myimage_config,\n",
- " workspace = ws)\n",
- "\n",
- "# Wait for the create process to complete\n",
- "image.wait_for_creation(show_output = True)"
+ "env = Environment(name='sparkmlenv')\n",
+ "\n",
+ "# Specify a public image from microsoft/mmlspark as base image\n",
+ "env.docker.base_image=\"microsoft/mmlspark:0.15\"\n",
+ "\n",
+ "pip = [\n",
+ " 'azureml-defaults', \n",
+ " 'numpy==1.14.2', \n",
+ " 'scikit-learn==0.19.1', \n",
+ " 'pandas', \n",
+ " 'pydocumentdb'\n",
+ "]\n",
+ "\n",
+ "# Add dependencies needed for inferencing\n",
+ "env.python.conda_dependencies = CondaDependencies.create(pip_packages=pip)\n",
+ "env.inferencing_stack_version = \"latest\"\n",
+ "\n",
+ "# Add spark packages\n",
+ "env.spark.precache_packages = True\n",
+ "env.spark.repositories = [\"https://mmlspark.azureedge.net/maven\"]\n",
+ "env.spark.packages= [\n",
+ " SparkPackage(\"com.microsoft.ml.spark\", \"mmlspark_2.11\", \"0.15\"),\n",
+ " SparkPackage(\"com.microsoft.azure\", artifact=\"azure-storage\", version=\"2.0.0\"),\n",
+ " SparkPackage(group=\"org.apache.hadoop\", artifact=\"hadoop-azure\", version=\"2.7.0\")\n",
+ "]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "#### 3.3.2 Create an AKS Cluster to run your container (this may take 20-25 minutes):"
+ "#### 3.3.2 Create an AKS Cluster to run your container\n",
+ "This may take 20 to 30 minutes depending on the cluster size."
]
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Creating..............................................................................................................................................................................\n",
+ "SucceededProvisioning operation finished, operation \"Succeeded\"\n",
+ "Succeeded\n"
+ ]
+ }
+ ],
"source": [
- "from azureml.core.compute import AksCompute, ComputeTarget\n",
- "\n",
- "# Use the default configuration (can also provide parameters to customize)\n",
- "prov_config = AksCompute.provisioning_configuration()\n",
- "\n",
- "# Create the cluster\n",
- "aks_target = ComputeTarget.create(workspace = ws, \n",
- " name = aks_name, \n",
- " provisioning_configuration = prov_config)\n",
- "\n",
- "aks_target.wait_for_completion(show_output = True)\n",
- "\n",
- "print(aks_target.provisioning_state)\n",
- "print(aks_target.provisioning_errors)"
+ "# Verify that cluster does not exist already\n",
+ "try:\n",
+ " aks_target = ComputeTarget(workspace=ws, name=aks_name)\n",
+ " print(\"Found existing cluster, use it.\")\n",
+ "except ComputeTargetException:\n",
+ " # Create the cluster using the default configuration (can also provide parameters to customize)\n",
+ " prov_config = AksCompute.provisioning_configuration()\n",
+ " aks_target = ComputeTarget.create(\n",
+ " workspace=ws, \n",
+ " name=aks_name, \n",
+ " provisioning_configuration=prov_config\n",
+ " )\n",
+ " aks_target.wait_for_completion(show_output = True)\n",
+ " print(aks_target.provisioning_state)\n",
+ " # To check any error logs, print(aks_target.provisioning_errors)"
]
},
{
@@ -805,27 +1064,43 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Running........................................................................................................................................\n",
+ "SucceededAKS service creation operation finished, operation \"Succeeded\"\n"
+ ]
+ }
+ ],
"source": [
- "#Set the web service configuration (using default here with app insights)\n",
+ "# Create an Inferencing Configuration with your environment and scoring script\n",
+ "inference_config = InferenceConfig(\n",
+ " environment=env,\n",
+ " entry_script=\"score_sparkml.py\"\n",
+ ")\n",
+ "\n",
+ "# Set the web service configuration (using default here with app insights)\n",
"aks_config = AksWebservice.deploy_configuration(enable_app_insights=True)\n",
"\n",
- "# Webservice creation using single command, there is a variant to use image directly as well.\n",
+ "# Webservice creation using single command\n",
"try:\n",
- " aks_service = Webservice.deploy_from_image(\n",
- " workspace=ws, \n",
- " name=service_name,\n",
- " deployment_config = aks_config,\n",
- " image = image,\n",
- " deployment_target = aks_target\n",
- " )\n",
+ " aks_service = Model.deploy(\n",
+ " workspace=ws,\n",
+ " models=[mymodel],\n",
+ " name=service_name,\n",
+ " inference_config=inference_config,\n",
+ " deployment_config=aks_config,\n",
+ " deployment_target=aks_target\n",
+ " )\n",
" aks_service.wait_for_deployment(show_output=True)\n",
- "except Exception:\n",
- " aks_service = Webservice.list(ws)[0]"
+ "except WebserviceException:\n",
+ " # Retrieve existing service.\n",
+ " aks_service = Webservice(ws, name=service_name)\n",
+ " print(\"Retrieved existing service\")"
]
},
{
@@ -839,12 +1114,40 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {
- "collapsed": true
- },
- "outputs": [],
+ "execution_count": 159,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{\n",
+ " \"MovieId\": [\n",
+ " 320,\n",
+ " 1589,\n",
+ " 767,\n",
+ " 1104,\n",
+ " 1167,\n",
+ " 1137,\n",
+ " 919,\n",
+ " 1344,\n",
+ " 1400,\n",
+ " 61\n",
+ " ],\n",
+ " \"id\": \"496\",\n",
+ " \"_rid\": \"BbQDALSshkBiAQAAAAAACA==\",\n",
+ " \"_self\": \"dbs/BbQDAA==/colls/BbQDALSshkA=/docs/BbQDALSshkBiAQAAAAAACA==/\",\n",
+ " \"_etag\": \"0400d3e4-0000-0100-0000-5dec0dc70000\",\n",
+ " \"_attachments\": \"attachments/\",\n",
+ " \"_ts\": 1575751111\n",
+ "}\n",
+ "Full run took 0.13 seconds\n"
+ ]
+ }
+ ],
"source": [
+ "import json\n",
+ "\n",
"scoring_url = aks_service.scoring_uri\n",
"service_key = aks_service.get_keys()[0]\n",
"\n",
@@ -854,22 +1157,34 @@
"req.add_header(\"Authorization\",\"Bearer {}\".format(service_key))\n",
"req.add_header(\"Content-Type\",\"application/json\")\n",
"\n",
- "tic = time.time()\n",
- "with urllib.request.urlopen(req) as result:\n",
- " res = result.readlines()\n",
- " print(res)\n",
+ "with Timer() as t: \n",
+ " with urllib.request.urlopen(req) as result:\n",
+ " res = result.read()\n",
+ " resj = json.loads(\n",
+ " # Cleanup to parse into a json object\n",
+ " res.decode(\"utf-8\")\n",
+ " .replace(\"\\\\\", \"\")\n",
+ " .replace('\"', \"\")\n",
+ " .replace(\"'\", '\"')\n",
+ " )\n",
+ " print(json.dumps(resj, indent=4))\n",
" \n",
- "toc = time.time()\n",
- "t2 = toc - tic\n",
- "print(\"Full run took %.2f seconds\" % (toc - tic))"
+ "print(\"Full run took %.2f seconds\" % t.interval)"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
"kernelspec": {
- "display_name": "Python [conda env:Anaconda3]",
+ "display_name": "reco_pyspark",
"language": "python",
- "name": "conda-env-Anaconda3-py"
+ "name": "reco_pyspark"
},
"language_info": {
"codemirror_mode": {
@@ -884,7 +1199,16 @@
"version": "3.6.8"
},
"name": "ALS_Movie_Example",
- "notebookId": 3793436040750096
+ "notebookId": 3793436040750096,
+ "pycharm": {
+ "stem_cell": {
+ "cell_type": "raw",
+ "metadata": {
+ "collapsed": false
+ },
+ "source": []
+ }
+ }
},
"nbformat": 4,
"nbformat_minor": 1
diff --git a/reco_utils/dataset/amazon_reviews.py b/reco_utils/dataset/amazon_reviews.py
new file mode 100644
index 0000000000..10a386dd65
--- /dev/null
+++ b/reco_utils/dataset/amazon_reviews.py
@@ -0,0 +1,423 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import os
+import re
+import shutil
+import warnings
+import pandas as pd
+import gzip
+import random
+import _pickle as cPickle
+from reco_utils.dataset.download_utils import maybe_download, download_path
+
+
+def data_preprocessing(
+ reviews_file,
+ meta_file,
+ train_file,
+ valid_file,
+ test_file,
+ user_vocab,
+ item_vocab,
+ cate_vocab,
+ sample_rate=0.01,
+ valid_num_ngs=4,
+ test_num_ngs=9,
+):
+ """Create data for training, validation and testing from original dataset
+
+ Args:
+ reviews_file (str): Reviews dataset downloaded from former operations.
+ meta_file (str): Meta dataset downloaded from former operations.
+ """
+ reviews_output = _reviews_preprocessing(reviews_file)
+ meta_output = _meta_preprocessing(meta_file)
+ instance_output = _create_instance(reviews_output, meta_output)
+ _create_item2cate(instance_output)
+ sampled_instance_file = _get_sampled_data(instance_output, sample_rate=sample_rate)
+ preprocessed_output = _data_processing(sampled_instance_file)
+ _data_generating(preprocessed_output, train_file, valid_file, test_file)
+ _create_vocab(train_file, user_vocab, item_vocab, cate_vocab)
+ _negative_sampling_offline(
+ sampled_instance_file, valid_file, test_file, valid_num_ngs, test_num_ngs
+ )
+
+
+def _create_vocab(train_file, user_vocab, item_vocab, cate_vocab):
+
+ f_train = open(train_file, "r")
+
+ user_dict = {}
+ item_dict = {}
+ cat_dict = {}
+
+ print("vocab generating...")
+ for line in f_train:
+ arr = line.strip("\n").split("\t")
+ uid = arr[1]
+ mid = arr[2]
+ cat = arr[3]
+ mid_list = arr[5]
+ cat_list = arr[6]
+
+ if uid not in user_dict:
+ user_dict[uid] = 0
+ user_dict[uid] += 1
+ if mid not in item_dict:
+ item_dict[mid] = 0
+ item_dict[mid] += 1
+ if cat not in cat_dict:
+ cat_dict[cat] = 0
+ cat_dict[cat] += 1
+ if len(mid_list) == 0:
+ continue
+ for m in mid_list.split(","):
+ if m not in item_dict:
+ item_dict[m] = 0
+ item_dict[m] += 1
+ for c in cat_list.split(","):
+ if c not in cat_dict:
+ cat_dict[c] = 0
+ cat_dict[c] += 1
+
+ sorted_user_dict = sorted(user_dict.items(), key=lambda x: x[1], reverse=True)
+ sorted_item_dict = sorted(item_dict.items(), key=lambda x: x[1], reverse=True)
+ sorted_cat_dict = sorted(cat_dict.items(), key=lambda x: x[1], reverse=True)
+
+ uid_voc = {}
+ index = 0
+ for key, value in sorted_user_dict:
+ uid_voc[key] = index
+ index += 1
+
+ mid_voc = {}
+ mid_voc["default_mid"] = 0
+ index = 1
+ for key, value in sorted_item_dict:
+ mid_voc[key] = index
+ index += 1
+
+ cat_voc = {}
+ cat_voc["default_cat"] = 0
+ index = 1
+ for key, value in sorted_cat_dict:
+ cat_voc[key] = index
+ index += 1
+
+ cPickle.dump(uid_voc, open(user_vocab, "wb"))
+ cPickle.dump(mid_voc, open(item_vocab, "wb"))
+ cPickle.dump(cat_voc, open(cate_vocab, "wb"))
+
+
+def _negative_sampling_offline(
+ instance_input_file, valid_file, test_file, valid_neg_nums=4, test_neg_nums=49
+):
+
+ columns = ["label", "user_id", "item_id", "timestamp", "cate_id"]
+ ns_df = pd.read_csv(instance_input_file, sep="\t", names=columns)
+ items_with_popular = list(ns_df["item_id"])
+
+ global item2cate
+
+ # valid negative sampling
+ print("start valid negative sampling")
+ with open(valid_file, "r") as f:
+ valid_lines = f.readlines()
+ write_valid = open(valid_file, "w")
+ for line in valid_lines:
+ write_valid.write(line)
+ words = line.strip().split("\t")
+ positive_item = words[2]
+ count = 0
+ neg_items = set()
+ while count < valid_neg_nums:
+ neg_item = random.choice(items_with_popular)
+ if neg_item == positive_item or neg_item in neg_items:
+ continue
+ count += 1
+ neg_items.add(neg_item)
+ words[0] = "0"
+ words[2] = neg_item
+ words[3] = item2cate[neg_item]
+ write_valid.write("\t".join(words) + "\n")
+
+ # test negative sampling
+ print("start test negative sampling")
+ with open(test_file, "r") as f:
+ test_lines = f.readlines()
+ write_test = open(test_file, "w")
+ for line in test_lines:
+ write_test.write(line)
+ words = line.strip().split("\t")
+ positive_item = words[2]
+ count = 0
+ neg_items = set()
+ while count < test_neg_nums:
+ neg_item = random.choice(items_with_popular)
+ if neg_item == positive_item or neg_item in neg_items:
+ continue
+ count += 1
+ neg_items.add(neg_item)
+ words[0] = "0"
+ words[2] = neg_item
+ words[3] = item2cate[neg_item]
+ write_test.write("\t".join(words) + "\n")
+
+
+def _data_generating(input_file, train_file, valid_file, test_file, min_sequence=1):
+ f_input = open(input_file, "r")
+ f_train = open(train_file, "w")
+ f_valid = open(valid_file, "w")
+ f_test = open(test_file, "w")
+ print("data generating...")
+ last_user_id = None
+ for line in f_input:
+ line_split = line.strip().split("\t")
+ tfile = line_split[0]
+ label = int(line_split[1])
+ user_id = line_split[2]
+ movie_id = line_split[3]
+ date_time = line_split[4]
+ category = line_split[5]
+
+ if tfile == "train":
+ fo = f_train
+ elif tfile == "valid":
+ fo = f_valid
+ elif tfile == "test":
+ fo = f_test
+ if user_id != last_user_id:
+ movie_id_list = []
+ cate_list = []
+ dt_list = []
+ else:
+ history_clk_num = len(movie_id_list)
+ cat_str = ""
+ mid_str = ""
+ dt_str = ""
+ for c1 in cate_list:
+ cat_str += c1 + ","
+ for mid in movie_id_list:
+ mid_str += mid + ","
+ for dt_time in dt_list:
+ dt_str += dt_time + ","
+ if len(cat_str) > 0:
+ cat_str = cat_str[:-1]
+ if len(mid_str) > 0:
+ mid_str = mid_str[:-1]
+ if len(dt_str) > 0:
+ dt_str = dt_str[:-1]
+ if history_clk_num >= min_sequence:
+ fo.write(
+ line_split[1]
+ + "\t"
+ + user_id
+ + "\t"
+ + movie_id
+ + "\t"
+ + category
+ + "\t"
+ + date_time
+ + "\t"
+ + mid_str
+ + "\t"
+ + cat_str
+ + "\t"
+ + dt_str
+ + "\n"
+ )
+ last_user_id = user_id
+ if label:
+ movie_id_list.append(movie_id)
+ cate_list.append(category)
+ dt_list.append(date_time)
+
+
+def _create_item2cate(instance_file):
+ print("creating item2cate dict")
+ global item2cate
+ instance_df = pd.read_csv(
+ instance_file,
+ sep="\t",
+ names=["label", "user_id", "item_id", "timestamp", "cate_id"],
+ )
+ item2cate = instance_df.set_index("item_id")["cate_id"].to_dict()
+
+
+def _get_sampled_data(instance_file, sample_rate):
+ print("getting sampled data...")
+ global item2cate
+ output_file = instance_file + "_" + str(sample_rate)
+ columns = ["label", "user_id", "item_id", "timestamp", "cate_id"]
+ ns_df = pd.read_csv(instance_file, sep="\t", names=columns)
+ items_num = ns_df["item_id"].nunique()
+ items_with_popular = list(ns_df["item_id"])
+ items_sample, count = set(), 0
+ while count < int(items_num * sample_rate):
+ random_item = random.choice(items_with_popular)
+ if random_item not in items_sample:
+ items_sample.add(random_item)
+ count += 1
+ ns_df_sample = ns_df[ns_df["item_id"].isin(items_sample)]
+ ns_df_sample.to_csv(output_file, sep="\t", index=None, header=None)
+ return output_file
+
+
+def _meta_preprocessing(meta_readfile):
+ print("start meta preprocessing...")
+ meta_writefile = meta_readfile + "_output"
+ meta_r = open(meta_readfile, "r")
+ meta_w = open(meta_writefile, "w")
+ for line in meta_r:
+ line_new = eval(line)
+ meta_w.write(line_new["asin"] + "\t" + line_new["categories"][0][-1] + "\n")
+ meta_r.close()
+ meta_w.close()
+ return meta_writefile
+
+
+def _reviews_preprocessing(reviews_readfile):
+ print("start reviews preprocessing...")
+ reviews_writefile = reviews_readfile + "_output"
+ reviews_r = open(reviews_readfile, "r")
+ reviews_w = open(reviews_writefile, "w")
+ for line in reviews_r:
+ line_new = eval(line.strip())
+ reviews_w.write(
+ str(line_new["reviewerID"])
+ + "\t"
+ + str(line_new["asin"])
+ + "\t"
+ + str(line_new["unixReviewTime"])
+ + "\n"
+ )
+ reviews_r.close()
+ reviews_w.close()
+ return reviews_writefile
+
+
+def _create_instance(reviews_file, meta_file):
+ print("start create instances...")
+ dirs, _ = os.path.split(reviews_file)
+ output_file = os.path.join(dirs, "instance_output")
+
+ f_reviews = open(reviews_file, "r")
+ user_dict = {}
+ item_list = []
+ for line in f_reviews:
+ line = line.strip()
+ reviews_things = line.split("\t")
+ if reviews_things[0] not in user_dict:
+ user_dict[reviews_things[0]] = []
+ user_dict[reviews_things[0]].append((line, float(reviews_things[-1])))
+ item_list.append(reviews_things[1])
+
+ f_meta = open(meta_file, "r")
+ meta_dict = {}
+ for line in f_meta:
+ line = line.strip()
+ meta_things = line.split("\t")
+ if meta_things[0] not in meta_dict:
+ meta_dict[meta_things[0]] = meta_things[1]
+
+ f_output = open(output_file, "w")
+ for user_behavior in user_dict:
+ sorted_user_behavior = sorted(user_dict[user_behavior], key=lambda x: x[1])
+ for line, _ in sorted_user_behavior:
+ user_things = line.split("\t")
+ asin = user_things[1]
+ if asin in meta_dict:
+ f_output.write("1" + "\t" + line + "\t" + meta_dict[asin] + "\n")
+ else:
+ f_output.write("1" + "\t" + line + "\t" + "default_cat" + "\n")
+
+ f_reviews.close()
+ f_meta.close()
+ f_output.close()
+ return output_file
+
+
+def _data_processing(input_file):
+ print("start data processing...")
+ dirs, _ = os.path.split(input_file)
+ output_file = os.path.join(dirs, "preprocessed_output")
+
+ f_input = open(input_file, "r")
+ f_output = open(output_file, "w")
+ user_count = {}
+ for line in f_input:
+ line = line.strip()
+ user = line.split("\t")[1]
+ if user not in user_count:
+ user_count[user] = 0
+ user_count[user] += 1
+ f_input.seek(0)
+ i = 0
+ last_user = None
+ for line in f_input:
+ line = line.strip()
+ user = line.split("\t")[1]
+ if user == last_user:
+ if i < user_count[user] - 2:
+ f_output.write("train" + "\t" + line + "\n")
+ elif i < user_count[user] - 1:
+ f_output.write("valid" + "\t" + line + "\n")
+ else:
+ f_output.write("test" + "\t" + line + "\n")
+ else:
+ last_user = user
+ i = 0
+ if i < user_count[user] - 2:
+ f_output.write("train" + "\t" + line + "\n")
+ elif i < user_count[user] - 1:
+ f_output.write("valid" + "\t" + line + "\n")
+ else:
+ f_output.write("test" + "\t" + line + "\n")
+ i += 1
+ return output_file
+
+
+def download_and_extract(name, dest_path):
+ """Downloads and extracts Amazon reviews and meta datafiles if they don’t already exist"""
+ dirs, _ = os.path.split(dest_path)
+ if not os.path.exists(dirs):
+ os.makedirs(dirs)
+
+ file_path = os.path.join(dirs, name)
+ if not os.path.exists(file_path):
+ _download_reviews(name, dest_path)
+ _extract_reviews(file_path, dest_path)
+
+ return file_path
+
+
+def _download_reviews(name, dest_path):
+ """Downloads Amazon reviews datafile.
+
+ Args:
+ dest_path (str): File path for the downloaded file
+ """
+
+ url = (
+ "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/"
+ + name
+ + ".gz"
+ )
+
+ dirs, file = os.path.split(dest_path)
+ maybe_download(url, file + ".gz", work_directory=dirs)
+
+
+def _extract_reviews(file_path, zip_path):
+ """Extract Amazon reviews and meta datafiles from the raw zip files.
+
+ To extract all files,
+ use ZipFile's extractall(path) instead.
+
+ Args:
+ file_path (str): Destination path for datafile
+ zip_path (str): zipfile path
+ """
+ with gzip.open(zip_path + ".gz", "rb") as zf, open(file_path, "wb") as f:
+ shutil.copyfileobj(zf, f)
diff --git a/reco_utils/recommender/deeprec/IO/sequential_iterator.py b/reco_utils/recommender/deeprec/IO/sequential_iterator.py
new file mode 100644
index 0000000000..bb03d5c30f
--- /dev/null
+++ b/reco_utils/recommender/deeprec/IO/sequential_iterator.py
@@ -0,0 +1,478 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import tensorflow as tf
+import numpy as np
+import json
+import pickle as pkl
+import random
+import os
+import time
+
+from reco_utils.recommender.deeprec.IO.iterator import BaseIterator
+from reco_utils.recommender.deeprec.deeprec_utils import load_dict
+
+__all__ = ["SequentialIterator"]
+
+
+class SequentialIterator(BaseIterator):
+ def __init__(self, hparams, graph, col_spliter="\t"):
+ """Initialize an iterator. Create necessary placeholders for the model.
+
+ Args:
+ hparams (obj): Global hyper-parameters. Some key settings such as #_feature and #_field are there.
+ graph (obj): the running graph. All created placeholder will be added to this graph.
+ col_spliter (str): column spliter in one line.
+ """
+ self.col_spliter = col_spliter
+ user_vocab, item_vocab, cate_vocab = (
+ hparams.user_vocab,
+ hparams.item_vocab,
+ hparams.cate_vocab,
+ )
+ self.userdict, self.itemdict, self.catedict = (
+ load_dict(user_vocab),
+ load_dict(item_vocab),
+ load_dict(cate_vocab),
+ )
+
+ self.max_seq_length = hparams.max_seq_length
+ self.batch_size = hparams.batch_size
+ self.iter_data = dict()
+
+ self.graph = graph
+ with self.graph.as_default():
+ self.labels = tf.placeholder(tf.float32, [None, 1], name="label")
+ self.users = tf.placeholder(tf.int32, [None], name="users")
+ self.items = tf.placeholder(tf.int32, [None], name="items")
+ self.cates = tf.placeholder(tf.int32, [None], name="cates")
+ self.item_history = tf.placeholder(
+ tf.int32, [None, self.max_seq_length], name="item_history"
+ )
+ self.item_cate_history = tf.placeholder(
+ tf.int32, [None, self.max_seq_length], name="item_cate_history"
+ )
+ self.mask = tf.placeholder(
+ tf.int32, [None, self.max_seq_length], name="mask"
+ )
+ self.time = tf.placeholder(tf.float32, [None], name="time")
+ self.time_diff = tf.placeholder(
+ tf.float32, [None, self.max_seq_length], name="time_diff"
+ )
+ self.time_from_first_action = tf.placeholder(
+ tf.float32, [None, self.max_seq_length], name="time_from_first_action"
+ )
+ self.time_to_now = tf.placeholder(
+ tf.float32, [None, self.max_seq_length], name="time_to_now"
+ )
+
+ def parse_file(self, input_file):
+ """Parse the file to a list ready to be used for downstream tasks
+
+ Args:
+ input_file: One of train, valid or test file which has never been parsed.
+
+ Returns:
+ list: A list with parsing result
+ """
+ with open(input_file, "r") as f:
+ lines = f.readlines()
+ res = []
+ for line in lines:
+ if not line:
+ continue
+ res.append(self.parser_one_line(line))
+ return res
+
+ def parser_one_line(self, line):
+ """Parse one string line into feature values.
+ a line was saved as the following format:
+ label \t user_hash \t item_hash \t item_cate \t operation_time \t item_history_sequence \t item_cate_history_sequence \t time_history_sequence
+
+ Args:
+ line (str): a string indicating one instance
+
+ Returns:
+ tuple/list: Parsed results including label, user_id, target_item_id, target_category, item_history, cate_history(, timeinterval_history,
+ timelast_history, timenow_history, mid_mask, seq_len, learning_rate)
+
+ """
+ words = line.strip().split(self.col_spliter)
+ label = int(words[0])
+ user_id = self.userdict[words[1]] if words[1] in self.userdict else 0
+ item_id = self.itemdict[words[2]] if words[2] in self.itemdict else 0
+ item_cate = self.catedict[words[3]] if words[3] in self.catedict else 0
+ current_time = float(words[4])
+
+ item_history_sequence = []
+ cate_history_sequence = []
+ time_history_sequence = []
+
+ item_history_words = words[5].strip().split(",")
+ for item in item_history_words:
+ item_history_sequence.append(
+ self.itemdict[item] if item in self.itemdict else 0
+ )
+
+ cate_history_words = words[6].strip().split(",")
+ for cate in cate_history_words:
+ cate_history_sequence.append(
+ self.catedict[cate] if cate in self.catedict else 0
+ )
+
+ time_history_words = words[7].strip().split(",")
+ time_history_sequence = [float(i) for i in time_history_words]
+
+ time_range = 3600 * 24
+
+ time_diff = []
+ for i in range(len(time_history_sequence) - 1):
+ diff = (
+ time_history_sequence[i + 1] - time_history_sequence[i]
+ ) / time_range
+ diff = max(diff, 0.5)
+ time_diff.append(diff)
+ last_diff = (current_time - time_history_sequence[-1]) / time_range
+ last_diff = max(last_diff, 0.5)
+ time_diff.append(last_diff)
+ time_diff = np.log(time_diff)
+
+ time_from_first_action = []
+ first_time = time_history_sequence[0]
+ time_from_first_action = [
+ (t - first_time) / time_range for t in time_history_sequence[1:]
+ ]
+ time_from_first_action = [max(t, 0.5) for t in time_from_first_action]
+ last_diff = (current_time - first_time) / time_range
+ last_diff = max(last_diff, 0.5)
+ time_from_first_action.append(last_diff)
+ time_from_first_action = np.log(time_from_first_action)
+
+ time_to_now = []
+ time_to_now = [(current_time - t) / time_range for t in time_history_sequence]
+ time_to_now = [max(t, 0.5) for t in time_to_now]
+ time_to_now = np.log(time_to_now)
+
+ return (
+ label,
+ user_id,
+ item_id,
+ item_cate,
+ item_history_sequence,
+ cate_history_sequence,
+ current_time,
+ time_diff,
+ time_from_first_action,
+ time_to_now,
+ )
+
+ def load_data_from_file(self, infile, batch_num_ngs=0, min_seq_length=1):
+ """Read and parse data from a file.
+
+ Args:
+ infile (str): Text input file. Each line in this file is an instance.
+ batch_num_ngs (int): The number of negative sampling here in batch.
+ 0 represents that there is no need to do negative sampling here.
+ min_seq_length (int): The minimum number of a sequence length.
+ Sequences with length lower than min_seq_length will be ignored.
+
+ Returns:
+ obj: An iterator that will yields parsed results, in the format of graph feed_dict.
+ """
+ label_list = []
+ user_list = []
+ item_list = []
+ item_cate_list = []
+ item_history_batch = []
+ item_cate_history_batch = []
+ time_list = []
+ time_diff_list = []
+ time_from_first_action_list = []
+ time_to_now_list = []
+
+ cnt = 0
+
+ if infile not in self.iter_data:
+ lines = self.parse_file(infile)
+ self.iter_data[infile] = lines
+ else:
+ lines = self.iter_data[infile]
+
+ if batch_num_ngs > 0:
+ random.shuffle(lines)
+
+ for line in lines:
+ if not line:
+ continue
+
+ (
+ label,
+ user_id,
+ item_id,
+ item_cate,
+ item_history_sequence,
+ item_cate_history_sequence,
+ current_time,
+ time_diff,
+ time_from_first_action,
+ time_to_now,
+ ) = line
+ if len(item_history_sequence) < min_seq_length:
+ continue
+
+ label_list.append(label)
+ user_list.append(user_id)
+ item_list.append(item_id)
+ item_cate_list.append(item_cate)
+ item_history_batch.append(item_history_sequence)
+ item_cate_history_batch.append(item_cate_history_sequence)
+ time_list.append(current_time)
+ time_diff_list.append(time_diff)
+ time_from_first_action_list.append(time_from_first_action)
+ time_to_now_list.append(time_to_now)
+
+ cnt += 1
+ if cnt == self.batch_size:
+ res = self._convert_data(
+ label_list,
+ user_list,
+ item_list,
+ item_cate_list,
+ item_history_batch,
+ item_cate_history_batch,
+ time_list,
+ time_diff_list,
+ time_from_first_action_list,
+ time_to_now_list,
+ batch_num_ngs,
+ )
+ batch_input = self.gen_feed_dict(res)
+ yield batch_input if batch_input else None
+ label_list = []
+ user_list = []
+ item_list = []
+ item_cate_list = []
+ item_history_batch = []
+ item_cate_history_batch = []
+ time_list = []
+ time_diff_list = []
+ time_from_first_action_list = []
+ time_to_now_list = []
+ cnt = 0
+ if cnt > 0:
+ res = self._convert_data(
+ label_list,
+ user_list,
+ item_list,
+ item_cate_list,
+ item_history_batch,
+ item_cate_history_batch,
+ time_list,
+ time_diff_list,
+ time_from_first_action_list,
+ time_to_now_list,
+ batch_num_ngs,
+ )
+ batch_input = self.gen_feed_dict(res)
+ yield batch_input if batch_input else None
+
+ def _convert_data(
+ self,
+ label_list,
+ user_list,
+ item_list,
+ item_cate_list,
+ item_history_batch,
+ item_cate_history_batch,
+ time_list,
+ time_diff_list,
+ time_from_first_action_list,
+ time_to_now_list,
+ batch_num_ngs,
+ ):
+ """Convert data into numpy arrays that are good for further model operation.
+
+ Args:
+ label_list (list): a list of ground-truth labels.
+ user_list (list): a list of user indexes.
+ item_list (list): a list of item indexes.
+ item_cate_list (list): a list of category indexes.
+ item_history_batch (list): a list of item history indexes.
+ item_cate_history_batch (list): a list of category history indexes.
+ time_list (list): a list of current timestamp.
+ time_diff_list (list): a list of timestamp between each sequential opertions.
+ time_from_first_action_list (list): a list of timestamp from the first opertion.
+ time_to_now_list (list): a list of timestamp to the current time.
+ batch_num_ngs (int): The number of negative sampling while training in mini-batch.
+
+ Returns:
+ dict: A dictionary, contains multiple numpy arrays that are convenient for further operation.
+ """
+ if batch_num_ngs:
+ instance_cnt = len(label_list)
+ if instance_cnt < 5:
+ return
+
+ label_list_all = []
+ item_list_all = []
+ item_cate_list_all = []
+ user_list_all = np.asarray(
+ [[user] * (batch_num_ngs + 1) for user in user_list], dtype=np.int32
+ ).flatten()
+ time_list_all = np.asarray(
+ [[t] * (batch_num_ngs + 1) for t in time_list], dtype=np.float32
+ ).flatten()
+
+ history_lengths = [len(item_history_batch[i]) for i in range(instance_cnt)]
+ max_seq_length_batch = self.max_seq_length
+ item_history_batch_all = np.zeros(
+ (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch)
+ ).astype("int32")
+ item_cate_history_batch_all = np.zeros(
+ (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch)
+ ).astype("int32")
+ time_diff_batch = np.zeros(
+ (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch)
+ ).astype("float32")
+ time_from_first_action_batch = np.zeros(
+ (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch)
+ ).astype("float32")
+ time_to_now_batch = np.zeros(
+ (instance_cnt * (batch_num_ngs + 1), max_seq_length_batch)
+ ).astype("float32")
+ mask = np.zeros(
+ (instance_cnt * (1 + batch_num_ngs), max_seq_length_batch)
+ ).astype("float32")
+
+ for i in range(instance_cnt):
+ this_length = min(history_lengths[i], max_seq_length_batch)
+ for index in range(batch_num_ngs + 1):
+ item_history_batch_all[
+ i * (batch_num_ngs + 1) + index, :this_length
+ ] = np.asarray(item_history_batch[i][-this_length:], dtype=np.int32)
+ item_cate_history_batch_all[
+ i * (batch_num_ngs + 1) + index, :this_length
+ ] = np.asarray(
+ item_cate_history_batch[i][-this_length:], dtype=np.int32
+ )
+ mask[i * (batch_num_ngs + 1) + index, :this_length] = 1.0
+ time_diff_batch[
+ i * (batch_num_ngs + 1) + index, :this_length
+ ] = np.asarray(time_diff_list[i][-this_length:], dtype=np.float32)
+ time_from_first_action_batch[
+ i * (batch_num_ngs + 1) + index, :this_length
+ ] = np.asarray(
+ time_from_first_action_list[i][-this_length:], dtype=np.float32
+ )
+ time_to_now_batch[
+ i * (batch_num_ngs + 1) + index, :this_length
+ ] = np.asarray(time_to_now_list[i][-this_length:], dtype=np.float32)
+
+ for i in range(instance_cnt):
+ positive_item = item_list[i]
+ label_list_all.append(1)
+ item_list_all.append(positive_item)
+ item_cate_list_all.append(item_cate_list[i])
+ count = 0
+ while batch_num_ngs:
+ random_value = random.randint(0, instance_cnt - 1)
+ negative_item = item_list[random_value]
+ if negative_item == positive_item:
+ continue
+ label_list_all.append(0)
+ item_list_all.append(negative_item)
+ item_cate_list_all.append(item_cate_list[random_value])
+ count += 1
+ if count == batch_num_ngs:
+ break
+
+ res = {}
+ res["labels"] = np.asarray(label_list_all, dtype=np.float32).reshape(-1, 1)
+ res["users"] = user_list_all
+ res["items"] = np.asarray(item_list_all, dtype=np.int32)
+ res["cates"] = np.asarray(item_cate_list_all, dtype=np.int32)
+ res["item_history"] = item_history_batch_all
+ res["item_cate_history"] = item_cate_history_batch_all
+ res["mask"] = mask
+ res["time"] = time_list_all
+ res["time_diff"] = time_diff_batch
+ res["time_from_first_action"] = time_from_first_action_batch
+ res["time_to_now"] = time_to_now_batch
+ return res
+
+ else:
+ instance_cnt = len(label_list)
+ history_lengths = [len(item_history_batch[i]) for i in range(instance_cnt)]
+ max_seq_length_batch = self.max_seq_length
+ item_history_batch_all = np.zeros(
+ (instance_cnt, max_seq_length_batch)
+ ).astype("int32")
+ item_cate_history_batch_all = np.zeros(
+ (instance_cnt, max_seq_length_batch)
+ ).astype("int32")
+ time_diff_batch = np.zeros((instance_cnt, max_seq_length_batch)).astype(
+ "float32"
+ )
+ time_from_first_action_batch = np.zeros(
+ (instance_cnt, max_seq_length_batch)
+ ).astype("float32")
+ time_to_now_batch = np.zeros((instance_cnt, max_seq_length_batch)).astype(
+ "float32"
+ )
+ mask = np.zeros((instance_cnt, max_seq_length_batch)).astype("float32")
+
+ for i in range(instance_cnt):
+ this_length = min(history_lengths[i], max_seq_length_batch)
+ item_history_batch_all[i, :this_length] = item_history_batch[i][
+ -this_length:
+ ]
+ item_cate_history_batch_all[i, :this_length] = item_cate_history_batch[
+ i
+ ][-this_length:]
+ mask[i, :this_length] = 1.0
+ time_diff_batch[i, :this_length] = time_diff_list[i][-this_length:]
+ time_from_first_action_batch[
+ i, :this_length
+ ] = time_from_first_action_list[i][-this_length:]
+ time_to_now_batch[i, :this_length] = time_to_now_list[i][-this_length:]
+
+ res = {}
+ res["labels"] = np.asarray(label_list, dtype=np.float32).reshape(-1, 1)
+ res["users"] = np.asarray(user_list, dtype=np.float32)
+ res["items"] = np.asarray(item_list, dtype=np.int32)
+ res["cates"] = np.asarray(item_cate_list, dtype=np.int32)
+ res["item_history"] = item_history_batch_all
+ res["item_cate_history"] = item_cate_history_batch_all
+ res["mask"] = mask
+ res["time"] = np.asarray(time_list, dtype=np.float32)
+ res["time_diff"] = time_diff_batch
+ res["time_from_first_action"] = time_from_first_action_batch
+ res["time_to_now"] = time_to_now_batch
+ return res
+
+ def gen_feed_dict(self, data_dict):
+ """Construct a dictionary that maps graph elements to values.
+
+ Args:
+ data_dict (dict): a dictionary that maps string name to numpy arrays.
+
+ Returns:
+ dict: a dictionary that maps graph elements to numpy arrays.
+
+ """
+ if not data_dict:
+ return dict()
+ feed_dict = {
+ self.labels: data_dict["labels"],
+ self.users: data_dict["users"],
+ self.items: data_dict["items"],
+ self.cates: data_dict["cates"],
+ self.item_history: data_dict["item_history"],
+ self.item_cate_history: data_dict["item_cate_history"],
+ self.mask: data_dict["mask"],
+ self.time: data_dict["time"],
+ self.time_diff: data_dict["time_diff"],
+ self.time_from_first_action: data_dict["time_from_first_action"],
+ self.time_to_now: data_dict["time_to_now"],
+ }
+ return feed_dict
diff --git a/reco_utils/recommender/deeprec/config/asvd.yaml b/reco_utils/recommender/deeprec/config/asvd.yaml
new file mode 100644
index 0000000000..b07b2048ef
--- /dev/null
+++ b/reco_utils/recommender/deeprec/config/asvd.yaml
@@ -0,0 +1,53 @@
+#data
+#data format:sequential model
+data:
+ user_vocab : ./tests/resources/deeprec/slirec/user_vocab.pkl # the map file of user to id
+ item_vocab : ./tests/resources/deeprec/slirec/item_vocab.pkl # the map file of item to id
+ cate_vocab : ./tests/resources/deeprec/slirec/category_vocab.pkl # the map file of category to id
+
+#model
+model:
+ method : classification # classification or regression
+ model_type : ASVD
+ layer_sizes : [100, 64] # layers' size of DNN. In this example, DNN has two layers, and each layer has 100 hidden nodes.
+ activation : [relu, relu] # activation function for DNN
+ user_dropout: True
+ dropout : [0.3, 0.3] #drop out values for DNN layer
+ item_embedding_dim : 32
+ cate_embedding_dim : 8
+ user_embedding_dim : 16
+
+#train
+#init_method: normal,tnormal,uniform,he_normal,he_uniform,xavier_normal,xavier_uniform
+train:
+ init_method: tnormal # method for initializing model parameters
+ init_value : 0.01 # stddev values for initializing model parameters
+ embed_l2 : 0.0001 # l2 regularization for embedding parameters
+ embed_l1 : 0.0000 # l1 regularization for embedding parameters
+ layer_l2 : 0.0001 # l2 regularization for hidden layer parameters
+ layer_l1 : 0.0000 # l1 regularization for hidden layer parameters
+ cross_l2 : 0.0000 # l2 regularization for cross layer parameters
+ cross_l1 : 0.000 # l1 regularization for cross layer parameters
+ learning_rate : 0.001
+ loss : softmax #log_loss, cross_entropy_loss, or square_loss
+ optimizer : lazyadam # adam, adadelta, sgd, ftrl, gd, padagrad, pgd, rmsprop, lazyadam
+ epochs : 50 # number of epoch for training
+ batch_size : 400 # batch size
+ enable_BN : True # whether to use batch normalization in hidden layers
+ EARLY_STOP : 10 # the number of epoch that controls EARLY STOPPING
+ max_seq_length : 50 # the maximum number of records in the history sequence
+
+#show info
+#metric :'auc','logloss', 'group_auc'
+info:
+ show_step : 100 #print training information after a certain number of mini-batch
+ save_model: True #whether to save models
+ save_epoch : 1 #is save_model is set to True, save the model every save_epoch.
+ metrics : ['auc','logloss'] #metrics for evaluation.
+ pairwise_metrics : ['mean_mrr', 'ndcg', "group_auc"]
+ MODEL_DIR : ./tests/resources/deeprec/asvd/model/asvd_model/
+ SUMMARIES_DIR : ./tests/resources/deeprec/asvd/summary/asvd_summary/
+ write_tfevents : True
+
+
+
diff --git a/reco_utils/recommender/deeprec/config/caser.yaml b/reco_utils/recommender/deeprec/config/caser.yaml
new file mode 100644
index 0000000000..6b67a322bd
--- /dev/null
+++ b/reco_utils/recommender/deeprec/config/caser.yaml
@@ -0,0 +1,58 @@
+#data
+#data format:sequential model
+data:
+ user_vocab : ./tests/resources/deeprec/slirec/user_vocab.pkl # the map file of user to id
+ item_vocab : ./tests/resources/deeprec/slirec/item_vocab.pkl # the map file of item to id
+ cate_vocab : ./tests/resources/deeprec/slirec/category_vocab.pkl # the map file of category to id
+
+#model
+model:
+ method : classification # classification or regression
+ model_type : Caser
+ layer_sizes : [100, 64] # layers' size of DNN. In this example, DNN has two layers, and each layer has 100 hidden nodes.
+ activation : [relu, relu] # activation function for DNN
+ user_dropout: True
+ dropout : [0.3, 0.3] #drop out values for DNN layer
+ item_embedding_dim : 32
+ cate_embedding_dim : 8
+ user_embedding_dim : 16
+
+#train
+#init_method: normal,tnormal,uniform,he_normal,he_uniform,xavier_normal,xavier_uniform
+train:
+ init_method: tnormal # method for initializing model parameters
+ init_value : 0.01 # stddev values for initializing model parameters
+ embed_l2 : 0.0001 # l2 regularization for embedding parameters
+ embed_l1 : 0.0000 # l1 regularization for embedding parameters
+ layer_l2 : 0.0001 # l2 regularization for hidden layer parameters
+ layer_l1 : 0.0000 # l1 regularization for hidden layer parameters
+ cross_l2 : 0.0000 # l2 regularization for cross layer parameters
+ cross_l1 : 0.000 # l1 regularization for cross layer parameters
+ learning_rate : 0.001
+ loss : softmax #log_loss, cross_entropy_loss, or square_loss
+ optimizer : lazyadam # adam, adadelta, sgd, ftrl, gd, padagrad, pgd, rmsprop, lazyadam
+ epochs : 50 # number of epoch for training
+ batch_size : 400 # batch size
+ enable_BN : True # whether to use batch normalization in hidden layers
+ EARLY_STOP : 10 # the number of epoch that controls EARLY STOPPING
+ max_seq_length : 50 # the maximum number of records in the history sequence
+ T : 1 # prediction shape
+ L : 3 # history sequence that involved in convolution shape
+ n_v : 128 # number of vertical convolution layers
+ n_h : 128 # number of horizonal convolution layers
+ min_seq_length : 5 # the minimum number of records in the history sequence
+
+#show info
+#metric :'auc','logloss', 'group_auc'
+info:
+ show_step : 100 #print training information after a certain number of mini-batch
+ save_model: True #whether to save models
+ save_epoch : 1 #is save_model is set to True, save the model every save_epoch.
+ metrics : ['auc','logloss'] #metrics for evaluation.
+ pairwise_metrics : ['mean_mrr', 'ndcg', "group_auc"]
+ MODEL_DIR : ./tests/resources/deeprec/caser/model/caser_model/
+ SUMMARIES_DIR : ./tests/resources/deeprec/caser/summary/caser_summary/
+ write_tfevents : True
+
+
+
diff --git a/reco_utils/recommender/deeprec/config/gru4rec.yaml b/reco_utils/recommender/deeprec/config/gru4rec.yaml
new file mode 100644
index 0000000000..0c1f5fe377
--- /dev/null
+++ b/reco_utils/recommender/deeprec/config/gru4rec.yaml
@@ -0,0 +1,54 @@
+#data
+#data format:sequential model
+data:
+ user_vocab : ./tests/resources/deeprec/slirec/user_vocab.pkl # the map file of user to id
+ item_vocab : ./tests/resources/deeprec/slirec/item_vocab.pkl # the map file of item to id
+ cate_vocab : ./tests/resources/deeprec/slirec/category_vocab.pkl # the map file of category to id
+
+#model
+model:
+ method : classification # classification or regression
+ model_type : GRU4Rec
+ layer_sizes : [100, 64] # layers' size of DNN. In this example, DNN has two layers, and each layer has 100 hidden nodes.
+ activation : [relu, relu] # activation function for DNN
+ user_dropout: True
+ dropout : [0.3, 0.3] #drop out values for DNN layer
+ item_embedding_dim : 32
+ cate_embedding_dim : 8
+ user_embedding_dim : 16
+
+#train
+#init_method: normal,tnormal,uniform,he_normal,he_uniform,xavier_normal,xavier_uniform
+train:
+ init_method: tnormal # method for initializing model parameters
+ init_value : 0.01 # stddev values for initializing model parameters
+ embed_l2 : 0.0001 # l2 regularization for embedding parameters
+ embed_l1 : 0.0000 # l1 regularization for embedding parameters
+ layer_l2 : 0.0001 # l2 regularization for hidden layer parameters
+ layer_l1 : 0.0000 # l1 regularization for hidden layer parameters
+ cross_l2 : 0.0000 # l2 regularization for cross layer parameters
+ cross_l1 : 0.000 # l1 regularization for cross layer parameters
+ learning_rate : 0.001
+ loss : softmax #log_loss, cross_entropy_loss, or square_loss
+ optimizer : lazyadam # adam, adadelta, sgd, ftrl, gd, padagrad, pgd, rmsprop, lazyadam
+ epochs : 50 # number of epoch for training
+ batch_size : 400 # batch size
+ enable_BN : True # whether to use batch normalization in hidden layers
+ EARLY_STOP : 10 # the number of epoch that controls EARLY STOPPING
+ max_seq_length : 50 # the maximum number of records in the history sequence
+ hidden_size : 40 # the shape of hidden size used in RNN
+
+#show info
+#metric :'auc','logloss', 'group_auc'
+info:
+ show_step : 100 #print training information after a certain number of mini-batch
+ save_model: True #whether to save models
+ save_epoch : 1 #is save_model is set to True, save the model every save_epoch.
+ metrics : ['auc','logloss'] #metrics for evaluation.
+ pairwise_metrics : ['mean_mrr', 'ndcg', "group_auc"]
+ MODEL_DIR : ./tests/resources/deeprec/gru4rec/model/gru4rec_model/
+ SUMMARIES_DIR : ./tests/resources/deeprec/gru4rec/summary/gru4rec_summary/
+ write_tfevents : True
+
+
+
diff --git a/reco_utils/recommender/deeprec/config/sli_rec.yaml b/reco_utils/recommender/deeprec/config/sli_rec.yaml
new file mode 100644
index 0000000000..edc5555dd2
--- /dev/null
+++ b/reco_utils/recommender/deeprec/config/sli_rec.yaml
@@ -0,0 +1,56 @@
+#data
+#data format:sequential model
+data:
+ user_vocab : ./tests/resources/deeprec/slirec/user_vocab.pkl # the map file of user to id
+ item_vocab : ./tests/resources/deeprec/slirec/item_vocab.pkl # the map file of item to id
+ cate_vocab : ./tests/resources/deeprec/slirec/category_vocab.pkl # the map file of category to id
+
+#model
+model:
+ method : classification # classification or regression
+ model_type : sli_rec
+ layer_sizes : [100, 64] # layers' size of DNN. In this example, DNN has two layers, and each layer has 100 hidden nodes.
+ att_fcn_layer_sizes : [80, 40]
+ activation : [relu, relu] # activation function for DNN
+ user_dropout: True
+ dropout : [0.3, 0.3] #drop out values for DNN layer
+ item_embedding_dim : 32
+ cate_embedding_dim : 8
+ user_embedding_dim : 16
+
+#train
+#init_method: normal,tnormal,uniform,he_normal,he_uniform,xavier_normal,xavier_uniform
+train:
+ init_method: tnormal # method for initializing model parameters
+ init_value : 0.01 # stddev values for initializing model parameters
+ embed_l2 : 0.0001 # l2 regularization for embedding parameters
+ embed_l1 : 0.0000 # l1 regularization for embedding parameters
+ layer_l2 : 0.0001 # l2 regularization for hidden layer parameters
+ layer_l1 : 0.0000 # l1 regularization for hidden layer parameters
+ cross_l2 : 0.0000 # l2 regularization for cross layer parameters
+ cross_l1 : 0.000 # l1 regularization for cross layer parameters
+ learning_rate : 0.001
+ loss : softmax #log_loss, cross_entropy_loss, or square_loss
+ optimizer : lazyadam # adam, adadelta, sgd, ftrl, gd, padagrad, pgd, rmsprop, lazyadam
+ epochs : 50 # number of epoch for training
+ batch_size : 400 # batch size
+ enable_BN : True # whether to use batch normalization in hidden layers
+ EARLY_STOP : 10 # the number of epoch that controls EARLY STOPPING
+ max_seq_length : 50 # the maximum number of records in the history sequence
+ hidden_size : 40 # the shape of hidden size used in RNN
+ attention_size : 40 # the shape of attention size
+
+#show info
+#metric :'auc','logloss', 'group_auc'
+info:
+ show_step : 100 #print training information after a certain number of mini-batch
+ save_model: True #whether to save modl
+ save_epoch : 1 #is save_model is set to True, save the model every save_epoch.
+ metrics : ['auc','logloss'] #metrics for evaluation.
+ pairwise_metrics : ['mean_mrr', 'ndcg', "group_auc"]
+ MODEL_DIR : ./tests/resources/deeprec/slirec/model/slirec_model/
+ SUMMARIES_DIR : ./tests/resources/deeprec/slirec/summary/slirec_summary/
+ write_tfevents : True
+
+
+
diff --git a/reco_utils/recommender/deeprec/deeprec_utils.py b/reco_utils/recommender/deeprec/deeprec_utils.py
index a99cdcb0d2..bd0e5d4991 100644
--- a/reco_utils/recommender/deeprec/deeprec_utils.py
+++ b/reco_utils/recommender/deeprec/deeprec_utils.py
@@ -16,6 +16,8 @@
import yaml
import zipfile
from reco_utils.dataset.download_utils import maybe_download
+import json
+import pickle as pkl
def flat_config(config):
@@ -63,6 +65,22 @@ def check_type(config):
"n_item",
"n_user_attr",
"n_item_attr",
+ "item_embedding_dim",
+ "cate_embedding_dim",
+ "user_embedding_dim",
+ "max_seq_length",
+ "hidden_size",
+ "T",
+ "L",
+ "n_v",
+ "n_h",
+ "min_seq_length",
+ "attention_size",
+ "epochs",
+ "batch_size",
+ "show_step",
+ "save_epoch",
+ "train_num_ngs",
]
for param in int_parameters:
if param in config and not isinstance(config[param], int):
@@ -93,12 +111,15 @@ def check_type(config):
"optimizer",
"init_method",
"attention_activation",
+ "user_vocab",
+ "item_vocab",
+ "cate_vocab",
]
for param in str_parameters:
if param in config and not isinstance(config[param], str):
raise TypeError("Parameters {0} must be str".format(param))
- list_parameters = ["layer_sizes", "activation", "dropout"]
+ list_parameters = ["layer_sizes", "activation", "dropout", "att_fcn_layer_sizes"]
for param in list_parameters:
if param in config and not isinstance(config[param], list):
raise TypeError("Parameters {0} must be list".format(param))
@@ -151,18 +172,62 @@ def check_nn_config(f_config):
"data_format",
"dropout",
]
- else:
+ if f_config["model_type"] in ["gru4rec", "GRU4REC", "GRU4Rec"]:
required_parameters = [
- "FIELD_COUNT",
- "FEATURE_COUNT",
+ "item_embedding_dim",
+ "cate_embedding_dim",
+ "max_seq_length",
+ "loss",
"method",
- "dim",
- "layer_sizes",
- "activation",
+ "user_vocab",
+ "item_vocab",
+ "cate_vocab",
+ "hidden_size",
+ ]
+ elif f_config["model_type"] in ["caser", "CASER", "Caser"]:
+ required_parameters = [
+ "item_embedding_dim",
+ "cate_embedding_dim",
+ "user_embedding_dim",
+ "max_seq_length",
"loss",
- "data_format",
- "dropout",
+ "method",
+ "user_vocab",
+ "item_vocab",
+ "cate_vocab",
+ "T",
+ "L",
+ "n_v",
+ "n_h",
+ "min_seq_length",
]
+ elif f_config["model_type"] in ["asvd", "ASVD"]:
+ required_parameters = [
+ "item_embedding_dim",
+ "cate_embedding_dim",
+ "max_seq_length",
+ "loss",
+ "method",
+ "user_vocab",
+ "item_vocab",
+ "cate_vocab",
+ ]
+ elif f_config["model_type"] in ["slirec", "sli_rec", "SLI_REC", "Sli_rec"]:
+ required_parameters = [
+ "item_embedding_dim",
+ "cate_embedding_dim",
+ "max_seq_length",
+ "loss",
+ "method",
+ "user_vocab",
+ "item_vocab",
+ "cate_vocab",
+ "attention_size",
+ "hidden_size",
+ "att_fcn_layer_sizes",
+ ]
+ else:
+ required_parameters = []
# check required parameters
for param in required_parameters:
@@ -183,13 +248,6 @@ def check_nn_config(f_config):
f_config["data_format"]
)
)
- else:
- if f_config["data_format"] not in ["fm"]:
- raise ValueError(
- "The default data format should be fm, but your set is {0}".format(
- f_config["data_format"]
- )
- )
check_type(f_config)
@@ -206,9 +264,9 @@ def load_yaml(filename):
with open(filename, "r") as f:
config = yaml.load(f, yaml.SafeLoader)
return config
- except FileNotFoundError: # for file not found
+ except FileNotFoundError: # for file not found
raise
- except Exception as e: # for other exceptions
+ except Exception as e: # for other exceptions
raise IOError("load {0} error!".format(filename))
@@ -319,6 +377,42 @@ def create_hparams(flags):
save_epoch=flags["save_epoch"] if "save_epoch" in flags else 5,
metrics=flags["metrics"] if "metrics" in flags else None,
write_tfevents=flags["write_tfevents"] if "write_tfevents" in flags else False,
+ # sequential
+ item_embedding_dim=flags["item_embedding_dim"]
+ if "item_embedding_dim" in flags
+ else None,
+ cate_embedding_dim=flags["cate_embedding_dim"]
+ if "cate_embedding_dim" in flags
+ else None,
+ user_embedding_dim=flags["user_embedding_dim"]
+ if "user_embedding_dim" in flags
+ else None,
+ train_num_ngs=flags["train_num_ngs"] if "train_num_ngs" in flags else 4,
+ need_sample=flags["need_sample"] if "need_sample" in flags else True,
+ embedding_dropout=flags["embedding_dropout"]
+ if "embedding_dropout" in flags
+ else 0.3,
+ user_vocab=flags["user_vocab"] if "user_vocab" in flags else None,
+ item_vocab=flags["item_vocab"] if "item_vocab" in flags else None,
+ cate_vocab=flags["cate_vocab"] if "cate_vocab" in flags else None,
+ pairwise_metrics=flags["pairwise_metrics"]
+ if "pairwise_metrics" in flags
+ else None,
+ EARLY_STOP=flags["EARLY_STOP"] if "EARLY_STOP" in flags else 100,
+ # gru4rec
+ max_seq_length=flags["max_seq_length"] if "max_seq_length" in flags else None,
+ hidden_size=flags["hidden_size"] if "hidden_size" in flags else None,
+ # caser,
+ L=flags["L"] if "L" in flags else None,
+ T=flags["T"] if "T" in flags else None,
+ n_v=flags["n_v"] if "n_v" in flags else None,
+ n_h=flags["n_h"] if "n_h" in flags else None,
+ min_seq_length=flags["min_seq_length"] if "min_seq_length" in flags else 1,
+ # sli_rec
+ attention_size=flags["attention_size"] if "attention_size" in flags else None,
+ att_fcn_layer_sizes=flags["att_fcn_layer_sizes"]
+ if "att_fcn_layer_sizes" in flags
+ else None,
)
@@ -362,6 +456,55 @@ def download_deeprec_resources(azure_container_url, data_path, remote_resource_n
os.remove(os.path.join(data_path, remote_resource_name))
+def mrr_score(y_true, y_score):
+ """Computing mrr score metric.
+
+ Args:
+ y_true (numpy.ndarray): ground-truth labels.
+ y_score (numpy.ndarray): predicted labels.
+
+ Returns:
+ numpy.ndarray: mrr scores.
+ """
+ order = np.argsort(y_score, axis=1)[:, ::-1]
+ y_true = np.take(y_true, order)
+ rr_score = y_true / (np.arange(np.shape(y_true)[1]) + 1)
+ return np.sum(rr_score, axis=1) / np.sum(y_true, axis=1)
+
+
+def ndcg_score(y_true, y_score, k=10):
+ """Computing ndcg score metric at k.
+
+ Args:
+ y_true (numpy.ndarray): ground-truth labels.
+ y_score (numpy.ndarray): predicted labels.
+
+ Returns:
+ numpy.ndarray: ndcg scores.
+ """
+ best = dcg_score(y_true, y_true, k)
+ actual = dcg_score(y_true, y_score, k)
+ return actual / best
+
+
+def dcg_score(y_true, y_score, k=10):
+ """Computing dcg score metric at k.
+
+ Args:
+ y_true (numpy.ndarray): ground-truth labels.
+ y_score (numpy.ndarray): predicted labels.
+
+ Returns:
+ numpy.ndarray: dcg scores.
+ """
+ k = min(np.shape(y_true)[1], k)
+ order = np.argsort(y_score, axis=1)[:, ::-1]
+ y_true = np.take(y_true, order[:, :k])
+ gains = 2 ** y_true - 1
+ discounts = np.log2(np.arange(np.shape(y_true)[1]) + 2)
+ return np.sum(gains / discounts, axis=1)
+
+
def cal_metric(labels, preds, metrics):
"""Calculate metrics,such as auc, logloss.
@@ -393,6 +536,36 @@ def cal_metric(labels, preds, metrics):
pred[pred < 0.5] = 0
f1 = f1_score(np.asarray(labels), pred)
res["f1"] = round(f1, 4)
+ elif metric == "mean_mrr":
+ mean_mrr = np.mean(mrr_score(labels, preds))
+ res["mean_mrr"] = round(mean_mrr, 4)
+ elif metric == "ndcg":
+ ndcg_list = [2, 4, 6, 8, 10]
+ for r in ndcg_list:
+ ndcg_temp = np.mean(ndcg_score(labels, preds))
+ res["ndcg{0}".format(r)] = round(ndcg_temp, 4)
+ elif metric == "group_auc":
+ group_auc = np.mean(
+ [
+ roc_auc_score(each_labels, each_preds)
+ for each_labels, each_preds in zip(labels, preds)
+ ]
+ )
+ res["group_auc"] = round(group_auc, 4)
else:
raise ValueError("not define this metric {0}".format(metric))
return res
+
+
+def load_dict(filename):
+ """Load the vocabularies.
+
+ Args:
+ filename (str): Filename of user, item or category vocabulary.
+
+ Returns:
+ dict: A saved vocabulary.
+ """
+ with open(filename, "rb") as f:
+ f_pkl = pkl.load(f)
+ return f_pkl
diff --git a/reco_utils/recommender/deeprec/models/base_model.py b/reco_utils/recommender/deeprec/models/base_model.py
index fc7e924892..708c29ed9f 100644
--- a/reco_utils/recommender/deeprec/models/base_model.py
+++ b/reco_utils/recommender/deeprec/models/base_model.py
@@ -1,11 +1,12 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
+from os.path import join
import abc
import time
import numpy as np
import tensorflow as tf
-
+from tensorflow import keras
from reco_utils.recommender.deeprec.deeprec_utils import cal_metric
@@ -29,6 +30,9 @@ def __init__(self, hparams, iterator_creator, graph=None, seed=None):
self.graph = graph if graph is not None else tf.Graph()
self.iterator = iterator_creator(hparams, self.graph)
+ self.train_num_ngs = (
+ hparams.train_num_ngs if "train_num_ngs" in hparams else None
+ )
with self.graph.as_default():
self.hparams = hparams
@@ -40,6 +44,7 @@ def __init__(self, hparams, iterator_creator, graph=None, seed=None):
self.keep_prob_train = None
self.keep_prob_test = None
self.is_train_stage = tf.placeholder(tf.bool, shape=(), name="is_training")
+ self.group = tf.placeholder(tf.int32, shape=(), name="group")
self.initializer = self._get_initializer()
@@ -49,6 +54,7 @@ def __init__(self, hparams, iterator_creator, graph=None, seed=None):
self.loss = self._get_loss()
self.saver = tf.train.Saver(max_to_keep=self.hparams.epochs)
self.update = self._build_train_opt()
+ self.extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
self.init_op = tf.global_variables_initializer()
self.merged = self._add_summaries()
@@ -149,13 +155,17 @@ def _cross_l_loss(self):
def _get_initializer(self):
if self.hparams.init_method == "tnormal":
- return tf.truncated_normal_initializer(stddev=self.hparams.init_value, seed=self.seed)
+ return tf.truncated_normal_initializer(
+ stddev=self.hparams.init_value, seed=self.seed
+ )
elif self.hparams.init_method == "uniform":
return tf.random_uniform_initializer(
-self.hparams.init_value, self.hparams.init_value, seed=self.seed
)
elif self.hparams.init_method == "normal":
- return tf.random_normal_initializer(stddev=self.hparams.init_value, seed=self.seed)
+ return tf.random_normal_initializer(
+ stddev=self.hparams.init_value, seed=self.seed
+ )
elif self.hparams.init_method == "xavier_normal":
return tf.contrib.layers.xavier_initializer(uniform=False, seed=self.seed)
elif self.hparams.init_method == "xavier_uniform":
@@ -169,7 +179,9 @@ def _get_initializer(self):
factor=2.0, mode="FAN_IN", uniform=True, seed=self.seed
)
else:
- return tf.truncated_normal_initializer(stddev=self.hparams.init_value, seed=self.seed)
+ return tf.truncated_normal_initializer(
+ stddev=self.hparams.init_value, seed=self.seed
+ )
def _compute_data_loss(self):
if self.hparams.loss == "cross_entropy_loss":
@@ -195,6 +207,15 @@ def _compute_data_loss(self):
labels=tf.reshape(self.iterator.labels, [-1]),
)
)
+ elif self.hparams.loss == "softmax":
+ group = self.train_num_ngs + 1
+ logits = tf.reshape(self.logit, (-1, group))
+ labels = tf.reshape(self.iterator.labels, (-1, group))
+ softmax_pred = tf.nn.softmax(logits, axis=-1)
+ boolean_mask = tf.equal(labels, tf.ones_like(labels))
+ mask_paddings = tf.ones_like(softmax_pred)
+ pos_softmax = tf.where(boolean_mask, softmax_pred, mask_paddings)
+ data_loss = -group * tf.reduce_mean(tf.math.log(pos_softmax))
else:
raise ValueError("this loss not defined {0}".format(self.hparams.loss))
return data_loss
@@ -229,11 +250,13 @@ def _train_opt(self):
elif optimizer == "gd":
train_step = tf.train.GradientDescentOptimizer(lr)
elif optimizer == "padagrad":
- train_step = tf.train.ProximalAdagradOptimizer(lr) # .minimize(self.loss)
+ train_step = tf.train.ProximalAdagradOptimizer(lr)
elif optimizer == "pgd":
train_step = tf.train.ProximalGradientDescentOptimizer(lr)
elif optimizer == "rmsprop":
train_step = tf.train.RMSPropOptimizer(lr)
+ elif optimizer == "lazyadam":
+ train_step = tf.contrib.opt.LazyAdamOptimizer(lr)
else:
train_step = tf.train.GradientDescentOptimizer(lr)
return train_step
@@ -311,7 +334,14 @@ def train(self, sess, feed_dict):
feed_dict[self.layer_keeps] = self.keep_prob_train
feed_dict[self.is_train_stage] = True
return sess.run(
- [self.update, self.loss, self.data_loss, self.merged], feed_dict=feed_dict
+ [
+ self.update,
+ self.extra_update_ops,
+ self.loss,
+ self.data_loss,
+ self.merged,
+ ],
+ feed_dict=feed_dict,
)
def eval(self, sess, feed_dict):
@@ -327,10 +357,7 @@ def eval(self, sess, feed_dict):
"""
feed_dict[self.layer_keeps] = self.keep_prob_test
feed_dict[self.is_train_stage] = False
- return sess.run(
- [self.loss, self.data_loss, self.pred, self.iterator.labels],
- feed_dict=feed_dict,
- )
+ return sess.run([self.pred, self.iterator.labels], feed_dict=feed_dict,)
def infer(self, sess, feed_dict):
"""Given feature data (in feed_dict), get predicted scores with current model.
@@ -389,7 +416,7 @@ def fit(self, train_file, valid_file, test_file=None):
train_start = time.time()
for batch_data_input in self.iterator.load_data_from_file(train_file):
step_result = self.train(train_sess, batch_data_input)
- (_, step_loss, step_data_loss, summary) = step_result
+ (_, _, step_loss, step_data_loss, summary) = step_result
if self.hparams.write_tfevents:
self.writer.add_summary(summary, step)
epoch_loss += step_loss
@@ -406,9 +433,10 @@ def fit(self, train_file, valid_file, test_file=None):
if self.hparams.save_model:
if epoch % self.hparams.save_epoch == 0:
+ save_path_str = join(self.hparams.MODEL_DIR, "epoch_" + str(epoch))
checkpoint_path = self.saver.save(
sess=train_sess,
- save_path=self.hparams.MODEL_DIR + "epoch_" + str(epoch),
+ save_path=save_path_str,
)
eval_start = time.time()
@@ -479,7 +507,7 @@ def run_eval(self, filename):
preds = []
labels = []
for batch_data_input in self.iterator.load_data_from_file(filename):
- _, _, step_pred, step_labels = self.eval(load_sess, batch_data_input)
+ step_pred, step_labels = self.eval(load_sess, batch_data_input)
preds.extend(np.reshape(step_pred, -1))
labels.extend(np.reshape(step_labels, -1))
res = cal_metric(labels, preds, self.hparams.metrics)
@@ -504,3 +532,114 @@ def predict(self, infile_name, outfile_name):
# line break after each batch.
wt.write("\n")
return self
+
+ def _attention(self, inputs, attention_size):
+ """Soft alignment attention implement.
+
+ Args:
+ inputs (obj): Sequences ready to apply attention.
+ attention_size (int): The dimension of attention operation.
+
+ Returns:
+ obj: Weighted sum after attention.
+ """
+ hidden_size = inputs.shape[2].value
+ if not attention_size:
+ attention_size = hidden_size
+
+ attention_mat = tf.get_variable(
+ name="attention_mat",
+ shape=[inputs.shape[-1].value, hidden_size],
+ initializer=self.initializer,
+ )
+ att_inputs = tf.tensordot(inputs, attention_mat, [[2],[0]])
+
+ query = tf.get_variable(
+ name="query",
+ shape=[attention_size],
+ dtype=tf.float32,
+ initializer=self.initializer,
+ )
+ att_logits = tf.tensordot(att_inputs, query, axes=1, name="att_logits")
+ att_weights = tf.nn.softmax(att_logits, name="att_weights")
+ output = inputs * tf.expand_dims(att_weights, -1)
+ return output
+
+ def _fcn_net(self, model_output, layer_sizes, scope):
+ """Construct the MLP part for the model.
+
+ Args:
+ model_output (obj): The output of upper layers, input of MLP part
+ layer_sizes (list): The shape of each layer of MLP part
+ scope (obj): The scope of MLP part
+
+ Returns:s
+ obj: prediction logit after fully connected layer
+ """
+ hparams = self.hparams
+ with tf.variable_scope(scope):
+ last_layer_size = model_output.shape[-1]
+ layer_idx = 0
+ hidden_nn_layers = []
+ hidden_nn_layers.append(model_output)
+ with tf.variable_scope("nn_part", initializer=self.initializer) as scope:
+ for idx, layer_size in enumerate(layer_sizes):
+ curr_w_nn_layer = tf.get_variable(
+ name="w_nn_layer" + str(layer_idx),
+ shape=[last_layer_size, layer_size],
+ dtype=tf.float32,
+ )
+ curr_b_nn_layer = tf.get_variable(
+ name="b_nn_layer" + str(layer_idx),
+ shape=[layer_size],
+ dtype=tf.float32,
+ initializer=tf.zeros_initializer(),
+ )
+ tf.summary.histogram(
+ "nn_part/" + "w_nn_layer" + str(layer_idx), curr_w_nn_layer
+ )
+ tf.summary.histogram(
+ "nn_part/" + "b_nn_layer" + str(layer_idx), curr_b_nn_layer
+ )
+ curr_hidden_nn_layer = tf.tensordot(
+ hidden_nn_layers[layer_idx], curr_w_nn_layer, axes=1
+ ) + curr_b_nn_layer
+
+ scope = "nn_part" + str(idx)
+ activation = hparams.activation[idx]
+
+ if hparams.enable_BN is True:
+ curr_hidden_nn_layer = tf.layers.batch_normalization(
+ curr_hidden_nn_layer,
+ momentum=0.95,
+ epsilon=0.0001,
+ training=self.is_train_stage,
+ )
+
+ curr_hidden_nn_layer = self._active_layer(
+ logit=curr_hidden_nn_layer, activation=activation, layer_idx=idx
+ )
+ hidden_nn_layers.append(curr_hidden_nn_layer)
+ layer_idx += 1
+ last_layer_size = layer_size
+
+ w_nn_output = tf.get_variable(
+ name="w_nn_output", shape=[last_layer_size, 1], dtype=tf.float32
+ )
+ b_nn_output = tf.get_variable(
+ name="b_nn_output",
+ shape=[1],
+ dtype=tf.float32,
+ initializer=tf.zeros_initializer(),
+ )
+ tf.summary.histogram(
+ "nn_part/" + "w_nn_output" + str(layer_idx), w_nn_output
+ )
+ tf.summary.histogram(
+ "nn_part/" + "b_nn_output" + str(layer_idx), b_nn_output
+ )
+ nn_output = tf.tensordot(
+ hidden_nn_layers[-1], w_nn_output, axes=1
+ ) + b_nn_output
+ self.logit = nn_output
+ return nn_output
diff --git a/reco_utils/recommender/deeprec/models/dkn.py b/reco_utils/recommender/deeprec/models/dkn.py
index bad8c2d7c8..2651769b33 100644
--- a/reco_utils/recommender/deeprec/models/dkn.py
+++ b/reco_utils/recommender/deeprec/models/dkn.py
@@ -10,6 +10,12 @@
class DKN(BaseModel):
+ """DKN model (Deep Knowledge-Aware Network)
+
+ H. Wang, F. Zhang, X. Xie and M. Guo, "DKN: Deep Knowledge-Aware Network for News
+ Recommendation", in Proceedings of the 2018 World Wide Web Conference on World
+ Wide Web, 2018.
+ """
def __init__(self, hparams, iterator_creator):
"""Initialization steps for DKN.
Compared with the BaseModel, DKN requires two different pre-computed embeddings,
diff --git a/reco_utils/recommender/deeprec/models/sequential/asvd.py b/reco_utils/recommender/deeprec/models/sequential/asvd.py
new file mode 100644
index 0000000000..2d9cdc7b91
--- /dev/null
+++ b/reco_utils/recommender/deeprec/models/sequential/asvd.py
@@ -0,0 +1,44 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import tensorflow as tf
+from reco_utils.recommender.deeprec.models.sequential.sequential_base_model import (
+ SequentialBaseModel,
+)
+
+__all__ = ["ASVDModel"]
+
+
+class ASVDModel(SequentialBaseModel):
+ """A2SVD Model
+
+ It extends ASVD with an attention module.
+
+ ASVD: Y. Koren, "Factorization Meets the Neighborhood: a Multifaceted Collaborative
+ Filtering Model", in Proceedings of the 14th ACM SIGKDD international conference on
+ Knowledge discovery and data mining, pages 426–434, ACM, 2008.
+
+ A2SVD: Z. Yu, J. Lian, A. Mahmoody, G. Liu and X. Xie, "Adaptive User Modeling with
+ Long and Short-Term Preferences for Personailzed Recommendation", in Proceedings of
+ the 28th International Joint Conferences on Artificial Intelligence, IJCAI’19,
+ Pages 4213-4219, AAAI Press, 2019.
+ """
+ def _build_seq_graph(self):
+ """The main function to create ASVD model.
+
+ Returns:
+ obj:the output of ASVD section.
+ """
+ hparams = self.hparams
+ with tf.variable_scope("asvd"):
+ hist_input = tf.concat(
+ [self.item_history_embedding, self.cate_history_embedding], 2
+ )
+ with tf.variable_scope("Attention_layer"):
+ att_outputs1 = self._attention(hist_input, hparams.attention_size)
+ asvd_output = tf.reduce_sum(att_outputs1, 1)
+ tf.summary.histogram("asvd_output", asvd_output)
+ model_output = tf.concat([asvd_output, self.target_item_embedding], 1)
+ self.model_output = model_output
+ tf.summary.histogram("model_output", model_output)
+ return model_output
diff --git a/reco_utils/recommender/deeprec/models/sequential/caser.py b/reco_utils/recommender/deeprec/models/sequential/caser.py
new file mode 100644
index 0000000000..5d35a6a03f
--- /dev/null
+++ b/reco_utils/recommender/deeprec/models/sequential/caser.py
@@ -0,0 +1,105 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import tensorflow as tf
+from reco_utils.recommender.deeprec.models.sequential.sequential_base_model import (
+ SequentialBaseModel,
+)
+
+__all__ = ["CaserModel"]
+
+
+class CaserModel(SequentialBaseModel):
+ """Caser Model
+
+ J. Tang and K. Wang, "Personalized top-n sequential recommendation via convolutional
+ sequence embedding", in Proceedings of the Eleventh ACM International Conference on
+ Web Search and Data Mining, ACM, 2018.
+ """
+ def __init__(self, hparams, iterator_creator):
+ """Initialization of variables for caser
+
+ Args:
+ hparams (obj): A tf.contrib.training.HParams object, hold the entire set of hyperparameters.
+ iterator_creator (obj): An iterator to load the data.
+ """
+ self.hparams = hparams
+ self.L = hparams.L # history sequence that involved in convolution shape
+ self.T = hparams.T # prediction shape
+ self.n_v = hparams.n_v # number of vertical convolution layers
+ self.n_h = hparams.n_h # number of horizonal convolution layers
+ self.lengths = [
+ i + 1 for i in range(self.L)
+ ] # horizonal convolution filter shape
+ super().__init__(hparams, iterator_creator)
+
+ def _build_seq_graph(self):
+ """The main function to create caser model.
+
+ Returns:
+ obj:the output of caser section.
+ """
+ with tf.variable_scope("caser"):
+ cnn_output = self._caser_cnn()
+ model_output = tf.concat([cnn_output, self.target_item_embedding], 1)
+ tf.summary.histogram("model_output", model_output)
+ return model_output
+
+ def _add_cnn(self, hist_matrix, vertical_dim, scope):
+ """The main function to use CNN at both vertical and horizonal aspects.
+
+ Args:
+ hist_matrix (obj): The output of history sequential embeddings
+ vertical_dim (int): The shape of embeddings of input
+ scope (obj): The scope of CNN input.
+
+ Returns:
+ obj:the output of CNN layers.
+ """
+ with tf.variable_scope(scope):
+ with tf.variable_scope("vertical"):
+ embedding_T = tf.transpose(hist_matrix, [0, 2, 1])
+ out_v = self._build_cnn(embedding_T, self.n_v, vertical_dim)
+ out_v = tf.layers.flatten(out_v)
+ with tf.variable_scope("horizonal"):
+ out_hs = []
+ for h in self.lengths:
+ conv_out = self._build_cnn(hist_matrix, self.n_h, h)
+ max_pool_out = tf.reduce_max(
+ conv_out, reduction_indices=[1], name="max_pool_{0}".format(h)
+ )
+ out_hs.append(max_pool_out)
+ out_h = tf.concat(out_hs, 1)
+ return tf.concat([out_v, out_h], 1)
+
+ def _caser_cnn(self):
+ """The main function to use CNN at both item and category aspects.
+
+ Returns:
+ obj:the concatenated output of two parts of item and catrgory.
+ """
+ item_out = self._add_cnn(
+ self.item_history_embedding, self.item_embedding_dim, "item"
+ )
+ tf.summary.histogram("item_out", item_out)
+ cate_out = self._add_cnn(
+ self.cate_history_embedding, self.cate_embedding_dim, "cate"
+ )
+ tf.summary.histogram("cate_out", cate_out)
+ cnn_output = tf.concat([item_out, cate_out], 1)
+ tf.summary.histogram("cnn_output", cnn_output)
+ return cnn_output
+
+ def _build_cnn(self, history_matrix, nums, shape):
+ """Call a CNN layer.
+
+ Returns:
+ obj:the output of cnn section.
+ """
+ return tf.layers.conv1d(
+ history_matrix,
+ nums,
+ shape,
+ activation=tf.nn.relu,
+ name="conv_" + str(shape),
+ )
diff --git a/reco_utils/recommender/deeprec/models/sequential/gru4rec.py b/reco_utils/recommender/deeprec/models/sequential/gru4rec.py
new file mode 100644
index 0000000000..562be65292
--- /dev/null
+++ b/reco_utils/recommender/deeprec/models/sequential/gru4rec.py
@@ -0,0 +1,75 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import tensorflow as tf
+from reco_utils.recommender.deeprec.models.sequential.sequential_base_model import (
+ SequentialBaseModel,
+)
+from tensorflow.contrib.rnn import GRUCell, LSTMCell
+from tensorflow.nn import dynamic_rnn
+
+__all__ = ["GRU4RecModel"]
+
+
+class GRU4RecModel(SequentialBaseModel):
+ """GRU4Rec Model
+
+ B. Hidasi, A. Karatzoglou, L. Baltrunas, D. Tikk, "Session-based Recommendations
+ with Recurrent Neural Networks", ICLR (Poster), 2016.
+ """
+ def _build_seq_graph(self):
+ """The main function to create GRU4Rec model.
+
+ Returns:
+ obj:the output of GRU4Rec section.
+ """
+ with tf.variable_scope("gru4rec"):
+ # final_state = self._build_lstm()
+ final_state = self._build_gru()
+ model_output = tf.concat([final_state, self.target_item_embedding], 1)
+ tf.summary.histogram("model_output", model_output)
+ return model_output
+
+ def _build_lstm(self):
+ """Apply an LSTM for modeling.
+
+ Returns:
+ obj: The output of LSTM section.
+ """
+ with tf.name_scope("lstm"):
+ self.mask = self.iterator.mask
+ self.sequence_length = tf.reduce_sum(self.mask, 1)
+ self.history_embedding = tf.concat(
+ [self.item_history_embedding, self.cate_history_embedding], 2
+ )
+ rnn_outputs, final_state = dynamic_rnn(
+ LSTMCell(self.hidden_size),
+ inputs=self.history_embedding,
+ sequence_length=self.sequence_length,
+ dtype=tf.float32,
+ scope="lstm",
+ )
+ tf.summary.histogram("LSTM_outputs", rnn_outputs)
+ return final_state[1]
+
+ def _build_gru(self):
+ """Apply a GRU for modeling.
+
+ Returns:
+ obj: The output of GRU section.
+ """
+ with tf.name_scope("gru"):
+ self.mask = self.iterator.mask
+ self.sequence_length = tf.reduce_sum(self.mask, 1)
+ self.history_embedding = tf.concat(
+ [self.item_history_embedding, self.cate_history_embedding], 2
+ )
+ rnn_outputs, final_state = dynamic_rnn(
+ GRUCell(self.hidden_size),
+ inputs=self.history_embedding,
+ sequence_length=self.sequence_length,
+ dtype=tf.float32,
+ scope="gru",
+ )
+ tf.summary.histogram("GRU_outputs", rnn_outputs)
+ return final_state
diff --git a/reco_utils/recommender/deeprec/models/sequential/rnn_cell_implement.py b/reco_utils/recommender/deeprec/models/sequential/rnn_cell_implement.py
new file mode 100644
index 0000000000..f9c018d147
--- /dev/null
+++ b/reco_utils/recommender/deeprec/models/sequential/rnn_cell_implement.py
@@ -0,0 +1,640 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Module implementing RNN Cells.
+
+This module provides a number of basic commonly used RNN cells, such as LSTM
+(Long Short Term Memory) or GRU (Gated Recurrent Unit), and a number of
+operators that allow adding dropouts, projections, or embeddings for inputs.
+Constructing multi-layer cells is supported by the class `MultiRNNCell`, or by
+calling the `rnn` ops several times.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
+
+from tensorflow.python.ops.rnn_cell_impl import (
+ RNNCell,
+ LSTMStateTuple,
+ _BIAS_VARIABLE_NAME,
+ _WEIGHTS_VARIABLE_NAME,
+)
+
+
+class Time4LSTMCell(RNNCell):
+ def __init__(
+ self,
+ num_units,
+ use_peepholes=False,
+ cell_clip=None,
+ initializer=None,
+ num_proj=None,
+ proj_clip=None,
+ num_unit_shards=None,
+ num_proj_shards=None,
+ forget_bias=1.0,
+ state_is_tuple=True,
+ activation=None,
+ reuse=None,
+ ):
+
+ super(Time4LSTMCell, self).__init__(_reuse=reuse)
+ if not state_is_tuple:
+ logging.warn(
+ "%s: Using a concatenated state is slower and will soon be "
+ "deprecated. Use state_is_tuple=True.",
+ self,
+ )
+ if num_unit_shards is not None or num_proj_shards is not None:
+ logging.warn(
+ "%s: The num_unit_shards and proj_unit_shards parameters are "
+ "deprecated and will be removed in Jan 2017. "
+ "Use a variable scope with a partitioner instead.",
+ self,
+ )
+
+ self._num_units = num_units
+ self._use_peepholes = use_peepholes
+ self._cell_clip = cell_clip
+ self._initializer = initializer
+ self._num_proj = num_proj
+ self._proj_clip = proj_clip
+ self._num_unit_shards = num_unit_shards
+ self._num_proj_shards = num_proj_shards
+ self._forget_bias = forget_bias
+ self._state_is_tuple = state_is_tuple
+ self._activation = activation or math_ops.tanh
+
+ if num_proj:
+ self._state_size = (
+ LSTMStateTuple(num_units, num_proj)
+ if state_is_tuple
+ else num_units + num_proj
+ )
+ self._output_size = num_proj
+ else:
+ self._state_size = (
+ LSTMStateTuple(num_units, num_units)
+ if state_is_tuple
+ else 2 * num_units
+ )
+ self._output_size = num_units
+ self._linear1 = None
+ self._linear2 = None
+ self._time_input_w1 = None
+ self._time_input_w2 = None
+ self._time_kernel_w1 = None
+ self._time_kernel_t1 = None
+ self._time_bias1 = None
+ self._time_kernel_w2 = None
+ self._time_kernel_t2 = None
+ self._time_bias2 = None
+ self._o_kernel_t1 = None
+ self._o_kernel_t2 = None
+ if self._use_peepholes:
+ self._w_f_diag = None
+ self._w_i_diag = None
+ self._w_o_diag = None
+
+ @property
+ def state_size(self):
+ return self._state_size
+
+ @property
+ def output_size(self):
+ return self._output_size
+
+ def call(self, inputs, state):
+ time_now_score = tf.expand_dims(inputs[:, -1], -1)
+ time_last_score = tf.expand_dims(inputs[:, -2], -1)
+ inputs = inputs[:, :-2]
+ num_proj = self._num_units if self._num_proj is None else self._num_proj
+ sigmoid = math_ops.sigmoid
+
+ if self._state_is_tuple:
+ (c_prev, m_prev) = state
+ else:
+ c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
+ m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])
+
+ dtype = inputs.dtype
+ input_size = inputs.get_shape().with_rank(2)[1]
+ if input_size.value is None:
+ raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
+
+ if self._time_kernel_w1 is None:
+ scope = vs.get_variable_scope()
+ with vs.variable_scope(scope, initializer=self._initializer) as unit_scope:
+ with vs.variable_scope(unit_scope):
+ self._time_input_w1 = vs.get_variable(
+ "_time_input_w1", shape=[self._num_units], dtype=dtype
+ )
+ self._time_input_bias1 = vs.get_variable(
+ "_time_input_bias1", shape=[self._num_units], dtype=dtype
+ )
+ self._time_input_w2 = vs.get_variable(
+ "_time_input_w2", shape=[self._num_units], dtype=dtype
+ )
+ self._time_input_bias2 = vs.get_variable(
+ "_time_input_bias2", shape=[self._num_units], dtype=dtype
+ )
+ self._time_kernel_w1 = vs.get_variable(
+ "_time_kernel_w1",
+ shape=[input_size, self._num_units],
+ dtype=dtype,
+ )
+ self._time_kernel_t1 = vs.get_variable(
+ "_time_kernel_t1",
+ shape=[self._num_units, self._num_units],
+ dtype=dtype,
+ )
+ self._time_bias1 = vs.get_variable(
+ "_time_bias1", shape=[self._num_units], dtype=dtype
+ )
+ self._time_kernel_w2 = vs.get_variable(
+ "_time_kernel_w2",
+ shape=[input_size, self._num_units],
+ dtype=dtype,
+ )
+ self._time_kernel_t2 = vs.get_variable(
+ "_time_kernel_t2",
+ shape=[self._num_units, self._num_units],
+ dtype=dtype,
+ )
+ self._time_bias2 = vs.get_variable(
+ "_time_bias2", shape=[self._num_units], dtype=dtype
+ )
+ self._o_kernel_t1 = vs.get_variable(
+ "_o_kernel_t1",
+ shape=[self._num_units, self._num_units],
+ dtype=dtype,
+ )
+ self._o_kernel_t2 = vs.get_variable(
+ "_o_kernel_t2",
+ shape=[self._num_units, self._num_units],
+ dtype=dtype,
+ )
+
+ time_now_input = tf.nn.tanh(
+ time_now_score * self._time_input_w1 + self._time_input_bias1
+ )
+ time_last_input = tf.nn.tanh(
+ time_last_score * self._time_input_w2 + self._time_input_bias2
+ )
+
+ time_now_state = (
+ math_ops.matmul(inputs, self._time_kernel_w1)
+ + math_ops.matmul(time_now_input, self._time_kernel_t1)
+ + self._time_bias1
+ )
+ time_last_state = (
+ math_ops.matmul(inputs, self._time_kernel_w2)
+ + math_ops.matmul(time_last_input, self._time_kernel_t2)
+ + self._time_bias2
+ )
+
+ if self._linear1 is None:
+ scope = vs.get_variable_scope()
+ with vs.variable_scope(scope, initializer=self._initializer) as unit_scope:
+ if self._num_unit_shards is not None:
+ unit_scope.set_partitioner(
+ partitioned_variables.fixed_size_partitioner(
+ self._num_unit_shards
+ )
+ )
+ self._linear1 = _Linear([inputs, m_prev], 4 * self._num_units, True)
+
+ # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+ lstm_matrix = self._linear1([inputs, m_prev])
+ i, j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=4, axis=1)
+ o = (
+ o
+ + math_ops.matmul(time_now_input, self._o_kernel_t1)
+ + math_ops.matmul(time_last_input, self._o_kernel_t2)
+ )
+ # Diagonal connections
+ if self._use_peepholes and not self._w_f_diag:
+ scope = vs.get_variable_scope()
+ with vs.variable_scope(scope, initializer=self._initializer) as unit_scope:
+ with vs.variable_scope(unit_scope):
+ self._w_f_diag = vs.get_variable(
+ "w_f_diag", shape=[self._num_units], dtype=dtype
+ )
+ self._w_i_diag = vs.get_variable(
+ "w_i_diag", shape=[self._num_units], dtype=dtype
+ )
+ self._w_o_diag = vs.get_variable(
+ "w_o_diag", shape=[self._num_units], dtype=dtype
+ )
+
+ if self._use_peepholes:
+ c = sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) * sigmoid(
+ time_last_state
+ ) * c_prev + sigmoid(i + self._w_i_diag * c_prev) * sigmoid(
+ time_now_state
+ ) * self._activation(
+ j
+ )
+ else:
+ c = sigmoid(f + self._forget_bias) * sigmoid(
+ time_last_state
+ ) * c_prev + sigmoid(i) * sigmoid(time_now_state) * self._activation(j)
+
+ if self._cell_clip is not None:
+ # pylint: disable=invalid-unary-operand-type
+ c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
+ # pylint: enable=invalid-unary-operand-type
+ if self._use_peepholes:
+ m = sigmoid(o + self._w_o_diag * c) * self._activation(c)
+ else:
+ m = sigmoid(o) * self._activation(c)
+
+ if self._num_proj is not None:
+ if self._linear2 is None:
+ scope = vs.get_variable_scope()
+ with vs.variable_scope(scope, initializer=self._initializer):
+ with vs.variable_scope("projection") as proj_scope:
+ if self._num_proj_shards is not None:
+ proj_scope.set_partitioner(
+ partitioned_variables.fixed_size_partitioner(
+ self._num_proj_shards
+ )
+ )
+ self._linear2 = _Linear(m, self._num_proj, False)
+ m = self._linear2(m)
+
+ if self._proj_clip is not None:
+ # pylint: disable=invalid-unary-operand-type
+ m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
+ # pylint: enable=invalid-unary-operand-type
+
+ new_state = (
+ LSTMStateTuple(c, m)
+ if self._state_is_tuple
+ else array_ops.concat([c, m], 1)
+ )
+ return m, new_state
+
+
+class Time4ALSTMCell(RNNCell):
+ def __init__(
+ self,
+ num_units,
+ use_peepholes=False,
+ cell_clip=None,
+ initializer=None,
+ num_proj=None,
+ proj_clip=None,
+ num_unit_shards=None,
+ num_proj_shards=None,
+ forget_bias=1.0,
+ state_is_tuple=True,
+ activation=None,
+ reuse=None,
+ ):
+
+ super(Time4ALSTMCell, self).__init__(_reuse=reuse)
+ if not state_is_tuple:
+ logging.warn(
+ "%s: Using a concatenated state is slower and will soon be "
+ "deprecated. Use state_is_tuple=True.",
+ self,
+ )
+ if num_unit_shards is not None or num_proj_shards is not None:
+ logging.warn(
+ "%s: The num_unit_shards and proj_unit_shards parameters are "
+ "deprecated and will be removed in Jan 2017. "
+ "Use a variable scope with a partitioner instead.",
+ self,
+ )
+
+ self._num_units = num_units
+ self._use_peepholes = use_peepholes
+ self._cell_clip = cell_clip
+ self._initializer = initializer
+ self._num_proj = num_proj
+ self._proj_clip = proj_clip
+ self._num_unit_shards = num_unit_shards
+ self._num_proj_shards = num_proj_shards
+ self._forget_bias = forget_bias
+ self._state_is_tuple = state_is_tuple
+ self._activation = activation or math_ops.tanh
+
+ if num_proj:
+ self._state_size = (
+ LSTMStateTuple(num_units, num_proj)
+ if state_is_tuple
+ else num_units + num_proj
+ )
+ self._output_size = num_proj
+ else:
+ self._state_size = (
+ LSTMStateTuple(num_units, num_units)
+ if state_is_tuple
+ else 2 * num_units
+ )
+ self._output_size = num_units
+ self._linear1 = None
+ self._linear2 = None
+ self._time_input_w1 = None
+ self._time_input_w2 = None
+ self._time_kernel_w1 = None
+ self._time_kernel_t1 = None
+ self._time_bias1 = None
+ self._time_kernel_w2 = None
+ self._time_kernel_t2 = None
+ self._time_bias2 = None
+ self._o_kernel_t1 = None
+ self._o_kernel_t2 = None
+ if self._use_peepholes:
+ self._w_f_diag = None
+ self._w_i_diag = None
+ self._w_o_diag = None
+
+ @property
+ def state_size(self):
+ return self._state_size
+
+ @property
+ def output_size(self):
+ return self._output_size
+
+ def call(self, inputs, state):
+ att_score = tf.expand_dims(inputs[:, -1], -1)
+ time_now_score = tf.expand_dims(inputs[:, -2], -1)
+ time_last_score = tf.expand_dims(inputs[:, -3], -1)
+ inputs = inputs[:, :-3]
+ num_proj = self._num_units if self._num_proj is None else self._num_proj
+ sigmoid = math_ops.sigmoid
+
+ if self._state_is_tuple:
+ (c_prev, m_prev) = state
+ else:
+ c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
+ m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj])
+
+ dtype = inputs.dtype
+ input_size = inputs.get_shape().with_rank(2)[1]
+ if input_size.value is None:
+ raise ValueError("Could not infer input size from inputs.get_shape()[-1]")
+
+ if self._time_kernel_w1 is None:
+ scope = vs.get_variable_scope()
+ with vs.variable_scope(scope, initializer=self._initializer) as unit_scope:
+ with vs.variable_scope(unit_scope):
+ self._time_input_w1 = vs.get_variable(
+ "_time_input_w1", shape=[self._num_units], dtype=dtype
+ )
+ self._time_input_bias1 = vs.get_variable(
+ "_time_input_bias1", shape=[self._num_units], dtype=dtype
+ )
+ self._time_input_w2 = vs.get_variable(
+ "_time_input_w2", shape=[self._num_units], dtype=dtype
+ )
+ self._time_input_bias2 = vs.get_variable(
+ "_time_input_bias2", shape=[self._num_units], dtype=dtype
+ )
+ self._time_kernel_w1 = vs.get_variable(
+ "_time_kernel_w1",
+ shape=[input_size, self._num_units],
+ dtype=dtype,
+ )
+ self._time_kernel_t1 = vs.get_variable(
+ "_time_kernel_t1",
+ shape=[self._num_units, self._num_units],
+ dtype=dtype,
+ )
+ self._time_bias1 = vs.get_variable(
+ "_time_bias1", shape=[self._num_units], dtype=dtype
+ )
+ self._time_kernel_w2 = vs.get_variable(
+ "_time_kernel_w2",
+ shape=[input_size, self._num_units],
+ dtype=dtype,
+ )
+ self._time_kernel_t2 = vs.get_variable(
+ "_time_kernel_t2",
+ shape=[self._num_units, self._num_units],
+ dtype=dtype,
+ )
+ self._time_bias2 = vs.get_variable(
+ "_time_bias2", shape=[self._num_units], dtype=dtype
+ )
+ self._o_kernel_t1 = vs.get_variable(
+ "_o_kernel_t1",
+ shape=[self._num_units, self._num_units],
+ dtype=dtype,
+ )
+ self._o_kernel_t2 = vs.get_variable(
+ "_o_kernel_t2",
+ shape=[self._num_units, self._num_units],
+ dtype=dtype,
+ )
+
+ time_now_input = tf.nn.tanh(
+ time_now_score * self._time_input_w1 + self._time_input_bias1
+ )
+ time_last_input = tf.nn.tanh(
+ time_last_score * self._time_input_w2 + self._time_input_bias2
+ )
+
+ time_now_state = (
+ math_ops.matmul(inputs, self._time_kernel_w1)
+ + math_ops.matmul(time_now_input, self._time_kernel_t1)
+ + self._time_bias1
+ )
+ time_last_state = (
+ math_ops.matmul(inputs, self._time_kernel_w2)
+ + math_ops.matmul(time_last_input, self._time_kernel_t2)
+ + self._time_bias2
+ )
+
+ if self._linear1 is None:
+ scope = vs.get_variable_scope()
+ with vs.variable_scope(scope, initializer=self._initializer) as unit_scope:
+ if self._num_unit_shards is not None:
+ unit_scope.set_partitioner(
+ partitioned_variables.fixed_size_partitioner(
+ self._num_unit_shards
+ )
+ )
+ self._linear1 = _Linear([inputs, m_prev], 4 * self._num_units, True)
+
+ # i = input_gate, j = new_input, f = forget_gate, o = output_gate
+ lstm_matrix = self._linear1([inputs, m_prev])
+ i, j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=4, axis=1)
+ o = (
+ o
+ + math_ops.matmul(time_now_input, self._o_kernel_t1)
+ + math_ops.matmul(time_last_input, self._o_kernel_t2)
+ )
+ # Diagonal connections
+ if self._use_peepholes and not self._w_f_diag:
+ scope = vs.get_variable_scope()
+ with vs.variable_scope(scope, initializer=self._initializer) as unit_scope:
+ with vs.variable_scope(unit_scope):
+ self._w_f_diag = vs.get_variable(
+ "w_f_diag", shape=[self._num_units], dtype=dtype
+ )
+ self._w_i_diag = vs.get_variable(
+ "w_i_diag", shape=[self._num_units], dtype=dtype
+ )
+ self._w_o_diag = vs.get_variable(
+ "w_o_diag", shape=[self._num_units], dtype=dtype
+ )
+
+ if self._use_peepholes:
+ c = sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) * sigmoid(
+ time_last_state
+ ) * c_prev + sigmoid(i + self._w_i_diag * c_prev) * sigmoid(
+ time_now_state
+ ) * self._activation(
+ j
+ )
+ else:
+ c = sigmoid(f + self._forget_bias) * sigmoid(
+ time_last_state
+ ) * c_prev + sigmoid(i) * sigmoid(time_now_state) * self._activation(j)
+
+ if self._cell_clip is not None:
+ # pylint: disable=invalid-unary-operand-type
+ c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
+ # pylint: enable=invalid-unary-operand-type
+ if self._use_peepholes:
+ m = sigmoid(o + self._w_o_diag * c) * self._activation(c)
+ else:
+ m = sigmoid(o) * self._activation(c)
+
+ if self._num_proj is not None:
+ if self._linear2 is None:
+ scope = vs.get_variable_scope()
+ with vs.variable_scope(scope, initializer=self._initializer):
+ with vs.variable_scope("projection") as proj_scope:
+ if self._num_proj_shards is not None:
+ proj_scope.set_partitioner(
+ partitioned_variables.fixed_size_partitioner(
+ self._num_proj_shards
+ )
+ )
+ self._linear2 = _Linear(m, self._num_proj, False)
+ m = self._linear2(m)
+
+ if self._proj_clip is not None:
+ # pylint: disable=invalid-unary-operand-type
+ m = clip_ops.clip_by_value(m, -self._proj_clip, self._proj_clip)
+ # pylint: enable=invalid-unary-operand-type
+ c = att_score * c + (1.0 - att_score) * c
+ m = att_score * m + (1.0 - att_score) * m
+ new_state = (
+ LSTMStateTuple(c, m)
+ if self._state_is_tuple
+ else array_ops.concat([c, m], 1)
+ )
+ return m, new_state
+
+
+class _Linear(object):
+ """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
+
+ Args:
+ args: a 2D Tensor or a list of 2D, batch x n, Tensors.
+ output_size: int, second dimension of weight variable.
+ dtype: data type for variables.
+ build_bias: boolean, whether to build a bias variable.
+ bias_initializer: starting value to initialize the bias
+ (default is all zeros).
+ kernel_initializer: starting value to initialize the weight.
+
+ Raises:
+ ValueError: if inputs_shape is wrong.
+ """
+
+ def __init__(
+ self,
+ args,
+ output_size,
+ build_bias,
+ bias_initializer=None,
+ kernel_initializer=None,
+ ):
+ self._build_bias = build_bias
+
+ if args is None or (nest.is_sequence(args) and not args):
+ raise ValueError("`args` must be specified")
+ if not nest.is_sequence(args):
+ args = [args]
+ self._is_sequence = False
+ else:
+ self._is_sequence = True
+
+ # Calculate the total size of arguments on dimension 1.
+ total_arg_size = 0
+ shapes = [a.get_shape() for a in args]
+ for shape in shapes:
+ if shape.ndims != 2:
+ raise ValueError("linear is expecting 2D arguments: %s" % shapes)
+ if shape[1].value is None:
+ raise ValueError(
+ "linear expects shape[1] to be provided for shape %s, "
+ "but saw %s" % (shape, shape[1])
+ )
+ else:
+ total_arg_size += shape[1].value
+
+ dtype = [a.dtype for a in args][0]
+
+ scope = vs.get_variable_scope()
+ with vs.variable_scope(scope) as outer_scope:
+ self._weights = vs.get_variable(
+ _WEIGHTS_VARIABLE_NAME,
+ [total_arg_size, output_size],
+ dtype=dtype,
+ initializer=kernel_initializer,
+ )
+ if build_bias:
+ with vs.variable_scope(outer_scope) as inner_scope:
+ inner_scope.set_partitioner(None)
+ if bias_initializer is None:
+ bias_initializer = init_ops.constant_initializer(
+ 0.0, dtype=dtype
+ )
+ self._biases = vs.get_variable(
+ _BIAS_VARIABLE_NAME,
+ [output_size],
+ dtype=dtype,
+ initializer=bias_initializer,
+ )
+
+ def __call__(self, args):
+ if not self._is_sequence:
+ args = [args]
+
+ if len(args) == 1:
+ res = math_ops.matmul(args[0], self._weights)
+ else:
+ res = math_ops.matmul(array_ops.concat(args, 1), self._weights)
+ if self._build_bias:
+ res = nn_ops.bias_add(res, self._biases)
+ return res
diff --git a/reco_utils/recommender/deeprec/models/sequential/sequential_base_model.py b/reco_utils/recommender/deeprec/models/sequential/sequential_base_model.py
new file mode 100644
index 0000000000..83a3bbe0cd
--- /dev/null
+++ b/reco_utils/recommender/deeprec/models/sequential/sequential_base_model.py
@@ -0,0 +1,344 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import abc
+import time
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+import os
+
+from reco_utils.recommender.deeprec.models.base_model import BaseModel
+from reco_utils.recommender.deeprec.deeprec_utils import cal_metric, load_dict
+
+__all__ = ["SequentialBaseModel"]
+
+
+class SequentialBaseModel(BaseModel):
+ def __init__(self, hparams, iterator_creator, graph=None, seed=None):
+ """Initializing the model. Create common logics which are needed by all sequential models, such as loss function,
+ parameter set.
+
+ Args:
+ hparams (obj): A tf.contrib.training.HParams object, hold the entire set of hyperparameters.
+ iterator_creator (obj): An iterator to load the data.
+ graph (obj): An optional graph.
+ seed (int): Random seed.
+ """
+ self.hparams = hparams
+
+ self.need_sample = hparams.need_sample
+ self.train_num_ngs = hparams.train_num_ngs
+ if self.train_num_ngs is None:
+ raise ValueError(
+ "Please confirm the number of negative samples for each positive instance."
+ )
+ self.min_seq_length = (
+ hparams.min_seq_length if "min_seq_length" in hparams else 1
+ )
+ self.hidden_size = hparams.hidden_size if "hidden_size" in hparams else None
+ self.graph = tf.Graph() if not graph else graph
+
+ with self.graph.as_default():
+ self.sequence_length = tf.placeholder(
+ tf.int32, [None], name="sequence_length"
+ )
+
+ super().__init__(hparams, iterator_creator, graph=self.graph, seed=seed)
+
+ @abc.abstractmethod
+ def _build_seq_graph(self):
+ """Subclass will implement this."""
+ pass
+
+ def _build_graph(self):
+ """The main function to create sequential models.
+
+ Returns:
+ obj:the prediction score make by the model.
+ """
+ hparams = self.hparams
+ self.keep_prob_train = 1 - np.array(hparams.dropout)
+ self.keep_prob_test = np.ones_like(hparams.dropout)
+
+ with tf.variable_scope("sequential") as self.sequential_scope:
+ self._build_embedding()
+ self._lookup_from_embedding()
+ model_output = self._build_seq_graph()
+ logit = self._fcn_net(model_output, hparams.layer_sizes, scope="logit_fcn")
+ self._add_norm()
+ return logit
+
+ def fit(
+ self, train_file, valid_file, valid_num_ngs, eval_metric="group_auc",
+ ):
+ """Fit the model with train_file. Evaluate the model on valid_file per epoch to observe the training status.
+ If test_file is not None, evaluate it too.
+
+ Args:
+ train_file (str): training data set.
+ valid_file (str): validation set.
+ valid_num_ngs (int): the number of negative instances with one positive instance in validation data.
+ eval_metric (str): the metric that control early stopping. e.g. "auc", "group_auc", etc.
+
+ Returns:
+ obj: An instance of self.
+ """
+
+ # check bad input.
+ if not self.need_sample and self.train_num_ngs < 1:
+ raise ValueError(
+ "Please specify a positive integer of negative numbers for training without sampling needed."
+ )
+ if valid_num_ngs < 1:
+ raise ValueError(
+ "Please specify a positive integer of negative numbers for validation."
+ )
+
+ if self.need_sample and self.train_num_ngs < 1:
+ self.train_num_ngs = 1
+
+ if self.hparams.write_tfevents and self.hparams.SUMMARIES_DIR:
+ if not os.path.exists(self.hparams.SUMMARIES_DIR):
+ os.makedirs(self.hparams.SUMMARIES_DIR)
+
+ self.writer = tf.summary.FileWriter(
+ self.hparams.SUMMARIES_DIR, self.sess.graph
+ )
+
+ train_sess = self.sess
+ eval_info = list()
+
+ no_progress_round = 0
+ best_metric, self.best_epoch = 0, 0
+
+ for epoch in range(1, self.hparams.epochs + 1):
+ step = 0
+ self.hparams.current_epoch = epoch
+ epoch_loss = 0
+ file_iterator = self.iterator.load_data_from_file(
+ train_file,
+ min_seq_length=self.min_seq_length,
+ batch_num_ngs=self.train_num_ngs,
+ )
+
+ for batch_data_input in file_iterator:
+ if batch_data_input:
+ step_result = self.train(train_sess, batch_data_input)
+ (_, _, step_loss, step_data_loss, summary) = step_result
+ if self.hparams.write_tfevents and self.hparams.SUMMARIES_DIR:
+ self.writer.add_summary(summary, step)
+ epoch_loss += step_loss
+ step += 1
+ if step % self.hparams.show_step == 0:
+ print(
+ "step {0:d} , total_loss: {1:.4f}, data_loss: {2:.4f}".format(
+ step, step_loss, step_data_loss
+ )
+ )
+
+ valid_res = self.run_eval(valid_file, valid_num_ngs)
+ print(
+ "eval valid at epoch {0}: {1}".format(
+ epoch,
+ ",".join(
+ [
+ "" + str(key) + ":" + str(value)
+ for key, value in valid_res.items()
+ ]
+ ),
+ )
+ )
+ eval_info.append((epoch, valid_res))
+
+ if self.hparams.save_model and self.hparams.MODEL_DIR:
+ if not os.path.exists(self.hparams.MODEL_DIR):
+ os.makedirs(self.hparams.MODEL_DIR)
+
+ if valid_res[eval_metric] > best_metric:
+ checkpoint_path = self.saver.save(
+ sess=train_sess,
+ save_path=self.hparams.MODEL_DIR + "epoch_" + str(epoch),
+ )
+ best_metric = valid_res[eval_metric]
+ self.best_epoch = epoch
+ no_progress_round = 0
+ else:
+ if no_progress_round >= self.hparams.EARLY_STOP:
+ break
+ else:
+ no_progress_round += 1
+
+ if self.hparams.write_tfevents:
+ self.writer.close()
+
+ print(eval_info)
+ print("best epoch: {0}".format(self.best_epoch))
+ return self
+
+ def run_eval(self, filename, num_ngs):
+ """Evaluate the given file and returns some evaluation metrics.
+
+ Args:
+ filename (str): A file name that will be evaluated.
+ num_ngs (int): The number of negative sampling for a positive instance.
+
+ Returns:
+ dict: A dictionary contains evaluation metrics.
+ """
+
+ load_sess = self.sess
+ preds = []
+ labels = []
+ group_preds = []
+ group_labels = []
+ group = num_ngs + 1
+
+ for batch_data_input in self.iterator.load_data_from_file(
+ filename, min_seq_length=self.min_seq_length, batch_num_ngs=0
+ ):
+ if batch_data_input:
+ step_pred, step_labels = self.eval(load_sess, batch_data_input)
+ preds.extend(np.reshape(step_pred, -1))
+ labels.extend(np.reshape(step_labels, -1))
+ group_preds.extend(np.reshape(step_pred, (-1, group)))
+ group_labels.extend(np.reshape(step_labels, (-1, group)))
+
+ res = cal_metric(labels, preds, self.hparams.metrics)
+ res_pairwise = cal_metric(
+ group_labels, group_preds, self.hparams.pairwise_metrics
+ )
+ res.update(res_pairwise)
+ return res
+
+ def predict(self, infile_name, outfile_name):
+ """Make predictions on the given data, and output predicted scores to a file.
+
+ Args:
+ infile_name (str): Input file name.
+ outfile_name (str): Output file name.
+
+ Returns:
+ obj: An instance of self.
+ """
+
+ load_sess = self.sess
+ with tf.gfile.GFile(outfile_name, "w") as wt:
+ for batch_data_input in self.iterator.load_data_from_file(
+ infile_name, batch_num_ngs=0
+ ):
+ if batch_data_input:
+ step_pred = self.infer(load_sess, batch_data_input)
+ step_pred = np.reshape(step_pred, -1)
+ wt.write("\n".join(map(str, step_pred)))
+ wt.write("\n")
+ return self
+
+ def _build_embedding(self):
+ """The field embedding layer. Initialization of embedding variables."""
+ hparams = self.hparams
+ self.user_vocab_length = len(load_dict(hparams.user_vocab))
+ self.item_vocab_length = len(load_dict(hparams.item_vocab))
+ self.cate_vocab_length = len(load_dict(hparams.cate_vocab))
+ self.user_embedding_dim = hparams.user_embedding_dim
+ self.item_embedding_dim = hparams.item_embedding_dim
+ self.cate_embedding_dim = hparams.cate_embedding_dim
+
+ with tf.variable_scope("embedding", initializer=self.initializer):
+ self.user_lookup = tf.get_variable(
+ name="user_embedding",
+ shape=[self.user_vocab_length, self.user_embedding_dim],
+ dtype=tf.float32,
+ )
+ self.item_lookup = tf.get_variable(
+ name="item_embedding",
+ shape=[self.item_vocab_length, self.item_embedding_dim],
+ dtype=tf.float32,
+ )
+ self.cate_lookup = tf.get_variable(
+ name="cate_embedding",
+ shape=[self.cate_vocab_length, self.cate_embedding_dim],
+ dtype=tf.float32,
+ )
+
+ def _lookup_from_embedding(self):
+ """Lookup from embedding variables. A dropout layer follows lookup operations.
+ """
+ self.user_embedding = tf.nn.embedding_lookup(
+ self.user_lookup, self.iterator.users
+ )
+ tf.summary.histogram("user_embedding_output", self.user_embedding)
+
+ self.item_embedding = tf.nn.embedding_lookup(
+ self.item_lookup, self.iterator.items
+ )
+ self.item_history_embedding = tf.nn.embedding_lookup(
+ self.item_lookup, self.iterator.item_history
+ )
+ tf.summary.histogram(
+ "item_history_embedding_output", self.item_history_embedding
+ )
+
+ self.cate_embedding = tf.nn.embedding_lookup(
+ self.cate_lookup, self.iterator.cates
+ )
+ self.cate_history_embedding = tf.nn.embedding_lookup(
+ self.cate_lookup, self.iterator.item_cate_history
+ )
+ tf.summary.histogram(
+ "cate_history_embedding_output", self.cate_history_embedding
+ )
+
+ involved_items = tf.concat(
+ [
+ tf.reshape(self.iterator.item_history, [-1]),
+ tf.reshape(self.iterator.items, [-1]),
+ ],
+ -1,
+ )
+ self.involved_items, _ = tf.unique(involved_items)
+ involved_item_embedding = tf.nn.embedding_lookup(
+ self.item_lookup, self.involved_items
+ )
+ self.embed_params.append(involved_item_embedding)
+
+ involved_cates = tf.concat(
+ [
+ tf.reshape(self.iterator.item_cate_history, [-1]),
+ tf.reshape(self.iterator.cates, [-1]),
+ ],
+ -1,
+ )
+ self.involved_cates, _ = tf.unique(involved_cates)
+ involved_cate_embedding = tf.nn.embedding_lookup(
+ self.cate_lookup, self.involved_cates
+ )
+ self.embed_params.append(involved_cate_embedding)
+
+ self.target_item_embedding = tf.concat(
+ [self.item_embedding, self.cate_embedding], 1
+ )
+ tf.summary.histogram("target_item_embedding_output", self.target_item_embedding)
+
+ # dropout after embedding
+ self.user_embedding = self._dropout(
+ self.user_embedding, keep_prob=1 - self.hparams.embedding_dropout
+ )
+ self.item_history_embedding = self._dropout(
+ self.item_history_embedding, keep_prob=1 - self.hparams.embedding_dropout
+ )
+ self.cate_history_embedding = self._dropout(
+ self.cate_history_embedding, keep_prob=1 - self.hparams.embedding_dropout
+ )
+ self.target_item_embedding = self._dropout(
+ self.target_item_embedding, keep_prob=1 - self.hparams.embedding_dropout
+ )
+
+ def _add_norm(self):
+ """Regularization for embedding variables and other variables."""
+ all_variables, embed_variables = (
+ tf.trainable_variables(),
+ tf.trainable_variables(self.sequential_scope._name + "/embedding"),
+ )
+ layer_params = list(set(all_variables) - set(embed_variables))
+ self.layer_params.extend(layer_params)
diff --git a/reco_utils/recommender/deeprec/models/sequential/sli_rec.py b/reco_utils/recommender/deeprec/models/sequential/sli_rec.py
new file mode 100644
index 0000000000..5826fc9cc7
--- /dev/null
+++ b/reco_utils/recommender/deeprec/models/sequential/sli_rec.py
@@ -0,0 +1,133 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import tensorflow as tf
+from reco_utils.recommender.deeprec.models.sequential.sequential_base_model import (
+ SequentialBaseModel,
+)
+from tensorflow.nn import dynamic_rnn
+from reco_utils.recommender.deeprec.models.sequential.rnn_cell_implement import (
+ Time4LSTMCell,
+)
+
+__all__ = ["SLI_RECModel"]
+
+
+class SLI_RECModel(SequentialBaseModel):
+ """SLI Rec model
+
+ Z. Yu, J. Lian, A. Mahmoody, G. Liu and X. Xie, "Adaptive User Modeling with
+ Long and Short-Term Preferences for Personailzed Recommendation", in Proceedings of
+ the 28th International Joint Conferences on Artificial Intelligence, IJCAI’19,
+ Pages 4213-4219, AAAI Press, 2019.
+ """
+ def _build_seq_graph(self):
+ """The main function to create sli_rec model.
+
+ Returns:
+ obj:the output of sli_rec section.
+ """
+ hparams = self.hparams
+ with tf.variable_scope("sli_rec"):
+ hist_input = tf.concat(
+ [self.item_history_embedding, self.cate_history_embedding], 2
+ )
+ self.mask = self.iterator.mask
+ self.sequence_length = tf.reduce_sum(self.mask, 1)
+
+ with tf.variable_scope("long_term_asvd"):
+ att_outputs1 = self._attention(hist_input, hparams.attention_size)
+ att_fea1 = tf.reduce_sum(att_outputs1, 1)
+ tf.summary.histogram("att_fea1", att_fea1)
+
+ item_history_embedding_new = tf.concat(
+ [
+ self.item_history_embedding,
+ tf.expand_dims(self.iterator.time_from_first_action, -1),
+ ],
+ -1,
+ )
+ item_history_embedding_new = tf.concat(
+ [
+ item_history_embedding_new,
+ tf.expand_dims(self.iterator.time_to_now, -1),
+ ],
+ -1,
+ )
+ with tf.variable_scope("rnn"):
+ rnn_outputs, final_state = dynamic_rnn(
+ Time4LSTMCell(hparams.hidden_size),
+ inputs=item_history_embedding_new,
+ sequence_length=self.sequence_length,
+ dtype=tf.float32,
+ scope="time4lstm",
+ )
+ tf.summary.histogram("LSTM_outputs", rnn_outputs)
+
+ with tf.variable_scope("attention_fcn"):
+ att_outputs2 = self._attention_fcn(
+ self.target_item_embedding, rnn_outputs
+ )
+ att_fea2 = tf.reduce_sum(att_outputs2, 1)
+ tf.summary.histogram("att_fea2", att_fea2)
+
+ # ensemble
+ with tf.name_scope("alpha"):
+ concat_all = tf.concat(
+ [
+ self.target_item_embedding,
+ att_fea1,
+ att_fea2,
+ tf.expand_dims(self.iterator.time_to_now[:, -1], -1),
+ ],
+ 1,
+ )
+ last_hidden_nn_layer = concat_all
+ alpha_logit = self._fcn_net(
+ last_hidden_nn_layer, hparams.att_fcn_layer_sizes, scope="fcn_alpha"
+ )
+ alpha_output = tf.sigmoid(alpha_logit)
+ user_embed = att_fea1 * alpha_output + att_fea2 * (1.0 - alpha_output)
+ model_output = tf.concat([user_embed, self.target_item_embedding], 1)
+ tf.summary.histogram("model_output", model_output)
+ return model_output
+
+ def _attention_fcn(self, query, user_embedding):
+ """Apply attention by fully connected layers.
+
+ Args:
+ query (obj): The embedding of target item which is regarded as a query in attention operations.
+ user_embedding (obj): The output of RNN layers which is regarded as user modeling.
+
+ Returns:
+ obj: Weighted sum of user modeling.
+ """
+ hparams = self.hparams
+ with tf.variable_scope("attention_fcn"):
+ query_size = query.shape[1].value
+ boolean_mask = tf.equal(self.mask, tf.ones_like(self.mask))
+
+ attention_mat = tf.get_variable(
+ name="attention_mat",
+ shape=[user_embedding.shape.as_list()[-1], query_size],
+ initializer=self.initializer,
+ )
+ att_inputs = tf.tensordot(user_embedding, attention_mat, [[2],[0]])
+
+ queries = tf.reshape(
+ tf.tile(query, [1, att_inputs.shape[1].value]), tf.shape(att_inputs)
+ )
+ last_hidden_nn_layer = tf.concat(
+ [att_inputs, queries, att_inputs - queries, att_inputs * queries], -1
+ )
+ att_fnc_output = self._fcn_net(
+ last_hidden_nn_layer, hparams.att_fcn_layer_sizes, scope="att_fcn"
+ )
+ att_fnc_output = tf.squeeze(att_fnc_output, -1)
+ mask_paddings = tf.ones_like(att_fnc_output) * (-(2 ** 32) + 1)
+ att_weights = tf.nn.softmax(
+ tf.where(boolean_mask, att_fnc_output, mask_paddings),
+ name="att_weights",
+ )
+ output = user_embedding * tf.expand_dims(att_weights, -1)
+ return output
diff --git a/reco_utils/recommender/deeprec/models/xDeepFM.py b/reco_utils/recommender/deeprec/models/xDeepFM.py
index 049dee1557..009c52d6f4 100644
--- a/reco_utils/recommender/deeprec/models/xDeepFM.py
+++ b/reco_utils/recommender/deeprec/models/xDeepFM.py
@@ -11,6 +11,13 @@
class XDeepFMModel(BaseModel):
+ """xDeepFM model
+
+ J. Lian, X. Zhou, F. Zhang, Z. Chen, X. Xie, G. Sun, "xDeepFM: Combining Explicit
+ and Implicit Feature Interactions for Recommender Systems", in Proceedings of the
+ 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining,
+ KDD 2018, London, 2018.
+ """
def _build_graph(self):
"""The main function to create xdeepfm's logic.
diff --git a/scripts/databricks_install.py b/scripts/databricks_install.py
index 55254788bb..a7b865f4d1 100644
--- a/scripts/databricks_install.py
+++ b/scripts/databricks_install.py
@@ -235,7 +235,7 @@ def prepare_for_operationalization(
PYPI_O16N_LIBS = [
"azure-cli==2.0.56",
- "azureml-sdk[databricks]==1.0.8",
+ "azureml-sdk[databricks]==1.0.69",
PIP_BASE["pydocumentdb"],
]
diff --git a/scripts/generate_conda_file.py b/scripts/generate_conda_file.py
index 1a37efb1ba..1132fde112 100644
--- a/scripts/generate_conda_file.py
+++ b/scripts/generate_conda_file.py
@@ -49,28 +49,32 @@
"pytest": "pytest>=3.6.4",
"pytorch": "pytorch-cpu>=1.0.0",
"seaborn": "seaborn>=0.8.1",
- "scikit-learn": "scikit-learn==0.19.1",
+ "scikit-learn": "scikit-learn>=0.19.1",
"scipy": "scipy>=1.0.0",
"scikit-surprise": "scikit-surprise>=1.0.6",
"swig": "swig==3.0.12",
"tensorflow": "tensorflow==1.12.0",
"lightgbm": "lightgbm==2.2.1",
+ "cmake": "cmake==3.14.0",
"cornac": "cornac>=1.1.2",
- "fastai": "fastai==1.0.46",
"papermill": "papermill==0.19.1",
+ "tqdm": "tqdm>=4.31.1",
}
CONDA_PYSPARK = {"pyarrow": "pyarrow>=0.8.0", "pyspark": "pyspark==2.3.1"}
CONDA_GPU = {
+ "fastai": "fastai==1.0.46",
"numba": "numba>=0.38.1",
"pytorch": "pytorch>=1.0.0",
"tensorflow": "tensorflow-gpu==1.12.0",
}
PIP_BASE = {
- "azureml-sdk[notebooks,tensorboard]": "azureml-sdk[notebooks,tensorboard]==1.0.18",
+ "azureml-sdk[notebooks,tensorboard]": "azureml-sdk[notebooks,tensorboard]==1.0.69",
"azure-storage": "azure-storage>=0.36.0",
+ "azure-cli-core": "azure-cli-core>=2.0.75",
+ "azure-mgmt-cosmosdb": "azure-mgmt-cosmosdb>=0.8.0",
"black": "black>=18.6b4",
"category_encoders": "category_encoders>=1.3.0",
"dataclasses": "dataclasses>=0.6",
@@ -81,18 +85,14 @@
"nbconvert": "nbconvert==5.5.0",
"pydocumentdb": "pydocumentdb>=2.3.3",
"pymanopt": "pymanopt==0.2.3",
- "tqdm": "tqdm==4.31.1",
+ "xlearn": "xlearn==0.40a1"
}
PIP_GPU = {"nvidia-ml-py3": "nvidia-ml-py3>=7.352.0"}
PIP_PYSPARK = {"databricks-cli": "databricks-cli==0.8.6"}
-PIP_DARWIN = {
- "nni": "nni==0.5.2.1.1",
-}
-PIP_LINUX = {
- "nni": "nni==0.5.2.1.1",
-}
+PIP_DARWIN = {"nni": "nni==0.5.2.1.1"}
+PIP_LINUX = {"nni": "nni==0.5.2.1.1"}
PIP_WIN32 = {}
@@ -156,14 +156,14 @@
pip_packages.update(PIP_GPU)
# check for os platform support
- if platform == 'darwin':
+ if platform == "darwin":
pip_packages.update(PIP_DARWIN)
- elif platform.startswith('linux'):
+ elif platform.startswith("linux"):
pip_packages.update(PIP_LINUX)
- elif platform == 'win32':
+ elif platform == "win32":
pip_packages.update(PIP_WIN32)
else:
- raise Exception('Unsupported platform, must be Windows, Linux, or macOS')
+ raise Exception("Unsupported platform, must be Windows, Linux, or macOS")
# write out yaml file
conda_file = "{}.yaml".format(conda_env)
diff --git a/tests/ci/README.md b/tests/ci/README.md
index 0ab70b0a64..d34d6a7835 100644
--- a/tests/ci/README.md
+++ b/tests/ci/README.md
@@ -24,3 +24,22 @@ Testing pipelines that run on either a Linux or Windows DSVM agent machine.
* Azure Machine Learning service testing pipeline
Testing pipelines that run within an Azure Machine Learning service workspace.
+
+## Azure DevOps Templates
+Azure DevOps Templates have been used to reduce our duplicated code between repositories
+For more information see [here](https://docs.microsoft.com/en-us/azure/devops/pipelines/process/templates?view=azure-devops).
+
+A Github Service Connection must also be created with the name "AI-GitHub" to use these templates, within each pipeline.
+For more information see [here](https://docs.microsoft.com/en-us/azure/devops/pipelines/process/demands?view=azure-devops&tabs=yaml)
+
+### [reco_config_conda_linux.yml@aitemplates](https://github.com/microsoft/AI/blob/master/.ci/steps/reco_config_conda_linux.yml)
+This template is used to install a new conda env on a Linux Virtual Machine. The name of the conda env must be provided.
+
+### [reco_conda_clean_linux.yml@aitemplates](https://github.com/microsoft/AI/blob/master/.ci/steps/reco_conda_clean_linux.yml)
+This template is used to clean a Linux Virtual Machine after being used by a conda process. This should be used for a self-hosted linux agent.
+
+### [reco_config_conda_win.yml@aitemplates](https://github.com/microsoft/AI/blob/master/.ci/steps/reco_conda_config_win.yml)
+This template is used to install a new conda env on a Windows Virtual Machine. The name of the conda env must be provided.
+
+### [reco_conda_clean_win.yml@aitemplates](https://github.com/microsoft/AI/blob/master/.ci/steps/reco_conda_clean_win.yml)
+This template is used to clean a Windows Virtual Machine after being used by a conda process. This should be used for a self-hosted windows agent.
\ No newline at end of file
diff --git a/tests/ci/azure_pipeline_test/dsvm_nightly_linux_cpu.yml b/tests/ci/azure_pipeline_test/dsvm_nightly_linux_cpu.yml
index 4da34bf1d9..16f755f9d5 100644
--- a/tests/ci/azure_pipeline_test/dsvm_nightly_linux_cpu.yml
+++ b/tests/ci/azure_pipeline_test/dsvm_nightly_linux_cpu.yml
@@ -1,24 +1,34 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
+
schedules:
-- cron: "0 0 * * *"
- displayName: Daily midnight build
+- cron: "7 0 * * *"
+ displayName: Nightly build master
branches:
include:
- master
+ always: true
+- cron: "7 12 * * *"
+ displayName: Nightly build staging
+ branches:
+ include:
- staging
+ always: true
trigger: none
pr: none
+variables:
+- group: LinuxAgentPool
+
jobs:
- job: nightly
- displayName : 'Nightly tests Linux CPU'
+ displayName: 'Nightly tests Linux CPU'
timeoutInMinutes: 180 # how long to run the job before automatically cancelling
pool:
- name: recolinuxpool
+ name: $(Agent_Pool)
steps:
- bash: |
@@ -31,18 +41,16 @@ jobs:
python ./scripts/generate_conda_file.py --name nightly_reco_base && \
conda env create --quiet -f nightly_reco_base.yaml 2> log
displayName: 'Setup Conda Env'
- timeoutInMinutes: 10
- script: |
. /anaconda/etc/profile.d/conda.sh && \
conda activate nightly_reco_base && \
echo "Smoke tests" && \
- pytest tests/smoke -m "smoke and not spark and not gpu" --junitxml=reports/test-smoke.xml && \
+ pytest tests/smoke --durations 0 -m "smoke and not spark and not gpu" --junitxml=reports/test-smoke.xml && \
echo "Integration tests" && \
- pytest tests/integration -m "integration and not spark and not gpu" --junitxml=reports/test-integration.xml && \
+ pytest tests/integration --durations 0 -m "integration and not spark and not gpu" --junitxml=reports/test-integration.xml && \
conda deactivate
displayName: 'Run Tests'
- timeoutInMinutes: 180
- task: PublishTestResults@2
displayName: 'Publish Test Results '
@@ -56,6 +64,4 @@ jobs:
workingDirectory: tests
displayName: 'Conda remove'
continueOnError: true
- condition: succeededOrFailed()
- timeoutInMinutes: 10
-
+ condition: always() # this step will always run, even if the pipeline is canceled
diff --git a/tests/ci/azure_pipeline_test/dsvm_nightly_linux_gpu.yml b/tests/ci/azure_pipeline_test/dsvm_nightly_linux_gpu.yml
index 7da94ce759..a7ae801c0b 100644
--- a/tests/ci/azure_pipeline_test/dsvm_nightly_linux_gpu.yml
+++ b/tests/ci/azure_pipeline_test/dsvm_nightly_linux_gpu.yml
@@ -2,23 +2,30 @@
# Licensed under the MIT License.
schedules:
-- cron: "0 0 * * *"
- displayName: Daily midnight build
+- cron: "7 4 * * *"
+ displayName: Nightly build master
branches:
include:
- master
+- cron: "7 16 * * *"
+ displayName: Nightly build staging
+ branches:
+ include:
- staging
trigger: none
pr: none
+variables:
+- group: LinuxAgentPool
+
jobs:
- job: nightly
- displayName : 'Nightly tests Linux GPU'
+ displayName: 'Nightly tests Linux GPU'
timeoutInMinutes: 180 # how long to run the job before automatically cancelling
pool:
- name: recolinuxpool
+ name: $(Agent_Pool)
steps:
- bash: |
@@ -31,18 +38,16 @@ jobs:
python ./scripts/generate_conda_file.py --gpu --name nightly_reco_gpu && \
conda env create --quiet -f nightly_reco_gpu.yaml 2> log
displayName: 'Setup Conda Env'
- timeoutInMinutes: 10
- script: |
. /anaconda/etc/profile.d/conda.sh && \
conda activate nightly_reco_gpu && \
echo "Smoke tests" && \
- pytest tests/smoke -m "smoke and not spark and gpu" --junitxml=reports/test-smoke.xml && \
+ pytest tests/smoke --durations 0 -m "smoke and not spark and gpu" --junitxml=reports/test-smoke.xml && \
echo "Integration tests" && \
- pytest tests/integration -m "integration and not spark and gpu" --junitxml=reports/test-integration.xml && \
+ pytest tests/integration --durations 0 -m "integration and not spark and gpu" --junitxml=reports/test-integration.xml && \
conda deactivate
displayName: 'Run Tests'
- timeoutInMinutes: 180
- task: PublishTestResults@2
displayName: 'Publish Test Results '
@@ -56,6 +61,4 @@ jobs:
workingDirectory: tests
displayName: 'Conda remove'
continueOnError: true
- condition: succeededOrFailed()
- timeoutInMinutes: 10
-
+ condition: always() # this step will always run, even if the pipeline is canceled
diff --git a/tests/ci/azure_pipeline_test/dsvm_nightly_linux_pyspark.yml b/tests/ci/azure_pipeline_test/dsvm_nightly_linux_pyspark.yml
index d109b041e5..8210d7d87d 100644
--- a/tests/ci/azure_pipeline_test/dsvm_nightly_linux_pyspark.yml
+++ b/tests/ci/azure_pipeline_test/dsvm_nightly_linux_pyspark.yml
@@ -2,23 +2,30 @@
# Licensed under the MIT License.
schedules:
-- cron: "0 0 * * *"
- displayName: Daily midnight build
+- cron: "7 8 * * *"
+ displayName: Nightly build master
branches:
include:
- master
+- cron: "7 20 * * *"
+ displayName: Nightly build staging
+ branches:
+ include:
- staging
trigger: none
pr: none
+variables:
+- group: LinuxAgentPool
+
jobs:
- job: nightly
- displayName : 'Nightly tests Linux Spark'
+ displayName: 'Nightly tests Linux Spark'
timeoutInMinutes: 180 # how long to run the job before automatically cancelling
pool:
- name: recolinuxpool
+ name: $(Agent_Pool)
steps:
- bash: |
@@ -31,18 +38,16 @@ jobs:
python ./scripts/generate_conda_file.py --pyspark --name nightly_reco_pyspark && \
conda env create --quiet -f nightly_reco_pyspark.yaml 2> log
displayName: 'Setup Conda Env'
- timeoutInMinutes: 10
- script: |
. /anaconda/etc/profile.d/conda.sh && \
conda activate nightly_reco_pyspark && \
echo "Smoke tests" && \
- pytest tests/smoke -m "smoke and spark and not gpu" --junitxml=reports/test-smoke.xml && \
+ pytest tests/smoke --durations 0 -m "smoke and spark and not gpu" --junitxml=reports/test-smoke.xml && \
echo "Integration tests" && \
- pytest tests/integration -m "integration and spark and not gpu" --junitxml=reports/test-integration.xml && \
+ pytest tests/integration --durations 0 -m "integration and spark and not gpu" --junitxml=reports/test-integration.xml && \
conda deactivate
displayName: 'Run Tests'
- timeoutInMinutes: 180
- task: PublishTestResults@2
displayName: 'Publish Test Results '
@@ -56,5 +61,4 @@ jobs:
workingDirectory: tests
displayName: 'Conda remove'
continueOnError: true
- condition: succeededOrFailed()
- timeoutInMinutes: 10
\ No newline at end of file
+ condition: always() # this step will always run, even if the pipeline is canceled
\ No newline at end of file
diff --git a/tests/ci/azure_pipeline_test/dsvm_nightly_win_cpu.yml b/tests/ci/azure_pipeline_test/dsvm_nightly_win_cpu.yml
index 4cf5c8e602..1ddd0ea946 100644
--- a/tests/ci/azure_pipeline_test/dsvm_nightly_win_cpu.yml
+++ b/tests/ci/azure_pipeline_test/dsvm_nightly_win_cpu.yml
@@ -2,61 +2,67 @@
# Licensed under the MIT License.
schedules:
-- cron: "0 18 * * *"
- displayName: Daily master cpu testing pipeline
+- cron: "7 0 * * *"
+ displayName: Nightly build master
branches:
include:
- master
+- cron: "7 12 * * *"
+ displayName: Nightly build staging
+ branches:
+ include:
+ - staging
trigger: none
pr: none
-pool:
- name: RecommendersAgentPoolWin
- timeoutInMinutes: 180
- cancelTimeoutInMinutes: 180
-
-steps:
-- script: |
- call conda env remove -n nightly_reco_base
- rmdir /s /q C:\Anaconda\envs\nightly_reco_base
- displayName: 'Remove Conda Env if it exists'
-
-- script: |
- python ./scripts/generate_conda_file.py --name nightly_reco_base
- call conda env create -f nightly_reco_base.yaml
- displayName: 'Setup Conda Env'
- timeoutInMinutes: 10
-
-- script: |
- call conda activate nightly_reco_base
- echo "Smoke tests"
- pytest tests/smoke -m "smoke and not spark and not gpu" --junitxml=reports/test-smoke.xml
- echo "Integration tests"
- pytest tests/integration -m "integration and not spark and not gpu" --junitxml=reports/test-integration.xml
- conda deactivate
- displayName: 'Run Tests'
-
-- task: PublishTestResults@2
- displayName: 'Publish Test Results '
- inputs:
- testResultsFiles: '**/test-*.xml'
- failTaskOnFailedTests: true
- condition: succeededOrFailed()
-
-- script: |
- call conda env remove -n nightly_reco_base -y
- rmdir /s /q C:\Anaconda\envs\nightly_reco_base
-
- workingDirectory: tests
- displayName: 'Conda remove'
- continueOnError: true
- condition: succeededOrFailed()
- timeoutInMinutes: 10
-
-- script: |
- del /q /S %LOCALAPPDATA%\Temp\*
- for /d %%i in (%LOCALAPPDATA%\Temp\*) do @rmdir /s /q "%%i"
- displayName: 'Remove Temp Files'
- condition: succeededOrFailed()
\ No newline at end of file
+variables:
+- group: WindowsAgentPool
+
+jobs:
+- job: nightly
+ displayName: 'Nightly tests Windows CPU'
+ timeoutInMinutes: 180 # how long to run the job before automatically cancelling
+ pool:
+ name: $(Agent_Pool)
+
+ steps:
+ - script: |
+ call conda env remove -n nightly_reco_base -y
+ if exist C:\Anaconda\envs\nightly_reco_base rmdir /s /q C:\Anaconda\envs\nightly_reco_base
+ displayName: 'Remove Conda Env if it exists'
+
+ - script: |
+ python ./scripts/generate_conda_file.py --name nightly_reco_base
+ call conda env create -f nightly_reco_base.yaml
+ displayName: 'Setup Conda Env'
+
+ - script: |
+ call conda activate nightly_reco_base
+ echo "Smoke tests"
+ pytest tests/smoke --durations 0 -m "smoke and not spark and not gpu" --junitxml=reports/test-smoke.xml
+ echo "Integration tests"
+ pytest tests/integration --durations 0 -m "integration and not spark and not gpu" --junitxml=reports/test-integration.xml
+ displayName: 'Run Tests'
+
+ - task: PublishTestResults@2
+ displayName: 'Publish Test Results '
+ inputs:
+ testResultsFiles: '**/test-*.xml'
+ failTaskOnFailedTests: true
+ condition: succeededOrFailed()
+
+ - script: |
+ call conda env remove -n nightly_reco_base -y
+ if exist C:\Anaconda\envs\nightly_reco_base rmdir /s /q C:\Anaconda\envs\nightly_reco_base
+ workingDirectory: tests
+ displayName: 'Conda remove'
+ continueOnError: true
+ condition: always() # this step will always run, even if the pipeline is canceled
+
+ - script: |
+ del /q /S %LOCALAPPDATA%\Temp\*
+ for /d %%i in (%LOCALAPPDATA%\Temp\*) do @rmdir /s /q "%%i"
+ displayName: 'Remove Temp Files'
+ condition: succeededOrFailed()
\ No newline at end of file
diff --git a/tests/ci/azure_pipeline_test/dsvm_nightly_win_gpu.yml b/tests/ci/azure_pipeline_test/dsvm_nightly_win_gpu.yml
index 59abb62644..5807a1dc11 100644
--- a/tests/ci/azure_pipeline_test/dsvm_nightly_win_gpu.yml
+++ b/tests/ci/azure_pipeline_test/dsvm_nightly_win_gpu.yml
@@ -2,54 +2,61 @@
# Licensed under the MIT License.
schedules:
-- cron: "0 0 * * *"
- displayName: Daily master gpu test pipeline
+- cron: "7 4 * * *"
+ displayName: Nightly build master
branches:
include:
- master
+- cron: "7 16 * * *"
+ displayName: Nightly build staging
+ branches:
+ include:
+ - staging
trigger: none
pr: none
-pool:
- name: RecommendersAgentPoolWin
- timeoutInMinutes: 180
- cancelTimeoutInMinutes: 180
-
-steps:
-- script: |
- call conda env remove -n nightly_reco_gpu
- rmdir /s /q C:\Anaconda\envs\nightly_reco_gpu
- python ./scripts/generate_conda_file.py --gpu --name nightly_reco_gpu
- conda env create --quiet -f nightly_reco_gpu.yaml --verbose
-
- displayName: 'Setup Conda Env'
- timeoutInMinutes: 10
-
-- script: |
- call conda activate nightly_reco_gpu
- echo "Smoke tests"
- pytest tests/smoke -m "smoke and not spark and gpu" --junitxml=reports/test-smoke.xml
- echo "Integration tests"
- pytest tests/integration -m "integration and not spark and gpu" --junitxml=reports/test-integration.xml
- call conda deactivate
-
- displayName: 'Run python smoke and integration tests'
- timeoutInMinutes: 180
-
-- task: PublishTestResults@2
- displayName: 'Publish Test Results **/test-*.xml'
- inputs:
- testResultsFiles: '**/test-*.xml'
- failTaskOnFailedTests: true
- condition: succeededOrFailed()
-
-- script: |
- call conda env remove -n nightly_reco_gpu -y
- rmdir /s /q C:\Anaconda\envs\nightly_reco_gpu
-
- workingDirectory: tests
- displayName: 'Conda remove'
- continueOnError: true
- timeoutInMinutes: 10
\ No newline at end of file
+variables:
+- group: WindowsAgentPool
+
+jobs:
+- job: nightly
+ displayName: 'Nightly tests Windows GPU'
+ timeoutInMinutes: 180 # how long to run the job before automatically cancelling
+ pool:
+ name: $(Agent_Pool)
+
+ steps:
+ - script: |
+ call conda env remove -n nightly_reco_gpu -y
+ if exist C:\Anaconda\envs\nightly_reco_gpu rmdir /s /q C:\Anaconda\envs\nightly_reco_gpu
+ displayName: 'Remove Conda Env if it exists'
+
+ - script: |
+ python ./scripts/generate_conda_file.py --gpu --name nightly_reco_gpu
+ conda env create --quiet -f nightly_reco_gpu.yaml --verbose
+ displayName: 'Setup Conda Env'
+
+ - script: |
+ call conda activate nightly_reco_gpu
+ echo "Smoke tests"
+ pytest tests/smoke --durations 0 -m "smoke and not spark and gpu" --junitxml=reports/test-smoke.xml
+ echo "Integration tests"
+ pytest tests/integration --durations 0 -m "integration and not spark and gpu" --junitxml=reports/test-integration.xml
+ displayName: 'Run python smoke and integration tests'
+
+ - task: PublishTestResults@2
+ displayName: 'Publish Test Results **/test-*.xml'
+ inputs:
+ testResultsFiles: '**/test-*.xml'
+ failTaskOnFailedTests: true
+ condition: succeededOrFailed()
+
+ - script: |
+ call conda env remove -n nightly_reco_gpu -y
+ if exist C:\Anaconda\envs\nightly_reco_gpu rmdir /s /q C:\Anaconda\envs\nightly_reco_gpu
+ workingDirectory: tests
+ displayName: 'Conda remove'
+ continueOnError: true
+ condition: always() # this step will always run, even if the pipeline is canceled
\ No newline at end of file
diff --git a/tests/ci/azure_pipeline_test/dsvm_nightly_win_pyspark.yml b/tests/ci/azure_pipeline_test/dsvm_nightly_win_pyspark.yml
index 89ae933e1f..9d19e3fd3e 100644
--- a/tests/ci/azure_pipeline_test/dsvm_nightly_win_pyspark.yml
+++ b/tests/ci/azure_pipeline_test/dsvm_nightly_win_pyspark.yml
@@ -2,58 +2,65 @@
# Licensed under the MIT License.
schedules:
-- cron: "0 06 * * *"
- displayName: Daily master spark testing pipeline
+- cron: "7 8 * * *"
+ displayName: Nightly build master
branches:
include:
- master
+- cron: "7 20 * * *"
+ displayName: Nightly build staging
+ branches:
+ include:
+ - staging
trigger: none
pr: none
-pool:
- name: RecommendersAgentPoolWin
- timeoutInMinutes: 180
- cancelTimeoutInMinutes: 180
-
-steps:
-- script: |
- call conda env remove -n nightly_reco_pyspark
- rmdir /s /q C:\Anaconda\envs\nightly_reco_pyspark
- python ./scripts/generate_conda_file.py --pyspark --name nightly_reco_pyspark
- conda env create --quiet -f nightly_reco_pyspark.yaml --verbose
- displayName: 'Setup Conda Env'
- timeoutInMinutes: 10
-
-- script: |
- call conda activate nightly_reco_pyspark
- set SPARK_HOME=
- echo "Smoke tests"
- pytest tests/smoke -m "smoke and spark and not gpu" --junitxml=reports/test-smoke.xml
- echo "Integration tests"
- pytest tests/integration -m "integration and spark and not gpu" --junitxml=reports/test-integration.xml
- conda deactivate
- displayName: 'Run pyspark smoke and integration tests'
- timeoutInMinutes: 180
- env:
- PYSPARK_PYTHON: c:\anaconda\envs\reco_pyspark\python.exe
- PYSPARK_DRIVER_PYTHON: c:\anaconda\envs\reco_pyspark\python.exe
-
-- task: PublishTestResults@2
- displayName: 'Publish Test Results '
- inputs:
- testResultsFiles: '**/test-*.xml'
- failTaskOnFailedTests: true
- condition: succeededOrFailed()
-
-- script: |
- call conda env remove -n nightly_reco_pyspark -y
- rmdir /s /q C:\Anaconda\envs\nightly_reco_pyspark
-
- workingDirectory: tests
- displayName: 'Conda remove'
- continueOnError: true
- condition: succeededOrFailed()
- timeoutInMinutes: 10
+variables:
+- group: WindowsAgentPool
+
+jobs:
+- job: nightly
+ displayName: 'Nightly tests Windows Pyspark'
+ timeoutInMinutes: 180 # how long to run the job before automatically cancelling
+ pool:
+ name: $(Agent_Pool)
+
+ steps:
+ - script: |
+ call conda env remove -n nightly_reco_pyspark -y
+ if exist C:\Anaconda\envs\nightly_reco_pyspark rmdir /s /q C:\Anaconda\envs\nightly_reco_pyspark
+ displayName: 'Remove Conda Env if it exists'
+
+ - script: |
+ python ./scripts/generate_conda_file.py --pyspark --name nightly_reco_pyspark
+ conda env create --quiet -f nightly_reco_pyspark.yaml --verbose
+ displayName: 'Setup Conda Env'
+
+ - script: |
+ call conda activate nightly_reco_pyspark
+ set SPARK_HOME=
+ echo "Smoke tests"
+ pytest tests/smoke --durations 0 -m "smoke and spark and not gpu" --junitxml=reports/test-smoke.xml
+ echo "Integration tests"
+ pytest tests/integration --durations 0 -m "integration and spark and not gpu" --junitxml=reports/test-integration.xml
+ displayName: 'Run pyspark smoke and integration tests'
+ env:
+ PYSPARK_PYTHON: c:\anaconda\envs\reco_pyspark\python.exe
+ PYSPARK_DRIVER_PYTHON: c:\anaconda\envs\reco_pyspark\python.exe
+
+ - task: PublishTestResults@2
+ displayName: 'Publish Test Results '
+ inputs:
+ testResultsFiles: '**/test-*.xml'
+ failTaskOnFailedTests: true
+ condition: succeededOrFailed()
+ - script: |
+ call conda env remove -n nightly_reco_pyspark -y
+ if exist C:\Anaconda\envs\nightly_reco_pyspark rmdir /s /q C:\Anaconda\envs\nightly_reco_pyspark
+ workingDirectory: tests
+ displayName: 'Conda remove'
+ continueOnError: true
+ condition: always() # this step will always run, even if the pipeline is canceled
diff --git a/tests/ci/azure_pipeline_test/dsvm_notebook_linux_cpu.yml b/tests/ci/azure_pipeline_test/dsvm_notebook_linux_cpu.yml
index 4c87c4e1df..ae7083434d 100644
--- a/tests/ci/azure_pipeline_test/dsvm_notebook_linux_cpu.yml
+++ b/tests/ci/azure_pipeline_test/dsvm_notebook_linux_cpu.yml
@@ -11,40 +11,47 @@ trigger:
- staging
- master
-pool:
- name: recolinuxpool
- timeoutInMinutes: 20
-
-# resources:
-# repositories:
-# - repository: common
-# type: github
-# name: microsoft/recommenders
-
-steps:
-- bash: |
- echo "##vso[task.prependpath]/data/anaconda/bin"
- conda env list
- displayName: Add Conda to PATH
-# Uncomment if needed
-# Conda creation can take around 10min
-- bash: |
- python scripts/generate_conda_file.py
- conda env update -n reco_base -f reco_base.yaml
- displayName: 'Creating Conda Environment with dependencies'
-
-- script: |
- . /anaconda/etc/profile.d/conda.sh && \
- conda activate reco_base && \
- pytest tests/unit -m "notebooks and not spark and not gpu" --junitxml=reports/test-unit.xml && \
- conda deactivate
- displayName: 'Run Tests'
- timeoutInMinutes: 20
-
-- task: PublishTestResults@2
- displayName: 'Publish Test Results **/test-*.xml'
- inputs:
- testResultsFiles: '**/test-*.xml'
- failTaskOnFailedTests: true
- condition: succeededOrFailed()
-
+variables:
+- group: LinuxAgentPool
+
+jobs:
+- job: unit
+ displayName: 'Unit tests Linux CPU on notebooks'
+ timeoutInMinutes: 20 # how long to run the job before automatically cancelling
+ pool:
+ name: $(Agent_Pool)
+
+ steps:
+ - bash: |
+ echo "##vso[task.prependpath]/data/anaconda/bin"
+ conda env list
+ displayName: Add Conda to PATH
+
+ # Uncomment if needed
+ # Conda creation can take around 10min
+ # - bash: |
+ # python scripts/generate_conda_file.py
+ # conda env update -n reco_base -f reco_base.yaml
+ # displayName: 'Creating Conda Environment with dependencies'
+
+ - script: |
+ . /anaconda/etc/profile.d/conda.sh && \
+ conda activate reco_base && \
+ pytest tests/unit --durations 0 -m "notebooks and not spark and not gpu" --junitxml=reports/test-unit.xml
+ displayName: 'Run Tests'
+
+ - task: PublishTestResults@2
+ displayName: 'Publish Test Results **/test-*.xml'
+ inputs:
+ testResultsFiles: '**/test-*.xml'
+ failTaskOnFailedTests: true
+ condition: succeededOrFailed()
+
+ # Uncomment if needed
+ # - script: |
+ # conda env remove -n reco_cpu -y
+ # workingDirectory: tests
+ # displayName: 'Conda remove'
+ # continueOnError: true
+ # condition: succeededOrFailed()
+
\ No newline at end of file
diff --git a/tests/ci/azure_pipeline_test/dsvm_notebook_linux_gpu.yml b/tests/ci/azure_pipeline_test/dsvm_notebook_linux_gpu.yml
index 1ae6776318..a2e7d553c0 100644
--- a/tests/ci/azure_pipeline_test/dsvm_notebook_linux_gpu.yml
+++ b/tests/ci/azure_pipeline_test/dsvm_notebook_linux_gpu.yml
@@ -11,47 +11,46 @@ trigger:
- staging
- master
-pool:
- name: recolinuxpool
- timeoutInMinutes: 20
-
-# resources:
-# repositories:
-# - repository: common
-# type: github
-# name: microsoft/recommenders
-
-steps:
-- bash: |
- echo "##vso[task.prependpath]/data/anaconda/bin"
- conda env list
- displayName: Add Conda to PATH
-# Uncomment if needed
-# Conda creation can take around 10min
-- bash: |
- python scripts/generate_conda_file.py --gpu
- conda env update -n reco_gpu -f reco_gpu.yaml
- displayName: 'Creating Conda Environment with dependencies'
-
-- script: |
- . /anaconda/etc/profile.d/conda.sh && \
- conda activate reco_gpu && \
- pytest tests/unit -m "notebooks and not spark and gpu" --junitxml=reports/test-unit.xml && \
- conda deactivate
- displayName: 'Run Tests'
- timeoutInMinutes: 20
-
-- task: PublishTestResults@2
- displayName: 'Publish Test Results **/test-*.xml'
- inputs:
- testResultsFiles: '**/test-*.xml'
- failTaskOnFailedTests: true
- condition: succeededOrFailed()
-
-# - script: |
-# conda env remove -n reco_gpu -y
-# workingDirectory: tests
-# displayName: 'Conda remove'
-# continueOnError: true
-# condition: succeededOrFailed()
-# timeoutInMinutes: 10
+variables:
+- group: LinuxAgentPool
+
+jobs:
+- job: unit
+ displayName: 'Unit tests Linux GPU on notebooks'
+ timeoutInMinutes: 20 # how long to run the job before automatically cancelling
+ pool:
+ name: $(Agent_Pool)
+
+ steps:
+ - bash: |
+ echo "##vso[task.prependpath]/data/anaconda/bin"
+ conda env list
+ displayName: Add Conda to PATH\
+
+ # Uncomment if needed
+ # Conda creation can take around 10min
+ # - bash: |
+ # python scripts/generate_conda_file.py --gpu
+ # conda env update -n reco_gpu -f reco_gpu.yaml
+ # displayName: 'Creating Conda Environment with dependencies'
+
+ - script: |
+ . /anaconda/etc/profile.d/conda.sh && \
+ conda activate reco_gpu && \
+ pytest tests/unit --durations 0 -m "notebooks and not spark and gpu" --junitxml=reports/test-unit.xml
+ displayName: 'Run Tests'
+
+ - task: PublishTestResults@2
+ displayName: 'Publish Test Results **/test-*.xml'
+ inputs:
+ testResultsFiles: '**/test-*.xml'
+ failTaskOnFailedTests: true
+ condition: succeededOrFailed()
+
+ # Uncomment if needed
+ # - script: |
+ # conda env remove -n reco_gpu -y
+ # workingDirectory: tests
+ # displayName: 'Conda remove'
+ # continueOnError: true
+ # condition: succeededOrFailed()
\ No newline at end of file
diff --git a/tests/ci/azure_pipeline_test/dsvm_notebook_linux_pyspark.yml b/tests/ci/azure_pipeline_test/dsvm_notebook_linux_pyspark.yml
index 454393fce3..7b0241aab2 100644
--- a/tests/ci/azure_pipeline_test/dsvm_notebook_linux_pyspark.yml
+++ b/tests/ci/azure_pipeline_test/dsvm_notebook_linux_pyspark.yml
@@ -11,46 +11,51 @@ trigger:
- staging
- master
-pool:
- name: recolinuxpool
- timeoutInMinutes: 20
-
-# resources:
-# repositories:
-# - repository: common
-# type: github
-# name: microsoft/recommenders
-
-steps:
-- bash: |
- echo "##vso[task.prependpath]/data/anaconda/bin"
- conda env list
- displayName: Add Conda to PATH
-# Uncomment if needed
-# Conda creation can take around 10min
-- bash: |
- python scripts/generate_conda_file.py --pyspark
- conda env update -n reco_pyspark -f reco_pyspark.yaml
- displayName: 'Creating Conda Environment with dependencies'
-
-- script: |
- . /anaconda/etc/profile.d/conda.sh && \
- conda activate reco_pyspark && \
- pytest tests/unit -m "notebooks and spark and not gpu" --junitxml=reports/test-unit.xml && \
- conda deactivate
- displayName: 'Run Tests'
- timeoutInMinutes: 20
- env:
- PYSPARK_PYTHON: /anaconda/envs/reco_pyspark/bin/python
- PYSPARK_DRIVER_PYTHON: /anaconda/envs/reco_pyspark/bin/python
-
-- task: PublishTestResults@2
- displayName: 'Publish Test Results **/test-*.xml'
- inputs:
- testResultsFiles: '**/test-*.xml'
- failTaskOnFailedTests: true
- condition: succeededOrFailed()
-
-
-
-
+variables:
+- group: LinuxAgentPool
+
+jobs:
+- job: unit
+ displayName: 'Unit tests Linux PySpark on notebooks'
+ timeoutInMinutes: 20 # how long to run the job before automatically cancelling
+ pool:
+ name: $(Agent_Pool)
+
+ steps:
+ - bash: |
+ echo "##vso[task.prependpath]/data/anaconda/bin"
+ conda env list
+ displayName: Add Conda to PATH
+
+ # Uncomment if needed
+ # Conda creation can take around 10min
+ # - bash: |
+ # python scripts/generate_conda_file.py --pyspark
+ # conda env update -n reco_pyspark -f reco_pyspark.yaml
+ # displayName: 'Creating Conda Environment with dependencies'
+
+ - script: |
+ . /anaconda/etc/profile.d/conda.sh && \
+ conda activate reco_pyspark && \
+ pytest tests/unit --durations 0 -m "notebooks and spark and not gpu" --junitxml=reports/test-unit.xml
+ displayName: 'Run Tests'
+ env:
+ PYSPARK_PYTHON: /anaconda/envs/reco_pyspark/bin/python
+ PYSPARK_DRIVER_PYTHON: /anaconda/envs/reco_pyspark/bin/python
+
+ - task: PublishTestResults@2
+ displayName: 'Publish Test Results **/test-*.xml'
+ inputs:
+ testResultsFiles: '**/test-*.xml'
+ failTaskOnFailedTests: true
+ condition: succeededOrFailed()
+
+ # Uncomment if needed
+ # - script: |
+ # conda env remove -n reco_pyspark -y
+ # workingDirectory: tests
+ # displayName: 'Conda remove'
+ # continueOnError: true
+ # condition: succeededOrFailed()
+
+
\ No newline at end of file
diff --git a/tests/ci/azure_pipeline_test/dsvm_notebook_win_cpu.yml b/tests/ci/azure_pipeline_test/dsvm_notebook_win_cpu.yml
index bdcb6ebb89..d3cee8a5e2 100644
--- a/tests/ci/azure_pipeline_test/dsvm_notebook_win_cpu.yml
+++ b/tests/ci/azure_pipeline_test/dsvm_notebook_win_cpu.yml
@@ -7,36 +7,40 @@ pr:
- master
# Any commit to this branch will trigger the build.
-trigger: none
-
-pool:
- name: RecommendersAgentPoolWin
- timeoutInMinutes: 20
-
-workspace:
- clean: all
-
-steps:
-- script: |
- python scripts/generate_conda_file.py
- call conda env update -n reco_base -f reco_base.yaml
- call conda activate reco_base
- pytest tests/unit -m "notebooks and not spark and not gpu" --junitxml=reports/test-unit.xml
- call conda deactivate
- displayName: 'Run Tests'
- timeoutInMinutes: 20
-
-- task: PublishTestResults@2
- displayName: 'Publish Test Results **/test-*.xml'
- inputs:
- testResultsFiles: '**/test-*.xml'
- failTaskOnFailedTests: true
- condition: succeededOrFailed()
-
-- script: |
- del /q /S %LOCALAPPDATA%\Temp\*
- for /d %%i in (%LOCALAPPDATA%\Temp\*) do @rmdir /s /q "%%i"
- displayName: 'Remove Temp Files'
- enabled: false
- condition: succeededOrFailed()
+trigger:
+- staging
+- master
+variables:
+- group: WindowsAgentPool
+
+jobs:
+- job: unit
+ displayName: 'Unit tests Windows CPU on notebooks'
+ timeoutInMinutes: 20 # how long to run the job before automatically cancelling
+ pool:
+ name: $(Agent_Pool)
+
+ # workspace:
+ # clean: all
+
+ steps:
+ - script: |
+ call conda activate reco_base
+ pytest tests/unit --durations 0 -m "notebooks and not spark and not gpu" --junitxml=reports/test-unit.xml
+ displayName: 'Run Tests'
+
+ - task: PublishTestResults@2
+ displayName: 'Publish Test Results **/test-*.xml'
+ inputs:
+ testResultsFiles: '**/test-*.xml'
+ failTaskOnFailedTests: true
+ condition: succeededOrFailed()
+
+ - script: |
+ del /q /S %LOCALAPPDATA%\Temp\*
+ for /d %%i in (%LOCALAPPDATA%\Temp\*) do @rmdir /s /q "%%i"
+ displayName: 'Remove Temp Files'
+ enabled: false
+ condition: succeededOrFailed()
+
\ No newline at end of file
diff --git a/tests/ci/azure_pipeline_test/dsvm_notebook_win_gpu.yml b/tests/ci/azure_pipeline_test/dsvm_notebook_win_gpu.yml
index 4c71c28cd5..b3d888ebea 100644
--- a/tests/ci/azure_pipeline_test/dsvm_notebook_win_gpu.yml
+++ b/tests/ci/azure_pipeline_test/dsvm_notebook_win_gpu.yml
@@ -7,37 +7,41 @@ pr:
- master
# Any commit to this branch will trigger the build.
-trigger: none
-
-pool:
- name: RecommendersAgentPoolWin
- timeoutInMinutes: 20
-
-workspace:
- clean: all
-
-steps:
-- script: |
- python scripts/generate_conda_file.py --gpu
- call conda env create -f reco_gpu.yaml
- call conda activate reco_gpu
- pytest tests/unit -m "notebooks and not spark and gpu" --junitxml=reports/test-unit.xml
- call conda deactivate
- displayName: 'Run Tests'
- continueOnError: true
- timeoutInMinutes: 20
-
-- task: PublishTestResults@2
- displayName: 'Publish Test Results **/test-*.xml'
- inputs:
- testResultsFiles: '**/test-*.xml'
- failTaskOnFailedTests: true
- condition: succeededOrFailed()
-
-- script: |
- del /q /S %LOCALAPPDATA%\Temp\*
- for /d %%i in (%LOCALAPPDATA%\Temp\*) do @rmdir /s /q "%%i"s to this script
- displayName: 'Remove Temp Files'
- enabled: false
- condition: succeededOrFailed()
+trigger:
+- staging
+- master
+variables:
+- group: WindowsAgentPool
+
+jobs:
+- job: unit
+ displayName: 'Unit tests Windows GPU on notebooks'
+ timeoutInMinutes: 20 # how long to run the job before automatically cancelling
+ pool:
+ name: $(Agent_Pool)
+
+ # workspace:
+ # clean: all
+
+ steps:
+ - script: |
+ call conda activate reco_gpu
+ pytest tests/unit --durations 0 -m "notebooks and not spark and gpu" --junitxml=reports/test-unit.xml
+ displayName: 'Run Tests'
+ continueOnError: true
+
+ - task: PublishTestResults@2
+ displayName: 'Publish Test Results **/test-*.xml'
+ inputs:
+ testResultsFiles: '**/test-*.xml'
+ failTaskOnFailedTests: true
+ condition: succeededOrFailed()
+
+ - script: |
+ del /q /S %LOCALAPPDATA%\Temp\*
+ for /d %%i in (%LOCALAPPDATA%\Temp\*) do @rmdir /s /q "%%i"s to this script
+ displayName: 'Remove Temp Files'
+ enabled: false
+ condition: succeededOrFailed()
+
\ No newline at end of file
diff --git a/tests/ci/azure_pipeline_test/dsvm_notebook_win_pyspark.yml b/tests/ci/azure_pipeline_test/dsvm_notebook_win_pyspark.yml
index 1f9ad8fb27..85709dbad4 100644
--- a/tests/ci/azure_pipeline_test/dsvm_notebook_win_pyspark.yml
+++ b/tests/ci/azure_pipeline_test/dsvm_notebook_win_pyspark.yml
@@ -7,31 +7,31 @@ pr:
- master
# Any commit to this branch will trigger the build.
-trigger: none
+trigger:
+- staging
+- master
+
+variables:
+- group: WindowsAgentPool
jobs:
- job: notebook
- displayName : "Notebook tests windows pyspark"
- timeoutInMinutes: 180
-
+ displayName: "Notebook tests windows pyspark"
+ timeoutInMinutes: 20 # how long to run the job before automatically cancelling
pool:
- name: RecommendersAgentPoolWin
+ name: $(Agent_Pool)
- workspace:
- clean: all
+ # workspace:
+ # clean: all
steps:
- script: |
- python scripts/generate_conda_file.py --pyspark
- call conda env create -f reco_pyspark.yaml
call conda activate reco_pyspark
- pytest tests/unit -m "notebooks and spark and not gpu" --junitxml=reports/test-unit.xml
- call conda deactivate
- displayName: 'Run Tests'
- env:
- PYSPARK_PYTHON: c:\anaconda\envs\reco_pyspark\python.exe
- PYSPARK_DRIVER_PYTHON: c:\anaconda\envs\reco_pyspark\python.exe
- enabled: false
+ pytest tests/unit --durations 0 -m "notebooks and spark and not gpu" --junitxml=reports/test-unit.xml
+ displayName: 'Run Tests'
+ env:
+ PYSPARK_PYTHON: c:\anaconda\envs\reco_pyspark\python.exe
+ PYSPARK_DRIVER_PYTHON: c:\anaconda\envs\reco_pyspark\python.exe
- task: PublishTestResults@2
displayName: 'Publish Test Results **/test-*.xml'
@@ -43,6 +43,6 @@ jobs:
- script: |
del /q /S %LOCALAPPDATA%\Temp\*
for /d %%i in (%LOCALAPPDATA%\Temp\*) do @rmdir /s /q "%%i"
- displayName: 'Remove Temp Files'
+ displayName: 'Remove Temp Files'
enabled: false
- condition: succeededOrFailed()
+ condition: succeededOrFailed()
\ No newline at end of file
diff --git a/tests/ci/azure_pipeline_test/dsvm_unit_linux_cpu.yml b/tests/ci/azure_pipeline_test/dsvm_unit_linux_cpu.yml
index f97db9e6c6..5ed62d3727 100644
--- a/tests/ci/azure_pipeline_test/dsvm_unit_linux_cpu.yml
+++ b/tests/ci/azure_pipeline_test/dsvm_unit_linux_cpu.yml
@@ -6,46 +6,51 @@ pr:
- master
- staging
-#Any commit to this branch will trigger the build.
-# Except for markdown files
+# Any commit to this branch will trigger the build.
trigger:
- staging
- master
-pool:
- name: recolinuxpool
- timeoutInMinutes: 20
-
-# resources:
-# repositories:
-# - repository: common
-# type: github
-# name: microsoft/recommenders
-
-steps:
-- bash: |
- echo "##vso[task.prependpath]/data/anaconda/bin"
- conda env list
- displayName: Add Conda to PATH
-# Uncomment if needed
-# # Conda creation can take around 10min
-- bash: |
- python scripts/generate_conda_file.py
- conda env update -n reco_base -f reco_base.yaml
- displayName: 'Creating Conda Environment with dependencies'
-
-- script: |
- . /anaconda/etc/profile.d/conda.sh && \
- conda activate reco_base && \
- pytest tests/unit -m "not notebooks and not spark and not gpu" --junitxml=reports/test-unit.xml && \
- conda deactivate
- displayName: 'Run Tests'
- timeoutInMinutes: 20
-
-- task: PublishTestResults@2
- displayName: 'Publish Test Results **/test-*.xml'
- inputs:
- testResultsFiles: '**/test-*.xml'
- failTaskOnFailedTests: true
- condition: succeededOrFailed()
-
+variables:
+- group: LinuxAgentPool
+
+jobs:
+- job: unit
+ displayName: "Unit tests Linux CPU"
+ timeoutInMinutes: 20 # how long to run the job before automatically cancelling
+ pool:
+ name: $(Agent_Pool)
+
+ steps:
+ - bash: |
+ echo "##vso[task.prependpath]/data/anaconda/bin"
+ conda env list
+ displayName: Add Conda to PATH
+
+ # Uncomment if needed
+ # Conda creation can take around 10min
+ # - bash: |
+ # python scripts/generate_conda_file.py
+ # conda env update -n reco_base -f reco_base.yaml
+ # displayName: 'Creating Conda Environment with dependencies'
+
+ - script: |
+ . /anaconda/etc/profile.d/conda.sh && \
+ conda activate reco_base && \
+ pytest tests/unit --durations 0 -m "not notebooks and not spark and not gpu" --junitxml=reports/test-unit.xml
+ displayName: 'Run Tests'
+
+ - task: PublishTestResults@2
+ displayName: 'Publish Test Results **/test-*.xml'
+ inputs:
+ testResultsFiles: '**/test-*.xml'
+ failTaskOnFailedTests: true
+ condition: succeededOrFailed()
+
+ # Uncomment if needed
+ # - script: |
+ # conda env remove -n reco_cpu -y
+ # workingDirectory: tests
+ # displayName: 'Conda remove'
+ # continueOnError: true
+ # condition: succeededOrFailed()
\ No newline at end of file
diff --git a/tests/ci/azure_pipeline_test/dsvm_unit_linux_gpu.yml b/tests/ci/azure_pipeline_test/dsvm_unit_linux_gpu.yml
index 1b1b33611f..c246f16c8a 100644
--- a/tests/ci/azure_pipeline_test/dsvm_unit_linux_gpu.yml
+++ b/tests/ci/azure_pipeline_test/dsvm_unit_linux_gpu.yml
@@ -11,47 +11,46 @@ trigger:
- staging
- master
-pool:
- name: recolinuxpool
- timeoutInMinutes: 20
-
-# resources:
-# repositories:
-# - repository: common
-# type: github
-# name: microsoft/recommenders
-
-steps:
-- bash: |
- echo "##vso[task.prependpath]/data/anaconda/bin"
- conda env list
- displayName: Add Conda to PATH
-# Uncomment if needed
-# Conda creation can take around 10min
-- bash: |
- python scripts/generate_conda_file.py --gpu
- conda env update -n reco_gpu -f reco_gpu.yaml
- displayName: 'Creating Conda Environment with dependencies'
-
-- script: |
- . /anaconda/etc/profile.d/conda.sh && \
- conda activate reco_gpu && \
- pytest tests/unit -m "not notebooks and not spark and gpu" --junitxml=reports/test-unit.xml && \
- conda deactivate
- displayName: 'Run Tests'
- timeoutInMinutes: 20
-
-- task: PublishTestResults@2
- displayName: 'Publish Test Results **/test-*.xml'
- inputs:
- testResultsFiles: '**/test-*.xml'
- failTaskOnFailedTests: true
- condition: succeededOrFailed()
-
-- script: |
- conda env remove -n reco_gpu -y
- workingDirectory: tests
- displayName: 'Conda remove'
- continueOnError: true
- condition: succeededOrFailed()
- timeoutInMinutes: 10
+variables:
+- group: LinuxAgentPool
+
+jobs:
+- job: unit
+ displayName: "Unit tests Linux GPU"
+ timeoutInMinutes: 20 # how long to run the job before automatically cancelling
+ pool:
+ name: $(Agent_Pool)
+
+ steps:
+ - bash: |
+ echo "##vso[task.prependpath]/data/anaconda/bin"
+ conda env list
+ displayName: Add Conda to PATH
+
+ # Uncomment if needed
+ # Conda creation can take around 10min
+ # - bash: |
+ # python scripts/generate_conda_file.py --gpu
+ # conda env update -n reco_gpu -f reco_gpu.yaml
+ # displayName: 'Creating Conda Environment with dependencies'
+
+ - script: |
+ . /anaconda/etc/profile.d/conda.sh && \
+ conda activate reco_gpu && \
+ pytest tests/unit --durations 0 -m "not notebooks and not spark and gpu" --junitxml=reports/test-unit.xml
+ displayName: 'Run Tests'
+
+ - task: PublishTestResults@2
+ displayName: 'Publish Test Results **/test-*.xml'
+ inputs:
+ testResultsFiles: '**/test-*.xml'
+ failTaskOnFailedTests: true
+ condition: succeededOrFailed()
+
+ # Uncomment if needed
+ # - script: |
+ # conda env remove -n reco_gpu -y
+ # workingDirectory: tests
+ # displayName: 'Conda remove'
+ # continueOnError: true
+ # condition: succeededOrFailed()
\ No newline at end of file
diff --git a/tests/ci/azure_pipeline_test/dsvm_unit_linux_pyspark.yml b/tests/ci/azure_pipeline_test/dsvm_unit_linux_pyspark.yml
index a4961dcf4d..e7caa86c20 100644
--- a/tests/ci/azure_pipeline_test/dsvm_unit_linux_pyspark.yml
+++ b/tests/ci/azure_pipeline_test/dsvm_unit_linux_pyspark.yml
@@ -11,44 +11,49 @@ trigger:
- staging
- master
-pool:
- name: recolinuxpool
- timeoutInMinutes: 20
-
-# resources:
-# repositories:
-# - repository: common
-# type: github
-# name: microsoft/recommenders
-
-steps:
-- bash: |
- echo "##vso[task.prependpath]/data/anaconda/bin"
- conda env list
- displayName: Add Conda to PATH
-# Uncomment if needed
-# Conda creation can take around 10min
-- bash: |
- python scripts/generate_conda_file.py --pyspark
- conda env update -n reco_pyspark -f reco_pyspark.yaml
- displayName: 'Creating Conda Environment with dependencies'
-
-- script: |
- . /anaconda/etc/profile.d/conda.sh && \
- conda activate reco_pyspark && \
- pytest tests/unit -m "not notebooks and spark and not gpu" --junitxml=reports/test-unit.xml && \
- conda deactivate
- displayName: 'Run Tests'
- timeoutInMinutes: 20
- env:
- PYSPARK_PYTHON: /anaconda/envs/reco_pyspark/bin/python
- PYSPARK_DRIVER_PYTHON: /anaconda/envs/reco_pyspark/bin/python
-
-- task: PublishTestResults@2
- displayName: 'Publish Test Results **/test-*.xml'
- inputs:
- testResultsFiles: '**/test-*.xml'
- failTaskOnFailedTests: true
- condition: succeededOrFailed()
-
-
+variables:
+- group: LinuxAgentPool
+
+jobs:
+- job: unit
+ displayName: "Unit tests Linux PySpark"
+ timeoutInMinutes: 20 # how long to run the job before automatically cancelling
+ pool:
+ name: $(Agent_Pool)
+
+ steps:
+ - bash: |
+ echo "##vso[task.prependpath]/data/anaconda/bin"
+ conda env list
+ displayName: Add Conda to PATH
+
+ # Uncomment if needed
+ # Conda creation can take around 10min
+ # - bash: |
+ # python scripts/generate_conda_file.py --pyspark
+ # conda env update -n reco_pyspark -f reco_pyspark.yaml
+ # displayName: 'Creating Conda Environment with dependencies'
+
+ - script: |
+ . /anaconda/etc/profile.d/conda.sh && \
+ conda activate reco_pyspark && \
+ pytest tests/unit --durations 0 -m "not notebooks and spark and not gpu" --junitxml=reports/test-unit.xml
+ displayName: 'Run Tests'
+ env:
+ PYSPARK_PYTHON: /anaconda/envs/reco_pyspark/bin/python
+ PYSPARK_DRIVER_PYTHON: /anaconda/envs/reco_pyspark/bin/python
+
+ - task: PublishTestResults@2
+ displayName: 'Publish Test Results **/test-*.xml'
+ inputs:
+ testResultsFiles: '**/test-*.xml'
+ failTaskOnFailedTests: true
+ condition: succeededOrFailed()
+
+ # Uncomment if needed
+ # - script: |
+ # conda env remove -n reco_pyspark -y
+ # workingDirectory: tests
+ # displayName: 'Conda remove'
+ # continueOnError: true
+ # condition: succeededOrFailed()
\ No newline at end of file
diff --git a/tests/ci/azure_pipeline_test/dsvm_unit_win_cpu.yml b/tests/ci/azure_pipeline_test/dsvm_unit_win_cpu.yml
index 2f735d1046..cecade08a9 100644
--- a/tests/ci/azure_pipeline_test/dsvm_unit_win_cpu.yml
+++ b/tests/ci/azure_pipeline_test/dsvm_unit_win_cpu.yml
@@ -7,36 +7,40 @@ pr:
- master
# Any commit to this branch will trigger the build.
-trigger: none
-
-pool:
- name: RecommendersAgentPoolWin
- timeoutInMinutes: 20
-
-workspace:
- clean: all
-
-steps:
-- script: |
- python scripts/generate_conda_file.py
- call conda env update -n reco_base -f reco_base.yaml
- call conda activate reco_base
- pytest tests/unit --durations 0 -m "not notebooks and not spark and not gpu" --junitxml=reports/test-unit.xml
- call conda deactivate
- displayName: 'Run Tests'
- timeoutInMinutes: 20
-
-- task: PublishTestResults@2
- displayName: 'Publish Test Results **/test-*.xml'
- inputs:
- testResultsFiles: '**/test-*.xml'
- failTaskOnFailedTests: true
- condition: succeededOrFailed()
-
-- script: |
- del /q /S %LOCALAPPDATA%\Temp\*
- for /d %%i in (%LOCALAPPDATA%\Temp\*) do @rmdir /s /q "%%i"
- displayName: 'Remove Temp Files'
- enabled: false
- condition: succeededOrFailed()
+trigger:
+- staging
+- master
+variables:
+- group: WindowsAgentPool
+
+jobs:
+- job: unit
+ displayName: 'Unit tests Windows CPU'
+ timeoutInMinutes: 20 # how long to run the job before automatically cancelling
+ pool:
+ name: $(Agent_Pool)
+
+ # workspace:
+ # clean: all
+
+ steps:
+ - script: |
+ call conda activate reco_base
+ pytest tests/unit --durations 0 -m "not notebooks and not spark and not gpu" --junitxml=reports/test-unit.xml
+ displayName: 'Run Tests'
+
+ - task: PublishTestResults@2
+ displayName: 'Publish Test Results **/test-*.xml'
+ inputs:
+ testResultsFiles: '**/test-*.xml'
+ failTaskOnFailedTests: true
+ condition: succeededOrFailed()
+
+ - script: |
+ del /q /S %LOCALAPPDATA%\Temp\*
+ for /d %%i in (%LOCALAPPDATA%\Temp\*) do @rmdir /s /q "%%i"
+ displayName: 'Remove Temp Files'
+ enabled: false
+ condition: succeededOrFailed()
+
\ No newline at end of file
diff --git a/tests/ci/azure_pipeline_test/dsvm_unit_win_gpu.yml b/tests/ci/azure_pipeline_test/dsvm_unit_win_gpu.yml
index 7a4e0040d8..fb709eef24 100644
--- a/tests/ci/azure_pipeline_test/dsvm_unit_win_gpu.yml
+++ b/tests/ci/azure_pipeline_test/dsvm_unit_win_gpu.yml
@@ -7,36 +7,40 @@ pr:
- master
# Any commit to this branch will trigger the build.
-trigger: none
-
-pool:
- name: RecommendersAgentPoolWin
- timeoutInMinutes: 20
-
-workspace:
- clean: all
-
-steps:
-- script: |
- python scripts/generate_conda_file.py --gpu
- call conda env update -n reco_gpu -f reco_gpu.yaml
- call conda activate reco_gpu
- pytest tests/unit -m "not notebooks and not spark and gpu" --junitxml=reports/test-unit.xml
- call conda deactivate
- displayName: 'Run Tests'
- timeoutInMinutes: 20
-
-- task: PublishTestResults@2
- displayName: 'Publish Test Results **/test-*.xml'
- inputs:
- testResultsFiles: '**/test-*.xml'
- failTaskOnFailedTests: true
- condition: succeededOrFailed()
-
-- script: |
- del /q /S %LOCALAPPDATA%\Temp\*
- for /d %%i in (%LOCALAPPDATA%\Temp\*) do @rmdir /s /q "%%i"
- displayName: 'Remove Temp Files'
- enabled: false
- condition: succeededOrFailed()
+trigger:
+- staging
+- master
+variables:
+- group: WindowsAgentPool
+
+jobs:
+- job: unit
+ displayName: 'Unit tests Windows GPU'
+ timeoutInMinutes: 20 # how long to run the job before automatically cancelling
+ pool:
+ name: $(Agent_Pool)
+
+ # workspace:
+ # clean: all
+
+ steps:
+ - script: |
+ call conda activate reco_gpu
+ pytest tests/unit --durations 0 -m "not notebooks and not spark and gpu" --junitxml=reports/test-unit.xml
+ displayName: 'Run Tests'
+
+ - task: PublishTestResults@2
+ displayName: 'Publish Test Results **/test-*.xml'
+ inputs:
+ testResultsFiles: '**/test-*.xml'
+ failTaskOnFailedTests: true
+ condition: succeededOrFailed()
+
+ - script: |
+ del /q /S %LOCALAPPDATA%\Temp\*
+ for /d %%i in (%LOCALAPPDATA%\Temp\*) do @rmdir /s /q "%%i"
+ displayName: 'Remove Temp Files'
+ enabled: false
+ condition: succeededOrFailed()
+
\ No newline at end of file
diff --git a/tests/ci/azure_pipeline_test/dsvm_unit_win_pyspark.yml b/tests/ci/azure_pipeline_test/dsvm_unit_win_pyspark.yml
index e2b980a637..457dcd8fcc 100644
--- a/tests/ci/azure_pipeline_test/dsvm_unit_win_pyspark.yml
+++ b/tests/ci/azure_pipeline_test/dsvm_unit_win_pyspark.yml
@@ -7,38 +7,42 @@ pr:
- master
# Any commit to this branch will trigger the build.
-trigger: none
-
-pool:
- name: RecommendersAgentPoolWin
- timeoutInMinutes: 20
-
-workspace:
- clean: all
-
-steps:
-- script: |
- python scripts/generate_conda_file.py --pyspark
- call conda env update -n reco_pyspark -f reco_pyspark.yaml
- call conda activate reco_pyspark
- pytest tests/unit -m "not notebooks and spark and not gpu" --junitxml=reports/test-unit.xml
- call conda deactivate
- displayName: 'Run Tests'
- timeoutInMinutes: 20
- env:
- PYSPARK_PYTHON: /anaconda/envs/reco_pyspark/bin/python
- PYSPARK_DRIVER_PYTHON: /anaconda/envs/reco_pyspark/bin/python
-
-- task: PublishTestResults@2
- displayName: 'Publish Test Results'
- inputs:
- testResultsFiles: '**/test-*.xml'
- failTaskOnFailedTests: true
- condition: succeededOrFailed()
-
-- script: |
- del /q /S %LOCALAPPDATA%\Temp\*
- for /d %%i in (%LOCALAPPDATA%\Temp\*) do @rmdir /s /q "%%i"
- displayName: 'Remove Temp Files'
- enabled: false
- condition: succeededOrFailed()
+trigger:
+- staging
+- master
+
+variables:
+- group: WindowsAgentPool
+
+jobs:
+- job: unit
+ displayName: 'Unit tests Windows GPU'
+ timeoutInMinutes: 20 # how long to run the job before automatically cancelling
+ pool:
+ name: $(Agent_Pool)
+
+ # workspace:
+ # clean: all
+
+ steps:
+ - script: |
+ call conda activate reco_pyspark
+ pytest tests/unit --durations 0 -m "not notebooks and spark and not gpu" --junitxml=reports/test-unit.xml
+ displayName: 'Run Tests'
+ env:
+ PYSPARK_PYTHON: /anaconda/envs/reco_pyspark/bin/python
+ PYSPARK_DRIVER_PYTHON: /anaconda/envs/reco_pyspark/bin/python
+
+ - task: PublishTestResults@2
+ displayName: 'Publish Test Results'
+ inputs:
+ testResultsFiles: '**/test-*.xml'
+ failTaskOnFailedTests: true
+ condition: succeededOrFailed()
+
+ - script: |
+ del /q /S %LOCALAPPDATA%\Temp\*
+ for /d %%i in (%LOCALAPPDATA%\Temp\*) do @rmdir /s /q "%%i"
+ displayName: 'Remove Temp Files'
+ enabled: false
+ condition: succeededOrFailed()
\ No newline at end of file
diff --git a/tests/conftest.py b/tests/conftest.py
index 821185089f..98b27f9bd7 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -207,6 +207,9 @@ def notebooks():
"wide_deep": os.path.join(
folder_notebooks, "00_quick_start", "wide_deep_movielens.ipynb"
),
+ "slirec_quickstart": os.path.join(
+ folder_notebooks, "00_quick_start", "sequential_recsys_amazondataset.ipynb"
+ ),
"data_split": os.path.join(
folder_notebooks, "01_prepare_data", "data_split.ipynb"
),
@@ -234,6 +237,12 @@ def notebooks():
"mmlspark_lightgbm_criteo": os.path.join(
folder_notebooks, "02_model", "mmlspark_lightgbm_criteo.ipynb"
),
+ "cornac_bpr_deep_dive": os.path.join(
+ folder_notebooks, "02_model", "cornac_bpr_deep_dive.ipynb"
+ ),
+ "xlearn_fm_deep_dive": os.path.join(
+ folder_notebooks, "02_model", "fm_deep_dive.ipynb"
+ ),
"evaluation": os.path.join(folder_notebooks, "03_evaluate", "evaluation.ipynb"),
"spark_tuning": os.path.join(
folder_notebooks, "04_model_select_and_optimize", "tuning_spark_als.ipynb"
@@ -244,10 +253,5 @@ def notebooks():
"nni_tuning_svd": os.path.join(
folder_notebooks, "04_model_select_and_optimize", "nni_surprise_svd.ipynb"
),
- "cornac_bpr_deep_dive": os.path.join(
- folder_notebooks,
- "02_model",
- "cornac_bpr_deep_dive.ipynb"
- )
}
return paths
diff --git a/tests/integration/test_notebooks_gpu.py b/tests/integration/test_notebooks_gpu.py
index 79842de5c0..8b69829a46 100644
--- a/tests/integration/test_notebooks_gpu.py
+++ b/tests/integration/test_notebooks_gpu.py
@@ -5,6 +5,7 @@
import pytest
from reco_utils.common.gpu_utils import get_number_gpus
from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME
+import os
TOL = 0.5
ABS_TOL = 0.05
@@ -213,3 +214,42 @@ def test_wide_deep_integration(notebooks, size, steps, expected_values, seed, tm
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
for key, value in expected_values.items():
assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL)
+
+@pytest.mark.sequential
+@pytest.mark.integration
+@pytest.mark.gpu
+@pytest.mark.parametrize(
+ "yaml_file, data_path, epochs, batch_size, expected_values, seed",
+ [
+ (
+ "reco_utils/recommender/deeprec/config/sli_rec.yaml",
+ os.path.join("tests", "resources", "deeprec", "slirec"),
+ 10,
+ 400,
+ {
+ "res_syn": {
+ "auc": 0.7183,
+ "logloss": 0.6045,
+ },
+ },
+ 2019,
+ )
+ ],
+)
+def test_slirec_quickstart_integration(notebooks, yaml_file, data_path, epochs, batch_size, expected_values, seed):
+ notebook_path = notebooks["slirec_quickstart"]
+
+ params = {
+ "yaml_file": yaml_file,
+ "data_path": data_path,
+ "EPOCHS": epochs,
+ "BATCH_SIZE": batch_size,
+ "RANDOM_SEED": seed,
+ }
+ pm.execute_notebook(
+ notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params
+ )
+ results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
+ for key, value in expected_values.items():
+ assert results[key]["auc"] == pytest.approx(value["auc"], rel=TOL, abs=ABS_TOL)
+ assert results[key]["logloss"] == pytest.approx(value["logloss"], rel=TOL, abs=ABS_TOL)
diff --git a/tests/integration/test_notebooks_python.py b/tests/integration/test_notebooks_python.py
index 735e930fa3..b9c9ec298e 100644
--- a/tests/integration/test_notebooks_python.py
+++ b/tests/integration/test_notebooks_python.py
@@ -190,15 +190,7 @@ def test_wikidata_integration(notebooks, tmp):
@pytest.mark.parametrize(
"size, expected_values",
[
- (
- "1m",
- dict(
- map=0.081390,
- ndcg=0.406627,
- precision=0.373228,
- recall=0.132444,
- ),
- ),
+ ("1m", dict(map=0.081390, ndcg=0.406627, precision=0.373228, recall=0.132444)),
# 10m works but takes too long
],
)
@@ -214,3 +206,17 @@ def test_cornac_bpr_integration(notebooks, size, expected_values):
for key, value in expected_values.items():
assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL)
+
+
+def test_xlearn_fm_integration(notebooks):
+ notebook_path = notebooks["xlearn_fm_deep_dive"]
+ pm.execute_notebook(
+ notebook_path,
+ OUTPUT_NOTEBOOK,
+ kernel_name=KERNEL_NAME,
+ parameters=dict(LEARNING_RATE=0.2, EPOCH=10),
+ )
+ results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
+
+ assert results["auc_score"] == pytest.approx(0.75, rel=TOL, abs=ABS_TOL)
+
diff --git a/tests/smoke/test_deeprec_model.py b/tests/smoke/test_deeprec_model.py
index 3086c508ff..10e80711b2 100644
--- a/tests/smoke/test_deeprec_model.py
+++ b/tests/smoke/test_deeprec_model.py
@@ -13,6 +13,9 @@
from reco_utils.recommender.deeprec.models.dkn import DKN
from reco_utils.recommender.deeprec.IO.iterator import FFMTextIterator
from reco_utils.recommender.deeprec.IO.dkn_iterator import DKNTextIterator
+from reco_utils.recommender.deeprec.IO.sequential_iterator import SequentialIterator
+from reco_utils.recommender.deeprec.models.sequential.sli_rec import SLI_RECModel
+from reco_utils.dataset.amazon_reviews import download_and_extract, data_preprocessing
@pytest.fixture
@@ -78,3 +81,77 @@ def test_model_dkn(resource_path):
assert isinstance(model.fit(train_file, valid_file), BaseModel)
assert model.run_eval(valid_file) is not None
+
+@pytest.mark.smoke
+@pytest.mark.gpu
+@pytest.mark.deeprec
+@pytest.mark.sequential
+def test_model_slirec(resource_path):
+ data_path = os.path.join(resource_path, "..", "resources", "deeprec", "slirec")
+ yaml_file = os.path.join(
+ resource_path,
+ "..",
+ "..",
+ "reco_utils",
+ "recommender",
+ "deeprec",
+ "config",
+ "sli_rec.yaml",
+ )
+ train_file = os.path.join(data_path, r"train_data")
+ valid_file = os.path.join(data_path, r"valid_data")
+ test_file = os.path.join(data_path, r"test_data")
+ output_file = os.path.join(data_path, "output.txt")
+ train_num_ngs = (
+ 4 # number of negative instances with a positive instance for training
+ )
+ valid_num_ngs = (
+ 4 # number of negative instances with a positive instance for validation
+ )
+ test_num_ngs = (
+ 9 # number of negative instances with a positive instance for testing
+ )
+
+ if not os.path.exists(train_file):
+ user_vocab = os.path.join(data_path, r"user_vocab.pkl")
+ item_vocab = os.path.join(data_path, r"item_vocab.pkl")
+ cate_vocab = os.path.join(data_path, r"category_vocab.pkl")
+ reviews_name = "reviews_Movies_and_TV_5.json"
+ meta_name = "meta_Movies_and_TV.json"
+ reviews_file = os.path.join(data_path, reviews_name)
+ meta_file = os.path.join(data_path, meta_name)
+ sample_rate = (
+ 0.005 # sample a small item set for training and testing here for example
+ )
+
+ input_files = [
+ reviews_file,
+ meta_file,
+ train_file,
+ valid_file,
+ test_file,
+ user_vocab,
+ item_vocab,
+ cate_vocab,
+ ]
+ download_and_extract(reviews_name, reviews_file)
+ download_and_extract(meta_name, meta_file)
+ data_preprocessing(
+ *input_files,
+ sample_rate=sample_rate,
+ valid_num_ngs=valid_num_ngs,
+ test_num_ngs=test_num_ngs
+ )
+
+ hparams = prepare_hparams(
+ yaml_file, learning_rate=0.01, epochs=3, train_num_ngs=train_num_ngs
+ ) # confirm train_num_ngs before initializing a SLi_Rec model.
+ assert hparams is not None
+
+ input_creator = SequentialIterator
+ model = SLI_RECModel(hparams, input_creator)
+ assert model.run_eval(test_file, num_ngs=test_num_ngs) is not None
+ assert isinstance(
+ model.fit(train_file, valid_file, valid_num_ngs=valid_num_ngs), BaseModel
+ )
+ assert model.predict(test_file, output_file) is not None
diff --git a/tests/unit/test_deeprec_model.py b/tests/unit/test_deeprec_model.py
index 0c9911f127..4ccca82a76 100644
--- a/tests/unit/test_deeprec_model.py
+++ b/tests/unit/test_deeprec_model.py
@@ -3,11 +3,17 @@
import pytest
import os
-from reco_utils.recommender.deeprec.deeprec_utils import prepare_hparams, download_deeprec_resources
+from reco_utils.recommender.deeprec.deeprec_utils import (
+ prepare_hparams,
+ download_deeprec_resources,
+)
from reco_utils.recommender.deeprec.models.xDeepFM import XDeepFMModel
from reco_utils.recommender.deeprec.models.dkn import DKN
from reco_utils.recommender.deeprec.IO.iterator import FFMTextIterator
from reco_utils.recommender.deeprec.IO.dkn_iterator import DKNTextIterator
+from reco_utils.dataset.amazon_reviews import download_and_extract, data_preprocessing
+from reco_utils.recommender.deeprec.models.sequential.sli_rec import SLI_RECModel
+from reco_utils.recommender.deeprec.IO.sequential_iterator import SequentialIterator
@pytest.fixture
@@ -64,3 +70,71 @@ def test_dkn_component_definition(resource_path):
assert model.logit is not None
assert model.update is not None
assert model.iterator is not None
+
+
+@pytest.mark.gpu
+@pytest.mark.deeprec
+@pytest.mark.sequential
+def test_slirec_component_definition(resource_path):
+ data_path = os.path.join(resource_path, "..", "resources", "deeprec", "slirec")
+ yaml_file = os.path.join(
+ resource_path,
+ "..",
+ "..",
+ "reco_utils",
+ "recommender",
+ "deeprec",
+ "config",
+ "sli_rec.yaml",
+ )
+ train_file = os.path.join(data_path, r"train_data")
+
+ if not os.path.exists(train_file):
+ train_file = os.path.join(data_path, r"train_data")
+ valid_file = os.path.join(data_path, r"valid_data")
+ test_file = os.path.join(data_path, r"test_data")
+ user_vocab = os.path.join(data_path, r"user_vocab.pkl")
+ item_vocab = os.path.join(data_path, r"item_vocab.pkl")
+ cate_vocab = os.path.join(data_path, r"category_vocab.pkl")
+
+ reviews_name = "reviews_Movies_and_TV_5.json"
+ meta_name = "meta_Movies_and_TV.json"
+ reviews_file = os.path.join(data_path, reviews_name)
+ meta_file = os.path.join(data_path, meta_name)
+ valid_num_ngs = (
+ 4 # number of negative instances with a positive instance for validation
+ )
+ test_num_ngs = (
+ 9 # number of negative instances with a positive instance for testing
+ )
+ sample_rate = (
+ 0.01 # sample a small item set for training and testing here for example
+ )
+
+ input_files = [
+ reviews_file,
+ meta_file,
+ train_file,
+ valid_file,
+ test_file,
+ user_vocab,
+ item_vocab,
+ cate_vocab,
+ ]
+ download_and_extract(reviews_name, reviews_file)
+ download_and_extract(meta_name, meta_file)
+ data_preprocessing(
+ *input_files,
+ sample_rate=sample_rate,
+ valid_num_ngs=valid_num_ngs,
+ test_num_ngs=test_num_ngs
+ )
+
+ hparams = prepare_hparams(
+ yaml_file, train_num_ngs=4
+ ) # confirm the train_num_ngs when initializing a SLi_Rec model.
+ model = SLI_RECModel(hparams, SequentialIterator)
+
+ assert model.logit is not None
+ assert model.update is not None
+ assert model.iterator is not None
diff --git a/tests/unit/test_deeprec_utils.py b/tests/unit/test_deeprec_utils.py
index d7e2be101c..2db10b6358 100644
--- a/tests/unit/test_deeprec_utils.py
+++ b/tests/unit/test_deeprec_utils.py
@@ -11,6 +11,9 @@
)
from reco_utils.recommender.deeprec.IO.iterator import FFMTextIterator
from reco_utils.recommender.deeprec.IO.dkn_iterator import DKNTextIterator
+from reco_utils.recommender.deeprec.IO.sequential_iterator import SequentialIterator
+from reco_utils.recommender.deeprec.models.sequential.sli_rec import SLI_RECModel
+from reco_utils.dataset.amazon_reviews import download_and_extract, data_preprocessing
@pytest.fixture
@@ -92,3 +95,67 @@ def test_DKN_iterator(resource_path):
assert iterator is not None
for res in iterator.load_data_from_file(data_file):
assert isinstance(res, dict)
+
+
+@pytest.mark.gpu
+@pytest.mark.deeprec
+@pytest.mark.sequential
+def test_Sequential_Iterator(resource_path):
+ data_path = os.path.join(resource_path, "..", "resources", "deeprec", "slirec")
+ yaml_file = os.path.join(
+ resource_path,
+ "..",
+ "..",
+ "reco_utils",
+ "recommender",
+ "deeprec",
+ "config",
+ "sli_rec.yaml",
+ )
+ train_file = os.path.join(data_path, r"train_data")
+
+ if not os.path.exists(train_file):
+ valid_file = os.path.join(data_path, r"valid_data")
+ test_file = os.path.join(data_path, r"test_data")
+ user_vocab = os.path.join(data_path, r"user_vocab.pkl")
+ item_vocab = os.path.join(data_path, r"item_vocab.pkl")
+ cate_vocab = os.path.join(data_path, r"category_vocab.pkl")
+
+ reviews_name = "reviews_Movies_and_TV_5.json"
+ meta_name = "meta_Movies_and_TV.json"
+ reviews_file = os.path.join(data_path, reviews_name)
+ meta_file = os.path.join(data_path, meta_name)
+ valid_num_ngs = (
+ 4 # number of negative instances with a positive instance for validation
+ )
+ test_num_ngs = (
+ 9 # number of negative instances with a positive instance for testing
+ )
+ sample_rate = (
+ 0.01 # sample a small item set for training and testing here for example
+ )
+
+ input_files = [
+ reviews_file,
+ meta_file,
+ train_file,
+ valid_file,
+ test_file,
+ user_vocab,
+ item_vocab,
+ cate_vocab,
+ ]
+ download_and_extract(reviews_name, reviews_file)
+ download_and_extract(meta_name, meta_file)
+ data_preprocessing(
+ *input_files,
+ sample_rate=sample_rate,
+ valid_num_ngs=valid_num_ngs,
+ test_num_ngs=test_num_ngs
+ )
+
+ hparams = prepare_hparams(yaml_file)
+ iterator = SequentialIterator(hparams, tf.Graph())
+ assert iterator is not None
+ for res in iterator.load_data_from_file(train_file):
+ assert isinstance(res, dict)
diff --git a/tests/unit/test_notebooks_pyspark.py b/tests/unit/test_notebooks_pyspark.py
index 5c048e6826..6309386fb0 100644
--- a/tests/unit/test_notebooks_pyspark.py
+++ b/tests/unit/test_notebooks_pyspark.py
@@ -9,6 +9,9 @@
@pytest.mark.notebooks
@pytest.mark.spark
+@pytest.mark.skipif(
+ sys.platform == "win32", reason="Takes 1087.56s in Windows, while in Linux 52.51s"
+)
def test_als_pyspark_runs(notebooks):
notebook_path = notebooks["als_pyspark"]
pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME)
@@ -23,6 +26,9 @@ def test_data_split_runs(notebooks):
@pytest.mark.notebooks
@pytest.mark.spark
+@pytest.mark.skipif(
+ sys.platform == "win32", reason="Takes 2764.50s in Windows, while in Linux 124.35s"
+)
def test_als_deep_dive_runs(notebooks):
notebook_path = notebooks["als_deep_dive"]
pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME)
@@ -30,6 +36,9 @@ def test_als_deep_dive_runs(notebooks):
@pytest.mark.notebooks
@pytest.mark.spark
+@pytest.mark.skipif(
+ sys.platform == "win32", reason="Takes 583.75s in Windows, while in Linux 71.77s"
+)
def test_evaluation_runs(notebooks):
notebook_path = notebooks["evaluation"]
pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME)
@@ -37,6 +46,9 @@ def test_evaluation_runs(notebooks):
@pytest.mark.notebooks
@pytest.mark.spark
+@pytest.mark.skipif(
+ sys.platform == "win32", reason="Takes 2409.69s in Windows, while in Linux 138.30s"
+)
def test_spark_tuning(notebooks):
notebook_path = notebooks["spark_tuning"]
pm.execute_notebook(
@@ -48,23 +60,19 @@ def test_spark_tuning(notebooks):
NUMBER_ITERATIONS=3,
SUBSET_RATIO=0.5,
RANK=[5, 5],
- REG=[0.1, 0.01]
- )
+ REG=[0.1, 0.01],
+ ),
)
@pytest.mark.notebooks
@pytest.mark.spark
-@pytest.mark.skipif(sys.platform == 'win32', reason="Not implemented on Windows")
+@pytest.mark.skipif(sys.platform == "win32", reason="Not implemented on Windows")
def test_mmlspark_lightgbm_criteo_runs(notebooks):
notebook_path = notebooks["mmlspark_lightgbm_criteo"]
pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK,
kernel_name=KERNEL_NAME,
- parameters=dict(
- DATA_SIZE="sample",
- NUM_ITERATIONS=10,
- EARLY_STOPPING_ROUND=2,
- )
+ parameters=dict(DATA_SIZE="sample", NUM_ITERATIONS=10, EARLY_STOPPING_ROUND=2),
)