Skip to content

Commit

Permalink
Merge pull request #1289 from microsoft/bug/papermill_read_notebook
Browse files Browse the repository at this point in the history
Bug with papermill read notebook
  • Loading branch information
miguelgfierro authored Feb 1, 2021
2 parents b28a6ea + a3fe124 commit cec70b3
Show file tree
Hide file tree
Showing 14 changed files with 233 additions and 174 deletions.
21 changes: 12 additions & 9 deletions SETUP.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,18 +120,21 @@ You also need to find where Spark is installed and set `SPARK_HOME` variable, on

Then, create the file `$RECO_ENV/etc/conda/activate.d/env_vars.sh` and add:

#!/bin/sh
RECO_ENV=$(conda env list | grep reco_pyspark | awk '{print $NF}')
export PYSPARK_PYTHON=$RECO_ENV/bin/python
export PYSPARK_DRIVER_PYTHON=$RECO_ENV/bin/python
export SPARK_HOME=/dsvm/tools/spark/current
```bash
#!/bin/sh
RECO_ENV=$(conda env list | grep reco_pyspark | awk '{print $NF}')
export PYSPARK_PYTHON=$RECO_ENV/bin/python
export PYSPARK_DRIVER_PYTHON=$RECO_ENV/bin/python
export SPARK_HOME=/dsvm/tools/spark/current
```

This will export the variables every time we do `conda activate reco_pyspark`. To unset these variables when we deactivate the environment, create the file `$RECO_ENV/etc/conda/deactivate.d/env_vars.sh` and add:

#!/bin/sh
unset PYSPARK_PYTHON
unset PYSPARK_DRIVER_PYTHON

```bash
#!/bin/sh
unset PYSPARK_PYTHON
unset PYSPARK_DRIVER_PYTHON
```

</details>

Expand Down
12 changes: 4 additions & 8 deletions examples/00_quick_start/sar_movielens.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -455,13 +455,9 @@
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "Python 3.6.12 64-bit ('sb_full': conda)",
"metadata": {
"interpreter": {
"hash": "f28711ae1fad89778b64817fc2d746effb845deda73edae96b2473c20b2d4f70"
}
},
"name": "python3"
"display_name": "Python (reco_base)",
"language": "python",
"name": "reco_base"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -473,7 +469,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.12-final"
"version": "3.6.11"
}
},
"nbformat": 4,
Expand Down
40 changes: 7 additions & 33 deletions examples/01_prepare_data/mind_utils.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,12 @@
"import sys\n",
"sys.path.append(\"../../\")\n",
"import os\n",
"import papermill as pm\n",
"import pandas as pd\n",
"from collections import Counter\n",
"from tqdm import tqdm\n",
"import pickle\n",
"import numpy as np\n",
"\n",
"print(\"System version: {}\".format(sys.version))\n",
"import scrapbook as sb\n",
"\n",
"from tempfile import TemporaryDirectory\n",
"from reco_utils.dataset.mind import (download_mind,\n",
Expand All @@ -62,7 +60,9 @@
" load_glove_matrix,\n",
" word_tokenize\n",
" )\n",
"from reco_utils.dataset.download_utils import unzip_file"
"from reco_utils.dataset.download_utils import unzip_file\n",
"\n",
"print(\"System version: {}\".format(sys.version))\n"
]
},
{
Expand Down Expand Up @@ -418,37 +418,11 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/apple/miniconda/envs/reco_base/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning: Function record is deprecated and will be removed in verison 1.0.0 (current version 0.19.1). Please see `scrapbook.glue` (nteract-scrapbook) as a replacement for this functionality.\n",
" \"\"\"Entry point for launching an IPython kernel.\n"
]
},
{
"data": {
"application/papermill.record+json": {
"utils_state": {
"embedding_exist_num": 22408,
"embedding_exist_num_all": 37634,
"subvert_num": 17,
"uid2index": 5000,
"vert_num": 17,
"word_num": 23404,
"word_num_all": 41074
}
}
},
"metadata": {},
"output_type": "display_data"
}
],
"outputs": [],
"source": [
"pm.record(\"utils_state\", utils_state)"
"sb.glue(\"utils_state\", utils_state)"
]
}
],
Expand Down
8 changes: 4 additions & 4 deletions examples/template.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -259,9 +259,9 @@
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "Python (recommender)",
"display_name": "Python (reco_base)",
"language": "python",
"name": "recommender"
"name": "reco_base"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -273,9 +273,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
"version": "3.6.11"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}
28 changes: 27 additions & 1 deletion tests/integration/test_mind.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import pytest
import os
import pytest
import papermill as pm
import scrapbook as sb

from reco_utils.dataset.mind import download_mind, extract_mind
from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME


@pytest.mark.integration
Expand Down Expand Up @@ -37,3 +41,25 @@ def test_extract_mind(tmp):
assert statinfo.st_size == 59055351
statinfo = os.stat(os.path.join(valid_path, "relation_embedding.vec"))
assert statinfo.st_size == 1044588


@pytest.mark.integration
def test_mind_utils_integration(notebooks, tmp):
notebook_path = notebooks["mind_utils"]
pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK,
kernel_name=KERNEL_NAME,
parameters=dict(mind_type="small", word_embedding_dim=300),
)
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

assert results["utils_state"]["vert_num"] == 17
assert results["utils_state"]["subvert_num"] == 17
assert results["utils_state"]["word_num"] == 23404
assert results["utils_state"]["word_num_all"] == 41074
assert results["utils_state"]["embedding_exist_num"] == 22408
assert results["utils_state"]["embedding_exist_num_all"] == 37634
assert results["utils_state"]["uid2index"] == 5000
76 changes: 57 additions & 19 deletions tests/integration/test_notebooks_gpu.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import papermill as pm
import os
import pytest
import papermill as pm
import scrapbook as sb

from reco_utils.common.gpu_utils import get_number_gpus
from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME
import os


TOL = 0.5
Expand Down Expand Up @@ -48,7 +49,9 @@ def test_ncf_integration(notebooks, size, epochs, expected_values, seed):
TOP_K=10, MOVIELENS_DATA_SIZE=size, EPOCHS=epochs, BATCH_SIZE=512, SEED=seed
),
)
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

for key, value in expected_values.items():
assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL)
Expand Down Expand Up @@ -93,7 +96,9 @@ def test_ncf_deep_dive_integration(
SEED=seed,
),
)
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

for key, value in expected_values.items():
assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL)
Expand Down Expand Up @@ -123,14 +128,15 @@ def test_ncf_deep_dive_integration(
)
def test_fastai_integration(notebooks, size, epochs, expected_values):
notebook_path = notebooks["fastai"]
pm.execute_notebook(notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME)
pm.execute_notebook(
notebook_path,
OUTPUT_NOTEBOOK,
kernel_name=KERNEL_NAME,
parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE=size, EPOCHS=epochs),
)
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

for key, value in expected_values.items():
assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL)
Expand Down Expand Up @@ -168,7 +174,9 @@ def test_xdeepfm_integration(
RANDOM_SEED=seed,
),
)
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

for key, value in expected_values.items():
assert results[key]["auc"] == pytest.approx(value["auc"], rel=TOL, abs=ABS_TOL)
Expand Down Expand Up @@ -215,7 +223,10 @@ def test_wide_deep_integration(notebooks, size, steps, expected_values, seed, tm
pm.execute_notebook(
notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params
)
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

for key, value in expected_values.items():
assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL)

Expand Down Expand Up @@ -250,7 +261,10 @@ def test_slirec_quickstart_integration(
pm.execute_notebook(
notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params
)
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

for key, value in expected_values.items():
assert results[key]["auc"] == pytest.approx(value["auc"], rel=TOL, abs=ABS_TOL)
assert results[key]["logloss"] == pytest.approx(
Expand Down Expand Up @@ -278,14 +292,19 @@ def test_slirec_quickstart_integration(
)
],
)
def test_nrms_quickstart_integration(notebooks, epochs, seed, MIND_type, expected_values):
def test_nrms_quickstart_integration(
notebooks, epochs, seed, MIND_type, expected_values
):
notebook_path = notebooks["nrms_quickstart"]

params = {"epochs": epochs, "seed": seed, "MIND_type": MIND_type}
pm.execute_notebook(
notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params
)
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

for key, value in expected_values.items():
assert results[key]["group_auc"] == pytest.approx(
value["group_auc"], rel=TOL, abs=ABS_TOL
Expand Down Expand Up @@ -321,14 +340,19 @@ def test_nrms_quickstart_integration(notebooks, epochs, seed, MIND_type, expecte
)
],
)
def test_naml_quickstart_integration(notebooks, epochs, seed, MIND_type, expected_values):
def test_naml_quickstart_integration(
notebooks, epochs, seed, MIND_type, expected_values
):
notebook_path = notebooks["naml_quickstart"]

params = {"epochs": epochs, "seed": seed, "MIND_type": MIND_type}
pm.execute_notebook(
notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params
)
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

for key, value in expected_values.items():
assert results[key]["group_auc"] == pytest.approx(
value["group_auc"], rel=TOL, abs=ABS_TOL
Expand Down Expand Up @@ -364,14 +388,19 @@ def test_naml_quickstart_integration(notebooks, epochs, seed, MIND_type, expecte
)
],
)
def test_lstur_quickstart_integration(notebooks, epochs, seed, MIND_type, expected_values):
def test_lstur_quickstart_integration(
notebooks, epochs, seed, MIND_type, expected_values
):
notebook_path = notebooks["lstur_quickstart"]

params = {"epochs": epochs, "seed": seed, "MIND_type": MIND_type}
pm.execute_notebook(
notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params
)
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

for key, value in expected_values.items():
assert results[key]["group_auc"] == pytest.approx(
value["group_auc"], rel=TOL, abs=ABS_TOL
Expand Down Expand Up @@ -407,14 +436,19 @@ def test_lstur_quickstart_integration(notebooks, epochs, seed, MIND_type, expect
)
],
)
def test_npa_quickstart_integration(notebooks, epochs, seed, MIND_type, expected_values):
def test_npa_quickstart_integration(
notebooks, epochs, seed, MIND_type, expected_values
):
notebook_path = notebooks["npa_quickstart"]

params = {"epochs": epochs, "seed": seed, "MIND_type": MIND_type}
pm.execute_notebook(
notebook_path, OUTPUT_NOTEBOOK, kernel_name=KERNEL_NAME, parameters=params
)
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

for key, value in expected_values.items():
assert results[key]["group_auc"] == pytest.approx(
value["group_auc"], rel=TOL, abs=ABS_TOL
Expand Down Expand Up @@ -470,7 +504,9 @@ def test_lightgcn_deep_dive_integration(
item_file=os.path.join(data_path, r"item_embeddings"),
),
)
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

for key, value in expected_values.items():
assert results[key] == pytest.approx(value, rel=TOL, abs=ABS_TOL)
Expand All @@ -486,7 +522,9 @@ def test_dkn_quickstart_integration(notebooks):
kernel_name=KERNEL_NAME,
parameters=dict(epochs=5, batch_size=500),
)
results = pm.read_notebook(OUTPUT_NOTEBOOK).dataframe.set_index("name")["value"]
results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.dataframe.set_index("name")[
"data"
]

assert results["res"]["auc"] == pytest.approx(0.5651, rel=TOL, abs=ABS_TOL)
assert results["res"]["mean_mrr"] == pytest.approx(0.1639, rel=TOL, abs=ABS_TOL)
Expand Down
Loading

0 comments on commit cec70b3

Please sign in to comment.