From c61dc6c5d07b2870c91cd37535833556b38e1180 Mon Sep 17 00:00:00 2001
From: Jun Ki Min <42475935+loomlike@users.noreply.github.com>
Date: Wed, 3 Apr 2019 00:16:18 -0400
Subject: [PATCH 1/3] pytest tmp_dir fixture

Refactor tests to use the fixture
Fix movielens bug
---
 reco_utils/dataset/movielens.py         |  50 ++---
 tests/conftest.py                       |  13 +-
 tests/integration/test_movielens.py     | 235 ++++++++++++-----------
 tests/integration/test_notebooks_gpu.py |   3 +-
 tests/smoke/test_movielens.py           | 237 ++++++++++++------------
 tests/smoke/test_notebooks_gpu.py       |   3 +-
 tests/unit/test_notebooks_gpu.py        |  23 ++-
 7 files changed, 279 insertions(+), 285 deletions(-)

diff --git a/reco_utils/dataset/movielens.py b/reco_utils/dataset/movielens.py
index b8423b3850..4467555199 100644
--- a/reco_utils/dataset/movielens.py
+++ b/reco_utils/dataset/movielens.py
@@ -136,12 +136,12 @@ def item_has_header(self):
 
 # Warning and error messages
 WARNING_MOVIE_LENS_HEADER = """MovieLens rating dataset has four columns
-    (user id, movie id, rating, and timestamp), but more than four column headers are provided.
-    Will only use the first four column headers."""
+    (user id, movie id, rating, and timestamp), but more than four column names are provided.
+    Will only use the first four column names."""
 WARNING_HAVE_SCHEMA_AND_HEADER = """Both schema and header are provided.
     The header argument will be ignored."""
 ERROR_MOVIE_LENS_SIZE = "Invalid data size. Should be one of {100k, 1m, 10m, or 20m}"
-ERROR_NO_HEADER = "No header (schema) information"
+ERROR_NO_HEADER = "No header (schema) information. At least user and movie column names should be provided"
 
 
 def load_pandas_df(
@@ -187,13 +187,14 @@ def load_pandas_df(
     size = size.lower()
     if size not in DATA_FORMAT:
         raise ValueError(ERROR_MOVIE_LENS_SIZE)
-    if header is None or len(header) == 0:
-        raise ValueError(ERROR_NO_HEADER)
 
-    if len(header) > 4:
+    if header is None or len(header) < 2:
+        raise ValueError(ERROR_NO_HEADER)
+    elif len(header) > 4:
         warnings.warn(WARNING_MOVIE_LENS_HEADER)
         header = header[:4]
-    movie_col = DEFAULT_ITEM_COL if len(header) < 2 else header[1]
+
+    movie_col = header[1]
 
     with download_path(local_cache_path) as path:
         filepath = os.path.join(path, "ml-{}.zip".format(size)) 
@@ -205,10 +206,6 @@ def load_pandas_df(
         )
 
         # Load rating data
-        if len(header) == 1 and item_df is not None:
-            # MovieID should be loaded to merge rating df w/ item_df
-            header = [header[0], movie_col]
-
         df = pd.read_csv(
             datapath,
             sep=DATA_FORMAT[size].separator,
@@ -268,11 +265,11 @@ def load_item_df(
 
 def _load_item_df(size, item_datapath, movie_col, title_col, genres_col, year_col):
     """Loads Movie info"""
-    item_header = []
-    usecols = []
-    if movie_col is not None:
-        item_header.append(movie_col)
-        usecols.append(0)
+    if title_col is None and genres_col is None and year_col is None:
+        return None
+
+    item_header = [movie_col]
+    usecols = [0]
 
     # Year is parsed from title
     if title_col is not None or year_col is not None:
@@ -291,9 +288,6 @@ def _load_item_df(size, item_datapath, movie_col, title_col, genres_col, year_co
             item_header.append(genres_col)
             usecols.append(2)  # genres column
 
-    if len(item_header) == 0:
-        return None
-
     item_df = pd.read_csv(
         item_datapath,
         sep=DATA_FORMAT[size].item_separator,
@@ -390,17 +384,17 @@ def load_spark_df(
         ... )
 
         On DataBricks, pass the dbutils argument as follows:
-        >>> spark_df = load_spark_df(spark, ..., dbutils=dbutils)
+        >>> spark_df = load_spark_df(spark, dbutils=dbutils)
     """
     size = size.lower()
     if size not in DATA_FORMAT:
         raise ValueError(ERROR_MOVIE_LENS_SIZE)
 
     schema = _get_schema(header, schema)
-    if schema is None:
+    if schema is None or len(schema) < 2:
         raise ValueError(ERROR_NO_HEADER)
 
-    movie_col = DEFAULT_ITEM_COL if len(schema) < 2 else schema[1].name
+    movie_col = schema[1].name
 
     with download_path(local_cache_path) as path:
         filepath = os.path.join(path, "ml-{}.zip".format(size)) 
@@ -410,11 +404,8 @@ def load_spark_df(
         # Load movie features such as title, genres, and release year.
         # Since the file size is small, we directly load as pd.DataFrame from the driver node
         # and then convert into spark.DataFrame
-        item_df = spark.createDataFrame(
-            _load_item_df(
-                size, item_datapath, movie_col, title_col, genres_col, year_col
-            )
-        )
+        item_pd_df = _load_item_df(size, item_datapath, movie_col, title_col, genres_col, year_col)
+        item_df = spark.createDataFrame(item_pd_df) if item_pd_df is not None else None
 
         if is_databricks():
             if dbutils is None:
@@ -430,11 +421,6 @@ def load_spark_df(
             dbutils.fs.mv(spark_datapath, dbfs_datapath)
             spark_datapath = dbfs_datapath
 
-        # Load rating data
-        if len(schema) == 1 and item_df is not None:
-            # MovieID should be loaded to merge rating df w/ item_df
-            schema.add(StructField(movie_col, IntegerType()))
-
         # pySpark's read csv currently doesn't support multi-character delimiter, thus we manually handle that
         separator = DATA_FORMAT[size].separator
         if len(separator) > 1:
diff --git a/tests/conftest.py b/tests/conftest.py
index 3ccd16629f..480cef1f84 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -4,16 +4,16 @@
 # NOTE: This file is used by pytest to inject fixtures automatically. As it is explained in the documentation
 # https://docs.pytest.org/en/latest/fixture.html:
 # "If during implementing your tests you realize that you want to use a fixture function from multiple test files
-# you can move it to a conftest.py file. You don’t need to import the fixture you want to use in a test, it
+# you can move it to a conftest.py file. You don't need to import the fixture you want to use in a test, it
 # automatically gets discovered by pytest."
 
 import calendar
 import datetime
 import os
-import numpy as np
 import pandas as pd
 import pytest
 from sklearn.model_selection import train_test_split
+from tempfile import TemporaryDirectory
 from tests.notebooks_common import path_notebooks
 from reco_utils.common.general_utils import get_number_processors, get_physical_memory
 
@@ -23,6 +23,15 @@
     pass  # so the environment without spark doesn't break
 
 
+@pytest.fixture
+def tmp_dir(tmp_path_factory):
+    td = TemporaryDirectory(dir=tmp_path_factory.getbasetemp())
+    try:
+        yield td.name
+    finally:
+        td.cleanup()
+
+
 @pytest.fixture(scope="session")
 def spark(app_name="Sample", url="local[*]"):
     """Start Spark if not started.
diff --git a/tests/integration/test_movielens.py b/tests/integration/test_movielens.py
index 88941dda4a..6c21836a5d 100644
--- a/tests/integration/test_movielens.py
+++ b/tests/integration/test_movielens.py
@@ -3,7 +3,6 @@
 
 import os
 import pytest
-from tempfile import TemporaryDirectory
 from reco_utils.dataset.movielens import (
     load_pandas_df,
     load_spark_df,
@@ -68,53 +67,50 @@ def test_load_pandas_df(
     title_example,
     genres_example,
     year_example,
+    tmp_dir,
 ):
-    """Test MovieLens dataset load into pd.DataFrame
+    """Test MovieLens dataset load as pd.DataFrame
     """
-    # Test if correct data are loaded and local_cache_path works
-    with TemporaryDirectory() as tmp_dir:
-        # Test if can handle different size of header columns
-        header = ["a"]
-        df = load_pandas_df(size=size, local_cache_path=tmp_dir, header=header)
-        assert len(df) == num_samples
-        assert len(df.columns) == max(
-            len(header), 2
-        )  # Should load at least 2 columns, user and item
-
-        # Test title, genres, and released year load
-        header = ["a", "b", "c", "d", "e"]
-        with pytest.warns(Warning):
-            df = load_pandas_df(
-                size=size,
-                local_cache_path=tmp_dir,
-                header=header,
-                title_col="Title",
-                genres_col="Genres",
-                year_col="Year",
-            )
-            assert len(df) == num_samples
-            assert (
-                len(df.columns) == 7
-            )  # 4 header columns (user, item, rating, timestamp) and 3 feature columns
-            assert "e" not in df.columns  # only the first 4 header columns are used
-            # Get two records of the same items and check if the item-features are the same.
-            head = df.loc[df["b"] == movie_example][:2]
-            title = head["Title"].values
-            assert title[0] == title[1]
-            assert title[0] == title_example
-            genres = head["Genres"].values
-            assert genres[0] == genres[1]
-            assert genres[0] == genres_example
-            year = head["Year"].values
-            assert year[0] == year[1]
-            assert year[0] == year_example
+    # Test if correct data are loaded
+    header = ["a", "b", "c"]
+    df = load_pandas_df(size=size, local_cache_path=tmp_dir, header=header)
+    assert len(df) == num_samples
+    assert len(df.columns) == len(header)
+    # Test if raw-zip file, rating file, and item file are cached
+    assert len(os.listdir(tmp_dir)) == 3
 
-        # Test if raw-zip file, rating file, and item file are cached
-        assert len(os.listdir(tmp_dir)) == 3
+    # Test title, genres, and released year load
+    header = ["a", "b", "c", "d", "e"]
+    with pytest.warns(Warning):
+        df = load_pandas_df(
+            size=size,
+            header=header,
+            local_cache_path=tmp_dir,
+            title_col="Title",
+            genres_col="Genres",
+            year_col="Year",
+        )
+        assert len(df) == num_samples
+        assert (
+            len(df.columns) == 7
+        )  # 4 header columns (user, item, rating, timestamp) and 3 feature columns
+        assert "e" not in df.columns  # only the first 4 header columns are used
+        # Get two records of the same items and check if the item-features are the same.
+        head = df.loc[df["b"] == movie_example][:2]
+        title = head["Title"].values
+        assert title[0] == title[1]
+        assert title[0] == title_example
+        genres = head["Genres"].values
+        assert genres[0] == genres[1]
+        assert genres[0] == genres_example
+        year = head["Year"].values
+        assert year[0] == year[1]
+        assert year[0] == year_example
 
     # Test default arguments
     df = load_pandas_df(size)
     assert len(df) == num_samples
+    # user, item, rating and timestamp
     assert len(df.columns) == 4
 
 
@@ -142,32 +138,31 @@ def test_load_pandas_df(
     ],
 )
 def test_load_item_df(
-    size, num_movies, movie_example, title_example, genres_example, year_example
+    size,
+    num_movies,
+    movie_example,
+    title_example,
+    genres_example,
+    year_example,
+    tmp_dir,
 ):
     """Test movielens item data load (not rating data)
     """
-    with TemporaryDirectory() as tmp_dir:
-        df = load_item_df(
-            size, local_cache_path=tmp_dir, movie_col=None, title_col="title"
-        )
-        assert len(df) == num_movies
-        assert len(df.columns) == 1  # Only title column should be loaded
-        assert df["title"][0] == title_example
+    df = load_item_df(size, local_cache_path=tmp_dir, title_col="title")
+    assert len(df) == num_movies
+    # movie_col and title_col should be loaded
+    assert len(df.columns) == 2
+    assert df["title"][0] == title_example
 
-        # Test title and genres
-        df = load_item_df(
-            size, local_cache_path=tmp_dir, movie_col="item", genres_col="genres"
-        )
-        assert len(df) == num_movies
-        assert len(df.columns) == 2  # movile_col and genres_col
-        assert df["item"][0] == movie_example
-        assert df["genres"][0] == genres_example
+    # Test title and genres
+    df = load_item_df(size, local_cache_path=tmp_dir, movie_col="item", genres_col="genres", year_col="year")
+    assert len(df) == num_movies
+    # movile_col, genres_col and year_col
+    assert len(df.columns) == 3
 
-        # Test release year
-        df = load_item_df(size, local_cache_path=tmp_dir, year_col="year")
-        assert len(df) == num_movies
-        assert len(df.columns) == 2  # movile_col (default) and year_col
-        assert df["year"][0] == year_example
+    assert df["item"][0] == movie_example
+    assert df["genres"][0] == genres_example
+    assert df["year"][0] == year_example
 
 
 @pytest.mark.integration
@@ -212,78 +207,82 @@ def test_load_spark_df(
     title_example,
     genres_example,
     year_example,
+    tmp_dir,
 ):
     """Test MovieLens dataset load into pySpark.DataFrame
     """
     spark = start_or_get_spark("MovieLensLoaderTesting")
 
-    # Test if correct data are loaded and local_cache_path works
-    with TemporaryDirectory() as tmp_dir:
-        # Test if can handle different size of header columns
-        header = ["1", "2"]
-        schema = StructType([StructField("u", IntegerType())])
-        with pytest.warns(Warning):
-            # Test if schema is used when both schema and header are provided
-            df = load_spark_df(
-                spark, size=size, local_cache_path=tmp_dir, header=header, schema=schema
-            )
-            assert df.count() == num_samples
-            assert len(df.columns) == len(schema)
-
-        # Test title, genres, and released year load
-        header = ["a", "b", "c", "d", "e"]
-        with pytest.warns(Warning):
-            df = load_spark_df(
-                spark,
-                size=size,
-                local_cache_path=tmp_dir,
-                header=header,
-                title_col="Title",
-                genres_col="Genres",
-                year_col="Year",
-            )
-            assert df.count() == num_samples
-            assert (
-                len(df.columns) == 7
-            )  # 4 header columns (user, item, rating, timestamp) and 3 feature columns
-            assert "e" not in df.columns  # only the first 4 header columns are used
-            # Get two records of the same items and check if the item-features are the same.
-            head = df.filter(col("b") == movie_example).limit(2)
-            title = head.select("Title").collect()
-            assert title[0][0] == title[1][0]
-            assert title[0][0] == title_example
-            genres = head.select("Genres").collect()
-            assert genres[0][0] == genres[1][0]
-            assert genres[0][0] == genres_example
-            year = head.select("Year").collect()
-            assert year[0][0] == year[1][0]
-            assert year[0][0] == year_example
-
+    # Test if correct data are loaded
+    header = ["1", "2", "3"]
+    schema = StructType(
+        [
+            StructField("u", IntegerType()),
+            StructField("m", IntegerType()),
+        ]
+    )
+    with pytest.warns(Warning):
+        df = load_spark_df(
+            spark, size=size, local_cache_path=tmp_dir, header=header, schema=schema
+        )
+        assert df.count() == num_samples
+        # Test if schema is used when both schema and header are provided
+        assert len(df.columns) == len(schema)
         # Test if raw-zip file, rating file, and item file are cached
         assert len(os.listdir(tmp_dir)) == 3
 
+    # Test title, genres, and released year load
+    header = ["a", "b", "c", "d", "e"]
+    with pytest.warns(Warning):
+        df = load_spark_df(
+            spark,
+            size=size,
+            local_cache_path=tmp_dir,
+            header=header,
+            title_col="Title",
+            genres_col="Genres",
+            year_col="Year",
+        )
+        assert df.count() == num_samples
+        assert (
+            len(df.columns) == 7
+        )  # 4 header columns (user, item, rating, timestamp) and 3 feature columns
+        assert "e" not in df.columns  # only the first 4 header columns are used
+        # Get two records of the same items and check if the item-features are the same.
+        head = df.filter(col("b") == movie_example).limit(2)
+        title = head.select("Title").collect()
+        assert title[0][0] == title[1][0]
+        assert title[0][0] == title_example
+        genres = head.select("Genres").collect()
+        assert genres[0][0] == genres[1][0]
+        assert genres[0][0] == genres_example
+        year = head.select("Year").collect()
+        assert year[0][0] == year[1][0]
+        assert year[0][0] == year_example
+
     # Test default arguments
     df = load_spark_df(spark, size)
     assert df.count() == num_samples
+    # user, item, rating and timestamp
     assert len(df.columns) == 4
 
 
 @pytest.mark.integration
 @pytest.mark.parametrize("size", ["1m", "10m", "20m"])
-def test_download_and_extract_movielens(size):
+def test_download_and_extract_movielens(size, tmp_dir):
     """Test movielens data download and extract
     """
-    with TemporaryDirectory() as tmp_dir:
-        zip_path = os.path.join(tmp_dir, "ml.zip")
-        download_movielens(size, dest_path=zip_path)
-        assert len(os.listdir(tmp_dir)) == 1
-        assert os.path.exists(zip_path)
+    zip_path = os.path.join(tmp_dir, "ml.zip")
+    download_movielens(size, dest_path=zip_path)
+    assert len(os.listdir(tmp_dir)) == 1
+    assert os.path.exists(zip_path)
 
-        rating_path = os.path.join(tmp_dir, "rating.dat")
-        item_path = os.path.join(tmp_dir, "item.dat")
-        extract_movielens(
-            size, rating_path=rating_path, item_path=item_path, zip_path=zip_path
-        )
-        assert len(os.listdir(tmp_dir)) == 3
-        assert os.path.exists(rating_path)
-        assert os.path.exists(item_path)
+    rating_path = os.path.join(tmp_dir, "rating.dat")
+    item_path = os.path.join(tmp_dir, "item.dat")
+    extract_movielens(
+        size, rating_path=rating_path, item_path=item_path, zip_path=zip_path
+    )
+    # Test if raw-zip file, rating file, and item file are cached
+    assert len(os.listdir(tmp_dir)) == 3
+    assert os.path.exists(rating_path)
+    assert os.path.exists(item_path)
diff --git a/tests/integration/test_notebooks_gpu.py b/tests/integration/test_notebooks_gpu.py
index 005775a7d7..220941cf54 100644
--- a/tests/integration/test_notebooks_gpu.py
+++ b/tests/integration/test_notebooks_gpu.py
@@ -148,10 +148,9 @@ def test_fastai_integration(notebooks, size, epochs, expected_values):
         )
     ],
 )
-def test_wide_deep(notebooks, size, epochs, expected_values, tmpdir):
+def test_wide_deep(notebooks, size, epochs, expected_values, tmp_dir):
     notebook_path = notebooks["wide_deep"]
 
-    tmp_dir = str(tmpdir.mkdir("wide_deep"))
     params = {
         "MOVIELENS_DATA_SIZE": size,
         "EPOCHS": epochs,
diff --git a/tests/smoke/test_movielens.py b/tests/smoke/test_movielens.py
index 0816b5f61c..ae0cc6a67d 100644
--- a/tests/smoke/test_movielens.py
+++ b/tests/smoke/test_movielens.py
@@ -3,7 +3,6 @@
 
 import os
 import pytest
-from tempfile import TemporaryDirectory
 from reco_utils.dataset.movielens import (
     load_pandas_df,
     load_spark_df,
@@ -50,53 +49,50 @@ def test_load_pandas_df(
     title_example,
     genres_example,
     year_example,
+    tmp_dir,
 ):
-    """Test MovieLens dataset load into pd.DataFrame
+    """Test MovieLens dataset load as pd.DataFrame
     """
-    # Test if correct data are loaded and local_cache_path works
-    with TemporaryDirectory() as tmp_dir:
-        # Test if can handle different size of header columns
-        header = ["a"]
-        df = load_pandas_df(size=size, local_cache_path=tmp_dir, header=header)
+    # Test if correct data are loaded
+    header = ["a", "b", "c"]
+    df = load_pandas_df(size=size, local_cache_path=tmp_dir, header=header)
+    assert len(df) == num_samples
+    assert len(df.columns) == len(header)
+    # Test if raw-zip file, rating file, and item file are cached
+    assert len(os.listdir(tmp_dir)) == 3
+
+    # Test title, genres, and released year load
+    header = ["a", "b", "c", "d", "e"]
+    with pytest.warns(Warning):
+        df = load_pandas_df(
+            size=size,
+            header=header,
+            local_cache_path=tmp_dir,
+            title_col="Title",
+            genres_col="Genres",
+            year_col="Year",
+        )
         assert len(df) == num_samples
-        assert len(df.columns) == max(
-            len(header), 2
-        )  # Should load at least 2 columns, user and item
-
-        # Test title, genres, and released year load
-        header = ["a", "b", "c", "d", "e"]
-        with pytest.warns(Warning):
-            df = load_pandas_df(
-                size=size,
-                local_cache_path=tmp_dir,
-                header=header,
-                title_col="Title",
-                genres_col="Genres",
-                year_col="Year",
-            )
-            assert len(df) == num_samples
-            assert (
-                len(df.columns) == 7
-            )  # 4 header columns (user, item, rating, timestamp) and 3 feature columns
-            assert "e" not in df.columns  # only the first 4 header columns are used
-            # Get two records of the same items and check if the item-features are the same.
-            head = df.loc[df["b"] == movie_example][:2]
-            title = head["Title"].values
-            assert title[0] == title[1]
-            assert title[0] == title_example
-            genres = head["Genres"].values
-            assert genres[0] == genres[1]
-            assert genres[0] == genres_example
-            year = head["Year"].values
-            assert year[0] == year[1]
-            assert year[0] == year_example
-
-        # Test if raw-zip file, rating file, and item file are cached
-        assert len(os.listdir(tmp_dir)) == 3
+        assert (
+            len(df.columns) == 7
+        )  # 4 header columns (user, item, rating, timestamp) and 3 feature columns
+        assert "e" not in df.columns  # only the first 4 header columns are used
+        # Get two records of the same items and check if the item-features are the same.
+        head = df.loc[df["b"] == movie_example][:2]
+        title = head["Title"].values
+        assert title[0] == title[1]
+        assert title[0] == title_example
+        genres = head["Genres"].values
+        assert genres[0] == genres[1]
+        assert genres[0] == genres_example
+        year = head["Year"].values
+        assert year[0] == year[1]
+        assert year[0] == year_example
 
     # Test default arguments
     df = load_pandas_df(size)
     assert len(df) == num_samples
+    # user, item, rating and timestamp
     assert len(df.columns) == 4
 
 
@@ -106,32 +102,31 @@ def test_load_pandas_df(
     [("100k", 1682, 1, "Toy Story (1995)", "Animation|Children's|Comedy", "1995")],
 )
 def test_load_item_df(
-    size, num_movies, movie_example, title_example, genres_example, year_example
+    size,
+    num_movies,
+    movie_example,
+    title_example,
+    genres_example,
+    year_example,
+    tmp_dir,
 ):
     """Test movielens item data load (not rating data)
     """
-    with TemporaryDirectory() as tmp_dir:
-        df = load_item_df(
-            size, local_cache_path=tmp_dir, movie_col=None, title_col="title"
-        )
-        assert len(df) == num_movies
-        assert len(df.columns) == 1  # Only title column should be loaded
-        assert df["title"][0] == title_example
+    df = load_item_df(size, local_cache_path=tmp_dir, title_col="title")
+    assert len(df) == num_movies
+    # movie_col and title_col should be loaded
+    assert len(df.columns) == 2
+    assert df["title"][0] == title_example
 
-        # Test title and genres
-        df = load_item_df(
-            size, local_cache_path=tmp_dir, movie_col="item", genres_col="genres"
-        )
-        assert len(df) == num_movies
-        assert len(df.columns) == 2  # movile_col and genres_col
-        assert df["item"][0] == movie_example
-        assert df["genres"][0] == genres_example
+    # Test title and genres
+    df = load_item_df(size, local_cache_path=tmp_dir, movie_col="item", genres_col="genres", year_col="year")
+    assert len(df) == num_movies
+    # movile_col, genres_col and year_col
+    assert len(df.columns) == 3
 
-        # Test release year
-        df = load_item_df(size, local_cache_path=tmp_dir, year_col="year")
-        assert len(df) == num_movies
-        assert len(df.columns) == 2  # movile_col (default) and year_col
-        assert df["year"][0] == year_example
+    assert df["item"][0] == movie_example
+    assert df["genres"][0] == genres_example
+    assert df["year"][0] == year_example
 
 
 @pytest.mark.smoke
@@ -158,78 +153,82 @@ def test_load_spark_df(
     title_example,
     genres_example,
     year_example,
+    tmp_dir,
 ):
     """Test MovieLens dataset load into pySpark.DataFrame
     """
     spark = start_or_get_spark("MovieLensLoaderTesting")
 
-    # Test if correct data are loaded and local_cache_path works
-    with TemporaryDirectory() as tmp_dir:
-        # Test if can handle different size of header columns
-        header = ["1", "2"]
-        schema = StructType([StructField("u", IntegerType())])
-        with pytest.warns(Warning):
-            # Test if schema is used when both schema and header are provided
-            df = load_spark_df(
-                spark, size=size, local_cache_path=tmp_dir, header=header, schema=schema
-            )
-            assert df.count() == num_samples
-            assert len(df.columns) == len(schema)
-
-        # Test title, genres, and released year load
-        header = ["a", "b", "c", "d", "e"]
-        with pytest.warns(Warning):
-            df = load_spark_df(
-                spark,
-                size=size,
-                local_cache_path=tmp_dir,
-                header=header,
-                title_col="Title",
-                genres_col="Genres",
-                year_col="Year",
-            )
-            assert df.count() == num_samples
-            assert (
-                len(df.columns) == 7
-            )  # 4 header columns (user, item, rating, timestamp) and 3 feature columns
-            assert "e" not in df.columns  # only the first 4 header columns are used
-            # Get two records of the same items and check if the item-features are the same.
-            head = df.filter(col("b") == movie_example).limit(2)
-            title = head.select("Title").collect()
-            assert title[0][0] == title[1][0]
-            assert title[0][0] == title_example
-            genres = head.select("Genres").collect()
-            assert genres[0][0] == genres[1][0]
-            assert genres[0][0] == genres_example
-            year = head.select("Year").collect()
-            assert year[0][0] == year[1][0]
-            assert year[0][0] == year_example
-
+    # Test if correct data are loaded
+    header = ["1", "2", "3"]
+    schema = StructType(
+        [
+            StructField("u", IntegerType()),
+            StructField("m", IntegerType()),
+        ]
+    )
+    with pytest.warns(Warning):
+        df = load_spark_df(
+            spark, size=size, local_cache_path=tmp_dir, header=header, schema=schema
+        )
+        assert df.count() == num_samples
+        # Test if schema is used when both schema and header are provided
+        assert len(df.columns) == len(schema)
         # Test if raw-zip file, rating file, and item file are cached
         assert len(os.listdir(tmp_dir)) == 3
 
+    # Test title, genres, and released year load
+    header = ["a", "b", "c", "d", "e"]
+    with pytest.warns(Warning):
+        df = load_spark_df(
+            spark,
+            size=size,
+            local_cache_path=tmp_dir,
+            header=header,
+            title_col="Title",
+            genres_col="Genres",
+            year_col="Year",
+        )
+        assert df.count() == num_samples
+        assert (
+            len(df.columns) == 7
+        )  # 4 header columns (user, item, rating, timestamp) and 3 feature columns
+        assert "e" not in df.columns  # only the first 4 header columns are used
+        # Get two records of the same items and check if the item-features are the same.
+        head = df.filter(col("b") == movie_example).limit(2)
+        title = head.select("Title").collect()
+        assert title[0][0] == title[1][0]
+        assert title[0][0] == title_example
+        genres = head.select("Genres").collect()
+        assert genres[0][0] == genres[1][0]
+        assert genres[0][0] == genres_example
+        year = head.select("Year").collect()
+        assert year[0][0] == year[1][0]
+        assert year[0][0] == year_example
+
     # Test default arguments
     df = load_spark_df(spark, size)
     assert df.count() == num_samples
+    # user, item, rating and timestamp
     assert len(df.columns) == 4
 
 
 @pytest.mark.smoke
 @pytest.mark.parametrize("size", ["100k"])
-def test_download_and_extract_movielens(size):
+def test_download_and_extract_movielens(size, tmp_dir):
     """Test movielens data download and extract
     """
-    with TemporaryDirectory() as tmp_dir:
-        zip_path = os.path.join(tmp_dir, "ml.zip")
-        download_movielens(size, dest_path=zip_path)
-        assert len(os.listdir(tmp_dir)) == 1
-        assert os.path.exists(zip_path)
-
-        rating_path = os.path.join(tmp_dir, "rating.dat")
-        item_path = os.path.join(tmp_dir, "item.dat")
-        extract_movielens(
-            size, rating_path=rating_path, item_path=item_path, zip_path=zip_path
-        )
-        assert len(os.listdir(tmp_dir)) == 3
-        assert os.path.exists(rating_path)
-        assert os.path.exists(item_path)
+    zip_path = os.path.join(tmp_dir, "ml.zip")
+    download_movielens(size, dest_path=zip_path)
+    assert len(os.listdir(tmp_dir)) == 1
+    assert os.path.exists(zip_path)
+
+    rating_path = os.path.join(tmp_dir, "rating.dat")
+    item_path = os.path.join(tmp_dir, "item.dat")
+    extract_movielens(
+        size, rating_path=rating_path, item_path=item_path, zip_path=zip_path
+    )
+    # Test if raw-zip file, rating file, and item file are cached
+    assert len(os.listdir(tmp_dir)) == 3
+    assert os.path.exists(rating_path)
+    assert os.path.exists(item_path)
diff --git a/tests/smoke/test_notebooks_gpu.py b/tests/smoke/test_notebooks_gpu.py
index 55faf584e2..2ab1a0f6dc 100644
--- a/tests/smoke/test_notebooks_gpu.py
+++ b/tests/smoke/test_notebooks_gpu.py
@@ -127,10 +127,9 @@ def test_notebook_dkn(notebooks):
 
 @pytest.mark.smoke
 @pytest.mark.gpu
-def test_wide_deep(notebooks, tmpdir):
+def test_wide_deep(notebooks, tmp_dir):
     notebook_path = notebooks["wide_deep"]
 
-    tmp_dir = str(tmpdir.mkdir("wide_deep_0"))
     params = {
         "MOVIELENS_DATA_SIZE": "100k",
         "EPOCHS": 1,
diff --git a/tests/unit/test_notebooks_gpu.py b/tests/unit/test_notebooks_gpu.py
index d7d17bca06..c71fbe8af9 100644
--- a/tests/unit/test_notebooks_gpu.py
+++ b/tests/unit/test_notebooks_gpu.py
@@ -1,6 +1,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
+import os
 import pytest
 from reco_utils.common.gpu_utils import get_number_gpus
 from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME
@@ -55,18 +56,19 @@ def test_ncf_deep_dive(notebooks):
 
 @pytest.mark.notebooks
 @pytest.mark.gpu
-def test_wide_deep(notebooks, tmpdir):
+def test_wide_deep(notebooks, tmp_dir):
     notebook_path = notebooks["wide_deep"]
 
-    tmp_dir = str(tmpdir.mkdir("wide_deep_0"))
+    model_dir = os.path.join(tmp_dir, "wide_deep_0")
+    os.mkdir(model_dir)
     params = {
         'MOVIELENS_DATA_SIZE': '100k',
         'EPOCHS': 0,
         'EVALUATE_WHILE_TRAINING': False,
-        'MODEL_DIR': tmp_dir,
-        'EXPORT_DIR_BASE': tmp_dir,
-        'RATING_METRICS': ['rmse', 'mae'],
-        'RANKING_METRICS': ['ndcg_at_k', 'precision_at_k'],
+        'MODEL_DIR': model_dir,
+        'EXPORT_DIR_BASE': model_dir,
+        'RATING_METRICS': ['rmse'],
+        'RANKING_METRICS': ['ndcg_at_k'],
     }
     pm.execute_notebook(
         notebook_path,
@@ -75,15 +77,16 @@ def test_wide_deep(notebooks, tmpdir):
         parameters=params,
     )
 
-    # Test w/ different settings
-    tmp_dir = str(tmpdir.mkdir("wide_deep_1"))
+    # Test w/o item features
+    model_dir = os.path.join(tmp_dir, "wide_deep_1")
+    os.mkdir(model_dir)
     params = {
         'MOVIELENS_DATA_SIZE': '100k',
         'EPOCHS': 0,
         'ITEM_FEAT_COL': None,
         'EVALUATE_WHILE_TRAINING': True,
-        'MODEL_DIR': tmp_dir,
-        'EXPORT_DIR_BASE': tmp_dir,
+        'MODEL_DIR': model_dir,
+        'EXPORT_DIR_BASE': model_dir,
         'RATING_METRICS': ['rsquared'],
         'RANKING_METRICS': ['map_at_k'],
     }

From 94c83a31973d99da36f87a56261045d902af3c25 Mon Sep 17 00:00:00 2001
From: Jun Ki Min <42475935+loomlike@users.noreply.github.com>
Date: Wed, 3 Apr 2019 20:45:46 -0400
Subject: [PATCH 2/3] tmp fixture to use context manager

Change to use context manager
Change fixture name from tmp_dir to tmp
---
 tests/conftest.py                       |  9 +++----
 tests/integration/test_movielens.py     | 34 ++++++++++++-------------
 tests/integration/test_notebooks_gpu.py |  6 ++---
 tests/smoke/test_movielens.py           | 34 ++++++++++++-------------
 tests/smoke/test_notebooks_gpu.py       |  6 ++---
 tests/unit/test_notebooks_gpu.py        |  6 ++---
 6 files changed, 46 insertions(+), 49 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 480cef1f84..3901c1a026 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -24,12 +24,9 @@
 
 
 @pytest.fixture
-def tmp_dir(tmp_path_factory):
-    td = TemporaryDirectory(dir=tmp_path_factory.getbasetemp())
-    try:
-        yield td.name
-    finally:
-        td.cleanup()
+def tmp(tmp_path_factory):
+    with TemporaryDirectory(dir=tmp_path_factory.getbasetemp()) as td:
+        yield td
 
 
 @pytest.fixture(scope="session")
diff --git a/tests/integration/test_movielens.py b/tests/integration/test_movielens.py
index 6c21836a5d..aad20b46c3 100644
--- a/tests/integration/test_movielens.py
+++ b/tests/integration/test_movielens.py
@@ -67,17 +67,17 @@ def test_load_pandas_df(
     title_example,
     genres_example,
     year_example,
-    tmp_dir,
+    tmp,
 ):
     """Test MovieLens dataset load as pd.DataFrame
     """
     # Test if correct data are loaded
     header = ["a", "b", "c"]
-    df = load_pandas_df(size=size, local_cache_path=tmp_dir, header=header)
+    df = load_pandas_df(size=size, local_cache_path=tmp, header=header)
     assert len(df) == num_samples
     assert len(df.columns) == len(header)
     # Test if raw-zip file, rating file, and item file are cached
-    assert len(os.listdir(tmp_dir)) == 3
+    assert len(os.listdir(tmp)) == 3
 
     # Test title, genres, and released year load
     header = ["a", "b", "c", "d", "e"]
@@ -85,7 +85,7 @@ def test_load_pandas_df(
         df = load_pandas_df(
             size=size,
             header=header,
-            local_cache_path=tmp_dir,
+            local_cache_path=tmp,
             title_col="Title",
             genres_col="Genres",
             year_col="Year",
@@ -144,18 +144,18 @@ def test_load_item_df(
     title_example,
     genres_example,
     year_example,
-    tmp_dir,
+    tmp,
 ):
     """Test movielens item data load (not rating data)
     """
-    df = load_item_df(size, local_cache_path=tmp_dir, title_col="title")
+    df = load_item_df(size, local_cache_path=tmp, title_col="title")
     assert len(df) == num_movies
     # movie_col and title_col should be loaded
     assert len(df.columns) == 2
     assert df["title"][0] == title_example
 
     # Test title and genres
-    df = load_item_df(size, local_cache_path=tmp_dir, movie_col="item", genres_col="genres", year_col="year")
+    df = load_item_df(size, local_cache_path=tmp, movie_col="item", genres_col="genres", year_col="year")
     assert len(df) == num_movies
     # movile_col, genres_col and year_col
     assert len(df.columns) == 3
@@ -207,7 +207,7 @@ def test_load_spark_df(
     title_example,
     genres_example,
     year_example,
-    tmp_dir,
+    tmp,
 ):
     """Test MovieLens dataset load into pySpark.DataFrame
     """
@@ -223,13 +223,13 @@ def test_load_spark_df(
     )
     with pytest.warns(Warning):
         df = load_spark_df(
-            spark, size=size, local_cache_path=tmp_dir, header=header, schema=schema
+            spark, size=size, local_cache_path=tmp, header=header, schema=schema
         )
         assert df.count() == num_samples
         # Test if schema is used when both schema and header are provided
         assert len(df.columns) == len(schema)
         # Test if raw-zip file, rating file, and item file are cached
-        assert len(os.listdir(tmp_dir)) == 3
+        assert len(os.listdir(tmp)) == 3
 
     # Test title, genres, and released year load
     header = ["a", "b", "c", "d", "e"]
@@ -237,7 +237,7 @@ def test_load_spark_df(
         df = load_spark_df(
             spark,
             size=size,
-            local_cache_path=tmp_dir,
+            local_cache_path=tmp,
             header=header,
             title_col="Title",
             genres_col="Genres",
@@ -269,20 +269,20 @@ def test_load_spark_df(
 
 @pytest.mark.integration
 @pytest.mark.parametrize("size", ["1m", "10m", "20m"])
-def test_download_and_extract_movielens(size, tmp_dir):
+def test_download_and_extract_movielens(size, tmp):
     """Test movielens data download and extract
     """
-    zip_path = os.path.join(tmp_dir, "ml.zip")
+    zip_path = os.path.join(tmp, "ml.zip")
     download_movielens(size, dest_path=zip_path)
-    assert len(os.listdir(tmp_dir)) == 1
+    assert len(os.listdir(tmp)) == 1
     assert os.path.exists(zip_path)
 
-    rating_path = os.path.join(tmp_dir, "rating.dat")
-    item_path = os.path.join(tmp_dir, "item.dat")
+    rating_path = os.path.join(tmp, "rating.dat")
+    item_path = os.path.join(tmp, "item.dat")
     extract_movielens(
         size, rating_path=rating_path, item_path=item_path, zip_path=zip_path
     )
     # Test if raw-zip file, rating file, and item file are cached
-    assert len(os.listdir(tmp_dir)) == 3
+    assert len(os.listdir(tmp)) == 3
     assert os.path.exists(rating_path)
     assert os.path.exists(item_path)
diff --git a/tests/integration/test_notebooks_gpu.py b/tests/integration/test_notebooks_gpu.py
index 220941cf54..aaad2bd34b 100644
--- a/tests/integration/test_notebooks_gpu.py
+++ b/tests/integration/test_notebooks_gpu.py
@@ -148,15 +148,15 @@ def test_fastai_integration(notebooks, size, epochs, expected_values):
         )
     ],
 )
-def test_wide_deep(notebooks, size, epochs, expected_values, tmp_dir):
+def test_wide_deep(notebooks, size, epochs, expected_values, tmp):
     notebook_path = notebooks["wide_deep"]
 
     params = {
         "MOVIELENS_DATA_SIZE": size,
         "EPOCHS": epochs,
         "EVALUATE_WHILE_TRAINING": False,
-        "MODEL_DIR": tmp_dir,
-        "EXPORT_DIR_BASE": tmp_dir,
+        "MODEL_DIR": tmp,
+        "EXPORT_DIR_BASE": tmp,
         "RATING_METRICS": ["rmse", "mae", "rsquared", "exp_var"],
         "RANKING_METRICS": ["ndcg_at_k", "map_at_k", "precision_at_k", "recall_at_k"],
     }
diff --git a/tests/smoke/test_movielens.py b/tests/smoke/test_movielens.py
index ae0cc6a67d..30c9ec1c83 100644
--- a/tests/smoke/test_movielens.py
+++ b/tests/smoke/test_movielens.py
@@ -49,17 +49,17 @@ def test_load_pandas_df(
     title_example,
     genres_example,
     year_example,
-    tmp_dir,
+    tmp,
 ):
     """Test MovieLens dataset load as pd.DataFrame
     """
     # Test if correct data are loaded
     header = ["a", "b", "c"]
-    df = load_pandas_df(size=size, local_cache_path=tmp_dir, header=header)
+    df = load_pandas_df(size=size, local_cache_path=tmp, header=header)
     assert len(df) == num_samples
     assert len(df.columns) == len(header)
     # Test if raw-zip file, rating file, and item file are cached
-    assert len(os.listdir(tmp_dir)) == 3
+    assert len(os.listdir(tmp)) == 3
 
     # Test title, genres, and released year load
     header = ["a", "b", "c", "d", "e"]
@@ -67,7 +67,7 @@ def test_load_pandas_df(
         df = load_pandas_df(
             size=size,
             header=header,
-            local_cache_path=tmp_dir,
+            local_cache_path=tmp,
             title_col="Title",
             genres_col="Genres",
             year_col="Year",
@@ -108,18 +108,18 @@ def test_load_item_df(
     title_example,
     genres_example,
     year_example,
-    tmp_dir,
+    tmp,
 ):
     """Test movielens item data load (not rating data)
     """
-    df = load_item_df(size, local_cache_path=tmp_dir, title_col="title")
+    df = load_item_df(size, local_cache_path=tmp, title_col="title")
     assert len(df) == num_movies
     # movie_col and title_col should be loaded
     assert len(df.columns) == 2
     assert df["title"][0] == title_example
 
     # Test title and genres
-    df = load_item_df(size, local_cache_path=tmp_dir, movie_col="item", genres_col="genres", year_col="year")
+    df = load_item_df(size, local_cache_path=tmp, movie_col="item", genres_col="genres", year_col="year")
     assert len(df) == num_movies
     # movile_col, genres_col and year_col
     assert len(df.columns) == 3
@@ -153,7 +153,7 @@ def test_load_spark_df(
     title_example,
     genres_example,
     year_example,
-    tmp_dir,
+    tmp,
 ):
     """Test MovieLens dataset load into pySpark.DataFrame
     """
@@ -169,13 +169,13 @@ def test_load_spark_df(
     )
     with pytest.warns(Warning):
         df = load_spark_df(
-            spark, size=size, local_cache_path=tmp_dir, header=header, schema=schema
+            spark, size=size, local_cache_path=tmp, header=header, schema=schema
         )
         assert df.count() == num_samples
         # Test if schema is used when both schema and header are provided
         assert len(df.columns) == len(schema)
         # Test if raw-zip file, rating file, and item file are cached
-        assert len(os.listdir(tmp_dir)) == 3
+        assert len(os.listdir(tmp)) == 3
 
     # Test title, genres, and released year load
     header = ["a", "b", "c", "d", "e"]
@@ -183,7 +183,7 @@ def test_load_spark_df(
         df = load_spark_df(
             spark,
             size=size,
-            local_cache_path=tmp_dir,
+            local_cache_path=tmp,
             header=header,
             title_col="Title",
             genres_col="Genres",
@@ -215,20 +215,20 @@ def test_load_spark_df(
 
 @pytest.mark.smoke
 @pytest.mark.parametrize("size", ["100k"])
-def test_download_and_extract_movielens(size, tmp_dir):
+def test_download_and_extract_movielens(size, tmp):
     """Test movielens data download and extract
     """
-    zip_path = os.path.join(tmp_dir, "ml.zip")
+    zip_path = os.path.join(tmp, "ml.zip")
     download_movielens(size, dest_path=zip_path)
-    assert len(os.listdir(tmp_dir)) == 1
+    assert len(os.listdir(tmp)) == 1
     assert os.path.exists(zip_path)
 
-    rating_path = os.path.join(tmp_dir, "rating.dat")
-    item_path = os.path.join(tmp_dir, "item.dat")
+    rating_path = os.path.join(tmp, "rating.dat")
+    item_path = os.path.join(tmp, "item.dat")
     extract_movielens(
         size, rating_path=rating_path, item_path=item_path, zip_path=zip_path
     )
     # Test if raw-zip file, rating file, and item file are cached
-    assert len(os.listdir(tmp_dir)) == 3
+    assert len(os.listdir(tmp)) == 3
     assert os.path.exists(rating_path)
     assert os.path.exists(item_path)
diff --git a/tests/smoke/test_notebooks_gpu.py b/tests/smoke/test_notebooks_gpu.py
index 2ab1a0f6dc..95d6d70961 100644
--- a/tests/smoke/test_notebooks_gpu.py
+++ b/tests/smoke/test_notebooks_gpu.py
@@ -127,15 +127,15 @@ def test_notebook_dkn(notebooks):
 
 @pytest.mark.smoke
 @pytest.mark.gpu
-def test_wide_deep(notebooks, tmp_dir):
+def test_wide_deep(notebooks, tmp):
     notebook_path = notebooks["wide_deep"]
 
     params = {
         "MOVIELENS_DATA_SIZE": "100k",
         "EPOCHS": 1,
         "EVALUATE_WHILE_TRAINING": False,
-        "MODEL_DIR": tmp_dir,
-        "EXPORT_DIR_BASE": tmp_dir,
+        "MODEL_DIR": tmp,
+        "EXPORT_DIR_BASE": tmp,
         "RATING_METRICS": ["rmse", "mae"],
         "RANKING_METRICS": ["ndcg_at_k", "precision_at_k"],
     }
diff --git a/tests/unit/test_notebooks_gpu.py b/tests/unit/test_notebooks_gpu.py
index c71fbe8af9..7e6f48c175 100644
--- a/tests/unit/test_notebooks_gpu.py
+++ b/tests/unit/test_notebooks_gpu.py
@@ -56,10 +56,10 @@ def test_ncf_deep_dive(notebooks):
 
 @pytest.mark.notebooks
 @pytest.mark.gpu
-def test_wide_deep(notebooks, tmp_dir):
+def test_wide_deep(notebooks, tmp):
     notebook_path = notebooks["wide_deep"]
 
-    model_dir = os.path.join(tmp_dir, "wide_deep_0")
+    model_dir = os.path.join(tmp, "wide_deep_0")
     os.mkdir(model_dir)
     params = {
         'MOVIELENS_DATA_SIZE': '100k',
@@ -78,7 +78,7 @@ def test_wide_deep(notebooks, tmp_dir):
     )
 
     # Test w/o item features
-    model_dir = os.path.join(tmp_dir, "wide_deep_1")
+    model_dir = os.path.join(tmp, "wide_deep_1")
     os.mkdir(model_dir)
     params = {
         'MOVIELENS_DATA_SIZE': '100k',

From 570500e3b212b11558543558b6184ea6a88fbf73 Mon Sep 17 00:00:00 2001
From: Jun Ki Min <42475935+loomlike@users.noreply.github.com>
Date: Thu, 4 Apr 2019 11:03:11 -0400
Subject: [PATCH 3/3] Update conftest description

---
 tests/conftest.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 3901c1a026..81778cce8d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -4,8 +4,9 @@
 # NOTE: This file is used by pytest to inject fixtures automatically. As it is explained in the documentation
 # https://docs.pytest.org/en/latest/fixture.html:
 # "If during implementing your tests you realize that you want to use a fixture function from multiple test files
-# you can move it to a conftest.py file. You don't need to import the fixture you want to use in a test, it
-# automatically gets discovered by pytest."
+# you can move it to a conftest.py file. You don't need to import the module you defined your fixtures to use in a test,
+# it automatically gets discovered by pytest and thus you can simply receive fixture objects by naming them as
+# an input argument in the test."
 
 import calendar
 import datetime