From c61dc6c5d07b2870c91cd37535833556b38e1180 Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Wed, 3 Apr 2019 00:16:18 -0400 Subject: [PATCH 1/3] pytest tmp_dir fixture Refactor tests to use the fixture Fix movielens bug --- reco_utils/dataset/movielens.py | 50 ++--- tests/conftest.py | 13 +- tests/integration/test_movielens.py | 235 ++++++++++++----------- tests/integration/test_notebooks_gpu.py | 3 +- tests/smoke/test_movielens.py | 237 ++++++++++++------------ tests/smoke/test_notebooks_gpu.py | 3 +- tests/unit/test_notebooks_gpu.py | 23 ++- 7 files changed, 279 insertions(+), 285 deletions(-) diff --git a/reco_utils/dataset/movielens.py b/reco_utils/dataset/movielens.py index b8423b3850..4467555199 100644 --- a/reco_utils/dataset/movielens.py +++ b/reco_utils/dataset/movielens.py @@ -136,12 +136,12 @@ def item_has_header(self): # Warning and error messages WARNING_MOVIE_LENS_HEADER = """MovieLens rating dataset has four columns - (user id, movie id, rating, and timestamp), but more than four column headers are provided. - Will only use the first four column headers.""" + (user id, movie id, rating, and timestamp), but more than four column names are provided. + Will only use the first four column names.""" WARNING_HAVE_SCHEMA_AND_HEADER = """Both schema and header are provided. The header argument will be ignored.""" ERROR_MOVIE_LENS_SIZE = "Invalid data size. Should be one of {100k, 1m, 10m, or 20m}" -ERROR_NO_HEADER = "No header (schema) information" +ERROR_NO_HEADER = "No header (schema) information. At least user and movie column names should be provided" def load_pandas_df( @@ -187,13 +187,14 @@ def load_pandas_df( size = size.lower() if size not in DATA_FORMAT: raise ValueError(ERROR_MOVIE_LENS_SIZE) - if header is None or len(header) == 0: - raise ValueError(ERROR_NO_HEADER) - if len(header) > 4: + if header is None or len(header) < 2: + raise ValueError(ERROR_NO_HEADER) + elif len(header) > 4: warnings.warn(WARNING_MOVIE_LENS_HEADER) header = header[:4] - movie_col = DEFAULT_ITEM_COL if len(header) < 2 else header[1] + + movie_col = header[1] with download_path(local_cache_path) as path: filepath = os.path.join(path, "ml-{}.zip".format(size)) @@ -205,10 +206,6 @@ def load_pandas_df( ) # Load rating data - if len(header) == 1 and item_df is not None: - # MovieID should be loaded to merge rating df w/ item_df - header = [header[0], movie_col] - df = pd.read_csv( datapath, sep=DATA_FORMAT[size].separator, @@ -268,11 +265,11 @@ def load_item_df( def _load_item_df(size, item_datapath, movie_col, title_col, genres_col, year_col): """Loads Movie info""" - item_header = [] - usecols = [] - if movie_col is not None: - item_header.append(movie_col) - usecols.append(0) + if title_col is None and genres_col is None and year_col is None: + return None + + item_header = [movie_col] + usecols = [0] # Year is parsed from title if title_col is not None or year_col is not None: @@ -291,9 +288,6 @@ def _load_item_df(size, item_datapath, movie_col, title_col, genres_col, year_co item_header.append(genres_col) usecols.append(2) # genres column - if len(item_header) == 0: - return None - item_df = pd.read_csv( item_datapath, sep=DATA_FORMAT[size].item_separator, @@ -390,17 +384,17 @@ def load_spark_df( ... ) On DataBricks, pass the dbutils argument as follows: - >>> spark_df = load_spark_df(spark, ..., dbutils=dbutils) + >>> spark_df = load_spark_df(spark, dbutils=dbutils) """ size = size.lower() if size not in DATA_FORMAT: raise ValueError(ERROR_MOVIE_LENS_SIZE) schema = _get_schema(header, schema) - if schema is None: + if schema is None or len(schema) < 2: raise ValueError(ERROR_NO_HEADER) - movie_col = DEFAULT_ITEM_COL if len(schema) < 2 else schema[1].name + movie_col = schema[1].name with download_path(local_cache_path) as path: filepath = os.path.join(path, "ml-{}.zip".format(size)) @@ -410,11 +404,8 @@ def load_spark_df( # Load movie features such as title, genres, and release year. # Since the file size is small, we directly load as pd.DataFrame from the driver node # and then convert into spark.DataFrame - item_df = spark.createDataFrame( - _load_item_df( - size, item_datapath, movie_col, title_col, genres_col, year_col - ) - ) + item_pd_df = _load_item_df(size, item_datapath, movie_col, title_col, genres_col, year_col) + item_df = spark.createDataFrame(item_pd_df) if item_pd_df is not None else None if is_databricks(): if dbutils is None: @@ -430,11 +421,6 @@ def load_spark_df( dbutils.fs.mv(spark_datapath, dbfs_datapath) spark_datapath = dbfs_datapath - # Load rating data - if len(schema) == 1 and item_df is not None: - # MovieID should be loaded to merge rating df w/ item_df - schema.add(StructField(movie_col, IntegerType())) - # pySpark's read csv currently doesn't support multi-character delimiter, thus we manually handle that separator = DATA_FORMAT[size].separator if len(separator) > 1: diff --git a/tests/conftest.py b/tests/conftest.py index 3ccd16629f..480cef1f84 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,16 +4,16 @@ # NOTE: This file is used by pytest to inject fixtures automatically. As it is explained in the documentation # https://docs.pytest.org/en/latest/fixture.html: # "If during implementing your tests you realize that you want to use a fixture function from multiple test files -# you can move it to a conftest.py file. You don’t need to import the fixture you want to use in a test, it +# you can move it to a conftest.py file. You don't need to import the fixture you want to use in a test, it # automatically gets discovered by pytest." import calendar import datetime import os -import numpy as np import pandas as pd import pytest from sklearn.model_selection import train_test_split +from tempfile import TemporaryDirectory from tests.notebooks_common import path_notebooks from reco_utils.common.general_utils import get_number_processors, get_physical_memory @@ -23,6 +23,15 @@ pass # so the environment without spark doesn't break +@pytest.fixture +def tmp_dir(tmp_path_factory): + td = TemporaryDirectory(dir=tmp_path_factory.getbasetemp()) + try: + yield td.name + finally: + td.cleanup() + + @pytest.fixture(scope="session") def spark(app_name="Sample", url="local[*]"): """Start Spark if not started. diff --git a/tests/integration/test_movielens.py b/tests/integration/test_movielens.py index 88941dda4a..6c21836a5d 100644 --- a/tests/integration/test_movielens.py +++ b/tests/integration/test_movielens.py @@ -3,7 +3,6 @@ import os import pytest -from tempfile import TemporaryDirectory from reco_utils.dataset.movielens import ( load_pandas_df, load_spark_df, @@ -68,53 +67,50 @@ def test_load_pandas_df( title_example, genres_example, year_example, + tmp_dir, ): - """Test MovieLens dataset load into pd.DataFrame + """Test MovieLens dataset load as pd.DataFrame """ - # Test if correct data are loaded and local_cache_path works - with TemporaryDirectory() as tmp_dir: - # Test if can handle different size of header columns - header = ["a"] - df = load_pandas_df(size=size, local_cache_path=tmp_dir, header=header) - assert len(df) == num_samples - assert len(df.columns) == max( - len(header), 2 - ) # Should load at least 2 columns, user and item - - # Test title, genres, and released year load - header = ["a", "b", "c", "d", "e"] - with pytest.warns(Warning): - df = load_pandas_df( - size=size, - local_cache_path=tmp_dir, - header=header, - title_col="Title", - genres_col="Genres", - year_col="Year", - ) - assert len(df) == num_samples - assert ( - len(df.columns) == 7 - ) # 4 header columns (user, item, rating, timestamp) and 3 feature columns - assert "e" not in df.columns # only the first 4 header columns are used - # Get two records of the same items and check if the item-features are the same. - head = df.loc[df["b"] == movie_example][:2] - title = head["Title"].values - assert title[0] == title[1] - assert title[0] == title_example - genres = head["Genres"].values - assert genres[0] == genres[1] - assert genres[0] == genres_example - year = head["Year"].values - assert year[0] == year[1] - assert year[0] == year_example + # Test if correct data are loaded + header = ["a", "b", "c"] + df = load_pandas_df(size=size, local_cache_path=tmp_dir, header=header) + assert len(df) == num_samples + assert len(df.columns) == len(header) + # Test if raw-zip file, rating file, and item file are cached + assert len(os.listdir(tmp_dir)) == 3 - # Test if raw-zip file, rating file, and item file are cached - assert len(os.listdir(tmp_dir)) == 3 + # Test title, genres, and released year load + header = ["a", "b", "c", "d", "e"] + with pytest.warns(Warning): + df = load_pandas_df( + size=size, + header=header, + local_cache_path=tmp_dir, + title_col="Title", + genres_col="Genres", + year_col="Year", + ) + assert len(df) == num_samples + assert ( + len(df.columns) == 7 + ) # 4 header columns (user, item, rating, timestamp) and 3 feature columns + assert "e" not in df.columns # only the first 4 header columns are used + # Get two records of the same items and check if the item-features are the same. + head = df.loc[df["b"] == movie_example][:2] + title = head["Title"].values + assert title[0] == title[1] + assert title[0] == title_example + genres = head["Genres"].values + assert genres[0] == genres[1] + assert genres[0] == genres_example + year = head["Year"].values + assert year[0] == year[1] + assert year[0] == year_example # Test default arguments df = load_pandas_df(size) assert len(df) == num_samples + # user, item, rating and timestamp assert len(df.columns) == 4 @@ -142,32 +138,31 @@ def test_load_pandas_df( ], ) def test_load_item_df( - size, num_movies, movie_example, title_example, genres_example, year_example + size, + num_movies, + movie_example, + title_example, + genres_example, + year_example, + tmp_dir, ): """Test movielens item data load (not rating data) """ - with TemporaryDirectory() as tmp_dir: - df = load_item_df( - size, local_cache_path=tmp_dir, movie_col=None, title_col="title" - ) - assert len(df) == num_movies - assert len(df.columns) == 1 # Only title column should be loaded - assert df["title"][0] == title_example + df = load_item_df(size, local_cache_path=tmp_dir, title_col="title") + assert len(df) == num_movies + # movie_col and title_col should be loaded + assert len(df.columns) == 2 + assert df["title"][0] == title_example - # Test title and genres - df = load_item_df( - size, local_cache_path=tmp_dir, movie_col="item", genres_col="genres" - ) - assert len(df) == num_movies - assert len(df.columns) == 2 # movile_col and genres_col - assert df["item"][0] == movie_example - assert df["genres"][0] == genres_example + # Test title and genres + df = load_item_df(size, local_cache_path=tmp_dir, movie_col="item", genres_col="genres", year_col="year") + assert len(df) == num_movies + # movile_col, genres_col and year_col + assert len(df.columns) == 3 - # Test release year - df = load_item_df(size, local_cache_path=tmp_dir, year_col="year") - assert len(df) == num_movies - assert len(df.columns) == 2 # movile_col (default) and year_col - assert df["year"][0] == year_example + assert df["item"][0] == movie_example + assert df["genres"][0] == genres_example + assert df["year"][0] == year_example @pytest.mark.integration @@ -212,78 +207,82 @@ def test_load_spark_df( title_example, genres_example, year_example, + tmp_dir, ): """Test MovieLens dataset load into pySpark.DataFrame """ spark = start_or_get_spark("MovieLensLoaderTesting") - # Test if correct data are loaded and local_cache_path works - with TemporaryDirectory() as tmp_dir: - # Test if can handle different size of header columns - header = ["1", "2"] - schema = StructType([StructField("u", IntegerType())]) - with pytest.warns(Warning): - # Test if schema is used when both schema and header are provided - df = load_spark_df( - spark, size=size, local_cache_path=tmp_dir, header=header, schema=schema - ) - assert df.count() == num_samples - assert len(df.columns) == len(schema) - - # Test title, genres, and released year load - header = ["a", "b", "c", "d", "e"] - with pytest.warns(Warning): - df = load_spark_df( - spark, - size=size, - local_cache_path=tmp_dir, - header=header, - title_col="Title", - genres_col="Genres", - year_col="Year", - ) - assert df.count() == num_samples - assert ( - len(df.columns) == 7 - ) # 4 header columns (user, item, rating, timestamp) and 3 feature columns - assert "e" not in df.columns # only the first 4 header columns are used - # Get two records of the same items and check if the item-features are the same. - head = df.filter(col("b") == movie_example).limit(2) - title = head.select("Title").collect() - assert title[0][0] == title[1][0] - assert title[0][0] == title_example - genres = head.select("Genres").collect() - assert genres[0][0] == genres[1][0] - assert genres[0][0] == genres_example - year = head.select("Year").collect() - assert year[0][0] == year[1][0] - assert year[0][0] == year_example - + # Test if correct data are loaded + header = ["1", "2", "3"] + schema = StructType( + [ + StructField("u", IntegerType()), + StructField("m", IntegerType()), + ] + ) + with pytest.warns(Warning): + df = load_spark_df( + spark, size=size, local_cache_path=tmp_dir, header=header, schema=schema + ) + assert df.count() == num_samples + # Test if schema is used when both schema and header are provided + assert len(df.columns) == len(schema) # Test if raw-zip file, rating file, and item file are cached assert len(os.listdir(tmp_dir)) == 3 + # Test title, genres, and released year load + header = ["a", "b", "c", "d", "e"] + with pytest.warns(Warning): + df = load_spark_df( + spark, + size=size, + local_cache_path=tmp_dir, + header=header, + title_col="Title", + genres_col="Genres", + year_col="Year", + ) + assert df.count() == num_samples + assert ( + len(df.columns) == 7 + ) # 4 header columns (user, item, rating, timestamp) and 3 feature columns + assert "e" not in df.columns # only the first 4 header columns are used + # Get two records of the same items and check if the item-features are the same. + head = df.filter(col("b") == movie_example).limit(2) + title = head.select("Title").collect() + assert title[0][0] == title[1][0] + assert title[0][0] == title_example + genres = head.select("Genres").collect() + assert genres[0][0] == genres[1][0] + assert genres[0][0] == genres_example + year = head.select("Year").collect() + assert year[0][0] == year[1][0] + assert year[0][0] == year_example + # Test default arguments df = load_spark_df(spark, size) assert df.count() == num_samples + # user, item, rating and timestamp assert len(df.columns) == 4 @pytest.mark.integration @pytest.mark.parametrize("size", ["1m", "10m", "20m"]) -def test_download_and_extract_movielens(size): +def test_download_and_extract_movielens(size, tmp_dir): """Test movielens data download and extract """ - with TemporaryDirectory() as tmp_dir: - zip_path = os.path.join(tmp_dir, "ml.zip") - download_movielens(size, dest_path=zip_path) - assert len(os.listdir(tmp_dir)) == 1 - assert os.path.exists(zip_path) + zip_path = os.path.join(tmp_dir, "ml.zip") + download_movielens(size, dest_path=zip_path) + assert len(os.listdir(tmp_dir)) == 1 + assert os.path.exists(zip_path) - rating_path = os.path.join(tmp_dir, "rating.dat") - item_path = os.path.join(tmp_dir, "item.dat") - extract_movielens( - size, rating_path=rating_path, item_path=item_path, zip_path=zip_path - ) - assert len(os.listdir(tmp_dir)) == 3 - assert os.path.exists(rating_path) - assert os.path.exists(item_path) + rating_path = os.path.join(tmp_dir, "rating.dat") + item_path = os.path.join(tmp_dir, "item.dat") + extract_movielens( + size, rating_path=rating_path, item_path=item_path, zip_path=zip_path + ) + # Test if raw-zip file, rating file, and item file are cached + assert len(os.listdir(tmp_dir)) == 3 + assert os.path.exists(rating_path) + assert os.path.exists(item_path) diff --git a/tests/integration/test_notebooks_gpu.py b/tests/integration/test_notebooks_gpu.py index 005775a7d7..220941cf54 100644 --- a/tests/integration/test_notebooks_gpu.py +++ b/tests/integration/test_notebooks_gpu.py @@ -148,10 +148,9 @@ def test_fastai_integration(notebooks, size, epochs, expected_values): ) ], ) -def test_wide_deep(notebooks, size, epochs, expected_values, tmpdir): +def test_wide_deep(notebooks, size, epochs, expected_values, tmp_dir): notebook_path = notebooks["wide_deep"] - tmp_dir = str(tmpdir.mkdir("wide_deep")) params = { "MOVIELENS_DATA_SIZE": size, "EPOCHS": epochs, diff --git a/tests/smoke/test_movielens.py b/tests/smoke/test_movielens.py index 0816b5f61c..ae0cc6a67d 100644 --- a/tests/smoke/test_movielens.py +++ b/tests/smoke/test_movielens.py @@ -3,7 +3,6 @@ import os import pytest -from tempfile import TemporaryDirectory from reco_utils.dataset.movielens import ( load_pandas_df, load_spark_df, @@ -50,53 +49,50 @@ def test_load_pandas_df( title_example, genres_example, year_example, + tmp_dir, ): - """Test MovieLens dataset load into pd.DataFrame + """Test MovieLens dataset load as pd.DataFrame """ - # Test if correct data are loaded and local_cache_path works - with TemporaryDirectory() as tmp_dir: - # Test if can handle different size of header columns - header = ["a"] - df = load_pandas_df(size=size, local_cache_path=tmp_dir, header=header) + # Test if correct data are loaded + header = ["a", "b", "c"] + df = load_pandas_df(size=size, local_cache_path=tmp_dir, header=header) + assert len(df) == num_samples + assert len(df.columns) == len(header) + # Test if raw-zip file, rating file, and item file are cached + assert len(os.listdir(tmp_dir)) == 3 + + # Test title, genres, and released year load + header = ["a", "b", "c", "d", "e"] + with pytest.warns(Warning): + df = load_pandas_df( + size=size, + header=header, + local_cache_path=tmp_dir, + title_col="Title", + genres_col="Genres", + year_col="Year", + ) assert len(df) == num_samples - assert len(df.columns) == max( - len(header), 2 - ) # Should load at least 2 columns, user and item - - # Test title, genres, and released year load - header = ["a", "b", "c", "d", "e"] - with pytest.warns(Warning): - df = load_pandas_df( - size=size, - local_cache_path=tmp_dir, - header=header, - title_col="Title", - genres_col="Genres", - year_col="Year", - ) - assert len(df) == num_samples - assert ( - len(df.columns) == 7 - ) # 4 header columns (user, item, rating, timestamp) and 3 feature columns - assert "e" not in df.columns # only the first 4 header columns are used - # Get two records of the same items and check if the item-features are the same. - head = df.loc[df["b"] == movie_example][:2] - title = head["Title"].values - assert title[0] == title[1] - assert title[0] == title_example - genres = head["Genres"].values - assert genres[0] == genres[1] - assert genres[0] == genres_example - year = head["Year"].values - assert year[0] == year[1] - assert year[0] == year_example - - # Test if raw-zip file, rating file, and item file are cached - assert len(os.listdir(tmp_dir)) == 3 + assert ( + len(df.columns) == 7 + ) # 4 header columns (user, item, rating, timestamp) and 3 feature columns + assert "e" not in df.columns # only the first 4 header columns are used + # Get two records of the same items and check if the item-features are the same. + head = df.loc[df["b"] == movie_example][:2] + title = head["Title"].values + assert title[0] == title[1] + assert title[0] == title_example + genres = head["Genres"].values + assert genres[0] == genres[1] + assert genres[0] == genres_example + year = head["Year"].values + assert year[0] == year[1] + assert year[0] == year_example # Test default arguments df = load_pandas_df(size) assert len(df) == num_samples + # user, item, rating and timestamp assert len(df.columns) == 4 @@ -106,32 +102,31 @@ def test_load_pandas_df( [("100k", 1682, 1, "Toy Story (1995)", "Animation|Children's|Comedy", "1995")], ) def test_load_item_df( - size, num_movies, movie_example, title_example, genres_example, year_example + size, + num_movies, + movie_example, + title_example, + genres_example, + year_example, + tmp_dir, ): """Test movielens item data load (not rating data) """ - with TemporaryDirectory() as tmp_dir: - df = load_item_df( - size, local_cache_path=tmp_dir, movie_col=None, title_col="title" - ) - assert len(df) == num_movies - assert len(df.columns) == 1 # Only title column should be loaded - assert df["title"][0] == title_example + df = load_item_df(size, local_cache_path=tmp_dir, title_col="title") + assert len(df) == num_movies + # movie_col and title_col should be loaded + assert len(df.columns) == 2 + assert df["title"][0] == title_example - # Test title and genres - df = load_item_df( - size, local_cache_path=tmp_dir, movie_col="item", genres_col="genres" - ) - assert len(df) == num_movies - assert len(df.columns) == 2 # movile_col and genres_col - assert df["item"][0] == movie_example - assert df["genres"][0] == genres_example + # Test title and genres + df = load_item_df(size, local_cache_path=tmp_dir, movie_col="item", genres_col="genres", year_col="year") + assert len(df) == num_movies + # movile_col, genres_col and year_col + assert len(df.columns) == 3 - # Test release year - df = load_item_df(size, local_cache_path=tmp_dir, year_col="year") - assert len(df) == num_movies - assert len(df.columns) == 2 # movile_col (default) and year_col - assert df["year"][0] == year_example + assert df["item"][0] == movie_example + assert df["genres"][0] == genres_example + assert df["year"][0] == year_example @pytest.mark.smoke @@ -158,78 +153,82 @@ def test_load_spark_df( title_example, genres_example, year_example, + tmp_dir, ): """Test MovieLens dataset load into pySpark.DataFrame """ spark = start_or_get_spark("MovieLensLoaderTesting") - # Test if correct data are loaded and local_cache_path works - with TemporaryDirectory() as tmp_dir: - # Test if can handle different size of header columns - header = ["1", "2"] - schema = StructType([StructField("u", IntegerType())]) - with pytest.warns(Warning): - # Test if schema is used when both schema and header are provided - df = load_spark_df( - spark, size=size, local_cache_path=tmp_dir, header=header, schema=schema - ) - assert df.count() == num_samples - assert len(df.columns) == len(schema) - - # Test title, genres, and released year load - header = ["a", "b", "c", "d", "e"] - with pytest.warns(Warning): - df = load_spark_df( - spark, - size=size, - local_cache_path=tmp_dir, - header=header, - title_col="Title", - genres_col="Genres", - year_col="Year", - ) - assert df.count() == num_samples - assert ( - len(df.columns) == 7 - ) # 4 header columns (user, item, rating, timestamp) and 3 feature columns - assert "e" not in df.columns # only the first 4 header columns are used - # Get two records of the same items and check if the item-features are the same. - head = df.filter(col("b") == movie_example).limit(2) - title = head.select("Title").collect() - assert title[0][0] == title[1][0] - assert title[0][0] == title_example - genres = head.select("Genres").collect() - assert genres[0][0] == genres[1][0] - assert genres[0][0] == genres_example - year = head.select("Year").collect() - assert year[0][0] == year[1][0] - assert year[0][0] == year_example - + # Test if correct data are loaded + header = ["1", "2", "3"] + schema = StructType( + [ + StructField("u", IntegerType()), + StructField("m", IntegerType()), + ] + ) + with pytest.warns(Warning): + df = load_spark_df( + spark, size=size, local_cache_path=tmp_dir, header=header, schema=schema + ) + assert df.count() == num_samples + # Test if schema is used when both schema and header are provided + assert len(df.columns) == len(schema) # Test if raw-zip file, rating file, and item file are cached assert len(os.listdir(tmp_dir)) == 3 + # Test title, genres, and released year load + header = ["a", "b", "c", "d", "e"] + with pytest.warns(Warning): + df = load_spark_df( + spark, + size=size, + local_cache_path=tmp_dir, + header=header, + title_col="Title", + genres_col="Genres", + year_col="Year", + ) + assert df.count() == num_samples + assert ( + len(df.columns) == 7 + ) # 4 header columns (user, item, rating, timestamp) and 3 feature columns + assert "e" not in df.columns # only the first 4 header columns are used + # Get two records of the same items and check if the item-features are the same. + head = df.filter(col("b") == movie_example).limit(2) + title = head.select("Title").collect() + assert title[0][0] == title[1][0] + assert title[0][0] == title_example + genres = head.select("Genres").collect() + assert genres[0][0] == genres[1][0] + assert genres[0][0] == genres_example + year = head.select("Year").collect() + assert year[0][0] == year[1][0] + assert year[0][0] == year_example + # Test default arguments df = load_spark_df(spark, size) assert df.count() == num_samples + # user, item, rating and timestamp assert len(df.columns) == 4 @pytest.mark.smoke @pytest.mark.parametrize("size", ["100k"]) -def test_download_and_extract_movielens(size): +def test_download_and_extract_movielens(size, tmp_dir): """Test movielens data download and extract """ - with TemporaryDirectory() as tmp_dir: - zip_path = os.path.join(tmp_dir, "ml.zip") - download_movielens(size, dest_path=zip_path) - assert len(os.listdir(tmp_dir)) == 1 - assert os.path.exists(zip_path) - - rating_path = os.path.join(tmp_dir, "rating.dat") - item_path = os.path.join(tmp_dir, "item.dat") - extract_movielens( - size, rating_path=rating_path, item_path=item_path, zip_path=zip_path - ) - assert len(os.listdir(tmp_dir)) == 3 - assert os.path.exists(rating_path) - assert os.path.exists(item_path) + zip_path = os.path.join(tmp_dir, "ml.zip") + download_movielens(size, dest_path=zip_path) + assert len(os.listdir(tmp_dir)) == 1 + assert os.path.exists(zip_path) + + rating_path = os.path.join(tmp_dir, "rating.dat") + item_path = os.path.join(tmp_dir, "item.dat") + extract_movielens( + size, rating_path=rating_path, item_path=item_path, zip_path=zip_path + ) + # Test if raw-zip file, rating file, and item file are cached + assert len(os.listdir(tmp_dir)) == 3 + assert os.path.exists(rating_path) + assert os.path.exists(item_path) diff --git a/tests/smoke/test_notebooks_gpu.py b/tests/smoke/test_notebooks_gpu.py index 55faf584e2..2ab1a0f6dc 100644 --- a/tests/smoke/test_notebooks_gpu.py +++ b/tests/smoke/test_notebooks_gpu.py @@ -127,10 +127,9 @@ def test_notebook_dkn(notebooks): @pytest.mark.smoke @pytest.mark.gpu -def test_wide_deep(notebooks, tmpdir): +def test_wide_deep(notebooks, tmp_dir): notebook_path = notebooks["wide_deep"] - tmp_dir = str(tmpdir.mkdir("wide_deep_0")) params = { "MOVIELENS_DATA_SIZE": "100k", "EPOCHS": 1, diff --git a/tests/unit/test_notebooks_gpu.py b/tests/unit/test_notebooks_gpu.py index d7d17bca06..c71fbe8af9 100644 --- a/tests/unit/test_notebooks_gpu.py +++ b/tests/unit/test_notebooks_gpu.py @@ -1,6 +1,7 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. +import os import pytest from reco_utils.common.gpu_utils import get_number_gpus from tests.notebooks_common import OUTPUT_NOTEBOOK, KERNEL_NAME @@ -55,18 +56,19 @@ def test_ncf_deep_dive(notebooks): @pytest.mark.notebooks @pytest.mark.gpu -def test_wide_deep(notebooks, tmpdir): +def test_wide_deep(notebooks, tmp_dir): notebook_path = notebooks["wide_deep"] - tmp_dir = str(tmpdir.mkdir("wide_deep_0")) + model_dir = os.path.join(tmp_dir, "wide_deep_0") + os.mkdir(model_dir) params = { 'MOVIELENS_DATA_SIZE': '100k', 'EPOCHS': 0, 'EVALUATE_WHILE_TRAINING': False, - 'MODEL_DIR': tmp_dir, - 'EXPORT_DIR_BASE': tmp_dir, - 'RATING_METRICS': ['rmse', 'mae'], - 'RANKING_METRICS': ['ndcg_at_k', 'precision_at_k'], + 'MODEL_DIR': model_dir, + 'EXPORT_DIR_BASE': model_dir, + 'RATING_METRICS': ['rmse'], + 'RANKING_METRICS': ['ndcg_at_k'], } pm.execute_notebook( notebook_path, @@ -75,15 +77,16 @@ def test_wide_deep(notebooks, tmpdir): parameters=params, ) - # Test w/ different settings - tmp_dir = str(tmpdir.mkdir("wide_deep_1")) + # Test w/o item features + model_dir = os.path.join(tmp_dir, "wide_deep_1") + os.mkdir(model_dir) params = { 'MOVIELENS_DATA_SIZE': '100k', 'EPOCHS': 0, 'ITEM_FEAT_COL': None, 'EVALUATE_WHILE_TRAINING': True, - 'MODEL_DIR': tmp_dir, - 'EXPORT_DIR_BASE': tmp_dir, + 'MODEL_DIR': model_dir, + 'EXPORT_DIR_BASE': model_dir, 'RATING_METRICS': ['rsquared'], 'RANKING_METRICS': ['map_at_k'], } From 94c83a31973d99da36f87a56261045d902af3c25 Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Wed, 3 Apr 2019 20:45:46 -0400 Subject: [PATCH 2/3] tmp fixture to use context manager Change to use context manager Change fixture name from tmp_dir to tmp --- tests/conftest.py | 9 +++---- tests/integration/test_movielens.py | 34 ++++++++++++------------- tests/integration/test_notebooks_gpu.py | 6 ++--- tests/smoke/test_movielens.py | 34 ++++++++++++------------- tests/smoke/test_notebooks_gpu.py | 6 ++--- tests/unit/test_notebooks_gpu.py | 6 ++--- 6 files changed, 46 insertions(+), 49 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 480cef1f84..3901c1a026 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -24,12 +24,9 @@ @pytest.fixture -def tmp_dir(tmp_path_factory): - td = TemporaryDirectory(dir=tmp_path_factory.getbasetemp()) - try: - yield td.name - finally: - td.cleanup() +def tmp(tmp_path_factory): + with TemporaryDirectory(dir=tmp_path_factory.getbasetemp()) as td: + yield td @pytest.fixture(scope="session") diff --git a/tests/integration/test_movielens.py b/tests/integration/test_movielens.py index 6c21836a5d..aad20b46c3 100644 --- a/tests/integration/test_movielens.py +++ b/tests/integration/test_movielens.py @@ -67,17 +67,17 @@ def test_load_pandas_df( title_example, genres_example, year_example, - tmp_dir, + tmp, ): """Test MovieLens dataset load as pd.DataFrame """ # Test if correct data are loaded header = ["a", "b", "c"] - df = load_pandas_df(size=size, local_cache_path=tmp_dir, header=header) + df = load_pandas_df(size=size, local_cache_path=tmp, header=header) assert len(df) == num_samples assert len(df.columns) == len(header) # Test if raw-zip file, rating file, and item file are cached - assert len(os.listdir(tmp_dir)) == 3 + assert len(os.listdir(tmp)) == 3 # Test title, genres, and released year load header = ["a", "b", "c", "d", "e"] @@ -85,7 +85,7 @@ def test_load_pandas_df( df = load_pandas_df( size=size, header=header, - local_cache_path=tmp_dir, + local_cache_path=tmp, title_col="Title", genres_col="Genres", year_col="Year", @@ -144,18 +144,18 @@ def test_load_item_df( title_example, genres_example, year_example, - tmp_dir, + tmp, ): """Test movielens item data load (not rating data) """ - df = load_item_df(size, local_cache_path=tmp_dir, title_col="title") + df = load_item_df(size, local_cache_path=tmp, title_col="title") assert len(df) == num_movies # movie_col and title_col should be loaded assert len(df.columns) == 2 assert df["title"][0] == title_example # Test title and genres - df = load_item_df(size, local_cache_path=tmp_dir, movie_col="item", genres_col="genres", year_col="year") + df = load_item_df(size, local_cache_path=tmp, movie_col="item", genres_col="genres", year_col="year") assert len(df) == num_movies # movile_col, genres_col and year_col assert len(df.columns) == 3 @@ -207,7 +207,7 @@ def test_load_spark_df( title_example, genres_example, year_example, - tmp_dir, + tmp, ): """Test MovieLens dataset load into pySpark.DataFrame """ @@ -223,13 +223,13 @@ def test_load_spark_df( ) with pytest.warns(Warning): df = load_spark_df( - spark, size=size, local_cache_path=tmp_dir, header=header, schema=schema + spark, size=size, local_cache_path=tmp, header=header, schema=schema ) assert df.count() == num_samples # Test if schema is used when both schema and header are provided assert len(df.columns) == len(schema) # Test if raw-zip file, rating file, and item file are cached - assert len(os.listdir(tmp_dir)) == 3 + assert len(os.listdir(tmp)) == 3 # Test title, genres, and released year load header = ["a", "b", "c", "d", "e"] @@ -237,7 +237,7 @@ def test_load_spark_df( df = load_spark_df( spark, size=size, - local_cache_path=tmp_dir, + local_cache_path=tmp, header=header, title_col="Title", genres_col="Genres", @@ -269,20 +269,20 @@ def test_load_spark_df( @pytest.mark.integration @pytest.mark.parametrize("size", ["1m", "10m", "20m"]) -def test_download_and_extract_movielens(size, tmp_dir): +def test_download_and_extract_movielens(size, tmp): """Test movielens data download and extract """ - zip_path = os.path.join(tmp_dir, "ml.zip") + zip_path = os.path.join(tmp, "ml.zip") download_movielens(size, dest_path=zip_path) - assert len(os.listdir(tmp_dir)) == 1 + assert len(os.listdir(tmp)) == 1 assert os.path.exists(zip_path) - rating_path = os.path.join(tmp_dir, "rating.dat") - item_path = os.path.join(tmp_dir, "item.dat") + rating_path = os.path.join(tmp, "rating.dat") + item_path = os.path.join(tmp, "item.dat") extract_movielens( size, rating_path=rating_path, item_path=item_path, zip_path=zip_path ) # Test if raw-zip file, rating file, and item file are cached - assert len(os.listdir(tmp_dir)) == 3 + assert len(os.listdir(tmp)) == 3 assert os.path.exists(rating_path) assert os.path.exists(item_path) diff --git a/tests/integration/test_notebooks_gpu.py b/tests/integration/test_notebooks_gpu.py index 220941cf54..aaad2bd34b 100644 --- a/tests/integration/test_notebooks_gpu.py +++ b/tests/integration/test_notebooks_gpu.py @@ -148,15 +148,15 @@ def test_fastai_integration(notebooks, size, epochs, expected_values): ) ], ) -def test_wide_deep(notebooks, size, epochs, expected_values, tmp_dir): +def test_wide_deep(notebooks, size, epochs, expected_values, tmp): notebook_path = notebooks["wide_deep"] params = { "MOVIELENS_DATA_SIZE": size, "EPOCHS": epochs, "EVALUATE_WHILE_TRAINING": False, - "MODEL_DIR": tmp_dir, - "EXPORT_DIR_BASE": tmp_dir, + "MODEL_DIR": tmp, + "EXPORT_DIR_BASE": tmp, "RATING_METRICS": ["rmse", "mae", "rsquared", "exp_var"], "RANKING_METRICS": ["ndcg_at_k", "map_at_k", "precision_at_k", "recall_at_k"], } diff --git a/tests/smoke/test_movielens.py b/tests/smoke/test_movielens.py index ae0cc6a67d..30c9ec1c83 100644 --- a/tests/smoke/test_movielens.py +++ b/tests/smoke/test_movielens.py @@ -49,17 +49,17 @@ def test_load_pandas_df( title_example, genres_example, year_example, - tmp_dir, + tmp, ): """Test MovieLens dataset load as pd.DataFrame """ # Test if correct data are loaded header = ["a", "b", "c"] - df = load_pandas_df(size=size, local_cache_path=tmp_dir, header=header) + df = load_pandas_df(size=size, local_cache_path=tmp, header=header) assert len(df) == num_samples assert len(df.columns) == len(header) # Test if raw-zip file, rating file, and item file are cached - assert len(os.listdir(tmp_dir)) == 3 + assert len(os.listdir(tmp)) == 3 # Test title, genres, and released year load header = ["a", "b", "c", "d", "e"] @@ -67,7 +67,7 @@ def test_load_pandas_df( df = load_pandas_df( size=size, header=header, - local_cache_path=tmp_dir, + local_cache_path=tmp, title_col="Title", genres_col="Genres", year_col="Year", @@ -108,18 +108,18 @@ def test_load_item_df( title_example, genres_example, year_example, - tmp_dir, + tmp, ): """Test movielens item data load (not rating data) """ - df = load_item_df(size, local_cache_path=tmp_dir, title_col="title") + df = load_item_df(size, local_cache_path=tmp, title_col="title") assert len(df) == num_movies # movie_col and title_col should be loaded assert len(df.columns) == 2 assert df["title"][0] == title_example # Test title and genres - df = load_item_df(size, local_cache_path=tmp_dir, movie_col="item", genres_col="genres", year_col="year") + df = load_item_df(size, local_cache_path=tmp, movie_col="item", genres_col="genres", year_col="year") assert len(df) == num_movies # movile_col, genres_col and year_col assert len(df.columns) == 3 @@ -153,7 +153,7 @@ def test_load_spark_df( title_example, genres_example, year_example, - tmp_dir, + tmp, ): """Test MovieLens dataset load into pySpark.DataFrame """ @@ -169,13 +169,13 @@ def test_load_spark_df( ) with pytest.warns(Warning): df = load_spark_df( - spark, size=size, local_cache_path=tmp_dir, header=header, schema=schema + spark, size=size, local_cache_path=tmp, header=header, schema=schema ) assert df.count() == num_samples # Test if schema is used when both schema and header are provided assert len(df.columns) == len(schema) # Test if raw-zip file, rating file, and item file are cached - assert len(os.listdir(tmp_dir)) == 3 + assert len(os.listdir(tmp)) == 3 # Test title, genres, and released year load header = ["a", "b", "c", "d", "e"] @@ -183,7 +183,7 @@ def test_load_spark_df( df = load_spark_df( spark, size=size, - local_cache_path=tmp_dir, + local_cache_path=tmp, header=header, title_col="Title", genres_col="Genres", @@ -215,20 +215,20 @@ def test_load_spark_df( @pytest.mark.smoke @pytest.mark.parametrize("size", ["100k"]) -def test_download_and_extract_movielens(size, tmp_dir): +def test_download_and_extract_movielens(size, tmp): """Test movielens data download and extract """ - zip_path = os.path.join(tmp_dir, "ml.zip") + zip_path = os.path.join(tmp, "ml.zip") download_movielens(size, dest_path=zip_path) - assert len(os.listdir(tmp_dir)) == 1 + assert len(os.listdir(tmp)) == 1 assert os.path.exists(zip_path) - rating_path = os.path.join(tmp_dir, "rating.dat") - item_path = os.path.join(tmp_dir, "item.dat") + rating_path = os.path.join(tmp, "rating.dat") + item_path = os.path.join(tmp, "item.dat") extract_movielens( size, rating_path=rating_path, item_path=item_path, zip_path=zip_path ) # Test if raw-zip file, rating file, and item file are cached - assert len(os.listdir(tmp_dir)) == 3 + assert len(os.listdir(tmp)) == 3 assert os.path.exists(rating_path) assert os.path.exists(item_path) diff --git a/tests/smoke/test_notebooks_gpu.py b/tests/smoke/test_notebooks_gpu.py index 2ab1a0f6dc..95d6d70961 100644 --- a/tests/smoke/test_notebooks_gpu.py +++ b/tests/smoke/test_notebooks_gpu.py @@ -127,15 +127,15 @@ def test_notebook_dkn(notebooks): @pytest.mark.smoke @pytest.mark.gpu -def test_wide_deep(notebooks, tmp_dir): +def test_wide_deep(notebooks, tmp): notebook_path = notebooks["wide_deep"] params = { "MOVIELENS_DATA_SIZE": "100k", "EPOCHS": 1, "EVALUATE_WHILE_TRAINING": False, - "MODEL_DIR": tmp_dir, - "EXPORT_DIR_BASE": tmp_dir, + "MODEL_DIR": tmp, + "EXPORT_DIR_BASE": tmp, "RATING_METRICS": ["rmse", "mae"], "RANKING_METRICS": ["ndcg_at_k", "precision_at_k"], } diff --git a/tests/unit/test_notebooks_gpu.py b/tests/unit/test_notebooks_gpu.py index c71fbe8af9..7e6f48c175 100644 --- a/tests/unit/test_notebooks_gpu.py +++ b/tests/unit/test_notebooks_gpu.py @@ -56,10 +56,10 @@ def test_ncf_deep_dive(notebooks): @pytest.mark.notebooks @pytest.mark.gpu -def test_wide_deep(notebooks, tmp_dir): +def test_wide_deep(notebooks, tmp): notebook_path = notebooks["wide_deep"] - model_dir = os.path.join(tmp_dir, "wide_deep_0") + model_dir = os.path.join(tmp, "wide_deep_0") os.mkdir(model_dir) params = { 'MOVIELENS_DATA_SIZE': '100k', @@ -78,7 +78,7 @@ def test_wide_deep(notebooks, tmp_dir): ) # Test w/o item features - model_dir = os.path.join(tmp_dir, "wide_deep_1") + model_dir = os.path.join(tmp, "wide_deep_1") os.mkdir(model_dir) params = { 'MOVIELENS_DATA_SIZE': '100k', From 570500e3b212b11558543558b6184ea6a88fbf73 Mon Sep 17 00:00:00 2001 From: Jun Ki Min <42475935+loomlike@users.noreply.github.com> Date: Thu, 4 Apr 2019 11:03:11 -0400 Subject: [PATCH 3/3] Update conftest description --- tests/conftest.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 3901c1a026..81778cce8d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,8 +4,9 @@ # NOTE: This file is used by pytest to inject fixtures automatically. As it is explained in the documentation # https://docs.pytest.org/en/latest/fixture.html: # "If during implementing your tests you realize that you want to use a fixture function from multiple test files -# you can move it to a conftest.py file. You don't need to import the fixture you want to use in a test, it -# automatically gets discovered by pytest." +# you can move it to a conftest.py file. You don't need to import the module you defined your fixtures to use in a test, +# it automatically gets discovered by pytest and thus you can simply receive fixture objects by naming them as +# an input argument in the test." import calendar import datetime