diff --git a/topic/machine-learning/automl/pyproject.toml b/topic/machine-learning/automl/pyproject.toml index 97913613..b1677e1f 100644 --- a/topic/machine-learning/automl/pyproject.toml +++ b/topic/machine-learning/automl/pyproject.toml @@ -19,38 +19,6 @@ xfail_strict = true markers = [ ] -# pytest-notebook settings -nb_test_files = true -nb_coverage = false -# Default cell timeout is 120 seconds. For heavy computing, it needs to be increased. -nb_exec_timeout = 240 -nb_diff_replace = [ - # Compensate output of `crash`. - '"/cells/*/outputs/*/text" "\(\d.\d+ sec\)" "(0.000 sec)"', - # Compensate other outputs. - '"/cells/*/outputs/*/data/text/html" "T_....." "T_na"', - '"/cells/*/outputs/*/data/text/plain" "IPython.core.display.HTML object" "pandas.io.formats.style.Styler"', - '"/cells/*/outputs/*/data/text/plain" "pandas.io.formats.style.Styler at 0x.+" "pandas.io.formats.style.Styler"', - '"/cells/*/outputs/*/data/application/vnd.jupyter.widget-view+json" "model_id: .+" "model_id: na"', - '"/cells/*/outputs/*/data/text/html" "\>\d+\.\d+\<\/td\>" "0.3333"', -] -# `vector_search.py` does not include any output(s). -nb_diff_ignore = [ - "/metadata/language_info", - "/metadata/widgets", - "/cells/*/execution_count", - "/cells/*/outputs/*/execution_count", - "/cells/*/outputs/*/metadata/nbreg", - # Ignore images. - "/cells/*/outputs/*/data/image/png", - # Ignore all cell output. It is too tedious to compare and maintain. - # The validation hereby extends exclusively to the _execution_ of notebook cells, - # able to catch syntax errors, module import flaws, and runtime errors. - # However, the validation will not catch any regressions on actual cell output, - # or whether any output is produced at all. - "/cells/*/outputs", -] - [tool.coverage.run] branch = false diff --git a/topic/machine-learning/automl/requirements.txt b/topic/machine-learning/automl/requirements.txt index 2fb81e36..c11b4335 100644 --- a/topic/machine-learning/automl/requirements.txt +++ b/topic/machine-learning/automl/requirements.txt @@ -5,6 +5,7 @@ plotly<5.21 pycaret[models,parallel,test]==3.3.1 pydantic<2 python-dotenv<2 +sqlalchemy==2.* # Development. # mlflow-cratedb @ git+https://github.com/crate-workbench/mlflow-cratedb.git@main diff --git a/topic/machine-learning/automl/test.py b/topic/machine-learning/automl/test.py index 86879c9f..da39e931 100644 --- a/topic/machine-learning/automl/test.py +++ b/topic/machine-learning/automl/test.py @@ -22,8 +22,10 @@ import pytest from cratedb_toolkit.util import DatabaseAdapter -from pueblo.testing.folder import str_list, list_notebooks, list_python_files -from pueblo.testing.snippet import pytest_notebook, pytest_module_function +from pueblo.testing.folder import str_list, list_python_files +from pueblo.testing.notebook import generate_tests +from pueblo.testing.snippet import pytest_module_function +from testbook import testbook HERE = Path(__file__).parent @@ -57,15 +59,20 @@ def churn_dataset(cratedb): cratedb.run_sql("REFRESH TABLE pycaret_churn;") -@pytest.mark.parametrize("notebook", str_list(list_notebooks(HERE))) -def test_notebook(request, notebook: str): +def pytest_generate_tests(metafunc): """ - From individual Jupyter Notebook file, collect cells as pytest - test cases, and run them. + Generate pytest test case per Jupyter Notebook. + """ + here = Path(__file__).parent + generate_tests(metafunc, path=here) + - Not using `NBRegressionFixture`, because it would manually need to be configured. +def test_notebook(notebook): + """ + Execute Jupyter Notebook, one test case per .ipynb file. """ - pytest_notebook(request=request, filepath=notebook) + with testbook(notebook) as tb: + tb.execute() @pytest.mark.parametrize("pyfile", str_list(list_python_files(HERE))) diff --git a/topic/machine-learning/llm-langchain/conftest.py b/topic/machine-learning/llm-langchain/conftest.py deleted file mode 100644 index 99fdcf49..00000000 --- a/topic/machine-learning/llm-langchain/conftest.py +++ /dev/null @@ -1,6 +0,0 @@ -# Initialize nltk upfront, so that it does not run stray output into Jupyter Notebooks. -from pueblo.testing.nlp import nltk_init - -# Make `pytest.exit()` called in notebook cells gracefully skip testing the whole notebook. -from pueblo.testing.notebook import monkeypatch_pytest_notebook_treat_cell_exit_as_notebook_skip -monkeypatch_pytest_notebook_treat_cell_exit_as_notebook_skip() diff --git a/topic/machine-learning/llm-langchain/pyproject.toml b/topic/machine-learning/llm-langchain/pyproject.toml index 1b01730d..9b168a22 100644 --- a/topic/machine-learning/llm-langchain/pyproject.toml +++ b/topic/machine-learning/llm-langchain/pyproject.toml @@ -19,24 +19,6 @@ xfail_strict = true markers = [ ] -# pytest-notebook settings -nb_test_files = true -nb_coverage = true -nb_diff_replace = [ - # Compensate output of `crash`. - '"/cells/*/outputs/*/text" "\(\d.\d+ sec\)" "(0.000 sec)"', -] -# `vector_search.py` does not include any output(s). -nb_diff_ignore = [ - "/metadata/language_info", - "/cells/*/execution_count", - "/cells/*/outputs/*/execution_count", - - # Do not compare details of cell outputs. - # It is impossible to maintain efficiently. - "/cells/*/outputs", -] - [tool.coverage.run] branch = false diff --git a/topic/machine-learning/llm-langchain/requirements.txt b/topic/machine-learning/llm-langchain/requirements.txt index 0fbfaad9..ea07017e 100644 --- a/topic/machine-learning/llm-langchain/requirements.txt +++ b/topic/machine-learning/llm-langchain/requirements.txt @@ -11,6 +11,7 @@ pydantic>=1,<3 pypdf<5 python-dotenv<2 requests-cache<2 +sqlalchemy==2.* unstructured<0.12 google-cloud-aiplatform langchain-google-vertexai diff --git a/topic/machine-learning/llm-langchain/test.py b/topic/machine-learning/llm-langchain/test.py index e1d81f35..e40e3097 100644 --- a/topic/machine-learning/llm-langchain/test.py +++ b/topic/machine-learning/llm-langchain/test.py @@ -5,8 +5,11 @@ import pytest from cratedb_toolkit.io.sql import DatabaseAdapter -from pueblo.testing.folder import str_list, list_notebooks, list_python_files -from pueblo.testing.snippet import pytest_module_function, pytest_notebook +from nbclient.exceptions import CellExecutionError +from pueblo.testing.folder import str_list, list_python_files +from pueblo.testing.notebook import generate_tests +from pueblo.testing.snippet import pytest_module_function +from testbook import testbook HERE = Path(__file__).parent @@ -26,20 +29,32 @@ def reset_database(cratedb): time.sleep(0.01) -@pytest.mark.parametrize("notebook", str_list(list_notebooks(HERE))) -def test_notebook(request, notebook: str): +def pytest_generate_tests(metafunc): """ - From individual Jupyter Notebook file, collect cells as pytest - test cases, and run them. - - Not using `NBRegressionFixture`, because it would manually need to be configured. + Generate pytest test case per Jupyter Notebook. """ + here = Path(__file__).parent + generate_tests(metafunc, path=here) - # Skip Vertex AI examples, because authenticating is more complicated. - if "vertexai" in str(notebook): - raise pytest.skip("Skipping Vertex AI due to lack of authentication") - pytest_notebook(request=request, filepath=notebook) +def test_notebook(notebook): + """ + Execute Jupyter Notebook, one test case per .ipynb file. + """ + # Skip Vertex AI examples, because authenticating is more complicated. + if "vertexai" in notebook.name: + raise pytest.skip(f"Skipping Vertex AI due to lack of authentication: {notebook.name}") + + with testbook(notebook) as tb: + try: + tb.execute() + + # Skip notebook if `pytest.exit()` is invoked, usually by + # `getenvpass()`, when authentication token is not given. + except CellExecutionError as ex: + msg = str(ex) + if "[skip-notebook]" in msg: + raise pytest.skip(msg) @pytest.mark.parametrize("pyfile", str_list(list_python_files(HERE))) diff --git a/topic/machine-learning/mlops-mlflow/pyproject.toml b/topic/machine-learning/mlops-mlflow/pyproject.toml index 49643c74..4194d3ab 100644 --- a/topic/machine-learning/mlops-mlflow/pyproject.toml +++ b/topic/machine-learning/mlops-mlflow/pyproject.toml @@ -18,22 +18,6 @@ xfail_strict = true markers = [ ] -# pytest-notebook settings -nb_test_files = true -nb_coverage = true -nb_diff_replace = [ - # Compensate output of `crash`. - '"/cells/*/outputs/*/text" "\(\d.\d+ sec\)" "(0.000 sec)"', -] -# `vector_search.py` does not include any output(s). -nb_diff_ignore = [ - "/metadata/language_info", - "/cells/*/execution_count", - "/cells/*/outputs/*/execution_count", - # Ignore images. - "/cells/*/outputs/*/data/image/png", -] - [tool.coverage.run] branch = false diff --git a/topic/machine-learning/mlops-mlflow/requirements.txt b/topic/machine-learning/mlops-mlflow/requirements.txt index 49788362..9373cba4 100644 --- a/topic/machine-learning/mlops-mlflow/requirements.txt +++ b/topic/machine-learning/mlops-mlflow/requirements.txt @@ -4,6 +4,7 @@ distributed>=2024.4.1 # Python 3.11.9 breaks previous Dask mlflow-cratedb==2.11.3 pydantic<3 salesforce-merlion>=2,<3 +sqlalchemy==2.* # Development. # mlflow-cratedb @ git+https://github.com/crate-workbench/mlflow-cratedb.git@main diff --git a/topic/machine-learning/mlops-mlflow/test.py b/topic/machine-learning/mlops-mlflow/test.py index 3cc4dc5e..d930ee6a 100644 --- a/topic/machine-learning/mlops-mlflow/test.py +++ b/topic/machine-learning/mlops-mlflow/test.py @@ -3,8 +3,10 @@ import pytest from cratedb_toolkit.util import DatabaseAdapter -from pueblo.testing.folder import str_list, list_notebooks, list_python_files -from pueblo.testing.snippet import pytest_module_function, pytest_notebook +from pueblo.testing.folder import str_list, list_python_files +from pueblo.testing.notebook import generate_tests +from pueblo.testing.snippet import pytest_module_function +from testbook import testbook HERE = Path(__file__).parent @@ -22,15 +24,20 @@ def db_init(cratedb): cratedb.run_sql("DROP TABLE IF EXISTS machine_data;") -@pytest.mark.parametrize("notebook", str_list(list_notebooks(HERE))) -def test_notebook(request, notebook: str): +def pytest_generate_tests(metafunc): """ - From individual Jupyter Notebook file, collect cells as pytest - test cases, and run them. + Generate pytest test case per Jupyter Notebook. + """ + here = Path(__file__).parent + generate_tests(metafunc, path=here) + - Not using `NBRegressionFixture`, because it would manually need to be configured. +def test_notebook(notebook): + """ + Execute Jupyter Notebook, one test case per .ipynb file. """ - pytest_notebook(request=request, filepath=notebook) + with testbook(notebook) as tb: + tb.execute() @pytest.mark.parametrize("pyfile", str_list(list_python_files(HERE)))