Skip to content

Commit

Permalink
Docs update / fail-check for #33
Browse files Browse the repository at this point in the history
  • Loading branch information
marrrcin committed Nov 22, 2022
1 parent 03788f8 commit 17fbb99
Show file tree
Hide file tree
Showing 6 changed files with 147 additions and 24 deletions.
1 change: 1 addition & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
# "sphinx.ext.mathjax",
"recommonmark",
"sphinx_rtd_theme",
"sphinx_toolbox.collapse",
]

# Add any paths that contain templates here, relative to this directory.
Expand Down
1 change: 1 addition & 0 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
sphinx
sphinx_rtd_theme
sphinx-toolbox
recommonmark
23 changes: 23 additions & 0 deletions docs/source/03_quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,20 @@ which means that all of the files (including potentially sensitive ones!) will b

Ensure ``code_directory: "."`` is set in the ``azureml.yml`` config file (it's set by default).


.. collapse:: See example Dockerfile for code upload flow

.. code-block:: dockerfile
ARG BASE_IMAGE=python:3.9
FROM $BASE_IMAGE
# install project requirements
COPY src/requirements.txt /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt && rm -f /tmp/requirements.txt
\

\Build the image:

.. code:: console
Expand All @@ -142,6 +156,15 @@ Ensure ``code_directory: "."`` is set in the ``azureml.yml`` config file (it's s
\
Now you can re-use this environment and run the pipeline without the need to build the docker image again (unless you add some dependencies to your environment, obviously :-) ).

.. warning::
| Azure Code upload feature has issues with empty folders as identified in `GitHub #33 <https://github.com/getindata/kedro-azureml/issues/33>`__, where empty folders or folders with empty files might not get uploaded to Azure ML, which might result in the failing pipeline.
| We recommend to:
| - make sure that Kedro environments you intent to use in Azure have at least one non-empty file specified
| - gracefully handle folder creation in your pipeline's code (e.g. if your code depends on an existence of some folder)
|
| The plugin will do it's best to handle some of the edge-cases, but the fact that some of your files might not be captured by Azure ML SDK is out of our reach.

9.2. **If using docker image flow** (shown in the Quickstart video)

.. note::
Expand Down
28 changes: 5 additions & 23 deletions kedro_azureml/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from kedro_azureml.cli_functions import (
get_context_and_pipeline,
parse_extra_params,
verify_configuration_directory_for_azure,
warn_about_ignore_files,
)
from kedro_azureml.client import AzureMLPipelinesClient
from kedro_azureml.config import CONFIG_TEMPLATE_YAML
Expand Down Expand Up @@ -168,29 +170,9 @@ def run(
if aml_env:
click.echo(f"Overriding Azure ML Environment for run by: {aml_env}")

aml_ignore = Path.cwd().joinpath(".amlignore")
git_ignore = Path.cwd().joinpath(".gitignore")
if aml_ignore.exists():
ignore_contents = aml_ignore.read_text().strip()
if not ignore_contents:
click.echo(
click.style(
f".amlignore file is empty, which means all of the files from {Path.cwd()}"
"\nwill be uploaded to Azure ML. Make sure that you excluded sensitive files first!",
fg="yellow",
)
)
elif git_ignore.exists():
ignore_contents = git_ignore.read_text().strip()
if ignore_contents:
click.echo(
click.style(
".gitignore file detected, ignored files will not be uploaded to Azure ML"
"\nWe recommend to use .amlignore instead of .gitignore when working with Azure ML"
"\nSee https://github.com/MicrosoftDocs/azure-docs/blob/047cb7f625920183438f3e66472014ac2ebab098/includes/machine-learning-amlignore-gitignore.md", # noqa
fg="yellow",
)
)
warn_about_ignore_files()

verify_configuration_directory_for_azure(click_context, ctx)

mgr: KedroContextManager
with get_context_and_pipeline(ctx, image, pipeline, params, aml_env) as (
Expand Down
69 changes: 69 additions & 0 deletions kedro_azureml/cli_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import os
from contextlib import contextmanager
from pathlib import Path
from typing import Optional

import click
Expand Down Expand Up @@ -60,3 +61,71 @@ def parse_extra_params(params, silent=False):
else:
parameters = None
return parameters


def warn_about_ignore_files():
aml_ignore = Path.cwd().joinpath(".amlignore")
git_ignore = Path.cwd().joinpath(".gitignore")
if aml_ignore.exists():
ignore_contents = aml_ignore.read_text().strip()
if not ignore_contents:
click.echo(
click.style(
f".amlignore file is empty, which means all of the files from {Path.cwd()}"
"\nwill be uploaded to Azure ML. Make sure that you excluded sensitive files first!",
fg="yellow",
)
)
elif git_ignore.exists():
ignore_contents = git_ignore.read_text().strip()
if ignore_contents:
click.echo(
click.style(
".gitignore file detected, ignored files will not be uploaded to Azure ML"
"\nWe recommend to use .amlignore instead of .gitignore when working with Azure ML"
"\nSee https://github.com/MicrosoftDocs/azure-docs/blob/047cb7f625920183438f3e66472014ac2ebab098/includes/machine-learning-amlignore-gitignore.md", # noqa
# noqa
fg="yellow",
)
)


def verify_configuration_directory_for_azure(click_context, ctx: CliContext):
"""
Checks whether the Kedro conf directory of used environment contains only empty files or is empty.
If it is, prompts the user to continue or exit, as continuing might break execution in Azure ML.
:param click_context:
:param ctx:
:return:
"""
conf_dir = Path.cwd().joinpath(f"conf/{ctx.env}")

exists = conf_dir.exists() and conf_dir.is_dir()
is_empty = True
has_only_empty_files = True

if exists:
for p in conf_dir.iterdir():
is_empty = False
if p.is_file():
has_only_empty_files = p.lstat().st_size == 0

if not has_only_empty_files:
break

msg = f"Configuration folder for your Kedro environment {conf_dir} "
if not exists:
msg += "does not exist or is not a directory,"
if is_empty:
msg += "is empty,"
elif has_only_empty_files:
msg += "contains only empty files,"

if is_empty or has_only_empty_files:
msg += (
"\nwhich might cause issues when running in Azure ML."
+ "\nEither use different environment or provide non-empty configuration for your env."
+ "\nContinue?"
)
if not click.confirm(click.style(msg, fg="yellow")):
click_context.exit(2)
49 changes: 48 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
from pathlib import Path
from unittest import mock
from unittest.mock import patch
from unittest.mock import MagicMock, patch
from uuid import uuid4

import pytest
Expand All @@ -13,6 +13,7 @@
from kedro_azureml.config import KedroAzureMLConfig
from kedro_azureml.constants import KEDRO_AZURE_RUNNER_DATASET_TIMEOUT
from kedro_azureml.generator import AzureMLPipelineGenerator
from kedro_azureml.utils import CliContext
from tests.utils import create_kedro_conf_dirs


Expand Down Expand Up @@ -232,6 +233,52 @@ def test_can_invoke_run(
interactive_credentials.assert_not_called()


@pytest.mark.parametrize(
"kedro_environment_name",
("empty", "non_existing", "gitkeep", "nested"),
)
@pytest.mark.parametrize("confirm", (True, False))
def test_run_is_interrupted_if_used_on_empty_env(
confirm,
patched_kedro_package,
cli_context,
dummy_pipeline,
tmp_path: Path,
kedro_environment_name: str,
):
metadata = MagicMock()
metadata.package_name = "tests"
cli_context = CliContext(env=kedro_environment_name, metadata=metadata)

create_kedro_conf_dirs(tmp_path) # for base env

# setup Kedro env to handle test case
cfg_path = tmp_path / "conf" / kedro_environment_name
if kedro_environment_name == "empty":
cfg_path.mkdir(parents=True)
elif kedro_environment_name == "gitkeep":
cfg_path.mkdir(parents=True)
(cfg_path / ".gitkeep").touch()
elif kedro_environment_name == "nested":
(cfg_path / "nested2").mkdir(parents=True)
else:
pass # nothing to do for non_existing environment, do not remove this empty block

with patch.dict(
"kedro.framework.project.pipelines", {"__default__": dummy_pipeline}
), patch.object(Path, "cwd", return_value=tmp_path), patch.dict(
os.environ, {"AZURE_STORAGE_ACCOUNT_KEY": "dummy_key"}
), patch(
"click.confirm", return_value=confirm
) as click_confirm:
runner = CliRunner()
result = runner.invoke(cli.run, ["-s", "subscription_id"], obj=cli_context)
assert result.exit_code == (
1 if confirm else 2
), "run should have exited with code: 1 if confirmed, 2 if stopped"
click_confirm.assert_called_once()


def test_can_invoke_run_with_failed_pipeline(
patched_kedro_package,
cli_context,
Expand Down

0 comments on commit 17fbb99

Please sign in to comment.