Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor dss commands exit codes and logging #90

Merged
merged 9 commits into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 24 additions & 31 deletions src/dss/create_notebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,24 +42,22 @@ def create_notebook(name: str, image: str, lightkube_client: Client) -> None:
if not does_dss_pvc_exist(lightkube_client) or not does_mlflow_deployment_exist(
lightkube_client
):
logger.error("Failed to create notebook. DSS was not correctly initialized.\n")
logger.info(
"You might want to run\n"
" dss status to check the current status\n"
" dss logs --all to review all logs\n"
" dss initialize to install dss\n"
)
return
logger.debug("Failed to create notebook. DSS was not correctly initialized.")
logger.error("Failed to create notebook. DSS was not correctly initialized.")
logger.info("Note: You might want to run")
logger.info(" dss status to check the current status")
logger.info(" dss logs --all to view all logs")
logger.info(" dss initialize to install dss")
raise RuntimeError()
if does_notebook_exist(name, DSS_NAMESPACE, lightkube_client):
# Assumes that the notebook server is exposed by a service of the same name.
logger.error(
f"Failed to create Notebook. Notebook with name '{name}' already exists.\n"
f"Please specify a different name."
)
logger.debug(f"Failed to create Notebook. Notebook with name '{name}' already exists.")
logger.error(f"Failed to create Notebook. Notebook with name '{name}' already exists.")
logger.info("Please specify a different name.")
url = get_service_url("name", DSS_NAMESPACE, lightkube_client)
if url:
logger.info(f"To connect to the existing notebook, go to {url}.")
return
raise RuntimeError()

manifests_file = Path(
Path(__file__).parent, MANIFEST_TEMPLATES_LOCATION, "notebook_deployment.yaml.j2"
Expand All @@ -85,30 +83,25 @@ def create_notebook(name: str, image: str, lightkube_client: Client) -> None:

logger.info(f"Success: Notebook {name} created successfully.")
except ApiError as err:
logger.error(
f"Failed to create Notebook with error code {err.status.code}."
" Check the debug logs for more details."
)
logger.debug(f"Failed to create Notebook {name} with error {err}")
return
logger.debug(f"Failed to create Notebook {name}: {err}.", exc_info=True)
logger.error(f"Failed to create Notebook with error code {err.status.code}.")
logger.info(" Check the debug logs for more details.")
raise RuntimeError()
except TimeoutError as err:
logger.error(str(err))
logger.warn(
f"Timed out while trying to create Notebook {name}.\n"
"Some resources might be left in the cluster. Check the status with `dss list`."
)
return
logger.debug(f"Failed to create Notebook {name}: {err}", exc_info=True)
logger.error(f"Timed out while trying to create Notebook {name}.")
logger.warn(" Some resources might be left in the cluster.")
logger.info(" Check the status with `dss list`.")
raise RuntimeError()
except ImagePullBackOffError as err:
logger.error(
f"Timed out while trying to create Notebook {name}.\n"
f"Image {image_full_name} does not exist or is not accessible.\n"
)
logger.debug(f"Timed out while trying to create Notebook {name}: {err}.", exc_info=True)
logger.error(f"Timed out while trying to create Notebook {name}.")
logger.error(f"Image {image_full_name} does not exist or is not accessible.")
logger.info(
"Note: You might want to use some of these recommended images:\n"
f"{RECOMMENDED_IMAGES_MESSAGE}"
)
logger.debug(f"Timed out while trying to create Notebook {name} with error {err}.")
return
raise RuntimeError()
# Assumes that the notebook server is exposed by a service of the same name.
url = get_service_url(name, DSS_NAMESPACE, lightkube_client)
if url:
Expand Down
24 changes: 16 additions & 8 deletions src/dss/logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,12 @@ def get_logs(parts: str, name: str, lightkube_client: Client) -> None:
if deployment.metadata.name == name:
break
else:
logger.debug(
f"Failed to retrieve logs. Deployment '{name}' does not exist in {DSS_NAMESPACE} namespace." # noqa E501
)
logger.error(f"Failed to retrieve logs. Notebook '{name}' does not exist.")
logger.info("Run 'dss list' to check all notebooks.")
return
raise RuntimeError()
pods = lightkube_client.list(
Pod, namespace=DSS_NAMESPACE, labels=deployment.spec.selector.matchLabels
)
Expand All @@ -46,15 +49,16 @@ def get_logs(parts: str, name: str, lightkube_client: Client) -> None:
namespace=DSS_NAMESPACE,
labels=mlflow_deployment.spec.selector.matchLabels,
)
except ApiError:
except ApiError as e:
logger.debug(f"Failed to retrieve logs for MLflow: {e}", exc_info=True)
logger.error(
"Failed to retrieve logs. MLflow seems to be not present.Make sure DSS is correctly initialized." # noqa: E501
"Failed to retrieve logs. MLflow seems to be not present. Make sure DSS is correctly initialized." # noqa: E501
)
logger.info("Note: You might want to run")
logger.info(" dss status to check the current status")
logger.info(" dss logs --all to view all logs")
logger.info(" dss initialize to install dss")
return
raise RuntimeError()
elif parts == "all":
deployments = list(lightkube_client.list(Deployment, namespace=DSS_NAMESPACE))
pods = []
Expand All @@ -65,18 +69,19 @@ def get_logs(parts: str, name: str, lightkube_client: Client) -> None:
)
)
except ApiError as e:
logger.error(e)
logger.debug(f"Failed to retrieve logs for {parts} {name}: {e}", exc_info=True)
logger.error(
f"Failed to retrieve logs for {parts} {name}. Make sure DSS is correctly initialized." # noqa: E501
)
logger.info("Note: You might want to run")
logger.info(" dss status to check the current status")
logger.info(" dss initialize to install dss")
return
raise RuntimeError()

if not pods:
logger.debug(f"Failed to retrieve logs. No pods found for {parts} {name}.")
logger.error(f"Failed to retrieve logs. No pods found for {parts} {name}.")
return
raise RuntimeError()

for pod in pods:
# Retrieve logs from the pod
Expand All @@ -87,7 +92,10 @@ def get_logs(parts: str, name: str, lightkube_client: Client) -> None:
line = line.rstrip("\n")
logger.info(line)
except ApiError as e:
logger.error(e)
logger.debug(
f"Failed to retrieve logs for pod {pod.metadata.name}: {e}", exc_info=True
)
logger.error(
f"Failed to retrieve logs. There was a problem while getting the logs for {pod.metadata.name}" # noqa: E501
)
raise RuntimeError()
88 changes: 61 additions & 27 deletions src/dss/main.py
misohu marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import click
from lightkube.core.exceptions import ApiError

from dss.config import DEFAULT_NOTEBOOK_IMAGE, RECOMMENDED_IMAGES_MESSAGE
from dss.create_notebook import create_notebook
Expand Down Expand Up @@ -34,10 +33,17 @@ def initialize_command(kubeconfig: str) -> None:
"""
logger.info("Executing initialize command")

kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)
try:
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)

initialize(lightkube_client=lightkube_client)
initialize(lightkube_client=lightkube_client)
except RuntimeError:
click.get_current_context().exit(1)
except Exception as e:
logger.debug(f"Failed to initialize dss: {e}.", exc_info=True)
logger.error(f"Failed to initialize dss: {str(e)}.")
click.get_current_context().exit(1)


IMAGE_OPTION_HELP = "\b\nThe image used for the notebook server.\n"
Expand Down Expand Up @@ -72,10 +78,17 @@ def create_notebook_command(name: str, image: str, kubeconfig: str) -> None:
" For more information on using a specific image, see dss create --help."
)

kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)
try:
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)

create_notebook(name=name, image=image, lightkube_client=lightkube_client)
create_notebook(name=name, image=image, lightkube_client=lightkube_client)
except RuntimeError:
click.get_current_context().exit(1)
except Exception as e:
logger.debug(f"Failed to create notebook {name}: {e}.", exc_info=True)
logger.error(f"Failed to create notebook {name}: {str(e)}.")
click.get_current_context().exit(1)


create_notebook_command.help += f"""
Expand Down Expand Up @@ -112,15 +125,22 @@ def logs_command(kubeconfig: str, notebook_name: str, print_all: bool, mlflow: b
)
return

kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)
try:
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)

if print_all:
get_logs("all", None, lightkube_client)
elif mlflow:
get_logs("mlflow", None, lightkube_client)
elif notebook_name:
get_logs("notebooks", notebook_name, lightkube_client)
if print_all:
get_logs("all", None, lightkube_client)
elif mlflow:
get_logs("mlflow", None, lightkube_client)
elif notebook_name:
get_logs("notebooks", notebook_name, lightkube_client)
except RuntimeError:
click.get_current_context().exit(1)
except Exception as e:
logger.debug(f"Failed to retrieve logs: {e}.", exc_info=True)
logger.error(f"Failed to retrieve logs: {str(e)}.")
click.get_current_context().exit(1)


@main.command(name="status")
Expand All @@ -130,10 +150,17 @@ def logs_command(kubeconfig: str, notebook_name: str, print_all: bool, mlflow: b
)
def status_command(kubeconfig: str) -> None:
"""Checks the status of key components within the DSS environment. Verifies if the MLflow deployment is ready and checks if GPU acceleration is enabled on the Kubernetes cluster by examining the labels of Kubernetes nodes for NVIDIA or Intel GPU devices.""" # noqa E501
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)
try:
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)

get_status(lightkube_client)
get_status(lightkube_client)
except RuntimeError:
click.get_current_context().exit(1)
misohu marked this conversation as resolved.
Show resolved Hide resolved
except Exception as e:
logger.debug(f"Failed to retrieve status: {e}.", exc_info=True)
logger.error(f"Failed to retrieve status: {str(e)}.")
click.get_current_context().exit(1)


@main.command(name="list")
Expand Down Expand Up @@ -178,13 +205,16 @@ def stop_notebook_command(kubeconfig: str, notebook_name: str):
Example:
dss stop my-notebook
"""
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)

try:
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)
stop_notebook(name=notebook_name, lightkube_client=lightkube_client)
except (RuntimeError, ApiError):
exit(1)
except RuntimeError:
click.get_current_context().exit(1)
except Exception as e:
logger.debug(f"Failed to stop notebook: {e}.", exc_info=True)
logger.error(f"Failed to stop notebook: {str(e)}.")
click.get_current_context().exit(1)


# FIXME: remove the `--kubeconfig`` option
Expand Down Expand Up @@ -237,13 +267,17 @@ def remove_notebook_command(name: str, kubeconfig: str):
"""
logger.info("Executing remove command")

kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)

try:
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)

remove_notebook(name=name, lightkube_client=lightkube_client)
except RuntimeError:
exit(1)
click.get_current_context().exit(1)
except Exception as e:
logger.debug(f"Failed to remove notebook: {e}.", exc_info=True)
logger.error(f"Failed to remove notebook: {str(e)}.")
click.get_current_context().exit(1)


@main.command(name="purge")
Expand Down
11 changes: 6 additions & 5 deletions src/dss/remove_notebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ def remove_notebook(name: str, lightkube_client: Client) -> None:
if not does_notebook_exist(
name=name, namespace=DSS_NAMESPACE, lightkube_client=lightkube_client
):
logger.error(
f"Failed to remove Notebook. Notebook {name} does not exist. Run 'dss list' to check all notebooks." # noqa E501
)
raise RuntimeError("Failed to remove Notebook not found.")
logger.debug(f"Failed to remove Notebook. Notebook {name} does not exist.")
logger.error(f"Failed to remove Notebook. Notebook {name} does not exist.") # noqa E501
logger.info("Run 'dss list' to check all notebooks.")
raise RuntimeError()

# From this point forward we know either one or both
# resources (Deployment, Service) exist for the Notebook.
Expand All @@ -49,11 +49,12 @@ def remove_notebook(name: str, lightkube_client: Client) -> None:
exceptions.append(err)

if exceptions:
logger.debug(f"Failed to remove notebook {name}: {exceptions}")
logger.error(f"Failed to remove notebook {name}. Please try again.")
logger.info("Note: You might want to run")
logger.info(" dss status to check the current status")
logger.info(f" dss logs {name} to review the notebook logs")
raise RuntimeError(f"Failed to remove notebook {name} with errors", exceptions)
raise RuntimeError()
else:
logger.info(
f"Removing the notebook {name}. Check `dss list` for the status of the notebook."
Expand Down
59 changes: 30 additions & 29 deletions src/dss/status.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,34 +15,35 @@ def get_status(lightkube_client: Client) -> None:
Args:
lightkube_client (Client): The Kubernetes client.
"""
# Check MLflow deployment
mlflow_ready = does_mlflow_deployment_exist(lightkube_client)

# Log MLflow deployment status and URL
if mlflow_ready:
mlflow_url = get_service_url(MLFLOW_DEPLOYMENT_NAME, DSS_NAMESPACE, lightkube_client)
logger.info("MLflow deployment: Ready")
logger.info(f"MLflow URL: {mlflow_url}")
else:
logger.info("MLflow deployment: Not ready")

# Check NVIDIA GPU acceleration
gpu_acceleration = False
try:
# Check MLflow deployment
mlflow_ready = does_mlflow_deployment_exist(lightkube_client)

# Log MLflow deployment status and URL
if mlflow_ready:
mlflow_url = get_service_url(MLFLOW_DEPLOYMENT_NAME, DSS_NAMESPACE, lightkube_client)
logger.info("MLflow deployment: Ready")
logger.info(f"MLflow URL: {mlflow_url}")
else:
logger.info("MLflow deployment: Not ready")

# Check NVIDIA GPU acceleration
gpu_acceleration = False
node_labels = get_labels_for_node(lightkube_client)
if (
"nvidia.com/gpu.present" in node_labels
and "nvidia.com/gpu.deploy.container-toolkit" in node_labels
and "nvidia.com/gpu.deploy.device-plugin" in node_labels
):
gpu_acceleration = True
card_name = node_labels.get("nvidia.com/gpu.product", "NVIDIA GPU")

# Log GPU status
if gpu_acceleration:
logger.info(f"GPU acceleration: Enabled ({card_name})")
else:
logger.info("GPU acceleration: Disabled")
except Exception as e:
logger.error(f"Failed to retrieve status: {e}")
return
except ValueError as e:
logger.debug(f"Failed to get labels for nodes: {e}.", exc_info=True)
logger.error(f"Failed to retrieve status: {e}.")
raise RuntimeError()
if (
"nvidia.com/gpu.present" in node_labels
and "nvidia.com/gpu.deploy.container-toolkit" in node_labels
and "nvidia.com/gpu.deploy.device-plugin" in node_labels
):
gpu_acceleration = True
card_name = node_labels.get("nvidia.com/gpu.product", "NVIDIA GPU")

# Log GPU status
if gpu_acceleration:
logger.info(f"GPU acceleration: Enabled ({card_name})")
else:
logger.info("GPU acceleration: Disabled")
Loading
Loading