Skip to content

Commit

Permalink
Refactor dss commands exit codes and logging (#90)
Browse files Browse the repository at this point in the history
* refactor create command
* refactor logs command
* refactor status and stop
* refactor remove command
* refactor initialize command
  • Loading branch information
NohaIhab authored Apr 24, 2024
1 parent 3e5719d commit 06224bc
Show file tree
Hide file tree
Showing 10 changed files with 270 additions and 178 deletions.
55 changes: 24 additions & 31 deletions src/dss/create_notebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,24 +42,22 @@ def create_notebook(name: str, image: str, lightkube_client: Client) -> None:
if not does_dss_pvc_exist(lightkube_client) or not does_mlflow_deployment_exist(
lightkube_client
):
logger.error("Failed to create notebook. DSS was not correctly initialized.\n")
logger.info(
"You might want to run\n"
" dss status to check the current status\n"
" dss logs --all to review all logs\n"
" dss initialize to install dss\n"
)
return
logger.debug("Failed to create notebook. DSS was not correctly initialized.")
logger.error("Failed to create notebook. DSS was not correctly initialized.")
logger.info("Note: You might want to run")
logger.info(" dss status to check the current status")
logger.info(" dss logs --all to view all logs")
logger.info(" dss initialize to install dss")
raise RuntimeError()
if does_notebook_exist(name, DSS_NAMESPACE, lightkube_client):
# Assumes that the notebook server is exposed by a service of the same name.
logger.error(
f"Failed to create Notebook. Notebook with name '{name}' already exists.\n"
f"Please specify a different name."
)
logger.debug(f"Failed to create Notebook. Notebook with name '{name}' already exists.")
logger.error(f"Failed to create Notebook. Notebook with name '{name}' already exists.")
logger.info("Please specify a different name.")
url = get_service_url("name", DSS_NAMESPACE, lightkube_client)
if url:
logger.info(f"To connect to the existing notebook, go to {url}.")
return
raise RuntimeError()

manifests_file = Path(
Path(__file__).parent, MANIFEST_TEMPLATES_LOCATION, "notebook_deployment.yaml.j2"
Expand All @@ -85,30 +83,25 @@ def create_notebook(name: str, image: str, lightkube_client: Client) -> None:

logger.info(f"Success: Notebook {name} created successfully.")
except ApiError as err:
logger.error(
f"Failed to create Notebook with error code {err.status.code}."
" Check the debug logs for more details."
)
logger.debug(f"Failed to create Notebook {name} with error {err}")
return
logger.debug(f"Failed to create Notebook {name}: {err}.", exc_info=True)
logger.error(f"Failed to create Notebook with error code {err.status.code}.")
logger.info(" Check the debug logs for more details.")
raise RuntimeError()
except TimeoutError as err:
logger.error(str(err))
logger.warn(
f"Timed out while trying to create Notebook {name}.\n"
"Some resources might be left in the cluster. Check the status with `dss list`."
)
return
logger.debug(f"Failed to create Notebook {name}: {err}", exc_info=True)
logger.error(f"Timed out while trying to create Notebook {name}.")
logger.warn(" Some resources might be left in the cluster.")
logger.info(" Check the status with `dss list`.")
raise RuntimeError()
except ImagePullBackOffError as err:
logger.error(
f"Timed out while trying to create Notebook {name}.\n"
f"Image {image_full_name} does not exist or is not accessible.\n"
)
logger.debug(f"Timed out while trying to create Notebook {name}: {err}.", exc_info=True)
logger.error(f"Timed out while trying to create Notebook {name}.")
logger.error(f"Image {image_full_name} does not exist or is not accessible.")
logger.info(
"Note: You might want to use some of these recommended images:\n"
f"{RECOMMENDED_IMAGES_MESSAGE}"
)
logger.debug(f"Timed out while trying to create Notebook {name} with error {err}.")
return
raise RuntimeError()
# Assumes that the notebook server is exposed by a service of the same name.
url = get_service_url(name, DSS_NAMESPACE, lightkube_client)
if url:
Expand Down
24 changes: 16 additions & 8 deletions src/dss/logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,12 @@ def get_logs(parts: str, name: str, lightkube_client: Client) -> None:
if deployment.metadata.name == name:
break
else:
logger.debug(
f"Failed to retrieve logs. Deployment '{name}' does not exist in {DSS_NAMESPACE} namespace." # noqa E501
)
logger.error(f"Failed to retrieve logs. Notebook '{name}' does not exist.")
logger.info("Run 'dss list' to check all notebooks.")
return
raise RuntimeError()
pods = lightkube_client.list(
Pod, namespace=DSS_NAMESPACE, labels=deployment.spec.selector.matchLabels
)
Expand All @@ -46,15 +49,16 @@ def get_logs(parts: str, name: str, lightkube_client: Client) -> None:
namespace=DSS_NAMESPACE,
labels=mlflow_deployment.spec.selector.matchLabels,
)
except ApiError:
except ApiError as e:
logger.debug(f"Failed to retrieve logs for MLflow: {e}", exc_info=True)
logger.error(
"Failed to retrieve logs. MLflow seems to be not present.Make sure DSS is correctly initialized." # noqa: E501
"Failed to retrieve logs. MLflow seems to be not present. Make sure DSS is correctly initialized." # noqa: E501
)
logger.info("Note: You might want to run")
logger.info(" dss status to check the current status")
logger.info(" dss logs --all to view all logs")
logger.info(" dss initialize to install dss")
return
raise RuntimeError()
elif parts == "all":
deployments = list(lightkube_client.list(Deployment, namespace=DSS_NAMESPACE))
pods = []
Expand All @@ -65,18 +69,19 @@ def get_logs(parts: str, name: str, lightkube_client: Client) -> None:
)
)
except ApiError as e:
logger.error(e)
logger.debug(f"Failed to retrieve logs for {parts} {name}: {e}", exc_info=True)
logger.error(
f"Failed to retrieve logs for {parts} {name}. Make sure DSS is correctly initialized." # noqa: E501
)
logger.info("Note: You might want to run")
logger.info(" dss status to check the current status")
logger.info(" dss initialize to install dss")
return
raise RuntimeError()

if not pods:
logger.debug(f"Failed to retrieve logs. No pods found for {parts} {name}.")
logger.error(f"Failed to retrieve logs. No pods found for {parts} {name}.")
return
raise RuntimeError()

for pod in pods:
# Retrieve logs from the pod
Expand All @@ -87,7 +92,10 @@ def get_logs(parts: str, name: str, lightkube_client: Client) -> None:
line = line.rstrip("\n")
logger.info(line)
except ApiError as e:
logger.error(e)
logger.debug(
f"Failed to retrieve logs for pod {pod.metadata.name}: {e}", exc_info=True
)
logger.error(
f"Failed to retrieve logs. There was a problem while getting the logs for {pod.metadata.name}" # noqa: E501
)
raise RuntimeError()
88 changes: 61 additions & 27 deletions src/dss/main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import click
from lightkube.core.exceptions import ApiError

from dss.config import DEFAULT_NOTEBOOK_IMAGE, RECOMMENDED_IMAGES_MESSAGE
from dss.create_notebook import create_notebook
Expand Down Expand Up @@ -34,10 +33,17 @@ def initialize_command(kubeconfig: str) -> None:
"""
logger.info("Executing initialize command")

kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)
try:
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)

initialize(lightkube_client=lightkube_client)
initialize(lightkube_client=lightkube_client)
except RuntimeError:
click.get_current_context().exit(1)
except Exception as e:
logger.debug(f"Failed to initialize dss: {e}.", exc_info=True)
logger.error(f"Failed to initialize dss: {str(e)}.")
click.get_current_context().exit(1)


IMAGE_OPTION_HELP = "\b\nThe image used for the notebook server.\n"
Expand Down Expand Up @@ -72,10 +78,17 @@ def create_notebook_command(name: str, image: str, kubeconfig: str) -> None:
" For more information on using a specific image, see dss create --help."
)

kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)
try:
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)

create_notebook(name=name, image=image, lightkube_client=lightkube_client)
create_notebook(name=name, image=image, lightkube_client=lightkube_client)
except RuntimeError:
click.get_current_context().exit(1)
except Exception as e:
logger.debug(f"Failed to create notebook {name}: {e}.", exc_info=True)
logger.error(f"Failed to create notebook {name}: {str(e)}.")
click.get_current_context().exit(1)


create_notebook_command.help += f"""
Expand Down Expand Up @@ -112,15 +125,22 @@ def logs_command(kubeconfig: str, notebook_name: str, print_all: bool, mlflow: b
)
return

kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)
try:
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)

if print_all:
get_logs("all", None, lightkube_client)
elif mlflow:
get_logs("mlflow", None, lightkube_client)
elif notebook_name:
get_logs("notebooks", notebook_name, lightkube_client)
if print_all:
get_logs("all", None, lightkube_client)
elif mlflow:
get_logs("mlflow", None, lightkube_client)
elif notebook_name:
get_logs("notebooks", notebook_name, lightkube_client)
except RuntimeError:
click.get_current_context().exit(1)
except Exception as e:
logger.debug(f"Failed to retrieve logs: {e}.", exc_info=True)
logger.error(f"Failed to retrieve logs: {str(e)}.")
click.get_current_context().exit(1)


@main.command(name="status")
Expand All @@ -130,10 +150,17 @@ def logs_command(kubeconfig: str, notebook_name: str, print_all: bool, mlflow: b
)
def status_command(kubeconfig: str) -> None:
"""Checks the status of key components within the DSS environment. Verifies if the MLflow deployment is ready and checks if GPU acceleration is enabled on the Kubernetes cluster by examining the labels of Kubernetes nodes for NVIDIA or Intel GPU devices.""" # noqa E501
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)
try:
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)

get_status(lightkube_client)
get_status(lightkube_client)
except RuntimeError:
click.get_current_context().exit(1)
except Exception as e:
logger.debug(f"Failed to retrieve status: {e}.", exc_info=True)
logger.error(f"Failed to retrieve status: {str(e)}.")
click.get_current_context().exit(1)


@main.command(name="list")
Expand Down Expand Up @@ -178,13 +205,16 @@ def stop_notebook_command(kubeconfig: str, notebook_name: str):
Example:
dss stop my-notebook
"""
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)

try:
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)
stop_notebook(name=notebook_name, lightkube_client=lightkube_client)
except (RuntimeError, ApiError):
exit(1)
except RuntimeError:
click.get_current_context().exit(1)
except Exception as e:
logger.debug(f"Failed to stop notebook: {e}.", exc_info=True)
logger.error(f"Failed to stop notebook: {str(e)}.")
click.get_current_context().exit(1)


# FIXME: remove the `--kubeconfig`` option
Expand Down Expand Up @@ -237,13 +267,17 @@ def remove_notebook_command(name: str, kubeconfig: str):
"""
logger.info("Executing remove command")

kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)

try:
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)

remove_notebook(name=name, lightkube_client=lightkube_client)
except RuntimeError:
exit(1)
click.get_current_context().exit(1)
except Exception as e:
logger.debug(f"Failed to remove notebook: {e}.", exc_info=True)
logger.error(f"Failed to remove notebook: {str(e)}.")
click.get_current_context().exit(1)


@main.command(name="purge")
Expand Down
11 changes: 6 additions & 5 deletions src/dss/remove_notebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ def remove_notebook(name: str, lightkube_client: Client) -> None:
if not does_notebook_exist(
name=name, namespace=DSS_NAMESPACE, lightkube_client=lightkube_client
):
logger.error(
f"Failed to remove Notebook. Notebook {name} does not exist. Run 'dss list' to check all notebooks." # noqa E501
)
raise RuntimeError("Failed to remove Notebook not found.")
logger.debug(f"Failed to remove Notebook. Notebook {name} does not exist.")
logger.error(f"Failed to remove Notebook. Notebook {name} does not exist.") # noqa E501
logger.info("Run 'dss list' to check all notebooks.")
raise RuntimeError()

# From this point forward we know either one or both
# resources (Deployment, Service) exist for the Notebook.
Expand All @@ -49,11 +49,12 @@ def remove_notebook(name: str, lightkube_client: Client) -> None:
exceptions.append(err)

if exceptions:
logger.debug(f"Failed to remove notebook {name}: {exceptions}")
logger.error(f"Failed to remove notebook {name}. Please try again.")
logger.info("Note: You might want to run")
logger.info(" dss status to check the current status")
logger.info(f" dss logs {name} to review the notebook logs")
raise RuntimeError(f"Failed to remove notebook {name} with errors", exceptions)
raise RuntimeError()
else:
logger.info(
f"Removing the notebook {name}. Check `dss list` for the status of the notebook."
Expand Down
59 changes: 30 additions & 29 deletions src/dss/status.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,34 +15,35 @@ def get_status(lightkube_client: Client) -> None:
Args:
lightkube_client (Client): The Kubernetes client.
"""
# Check MLflow deployment
mlflow_ready = does_mlflow_deployment_exist(lightkube_client)

# Log MLflow deployment status and URL
if mlflow_ready:
mlflow_url = get_service_url(MLFLOW_DEPLOYMENT_NAME, DSS_NAMESPACE, lightkube_client)
logger.info("MLflow deployment: Ready")
logger.info(f"MLflow URL: {mlflow_url}")
else:
logger.info("MLflow deployment: Not ready")

# Check NVIDIA GPU acceleration
gpu_acceleration = False
try:
# Check MLflow deployment
mlflow_ready = does_mlflow_deployment_exist(lightkube_client)

# Log MLflow deployment status and URL
if mlflow_ready:
mlflow_url = get_service_url(MLFLOW_DEPLOYMENT_NAME, DSS_NAMESPACE, lightkube_client)
logger.info("MLflow deployment: Ready")
logger.info(f"MLflow URL: {mlflow_url}")
else:
logger.info("MLflow deployment: Not ready")

# Check NVIDIA GPU acceleration
gpu_acceleration = False
node_labels = get_labels_for_node(lightkube_client)
if (
"nvidia.com/gpu.present" in node_labels
and "nvidia.com/gpu.deploy.container-toolkit" in node_labels
and "nvidia.com/gpu.deploy.device-plugin" in node_labels
):
gpu_acceleration = True
card_name = node_labels.get("nvidia.com/gpu.product", "NVIDIA GPU")

# Log GPU status
if gpu_acceleration:
logger.info(f"GPU acceleration: Enabled ({card_name})")
else:
logger.info("GPU acceleration: Disabled")
except Exception as e:
logger.error(f"Failed to retrieve status: {e}")
return
except ValueError as e:
logger.debug(f"Failed to get labels for nodes: {e}.", exc_info=True)
logger.error(f"Failed to retrieve status: {e}.")
raise RuntimeError()
if (
"nvidia.com/gpu.present" in node_labels
and "nvidia.com/gpu.deploy.container-toolkit" in node_labels
and "nvidia.com/gpu.deploy.device-plugin" in node_labels
):
gpu_acceleration = True
card_name = node_labels.get("nvidia.com/gpu.product", "NVIDIA GPU")

# Log GPU status
if gpu_acceleration:
logger.info(f"GPU acceleration: Enabled ({card_name})")
else:
logger.info("GPU acceleration: Disabled")
Loading

0 comments on commit 06224bc

Please sign in to comment.