Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor dss commands exit codes and logging #90

Merged
merged 9 commits into from
Apr 24, 2024
Merged
55 changes: 24 additions & 31 deletions src/dss/create_notebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,24 +42,22 @@ def create_notebook(name: str, image: str, lightkube_client: Client) -> None:
if not does_dss_pvc_exist(lightkube_client) or not does_mlflow_deployment_exist(
lightkube_client
):
logger.error("Failed to create notebook. DSS was not correctly initialized.\n")
logger.info(
"You might want to run\n"
" dss status to check the current status\n"
" dss logs --all to review all logs\n"
" dss initialize to install dss\n"
)
return
logger.debug("Failed to create notebook. DSS was not correctly initialized.")
logger.error("Failed to create notebook. DSS was not correctly initialized.")
logger.info("Note: You might want to run")
logger.info(" dss status to check the current status")
logger.info(" dss logs --all to view all logs")
logger.info(" dss initialize to install dss")
raise RuntimeError()
if does_notebook_exist(name, DSS_NAMESPACE, lightkube_client):
# Assumes that the notebook server is exposed by a service of the same name.
logger.error(
f"Failed to create Notebook. Notebook with name '{name}' already exists.\n"
f"Please specify a different name."
)
logger.debug(f"Failed to create Notebook. Notebook with name '{name}' already exists.")
logger.error(f"Failed to create Notebook. Notebook with name '{name}' already exists.")
logger.info("Please specify a different name.")
url = get_service_url("name", DSS_NAMESPACE, lightkube_client)
if url:
logger.info(f"To connect to the existing notebook, go to {url}.")
return
raise RuntimeError()

manifests_file = Path(
Path(__file__).parent, MANIFEST_TEMPLATES_LOCATION, "notebook_deployment.yaml.j2"
Expand All @@ -85,30 +83,25 @@ def create_notebook(name: str, image: str, lightkube_client: Client) -> None:

logger.info(f"Success: Notebook {name} created successfully.")
except ApiError as err:
logger.error(
f"Failed to create Notebook with error code {err.status.code}."
" Check the debug logs for more details."
)
logger.debug(f"Failed to create Notebook {name} with error {err}")
return
logger.debug(f"Failed to create Notebook {name}: {err}.", exc_info=True)
logger.error(f"Failed to create Notebook with error code {err.status.code}.")
logger.info(" Check the debug logs for more details.")
raise RuntimeError()
except TimeoutError as err:
logger.error(str(err))
logger.warn(
f"Timed out while trying to create Notebook {name}.\n"
"Some resources might be left in the cluster. Check the status with `dss list`."
)
return
logger.debug(f"Failed to create Notebook {name}: {err}", exc_info=True)
logger.error(f"Timed out while trying to create Notebook {name}.")
logger.warn(" Some resources might be left in the cluster.")
logger.info(" Check the status with `dss list`.")
raise RuntimeError()
except ImagePullBackOffError as err:
logger.error(
f"Timed out while trying to create Notebook {name}.\n"
f"Image {image_full_name} does not exist or is not accessible.\n"
)
logger.debug(f"Timed out while trying to create Notebook {name}: {err}.", exc_info=True)
logger.error(f"Timed out while trying to create Notebook {name}.")
logger.error(f"Image {image_full_name} does not exist or is not accessible.")
logger.info(
"Note: You might want to use some of these recommended images:\n"
f"{RECOMMENDED_IMAGES_MESSAGE}"
)
logger.debug(f"Timed out while trying to create Notebook {name} with error {err}.")
return
raise RuntimeError()
# Assumes that the notebook server is exposed by a service of the same name.
url = get_service_url(name, DSS_NAMESPACE, lightkube_client)
if url:
Expand Down
24 changes: 16 additions & 8 deletions src/dss/logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,12 @@ def get_logs(parts: str, name: str, lightkube_client: Client) -> None:
if deployment.metadata.name == name:
break
else:
logger.debug(
f"Failed to retrieve logs. Deployment '{name}' does not exist in {DSS_NAMESPACE} namespace." # noqa E501
)
logger.error(f"Failed to retrieve logs. Notebook '{name}' does not exist.")
logger.info("Run 'dss list' to check all notebooks.")
return
raise RuntimeError()
pods = lightkube_client.list(
Pod, namespace=DSS_NAMESPACE, labels=deployment.spec.selector.matchLabels
)
Expand All @@ -46,15 +49,16 @@ def get_logs(parts: str, name: str, lightkube_client: Client) -> None:
namespace=DSS_NAMESPACE,
labels=mlflow_deployment.spec.selector.matchLabels,
)
except ApiError:
except ApiError as e:
logger.debug(f"Failed to retrieve logs for MLflow: {e}")
misohu marked this conversation as resolved.
Show resolved Hide resolved
logger.error(
"Failed to retrieve logs. MLflow seems to be not present.Make sure DSS is correctly initialized." # noqa: E501
"Failed to retrieve logs. MLflow seems to be not present. Make sure DSS is correctly initialized." # noqa: E501
)
logger.info("Note: You might want to run")
logger.info(" dss status to check the current status")
logger.info(" dss logs --all to view all logs")
logger.info(" dss initialize to install dss")
return
raise RuntimeError()
elif parts == "all":
deployments = list(lightkube_client.list(Deployment, namespace=DSS_NAMESPACE))
pods = []
Expand All @@ -65,18 +69,19 @@ def get_logs(parts: str, name: str, lightkube_client: Client) -> None:
)
)
except ApiError as e:
logger.error(e)
logger.debug(f"Failed to retrieve logs for {parts} {name}: {e}", exc_info=True)
logger.error(
f"Failed to retrieve logs for {parts} {name}. Make sure DSS is correctly initialized." # noqa: E501
)
logger.info("Note: You might want to run")
logger.info(" dss status to check the current status")
logger.info(" dss initialize to install dss")
return
raise RuntimeError()

if not pods:
logger.debug(f"Failed to retrieve logs. No pods found for {parts} {name}.")
logger.error(f"Failed to retrieve logs. No pods found for {parts} {name}.")
return
raise RuntimeError()

for pod in pods:
# Retrieve logs from the pod
Expand All @@ -87,7 +92,10 @@ def get_logs(parts: str, name: str, lightkube_client: Client) -> None:
line = line.rstrip("\n")
logger.info(line)
except ApiError as e:
logger.error(e)
logger.debug(
f"Failed to retrieve logs for pod {pod.metadata.name}: {e}", exc_info=True
)
logger.error(
f"Failed to retrieve logs. There was a problem while getting the logs for {pod.metadata.name}" # noqa: E501
)
raise RuntimeError()
82 changes: 55 additions & 27 deletions src/dss/main.py
misohu marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import click
from lightkube.core.exceptions import ApiError

from dss.config import DEFAULT_NOTEBOOK_IMAGE, RECOMMENDED_IMAGES_MESSAGE
from dss.create_notebook import create_notebook
Expand Down Expand Up @@ -34,10 +33,15 @@ def initialize_command(kubeconfig: str) -> None:
"""
logger.info("Executing initialize command")

kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)
try:
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)

initialize(lightkube_client=lightkube_client)
initialize(lightkube_client=lightkube_client)
except Exception as e:
logger.debug(f"Failed to initialize dss: {e}.", exc_info=True)
logger.error(f"Failed to initialize dss: {str(e)}.")
click.get_current_context().exit(1)


IMAGE_OPTION_HELP = "\b\nThe image used for the notebook server.\n"
Expand Down Expand Up @@ -72,10 +76,17 @@ def create_notebook_command(name: str, image: str, kubeconfig: str) -> None:
" For more information on using a specific image, see dss create --help."
)

kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)
try:
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)

create_notebook(name=name, image=image, lightkube_client=lightkube_client)
create_notebook(name=name, image=image, lightkube_client=lightkube_client)
except RuntimeError:
click.get_current_context().exit(1)
except Exception as e:
logger.debug(f"Failed to create notebook {name}: {e}.", exc_info=True)
logger.error(f"Failed to create notebook {name}: {str(e)}.")
click.get_current_context().exit(1)


create_notebook_command.help += f"""
Expand Down Expand Up @@ -112,15 +123,22 @@ def logs_command(kubeconfig: str, notebook_name: str, print_all: bool, mlflow: b
)
return

kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)
try:
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)

if print_all:
get_logs("all", None, lightkube_client)
elif mlflow:
get_logs("mlflow", None, lightkube_client)
elif notebook_name:
get_logs("notebooks", notebook_name, lightkube_client)
if print_all:
get_logs("all", None, lightkube_client)
elif mlflow:
get_logs("mlflow", None, lightkube_client)
elif notebook_name:
get_logs("notebooks", notebook_name, lightkube_client)
except RuntimeError:
click.get_current_context().exit(1)
except Exception as e:
logger.debug(f"Failed to retrieve logs: {e}.", exc_info=True)
logger.error(f"Failed to retrieve logs: {str(e)}.")
click.get_current_context().exit(1)


@main.command(name="status")
Expand All @@ -130,10 +148,13 @@ def logs_command(kubeconfig: str, notebook_name: str, print_all: bool, mlflow: b
)
def status_command(kubeconfig: str) -> None:
"""Checks the status of key components within the DSS environment. Verifies if the MLflow deployment is ready and checks if GPU acceleration is enabled on the Kubernetes cluster by examining the labels of Kubernetes nodes for NVIDIA or Intel GPU devices.""" # noqa E501
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)
try:
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)

get_status(lightkube_client)
get_status(lightkube_client)
except RuntimeError:
click.get_current_context().exit(1)
misohu marked this conversation as resolved.
Show resolved Hide resolved


@main.command(name="list")
Expand Down Expand Up @@ -178,13 +199,16 @@ def stop_notebook_command(kubeconfig: str, notebook_name: str):
Example:
dss stop my-notebook
"""
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)

try:
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)
stop_notebook(name=notebook_name, lightkube_client=lightkube_client)
except (RuntimeError, ApiError):
exit(1)
except RuntimeError:
click.get_current_context().exit(1)
except Exception as e:
logger.debug(f"Failed to stop notebook: {e}.", exc_info=True)
logger.error(f"Failed to stop notebook: {str(e)}.")
click.get_current_context().exit(1)


# FIXME: remove the `--kubeconfig`` option
Expand Down Expand Up @@ -237,13 +261,17 @@ def remove_notebook_command(name: str, kubeconfig: str):
"""
logger.info("Executing remove command")

kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)

try:
kubeconfig = get_default_kubeconfig(kubeconfig)
lightkube_client = get_lightkube_client(kubeconfig)

remove_notebook(name=name, lightkube_client=lightkube_client)
except RuntimeError:
exit(1)
click.get_current_context().exit(1)
except Exception as e:
logger.debug(f"Failed to remove notebook: {e}.", exc_info=True)
logger.error(f"Failed to remove notebook: {str(e)}.")
click.get_current_context().exit(1)


@main.command(name="purge")
Expand Down
11 changes: 6 additions & 5 deletions src/dss/remove_notebook.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ def remove_notebook(name: str, lightkube_client: Client) -> None:
if not does_notebook_exist(
name=name, namespace=DSS_NAMESPACE, lightkube_client=lightkube_client
):
logger.error(
f"Failed to remove Notebook. Notebook {name} does not exist. Run 'dss list' to check all notebooks." # noqa E501
)
raise RuntimeError("Failed to remove Notebook not found.")
logger.debug(f"Failed to remove Notebook. Notebook {name} does not exist.")
logger.error(f"Failed to remove Notebook. Notebook {name} does not exist.") # noqa E501
logger.info("Run 'dss list' to check all notebooks.")
raise RuntimeError()

# From this point forward we know either one or both
# resources (Deployment, Service) exist for the Notebook.
Expand All @@ -49,11 +49,12 @@ def remove_notebook(name: str, lightkube_client: Client) -> None:
exceptions.append(err)

if exceptions:
logger.debug(f"Failed to remove notebook {name}: {exceptions}")
logger.error(f"Failed to remove notebook {name}. Please try again.")
logger.info("Note: You might want to run")
logger.info(" dss status to check the current status")
logger.info(f" dss logs {name} to review the notebook logs")
raise RuntimeError(f"Failed to remove notebook {name} with errors", exceptions)
raise RuntimeError()
else:
logger.info(
f"Removing the notebook {name}. Check `dss list` for the status of the notebook."
Expand Down
3 changes: 2 additions & 1 deletion src/dss/status.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,6 @@ def get_status(lightkube_client: Client) -> None:
else:
logger.info("GPU acceleration: Disabled")
except Exception as e:
logger.debug(f"Failed to retrieve status: {e}", exc_info=True)
logger.error(f"Failed to retrieve status: {e}")
return
raise RuntimeError()
9 changes: 5 additions & 4 deletions src/dss/stop.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,10 @@ def stop_notebook(name: str, lightkube_client: Client) -> None:
if not does_notebook_exist(
name=name, namespace=DSS_NAMESPACE, lightkube_client=lightkube_client
):
logger.debug(f"Failed to stop Notebook. Notebook {name} does not exist.")
logger.error(f"Failed to stop Notebook. Notebook {name} does not exist.")
logger.info("Run 'dss list' to check all notebooks.")
raise RuntimeError(f"Failed to stop Notebook. Notebook {name} does not exist.")
raise RuntimeError()

obj = Deployment.Scale(
metadata=ObjectMeta(name=name, namespace=DSS_NAMESPACE), spec=ScaleSpec(replicas=0)
Expand All @@ -39,6 +40,6 @@ def stop_notebook(name: str, lightkube_client: Client) -> None:
)
return
except ApiError as e:
logger.error(f"Failed to stop Notebook {name}")
logger.debug(f"Failed to scale down Deployment {name} with error: {e}")
raise e
logger.debug(f"Failed to scale down Deployment {name}: {e}", exc_info=True)
logger.error(f"Failed to stop notebook {name}.")
raise RuntimeError()
Loading
Loading