-
Notifications
You must be signed in to change notification settings - Fork 54
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Report Spark job ID as container name
- Loading branch information
Showing
2 changed files
with
113 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
from datetime import datetime, timedelta | ||
from functools import lru_cache | ||
from typing import Optional, Any, Dict, cast, List | ||
|
||
import psutil | ||
import requests | ||
from psutil import Process | ||
|
||
from gprofiler.log import get_logger_adapter | ||
|
||
DATABRICKS_METRICS_PROP_PATH = "/databricks/spark/conf/metrics.properties" | ||
HOST_KEY_NAME = "*.sink.ganglia.host" | ||
DEFAULT_WEBUI_PORT = 40001 | ||
RUNNING_JOBS_CACHE_TTL = timedelta(seconds=60) | ||
|
||
logger = get_logger_adapter(__name__) | ||
|
||
|
||
class DatabricksWebUI: | ||
def __init__(self) -> None: | ||
self._executor_process: Optional[Process] = None | ||
self._driver_daemon_process: Optional[Process] = None | ||
self._running_jobs = [] | ||
self._last_running_jobs_fetch_time = Optional[datetime] | ||
|
||
@lru_cache(maxsize=None) | ||
def get_webui_address(self) -> Optional[str]: | ||
with open(DATABRICKS_METRICS_PROP_PATH) as f: | ||
properties = f.read() | ||
try: | ||
# Ignore line without `=` declaration | ||
properties_values = dict(line.split("=", 1) for line in properties.splitlines() if "=" in line) | ||
host = properties_values[HOST_KEY_NAME] | ||
except KeyError as e: | ||
if e.args[0] == HOST_KEY_NAME: | ||
# Might happen while provisioning the cluster, retry. | ||
return None | ||
raise Exception( | ||
f"Failed to get Databricks webui address {properties=}" | ||
) from e | ||
except Exception as e: | ||
raise Exception( | ||
f"Failed to get Databricks webui address {properties=}" | ||
) from e | ||
return f"{host}:{DEFAULT_WEBUI_PORT}" | ||
|
||
def get_executor_process(self) -> Optional[Process]: | ||
if self._executor_process is not None and self._executor_process.is_running(): | ||
return self._executor_process | ||
|
||
for process in psutil.process_iter(): | ||
if 'org.apache.spark.executor.CoarseGrainedExecutorBackend' in process.cmdline(): | ||
self._executor_process = process | ||
return self._executor_process | ||
|
||
return None | ||
|
||
def get_driver_daemon_process(self) -> Optional[Process]: | ||
if self._driver_daemon_process is not None and self._driver_daemon_process.is_running(): | ||
return self._driver_daemon_process | ||
|
||
for process in psutil.process_iter(): | ||
if 'org.apache.spark.deploy.master.DriverDaemon' in process.cmdline(): | ||
self._driver_daemon_process = process | ||
return self._driver_daemon_process | ||
|
||
return None | ||
|
||
def web_ui_request(self, query: str) -> Any: | ||
webui_address = self.get_webui_address() | ||
if webui_address is None: | ||
raise Exception("WebUI address is not available") | ||
return cast(Any, requests.get(f"http://{webui_address}/{query}").json()) | ||
|
||
@lru_cache(maxsize=None) | ||
def get_app_id(self) -> Optional[str]: | ||
apps = self.web_ui_request("api/v1/applications") | ||
if not apps: | ||
return None | ||
assert len(apps) == 1 | ||
return cast(str, apps[0]["id"]) | ||
|
||
@ | ||
def get_running_jobs(self) -> List[Dict[str, Any]]: | ||
if self._last_running_jobs_fetch_time is not None: | ||
if (datetime.now() - self._last_running_jobs_fetch_time) < RUNNING_JOBS_CACHE_TTL: | ||
return self._running_jobs | ||
|
||
app_id = self.get_app_id() | ||
if app_id is None: | ||
return [] | ||
|
||
self._running_jobs = cast(List, self.web_ui_request(f"api/v1/applications/{app_id}/jobs")) | ||
self._last_running_jobs_fetch_time = datetime.now() | ||
return self._running_jobs |