Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use longer timeouts for API checks before trigger a rollback #4658

Merged
merged 5 commits into from
Nov 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions supervisor/homeassistant/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,22 +130,22 @@ async def get_core_state(self) -> dict[str, Any]:
"""Return Home Assistant core state."""
return await self._get_json("api/core/state")

async def check_api_state(self) -> bool:
"""Return True if Home Assistant up and running."""
async def get_api_state(self) -> str | None:
"""Return state of Home Assistant Core or None."""
# Skip check on landingpage
if (
self.sys_homeassistant.version is None
or self.sys_homeassistant.version == LANDINGPAGE
):
return False
return None

# Check if port is up
if not await self.sys_run_in_executor(
check_port,
self.sys_homeassistant.ip_address,
self.sys_homeassistant.api_port,
):
return False
return None

# Check if API is up
with suppress(HomeAssistantAPIError):
Expand All @@ -157,7 +157,13 @@ async def check_api_state(self) -> bool:
else:
data = await self.get_config()
# Older versions of home assistant does not expose the state
if data and data.get("state", "RUNNING") == "RUNNING":
return True
if data:
return data.get("state", "RUNNING")

return None

async def check_api_state(self) -> bool:
"""Return Home Assistant Core state if up."""
if state := self.get_api_state():
return state == "RUNNING"
return False
32 changes: 23 additions & 9 deletions supervisor/homeassistant/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,10 @@
_LOGGER: logging.Logger = logging.getLogger(__name__)

SECONDS_BETWEEN_API_CHECKS: Final[int] = 5
STARTUP_API_CHECK_TIMEOUT: Final[timedelta] = timedelta(minutes=5)
# Core Stage 1 and some wiggle room
STARTUP_API_RESPONSE_TIMEOUT: Final[timedelta] = timedelta(minutes=3)
# All stages plus event start timeout and some wiggle rooom
STARTUP_API_CHECK_RUNNING_TIMEOUT: Final[timedelta] = timedelta(minutes=15)
RE_YAML_ERROR = re.compile(r"homeassistant\.util\.yaml")


Expand Down Expand Up @@ -440,25 +443,36 @@ async def _block_till_run(self, version: AwesomeVersion) -> None:
return
_LOGGER.info("Wait until Home Assistant is ready")

start = datetime.now()
while not (timeout := datetime.now() >= start + STARTUP_API_CHECK_TIMEOUT):
deadline = datetime.now() + STARTUP_API_RESPONSE_TIMEOUT
last_state = None
while not (timeout := datetime.now() >= deadline):
await asyncio.sleep(SECONDS_BETWEEN_API_CHECKS)

# 1: Check if Container is is_running
if not await self.instance.is_running():
_LOGGER.error("Home Assistant has crashed!")
break

# 2: Check if API response
if await self.sys_homeassistant.api.check_api_state():
_LOGGER.info("Detect a running Home Assistant instance")
self._error_state = False
return
# 2: Check API response
if state := await self.sys_homeassistant.api.get_api_state():
if last_state is None:
# API initially available, move deadline up and check API
# state to be running now
deadline = datetime.now() + STARTUP_API_CHECK_RUNNING_TIMEOUT

if last_state != state:
_LOGGER.info("Home Assistant Core state changed to %s", state)
last_state = state

if state == "RUNNING":
_LOGGER.info("Detect a running Home Assistant instance")
self._error_state = False
return

self._error_state = True
if timeout:
raise HomeAssistantStartupTimeout(
"No API response in 5 minutes, assuming core has had a fatal startup error",
"No Home Assistant Core response, assuming a fatal startup error",
agners marked this conversation as resolved.
Show resolved Hide resolved
_LOGGER.error,
)
raise HomeAssistantCrashError()
Expand Down
1 change: 1 addition & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,7 @@ async def coresys(

# WebSocket
coresys_obj.homeassistant.api.check_api_state = AsyncMock(return_value=True)
coresys_obj.homeassistant.api.get_api_state = AsyncMock(return_value="RUNNING")
coresys_obj.homeassistant._websocket._client = AsyncMock(
ha_version=AwesomeVersion("2021.2.4")
)
Expand Down
33 changes: 29 additions & 4 deletions tests/homeassistant/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ async def test_api_check_timeout(
"""Test attempts to contact the API timeout."""
container.status = "stopped"
coresys.homeassistant.version = AwesomeVersion("2023.9.0")
coresys.homeassistant.api.check_api_state.return_value = False
coresys.homeassistant.api.get_api_state.return_value = None

async def mock_instance_start(*_):
container.status = "running"
Expand All @@ -294,8 +294,33 @@ async def mock_sleep(*args):
), pytest.raises(HomeAssistantCrashError):
await coresys.homeassistant.core.start()

assert coresys.homeassistant.api.check_api_state.call_count == 5
assert coresys.homeassistant.api.get_api_state.call_count == 3
assert (
"No API response in 5 minutes, assuming core has had a fatal startup error"
in caplog.text
"No Home Assistant Core response, assuming a fatal startup error" in caplog.text
)


async def test_api_check_success(
coresys: CoreSys, container: MagicMock, caplog: pytest.LogCaptureFixture
):
"""Test attempts to contact the API timeout."""
container.status = "stopped"
coresys.homeassistant.version = AwesomeVersion("2023.9.0")

async def mock_instance_start(*_):
container.status = "running"

with patch.object(
DockerHomeAssistant, "start", new=mock_instance_start
), patch.object(DockerAPI, "container_is_initialized", return_value=True), travel(
datetime(2023, 10, 2, 0, 0, 0), tick=False
) as traveller:

async def mock_sleep(*args):
traveller.shift(timedelta(minutes=1))

with patch("supervisor.homeassistant.core.asyncio.sleep", new=mock_sleep):
await coresys.homeassistant.core.start()

assert coresys.homeassistant.api.get_api_state.call_count == 1
assert "Detect a running Home Assistant instance" in caplog.text
Loading