Skip to content
Snippets Groups Projects
Commit 545fbad2 authored by Aleš Mrázek's avatar Aleš Mrázek
Browse files

Merge branch 'manager-instability-handling' into 'master'

manager: recovery from 'policy-loader' failure during reload

See merge request !1563
parents 23f1c587 b2682be4
No related branches found
No related tags found
1 merge request!1563manager: recovery from 'policy-loader' failure during reload
Pipeline #127648 failed
......@@ -27,9 +27,7 @@ class ConfigStore:
err_res = filter(lambda r: r.is_err(), results)
errs = list(map(lambda r: r.unwrap_err(), err_res))
if len(errs) > 0:
raise KresManagerException(
"Validation of the new config failed. The reasons are:\n - " + "\n - ".join(errs)
)
raise KresManagerException("Configuration validation failed. The reasons are:\n - " + "\n - ".join(errs))
async with self._update_lock:
# update the stored config with the new version
......@@ -39,6 +37,9 @@ class ConfigStore:
for call in self._callbacks:
await call(config)
async def renew(self) -> None:
await self.update(self._config)
async def register_verifier(self, verifier: VerifyCallback) -> None:
self._verifiers.append(verifier)
res = await verifier(self.get(), self.get())
......
......@@ -327,7 +327,7 @@ class KresManager: # pylint: disable=too-many-instance-attributes
async def _instability_handler(self) -> None:
if self._fix_counter.is_too_high():
logger.error(
"Already attempted to many times to fix system state. Refusing to try again and shutting down."
"Already attempted too many times to fix system state. Refusing to try again and shutting down."
)
await self.forced_shutdown()
return
......@@ -337,13 +337,13 @@ class KresManager: # pylint: disable=too-many-instance-attributes
self._fix_counter.increase()
await self._reload_system_state()
logger.warning("Workers reloaded. Applying old config....")
await self.apply_config(self._config_store.get(), _noretry=True)
await self._config_store.renew()
logger.warning(f"System stability hopefully renewed. Fix attempt counter is currently {self._fix_counter}")
except BaseException:
logger.error("Failed attempting to fix an error. Forcefully shutting down.", exc_info=True)
await self.forced_shutdown()
async def _watchdog(self) -> None:
async def _watchdog(self) -> None: # pylint: disable=too-many-branches
while True:
await asyncio.sleep(WATCHDOG_INTERVAL)
......@@ -356,11 +356,12 @@ class KresManager: # pylint: disable=too-many-instance-attributes
expected_ids = [x.id for x in self._workers]
if self._gc:
expected_ids.append(self._gc.id)
if self._policy_loader:
expected_ids.append(self._policy_loader.id)
invoke_callback = False
if self._policy_loader:
expected_ids.append(self._policy_loader.id)
for eid in expected_ids:
if eid not in detected_subprocesses:
logger.error("Subprocess with id '%s' was not found in the system!", eid)
......@@ -368,6 +369,12 @@ class KresManager: # pylint: disable=too-many-instance-attributes
continue
if detected_subprocesses[eid] is SubprocessStatus.FATAL:
if self._policy_loader and self._policy_loader.id == eid:
logger.info(
"Subprocess '%s' is skipped by WatchDog because its status is monitored in a different way.",
eid,
)
continue
logger.error("Subprocess '%s' is in FATAL state!", eid)
invoke_callback = True
continue
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment