Skip to content
Snippets Groups Projects
Commit fd0561ac authored by Vladimír Čunát's avatar Vladimír Čunát
Browse files

Merge !1651: manager: processes watchdog error during shutdown

parents 6db58b91 6f837e90
No related branches found
No related tags found
1 merge request!1651manager: processes watchdog error during shutdown
Pipeline #135478 failed
......@@ -5,6 +5,7 @@ Bugfixes
--------
- manager: avoid an uncommon startup race in policy-loader (!1653)
[WARN] exited: policy-loader (exit status 0; not expected)
- manager: fix processes watchdog errors during shutdown (!1651)
Knot Resolver 6.0.10 (2025-01-20)
......
......@@ -53,7 +53,7 @@ async def _reload_supervisord(config: KresConfig) -> None:
supervisord = _create_supervisord_proxy(config)
supervisord.reloadConfig()
except Fault as e:
raise SubprocessControllerError("supervisord reload failed") from e
raise SubprocessControllerError(f"supervisord reload failed: {e}") from e
@async_in_a_thread
......
......@@ -5,6 +5,7 @@ from typing import Any, Dict, List, Literal, Optional, Tuple, Union
from knot_resolver.constants import API_SOCK_FILE, RUN_DIR, VERSION
from knot_resolver.datamodel.cache_schema import CacheSchema
from knot_resolver.datamodel.defer_schema import DeferSchema
from knot_resolver.datamodel.dns64_schema import Dns64Schema
from knot_resolver.datamodel.dnssec_schema import DnssecSchema
from knot_resolver.datamodel.forward_schema import ForwardSchema
......@@ -17,7 +18,6 @@ from knot_resolver.datamodel.monitoring_schema import MonitoringSchema
from knot_resolver.datamodel.network_schema import NetworkSchema
from knot_resolver.datamodel.options_schema import OptionsSchema
from knot_resolver.datamodel.rate_limiting_schema import RateLimitingSchema
from knot_resolver.datamodel.defer_schema import DeferSchema
from knot_resolver.datamodel.templates import POLICY_CONFIG_TEMPLATE, WORKER_CONFIG_TEMPLATE
from knot_resolver.datamodel.types import EscapedStr, IntPositive, WritableDir
from knot_resolver.datamodel.view_schema import ViewSchema
......
from knot_resolver.utils.modeling import ConfigSchema
from knot_resolver.datamodel.types import TimeUnit
from knot_resolver.utils.modeling import ConfigSchema
class DeferSchema(ConfigSchema):
......
......@@ -27,7 +27,7 @@ class RateLimitingSchema(ConfigSchema):
dry_run: bool = False
def _validate(self) -> None:
max_instant_limit = int(2 ** 32 // 768 - 1)
max_instant_limit = int(2**32 // 768 - 1)
if not int(self.instant_limit) <= max_instant_limit:
raise ValueError(f"'instant-limit' has to be in range 1..{max_instant_limit}")
if not int(self.rate_limit) <= 1000 * int(self.instant_limit):
......
from typing import List, Literal, Optional
from knot_resolver.datamodel.types import IDPattern, IPNetwork
from knot_resolver.datamodel.types import FloatNonNegative
from knot_resolver.datamodel.types import FloatNonNegative, IDPattern, IPNetwork
from knot_resolver.utils.modeling import ConfigSchema
......
......@@ -13,7 +13,7 @@ PID_FILE_NAME = "knot-resolver.pid"
FIX_COUNTER_ATTEMPTS_MAX = 2
FIX_COUNTER_DECREASE_INTERVAL_SEC = 30 * 60
WATCHDOG_INTERVAL_SEC: float = 5
PROCESSES_WATCHDOG_INTERVAL_SEC: float = 5
def kres_cache_dir(config: "KresConfig") -> Path:
......
......@@ -22,7 +22,7 @@ from knot_resolver.utils.compat.asyncio import create_task
from knot_resolver.utils.functional import Result
from knot_resolver.utils.modeling.types import NoneType
from .constants import FIX_COUNTER_ATTEMPTS_MAX, FIX_COUNTER_DECREASE_INTERVAL_SEC, WATCHDOG_INTERVAL_SEC
from .constants import FIX_COUNTER_ATTEMPTS_MAX, FIX_COUNTER_DECREASE_INTERVAL_SEC, PROCESSES_WATCHDOG_INTERVAL_SEC
logger = logging.getLogger(__name__)
......@@ -91,7 +91,7 @@ class KresManager: # pylint: disable=too-many-instance-attributes
self._manager_lock = asyncio.Lock()
self._workers_reset_needed: bool = False
self._controller: SubprocessController
self._watchdog_task: Optional["asyncio.Task[None]"] = None
self._processes_watchdog_task: Optional["asyncio.Task[None]"] = None
self._fix_counter: _FixCounter = _FixCounter()
self._config_store: ConfigStore
self._shutdown_triggers: List[Callable[[int], None]] = []
......@@ -116,7 +116,7 @@ class KresManager: # pylint: disable=too-many-instance-attributes
# initialize subprocess controller
logger.debug("Starting controller")
await self._controller.initialize_controller(config_store.get())
self._watchdog_task = create_task(self._watchdog())
self._processes_watchdog_task = create_task(self._processes_watchdog())
logger.debug("Looking for already running workers")
await self._collect_already_running_workers()
......@@ -350,10 +350,10 @@ class KresManager: # pylint: disable=too-many-instance-attributes
return Result.ok(None)
async def stop(self):
if self._watchdog_task is not None:
self._watchdog_task.cancel() # cancel it
if self._processes_watchdog_task is not None:
try:
await self._watchdog_task # and let it really finish
self._processes_watchdog_task.cancel() # cancel it
await self._processes_watchdog_task # and let it really finish
except asyncio.CancelledError:
pass
......@@ -390,9 +390,9 @@ class KresManager: # pylint: disable=too-many-instance-attributes
logger.error("Failed attempting to fix an error. Forcefully shutting down.", exc_info=True)
await self.forced_shutdown()
async def _watchdog(self) -> None: # pylint: disable=too-many-branches # noqa: PLR0912
async def _processes_watchdog(self) -> None: # pylint: disable=too-many-branches # noqa: PLR0912
while True:
await asyncio.sleep(WATCHDOG_INTERVAL_SEC)
await asyncio.sleep(PROCESSES_WATCHDOG_INTERVAL_SEC)
self._fix_counter.try_decrease()
......@@ -437,16 +437,22 @@ class KresManager: # pylint: disable=too-many-instance-attributes
)
invoke_callback = True
except SubprocessControllerError as e:
# wait few seconds and see if 'processes_watchdog' task is cancelled (during shutdown)
# otherwise it is an error
await asyncio.sleep(3)
invoke_callback = True
logger.error(f"Processes watchdog failed with SubprocessControllerError: {e}")
except asyncio.CancelledError:
raise
except BaseException:
invoke_callback = True
logger.error("Knot Resolver watchdog failed with an unexpected exception.", exc_info=True)
logger.error("Processes watchdog failed with an unexpected exception.", exc_info=True)
if invoke_callback:
try:
await self._instability_handler()
except Exception:
logger.error("Watchdog failed while invoking instability callback", exc_info=True)
logger.error("Processes watchdog failed while invoking instability callback", exc_info=True)
logger.error("Violently terminating!")
sys.exit(1)
......@@ -20,7 +20,7 @@ from aiohttp.web_runner import AppRunner, TCPSite, UnixSite
from knot_resolver.constants import CONFIG_FILE, USER
from knot_resolver.controller import get_best_controller_implementation
from knot_resolver.controller.exceptions import SubprocessControllerExecError
from knot_resolver.controller.exceptions import SubprocessControllerError, SubprocessControllerExecError
from knot_resolver.controller.interface import SubprocessType
from knot_resolver.controller.registered_workers import command_single_registered_worker
from knot_resolver.datamodel import kres_config_json_schema
......@@ -611,6 +611,10 @@ async def start_server(config: Path = CONFIG_FILE) -> int: # noqa: PLR0915
# and finally exec what we were told to exec
os.execl(*e.exec_args)
except SubprocessControllerError as e:
logger.error(f"Server initialization failed: {e}")
return 1
except KresManagerException as e:
# We caught an error with a pretty error message. Just print it and exit.
logger.error(e)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment