Skip to content
Snippets Groups Projects
Verified Commit 7c242178 authored by Vladimír Čunát's avatar Vladimír Čunát Committed by Tomas Krizek
Browse files

lib/utils kr_fail(): simple rate limit for forking

Default 5 minutes (but off).
Randomize the delays +-25%.
parent 17c19474
Branches
Tags
1 merge request!1146lib/utils: define kr_require(), kr_assume() macros
......@@ -302,7 +302,7 @@ struct kr_server_selection {
kr_layer_t kr_layer_t_static;
_Bool kr_dbg_assumption_abort;
_Bool kr_dbg_assumption_fork;
int kr_dbg_assumption_fork;
typedef int32_t (*kr_stale_cb)(int32_t ttl, const knot_dname_t *owner, uint16_t type,
const struct kr_query *qry);
......
......@@ -135,7 +135,7 @@ EOF
printf "
kr_layer_t kr_layer_t_static;
_Bool kr_dbg_assumption_abort;
_Bool kr_dbg_assumption_fork;
int kr_dbg_assumption_fork;
"
printf "
......
......@@ -10,21 +10,25 @@ process is out of the scope of this documentation, but some tips can be found
Kresd uses *assumptions*, which are checks that should always pass and indicate
some weird or unexpected state if they don't. In such cases, they show up in
the log as errors. By default, the process recovers from those states, but the
the log as errors. By default, the process recovers from those states if possible, but the
behaviour can be changed with the following options to aid further debugging.
.. envvar:: debugging.assumption_abort = false|true
:return: boolean (default: false)
:return: boolean (default: false in meson's release mode, true otherwise)
Allow the process to be aborted in case it encounters a failed assumption.
(Some critical conditions always lead to abortion, regardless of settings.)
.. envvar:: debugging.assumption_fork = true|false
.. envvar:: debugging.assumption_fork = milliseconds
:return: boolean (default: true)
:return: int (default: 5 minutes in meson's release mode, 0 otherwise)
If a proccess should be aborted, it can be done in two ways. When this is
set to true (default), a child is forked and aborted to obtain a coredump,
set to nonzero (default), a child is forked and aborted to obtain a coredump,
while the parent process recovers and keeps running. This can be useful to
debug a rare issue that occurs in production, since it doesn't affect the
main process.
As the dumping can be costly, the value is a lower bound on delay between
consecutive coredumps of each process. It is randomized by +-25% each time.
......@@ -39,7 +39,7 @@
/* Logging & debugging */
bool kr_verbose_status = false;
bool kr_dbg_assumption_abort = DBG_ASSUMPTION_ABORT;
bool kr_dbg_assumption_fork = DBG_ASSUMPTION_FORK;
int kr_dbg_assumption_fork = DBG_ASSUMPTION_FORK;
void kr_fail(bool is_fatal, const char *expr, const char *func, const char *file, int line)
{
......@@ -50,8 +50,22 @@ void kr_fail(bool is_fatal, const char *expr, const char *func, const char *file
if (is_fatal || (kr_dbg_assumption_abort && !kr_dbg_assumption_fork))
abort();
else if (kr_dbg_assumption_abort && kr_dbg_assumption_fork)
fork() == 0 ? abort() : (void)0;
else if (!kr_dbg_assumption_abort || !kr_dbg_assumption_fork)
return;
// We want to fork and abort the child, unless rate-limited.
static uint64_t limited_until = 0;
const uint64_t now = kr_now();
if (now < limited_until)
return;
if (kr_dbg_assumption_fork > 0) {
// Add jitter +- 25%; in other words: 75% + uniform(0,50%).
// Motivation: if a persistent problem starts happening, desynchronize
// coredumps from different instances as they're not cheap.
limited_until = now + kr_dbg_assumption_fork * 3 / 4
+ kr_dbg_assumption_fork * kr_rand_bytes(1) / 256 / 2;
}
if (fork() == 0)
abort();
}
/*
......
......@@ -64,13 +64,19 @@ typedef void (*trace_log_f)(const struct kr_request *request, const char *msg);
/** Whether kr_assume() checks should abort. */
KR_EXPORT extern bool kr_dbg_assumption_abort;
/** Whether kr_assume() should fork the process before issuing abort (if configured).
/** How often kr_assume() should fork the process before issuing abort (if configured).
*
* This can be useful for debugging rare edge-cases in production. When both
* kr_debug_assumption_abort and kr_debug_assumption_fork are set to true, it is
* This can be useful for debugging rare edge-cases in production.
* if (kr_debug_assumption_abort && kr_debug_assumption_fork), it is
* possible to both obtain a coredump (from forked child) and recover from the
* non-fatal error in the parent process. */
KR_EXPORT extern bool kr_dbg_assumption_fork;
* non-fatal error in the parent process.
*
* == 0 (false): no forking
* > 0: minimum delay between forks
* (in milliseconds, each instance separately, randomized +-25%)
* < 0: no rate-limiting (not recommended)
*/
KR_EXPORT extern int kr_dbg_assumption_fork;
/** Use kr_require() and kr_assume() instead of directly this function. */
KR_EXPORT KR_COLD void kr_fail(bool is_fatal, const char* expr, const char *func,
......
......@@ -179,7 +179,11 @@ conf_data.set('ENABLE_XDP', xdp.to_int())
conf_data.set('ENABLE_CAP_NG', capng.found().to_int())
conf_data.set('ENABLE_DOH2', nghttp2.found().to_int())
conf_data.set('DBG_ASSUMPTION_ABORT', get_option('debug').to_int())
conf_data.set('DBG_ASSUMPTION_FORK', (not get_option('debug')).to_int())
if get_option('debug')
conf_data.set('DBG_ASSUMPTION_FORK', '0')
else
conf_data.set('DBG_ASSUMPTION_FORK', '(5 * 60 * 1000) /* five minutes */')
endif
kresconfig = configure_file(
output: 'kresconfig.h',
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment