diff --git a/NEWS b/NEWS index 614177308c8e151252da4bc9105550f0b100bff7..8511da326ccc0caa24fd031ff59c2610f0977207 100644 --- a/NEWS +++ b/NEWS @@ -5,6 +5,10 @@ Bugfixes -------- - /management/unix-socket: revert to absolute path (#926, !1664) +Improvements +------------ +- defer: hard timeout for continuous work on a single request, terminates kresd if exceeded (!1661) + Knot Resolver 6.0.11 (2025-02-26) ================================= diff --git a/daemon/defer.c b/daemon/defer.c index aeaded206d0f2ead4f2035667b5803ad7cafc579..d1705301c1abedfd537f5e2f8e0b836830322705 100644 --- a/daemon/defer.c +++ b/daemon/defer.c @@ -4,6 +4,7 @@ #include <math.h> #include <stdatomic.h> +#include <unistd.h> #include "daemon/defer.h" #include "daemon/session2.h" #include "daemon/udp_queue.h" @@ -11,6 +12,7 @@ #include "lib/mmapped.h" #include "lib/resolve.h" #include "lib/utils.h" +#include "lib/sigsafe.h" #define V4_PREFIXES (uint8_t[]) { 18, 20, 24, 32 } #define V4_RATE_MULT (kru_price_t[]) { 768, 256, 32, 1 } @@ -59,13 +61,20 @@ V6_CONF = {1, V6_PREFIXES_CNT, V6_PREFIXES, V6_RATE_MULT, V6_SUBPRIO}; #define VERBOSE_LOG(...) kr_log_debug(DEFER, " | " __VA_ARGS__) +// Uses NON-STANDARD format string, see sigsafe_format. +#define SIGSAFE_VERBOSE_LOG(max_size, ...) sigsafe_log(LOG_DEBUG, DEFER, max_size, " | " __VA_ARGS__) + + struct defer { size_t capacity; kru_price_t max_decay; uint32_t log_period; + uint32_t hard_timeout; + uint32_t coredump_period; int cpus; bool using_avx2; _Atomic uint32_t log_time; + _Atomic uint32_t coredump_time; _Alignas(64) uint8_t kru[]; }; struct defer *defer = NULL; @@ -245,7 +254,8 @@ static inline int kru_charge_classify(const struct kru_conf *kru_conf, uint8_t * uint16_t *out_load, uint8_t *out_prefix) { uint16_t loads[kru_conf->prefixes_cnt]; - KRU.load_multi_prefix((struct kru *)defer->kru, kr_now(), + const uint64_t now = kr_now(); // NOLINT, async-signal-safe, uv_now only reads uint64_t + KRU.load_multi_prefix((struct kru *)defer->kru, now, kru_conf->namespace, key, kru_conf->prefixes, prices, kru_conf->prefixes_cnt, loads); int priority = 0; @@ -299,8 +309,9 @@ void defer_charge(uint64_t nsec, union kr_sockaddr *addr, bool stream) uint8_t prefix; kru_charge_classify(kru_conf, key, prices, &load, &prefix); - VERBOSE_LOG(" %s ADD %4.3f ms * %.2f -> load: %d on /%d\n", - kr_straddr(&addr->ip), nsec / 1000000.0, pf16 / (float)(1<<16), load, prefix); + SIGSAFE_VERBOSE_LOG(KR_STRADDR_MAXLEN + 100, + " %r ADD %f ms * %f -> load: %u on /%u\n", + &addr->ip, nsec / 1000000.0, pf16 / (float)(1<<16), load, prefix); } /// Determine priority of the request in [0, QUEUES_CNT - 1]; @@ -443,17 +454,9 @@ static inline void process_single_deferred(void) VERBOSE_LOG(" BREAK (timeout)\n"); // notice logging according to log-period - const uint32_t time_now = kr_now(); - uint32_t log_time_orig = atomic_load_explicit(&defer->log_time, memory_order_relaxed); - if (defer->log_period) { - while (time_now - log_time_orig + 1024 >= defer->log_period + 1024) { - if (atomic_compare_exchange_weak_explicit(&defer->log_time, &log_time_orig, time_now, - memory_order_relaxed, memory_order_relaxed)) { - kr_log_notice(DEFER, "Data from %s too long in queue, dropping. (%0.3f MiB in queues)\n", - kr_straddr(ctx->comm->src_addr), waiting_requests_size / 1024.0 / 1024.0); - break; - } - } + if (kr_log_period(defer->log_period, &defer->log_time)) { + kr_log_notice(DEFER, "Data from %s too long in queue, dropping. (%0.3f MiB in queues)\n", + kr_straddr(ctx->comm->src_addr), waiting_requests_size / 1024.0 / 1024.0); } break_query(ctx, ETIME); @@ -642,9 +645,43 @@ static void defer_queues_idle(uv_idle_t *handle) VERBOSE_LOG("POLL\n"); } +static void defer_alarm(int signum) +{ + if (!defer || (defer->hard_timeout == 0)) return; + + uint64_t elapsed = 0; + if (defer_sample_state.is_accounting) { + elapsed = defer_get_stamp() - defer_sample_state.stamp; + SIGSAFE_VERBOSE_LOG(KR_STRADDR_MAXLEN + 100, + "SIGALRM %s, host %r used %f s of cpu time on ongoing operation.\n", + signum ? "received" : "initialized", + &defer_sample_state.addr.ip, elapsed / 1000000000.0); + } else { + SIGSAFE_VERBOSE_LOG(KR_STRADDR_MAXLEN + 100, + "SIGALRM %s, no measuring in progress.\n", + signum ? "received" : "initialized"); + } + int64_t rest_to_timeout_ms = defer->hard_timeout - elapsed / 1000000; // ms - ns + + if (rest_to_timeout_ms <= 0) { + defer_charge(elapsed, &defer_sample_state.addr, defer_sample_state.stream); + bool coredump = kr_log_period(defer->coredump_period, &defer->coredump_time); + sigsafe_log(LOG_CRIT, DEFER, KR_STRADDR_MAXLEN + 100, + "Host %r used %f s of cpu time continuously, interrupting kresd (%s).\n", + &defer_sample_state.addr.ip, elapsed / 1000000000.0, + coredump ? "abort" : "exit"); + if (coredump) { + abort(); + } else { + _exit(EXIT_FAILURE); + } + } + alarm((rest_to_timeout_ms + 999) / 1000); +} /// Initialize shared memory, queues. To be called from Lua. -int defer_init(const char *mmap_file, uint32_t log_period, int cpus) // TODO possibly remove cpus; not needed +int defer_init(const char *mmap_file, uint32_t log_period, uint32_t hard_timeout, uint32_t coredump_period, int cpus) + // TODO possibly remove cpus; not needed { defer_initialized = true; if (mmap_file == NULL) { @@ -662,6 +699,8 @@ int defer_init(const char *mmap_file, uint32_t log_period, int cpus) // TODO po .capacity = KRU_CAPACITY, .max_decay = MAX_DECAY, .log_period = log_period, + .hard_timeout = hard_timeout, + .coredump_period = coredump_period, .cpus = cpus, .using_avx2 = using_avx2(), }; @@ -676,6 +715,8 @@ int defer_init(const char *mmap_file, uint32_t log_period, int cpus) // TODO po sizeof(header.capacity) + sizeof(header.max_decay) + sizeof(header.log_period) + + sizeof(header.hard_timeout) + + sizeof(header.coredump_period) + sizeof(header.cpus), "detected padding with undefined data inside mmapped header"); @@ -692,7 +733,8 @@ int defer_init(const char *mmap_file, uint32_t log_period, int cpus) // TODO po goto fail; } - defer->log_time = kr_now() - log_period; + defer->log_time = kr_log_period_init(log_period); + defer->coredump_time = kr_log_period_init(coredump_period); ret = mmapped_init_continue(&defer_mmapped); if (ret != 0) goto fail; @@ -713,6 +755,12 @@ int defer_init(const char *mmap_file, uint32_t log_period, int cpus) // TODO po for (size_t i = 0; i < QUEUES_CNT; i++) queue_init(queues[i]); + if (signal(SIGALRM, defer_alarm) == SIG_ERR) { + kr_log_error(DEFER, "Cannot set SIGALRM handler, interrupting of too long work on a single request will not work: %s\n", + strerror(errno)); + } + defer_alarm(0); + return 0; fail: diff --git a/daemon/defer.h b/daemon/defer.h index e6ade87aad9d2cf2094c0ef3bda53755e15eae09..57ad13d3922b73c512a5666e32cf53250946e41f 100644 --- a/daemon/defer.h +++ b/daemon/defer.h @@ -9,9 +9,9 @@ /// Initialize defer, incl. shared memory with KRU, excl. idle. KR_EXPORT -int defer_init(const char *mmap_file, uint32_t log_period, int cpus); +int defer_init(const char *mmap_file, uint32_t log_period, uint32_t hard_timeout, uint32_t coredump_period, int cpus); -/// Initialize idle. +/// Initialize idle and SIGALRM handler. int defer_init_idle(uv_loop_t *loop); /// Deinitialize shared memory. @@ -48,7 +48,9 @@ static inline uint64_t defer_get_stamp(void) uint64_t stamp = now_ts.tv_nsec + 1000*1000*1000 * (uint64_t)now_ts.tv_sec; if (defer_uvtime_stamp + 1000*1000 < stamp) { defer_uvtime_stamp = stamp; - uv_update_time(uv_default_loop()); + uv_update_time(uv_default_loop()); // NOLINT, async-signal-safe + // on Linux, it just calls clock_gettime(CLOCK_MONOTONIC[_COARSE], ...) and sets value for uv_now (kr_now); + // libuv probably updates time just once per loop by itself } return stamp; } @@ -92,9 +94,10 @@ static inline void defer_sample_start_stamp(uint64_t stamp) { if (!defer) return; kr_assert(!defer_sample_state.is_accounting); - defer_sample_state.is_accounting = true; defer_sample_state.stamp = stamp; defer_sample_state.addr.ip.sa_family = AF_UNSPEC; + __sync_synchronize(); + defer_sample_state.is_accounting = true; } /// Internal; stop accounting work at specified timestamp and charge the source if applicable. @@ -103,6 +106,7 @@ static inline void defer_sample_stop_stamp(uint64_t stamp) if (!defer) return; kr_assert(defer_sample_state.is_accounting); defer_sample_state.is_accounting = false; + __sync_synchronize(); if (defer_sample_state.addr.ip.sa_family == AF_UNSPEC) return; @@ -123,7 +127,10 @@ static inline bool defer_sample_is_accounting(void) /// Start accounting work; optionally save state of current accounting. /// Current state can be saved only after having an address assigned. static inline void defer_sample_start(defer_sample_state_t *prev_state_out) { - if (!defer) return; + if (!defer) { + if (prev_state_out) *prev_state_out = (defer_sample_state_t){ 0 }; // just to meet undefined-value check of linter, but never used + return; + } uint64_t stamp = defer_get_stamp(); // suspend @@ -159,7 +166,10 @@ static inline void defer_sample_stop(defer_sample_state_t *prev_state, bool reus // resume if (prev_state) { - defer_sample_state = *prev_state; + defer_sample_state.addr = prev_state->addr; + defer_sample_state.stream = prev_state->stream; defer_sample_state.stamp = stamp; + __sync_synchronize(); + defer_sample_state.is_accounting = prev_state->is_accounting; } } diff --git a/daemon/lua/kres-gen-33.lua b/daemon/lua/kres-gen-33.lua index 8147af88f8c1228a696e347aec8f37a90fc37a96..9e602472668d6a7886d2f32833b8853ee34a84a5 100644 --- a/daemon/lua/kres-gen-33.lua +++ b/daemon/lua/kres-gen-33.lua @@ -620,7 +620,7 @@ struct qr_task *worker_resolve_start(knot_pkt_t *, struct kr_qflags); int zi_zone_import(const zi_config_t); _Bool ratelimiting_request_begin(struct kr_request *); int ratelimiting_init(const char *, size_t, uint32_t, uint32_t, uint16_t, uint32_t, _Bool); -int defer_init(const char *, uint32_t, int); +int defer_init(const char *, uint32_t, uint32_t, uint32_t, int); void defer_set_price_factor16(struct kr_request *, uint32_t); struct engine { char _stub[]; diff --git a/daemon/main.c b/daemon/main.c index a7b9c92b6d9c304c462f44d297c3fd783700ec42..773bbbde5e666abd120d126749bb750bfd373034 100644 --- a/daemon/main.c +++ b/daemon/main.c @@ -553,7 +553,7 @@ int main(int argc, char **argv) uv_loop_t *loop = uv_default_loop(); /* Catch some signals. */ - uv_signal_t sigint, sigterm, sigchld; + uv_signal_t sigint, sigterm, sigchld; // +SIGALRM handled by defer if (true) ret = uv_signal_init(loop, &sigint); if (!ret) ret = uv_signal_init(loop, &sigterm); if (!ret) ret = uv_signal_init(loop, &sigchld); @@ -618,7 +618,7 @@ int main(int argc, char **argv) if (!defer_initialized) { kr_log_warning(SYSTEM, "Prioritization not initialized from Lua, using hardcoded default.\n"); - ret = defer_init("defer", 1, 1); + ret = defer_init("defer", 1, 0, 0, 1); if (ret) { ret = EXIT_FAILURE; goto cleanup; diff --git a/daemon/ratelimiting.c b/daemon/ratelimiting.c index d182658e49b5aa090ea49a8dac23440eb6605036..f42a7469210d8bf542dc332b37db88e76015d6fd 100644 --- a/daemon/ratelimiting.c +++ b/daemon/ratelimiting.c @@ -90,7 +90,7 @@ int ratelimiting_init(const char *mmap_file, size_t capacity, uint32_t instant_l goto fail; } - ratelimiting->log_time = kr_now() - log_period; + ratelimiting->log_time = kr_log_period_init(log_period); for (size_t i = 0; i < V4_PREFIXES_CNT; i++) { ratelimiting->v4_prices[i] = base_price / V4_RATE_MULT[i]; @@ -175,18 +175,11 @@ bool ratelimiting_request_begin(struct kr_request *req) ((ratelimiting->slip == 1) ? true : false); // logging - uint32_t log_time_orig = atomic_load_explicit(&ratelimiting->log_time, memory_order_relaxed); - if (ratelimiting->log_period) { - while (time_now - log_time_orig + 1024 >= ratelimiting->log_period + 1024) { - if (atomic_compare_exchange_weak_explicit(&ratelimiting->log_time, &log_time_orig, time_now, - memory_order_relaxed, memory_order_relaxed)) { - kr_log_notice(SYSTEM, "address %s rate-limited on /%d (%s%s)\n", - kr_straddr(req->qsource.addr), limited_prefix, - ratelimiting->dry_run ? "dry-run, " : "", - tc ? "truncated" : "dropped"); - break; - } - } + if (kr_log_period(ratelimiting->log_period, &ratelimiting->log_time)) { + kr_log_notice(SYSTEM, "address %s rate-limited on /%d (%s%s)\n", + kr_straddr(req->qsource.addr), limited_prefix, + ratelimiting->dry_run ? "dry-run, " : "", + tc ? "truncated" : "dropped"); } req->ratelimited = true; // we set this even on dry_run diff --git a/daemon/worker.c b/daemon/worker.c index 07480b24ca6f5e133243971bcada295b8351fe6e..83e3434f9215d85392c74e79ea71d09069bc6041 100644 --- a/daemon/worker.c +++ b/daemon/worker.c @@ -2024,7 +2024,8 @@ static enum protolayer_event_cb_result pl_dns_stream_disconnected( return PROTOLAYER_EVENT_PROPAGATE; defer_sample_state_t defer_prev_sample_state; - if (session->outgoing) + const bool use_recursive_defer_sampling = session->outgoing; // make linter warning silent + if (use_recursive_defer_sampling) defer_sample_start(&defer_prev_sample_state); while (!session2_waitinglist_is_empty(session)) { @@ -2043,7 +2044,7 @@ static enum protolayer_event_cb_result pl_dns_stream_disconnected( qry->flags.TCP = false; } qr_task_step(task, NULL, NULL); - defer_sample_restart(); + defer_sample_restart(); // use_recursive_defer_sampling == true } else { kr_assert(task->ctx->source.session == session); task->ctx->source.session = NULL; @@ -2068,7 +2069,7 @@ static enum protolayer_event_cb_result pl_dns_stream_disconnected( worker_task_unref(task); } - if (session->outgoing) + if (use_recursive_defer_sampling) defer_sample_stop(&defer_prev_sample_state, true); return PROTOLAYER_EVENT_PROPAGATE; diff --git a/doc/_static/config.schema.json b/doc/_static/config.schema.json index 0bedbbc4ed1b314e73bd6539b33142f0148b0452..a22a92832ac0bcdb22da943f9be03d1567e743ce 100644 --- a/doc/_static/config.schema.json +++ b/doc/_static/config.schema.json @@ -1751,11 +1751,25 @@ "pattern": "^(\\d+)(us|ms|s|m|h|d)$", "description": "Minimal time between two log messages, or '0s' to disable.", "default": "0s" + }, + "hard-timeout": { + "type": "string", + "pattern": "^(\\d+)(us|ms|s|m|h|d)$", + "description": "If a measured operation lasts longer, kresd is interrupted; use '0s' to disable.", + "default": "0s" + }, + "coredump-period": { + "type": "string", + "pattern": "^(\\d+)(us|ms|s|m|h|d)$", + "description": "Minimal time between two coredumps caused by hard_timeout, or '0s' to disable them.", + "default": "10m" } }, "default": { "enabled": false, - "log_period": "0s" + "log_period": "0s", + "hard_timeout": "0s", + "coredump_period": "10m" } }, "lua": { diff --git a/doc/dev/logging_api.rst b/doc/dev/logging_api.rst index 09b705c8be1384fe42e85b4c2ac2e448880572d7..a4a093a7327f52d629894fc9bd71015c81e69d49 100644 --- a/doc/dev/logging_api.rst +++ b/doc/dev/logging_api.rst @@ -3,6 +3,44 @@ Logging API reference ===================== +Logging overview +---------------- + +Kresd process uses one of several logging targets according to its configuration; +it can log to ``stdout``, ``stderr`` and ``syslog`` (either via ``syslog`` function of via ``libsystemd``). +There is also rarely used async-signal-safe way of logging defined in ``lib/sigsafe.h``, +which tries to be as consistent with the standard logging as possible, +though some inconsistencies arise to avoid unsafe function calls like ``syslog``. + +Based on how knot-resolver was executed, there are different ways of handling logs: + +With systemd service using manager (the standard way) +each process (incl. manager, etc) uses ``syslog``/``libsystemd`` directly, +sigsafe variant prints lines prefixed with loglevel to ``stderr``, +which should be handled by systemd the same way. +Using systemd without manager (the legacy way) works the same. + +When using manager with non-systemd syslog, +all processes use directly ``syslog``, +sigsafe uses the ``stderr`` with prefixing as before, +which is now handled by supervisor resending the output using ``syslog``. +The form of the messages may be a little different, but all end up in the syslog. +As supervisor cannot handle syslog loglevels by itself, +all sigsafe messages use the same level in this case. + +When using manager with stdout logging (``./poe run``), +each process prints to stdout, +which is prefixed with timestamp and process name by supervisor. +Sigsafe works the same (no loglevel prefixing). + +When spawning kresd on your own, logging target depends on your configuration. +If logging to ``stdout``/``stderr``, sigsafe is consistent; +if using ``syslog``, you need to handle also ``<N>``-prefixed lines on ``stderr``. + + +API reference +------------- + .. _config_log_groups: .. doxygenfile:: lib/log.h diff --git a/doc/user/config-defer.rst b/doc/user/config-defer.rst index 4d26ee403c21f2637bde85de7c76cd545789d6d7..43c2cefbcbe6597a7f5da28556ad8faa51f94ccc 100644 --- a/doc/user/config-defer.rst +++ b/doc/user/config-defer.rst @@ -8,8 +8,10 @@ Request prioritization (defer) Defer tries to mitigate DoS attacks by measuring cpu time consumption of different hosts and networks and deferring future requests from the same origin. If there is not enough time to process all the requests, the lowest priority ones are dropped. +It also allows setting a hard timeout on a continuous computation on a single request. The time measurements are taken into account only for TCP-based queries (including DoT and DoH), +except for hard timeout which is applied for both, as the source address of plain UDP can be forged. We aim to spend half of the time for UDP without prioritization and half of the time for non-UDP with prioritization, @@ -46,6 +48,49 @@ The limits can be adjusted for different packet origins using :option:`price-fac and sources with more dropped queries have greater probability to be chosen. +.. option:: defer/hard-timeout: <time ms|s|m|h|d> + + :default: 0s + + Time limit for a cpu time consumed continuously on a single request, or ``0s`` to disable. + It causes crash of kresd if exceeded; use carefully. + + This is intended as a last resort defence against yet unknown bugs + allowing an attacker to initiate very expensive computations by a single request + resulting in freezing kresd process for several seconds or minutes. + + It is based on scheduling a SIGALRM to be delivered after the timeout (or up to 1s later), + which then interrupts the computation. + After interrupting the priority of the request's origin is decreased according to the duration (if non-UDP), + the kresd process is terminated (dropping all pending, but probably already timeouted, requests) + and started again by manager. + To keep the data with measurements and priorities alive during restart, + it is crucial to use :ref:`multiple workers <config-multiple-workers>` + as those data are shared between them and disappear with the last one. + + A continuous work on a single request usually takes under 1 ms. + Set the timeout to 1s or higher values to avoid random crashes. + +.. option:: defer/coredump-period: <time ms|s|m|h|d> + + :defeult: 10m + + Minimal time between two coredumps caused by :option:`hard-timeout <defer/hard-timeout: <time ms|s|m|h|d>`, + or ``0s`` to disable them. + + If kresd is to be terminated due to :option:`hard-timeout <defer/hard-timeout: <time ms|s|m|h|d>`, + it calls ``abort``, which might cause coredump to be generated, and disables this behaviour + for :option:`coredump-period <defer/coredump-period: <time ms|s|m|h|d>`. + Subsequent terminations call just ``_exit``, so that kresd is terminated without coredump. + + The last abortion timestamp is stored along with other defer data + in the memory shared between workers which disappears with the last one; + it is thus needed to use :ref:`multiple workers <config-multiple-workers>` + to keep the data alive during restart. + Otherwise, :option:`coredump-period <defer/coredump-period: <time ms|s|m|h|d>` has no effect + and coredumps are always enabled. + + Implementation details ---------------------- @@ -64,4 +109,3 @@ Further ordering is according to the time of arrival. If a request is deferred for too long, it gets dropped. This can happen also for UDP requests, which are stored in a single queue ordered by the time of their arrival. - diff --git a/lib/log.c b/lib/log.c index e1f4d543676cc9c7233ca3068853150c648839f7..a52fd7f324ecb77274506237762770110e11cd5d 100644 --- a/lib/log.c +++ b/lib/log.c @@ -22,9 +22,7 @@ bool use_journal = false; kr_log_level_t kr_log_level = LOG_DEFAULT_LEVEL; kr_log_target_t kr_log_target = LOG_TARGET_DEFAULT; - -/** Set of log-groups that are on debug level. It's a bitmap over 1 << enum kr_log_group. */ -static uint64_t kr_log_groups = 0; +uint64_t kr_log_groups = 0; typedef struct { const char *g_name; diff --git a/lib/log.h b/lib/log.h index 30fc29373808fe482d8510e2cefe48aa10af9b8b..b441453e4f132988a0b18f319b645f6f580a991c 100644 --- a/lib/log.h +++ b/lib/log.h @@ -140,6 +140,9 @@ enum kr_log_group { #define LOG_GRP_REQDBG_TAG "reqdbg" /**< ``reqdbg``: debug logs enabled by policy actions */ ///@} +// Async-signal-safe variant of the lowercase function; fails silently. +#define KR_LOG_GROUP_IS_SET(grp) ((LOG_GRP_ ## grp < 1) || (kr_log_groups & (1ULL << LOG_GRP_ ## grp))) + KR_EXPORT bool kr_log_group_is_set(enum kr_log_group group); KR_EXPORT @@ -160,6 +163,10 @@ typedef int kr_log_level_t; KR_EXPORT extern kr_log_level_t kr_log_level; +/** Set of log-groups that are on debug level. It's a bitmap over 1 << enum kr_log_group. Read only. */ +KR_EXPORT extern +uint64_t kr_log_groups; + /** Set the current logging level. */ KR_EXPORT void kr_log_level_set(kr_log_level_t level); diff --git a/lib/meson.build b/lib/meson.build index 94bb80c7313371fbef327083263a563bc3908124..2d6d6bef1e289007e8428f1deec85765fc2397b3 100644 --- a/lib/meson.build +++ b/lib/meson.build @@ -39,6 +39,7 @@ libkres_src = files([ 'selection.c', 'selection_forward.c', 'selection_iter.c', + 'sigsafe.c', 'utils.c', 'zonecut.c', ]) @@ -75,6 +76,7 @@ libkres_headers = files([ 'selection.h', 'selection_forward.h', 'selection_iter.h', + 'sigsafe.h', 'utils.h', 'zonecut.h', ]) diff --git a/lib/sigsafe.c b/lib/sigsafe.c new file mode 100644 index 0000000000000000000000000000000000000000..bb9cb660f16d04267ad2a03f832c05f0af10b3d6 --- /dev/null +++ b/lib/sigsafe.c @@ -0,0 +1,261 @@ +#include <stdarg.h> +#include <stdbool.h> +#include <netinet/in.h> +#include <sys/un.h> + +#include "./sigsafe.h" + +/* + The sigsafe_append_TYPE functions convert value into NULL-terminated string, + they take + * dst: beginning of the output buffer, + * dste: end of the output buffer (excl.), + * ... + and return ptr to final terminating '\0' within the buffer; + the buffer has to contain space for at least the '\0', which is always written. + The APPEND(TYPE, ...) macro calls the sigsafe_append_TYPE function and modifies dst variable + to allow further appending. +*/ + +#define APPEND(type, ...) dst = sigsafe_append_ ## type (dst, dste, __VA_ARGS__) + +/// Appends character multiple times (negative cnt works as zero). +char *sigsafe_append_char(char *dst, char *dste, char c, int cnt) { + while ((dst < dste - 1) && (cnt-- > 0)) { + *dst++ = c; + } + *dst = '\0'; + return dst; +} + +/// Appends data of given length (NULL-terminating them). +char *sigsafe_append_data(char *dst, char *dste, const char *src, size_t len) { + const size_t max_len = dste-dst-1; + if (len > max_len) { + len = max_len; + } + + memcpy(dst, src, len); + dst += len; + *dst = '\0'; + return dst; +} + +/// Appends string, possibly padded to given width. +char *sigsafe_append_str(char *dst, char *dste, int width, bool align_left, const char *str) { + int len = strlen(str); + if (!align_left) APPEND(char, ' ', width - len); + APPEND(data, str, len); + if (align_left) APPEND(char, ' ', width - len); + return dst; +} + +/// Appends unsigned int in given numeral base, possibly padded to given width and preceeded with the given sign. +char *sigsafe_append_uint(char *dst, char *dste, int base, int width, char padding_char, char sign_char, unsigned val) { + const char digits[] ="0123456789abcdef"; + char tmp[sizeof(val) * 8 / 3 + 1]; // just digits of the resulting number, not null-terminated + char *sp = tmp + sizeof(tmp); + char *se = sp; + while ((val > 0) || (sp == se)) { + *--sp = digits[val % base]; + val /= base; + width--; + } + + if (sign_char) { + width--; + if (padding_char == '0') { + APPEND(char, sign_char, 1); + } + } + APPEND(char, padding_char, width); + if (sign_char && (padding_char != '0')) { + APPEND(char, sign_char, 1); + } + + return APPEND(data, sp, se-sp); +} + +/// Appends signed int in given numeral base, possibly padded to given width. +char *sigsafe_append_int(char *dst, char *dste, int base, int width, char padding_char, int val) { + char sign_char = 0; + if (val < 0) { + sign_char = '-'; + val *= -1; + } + return APPEND(uint, base, width, padding_char, sign_char, val); +} + + +/// Appends real number with given precision, possibly padded to given width. +char *sigsafe_append_double(char *dst, char *dste, int width, int precision, double val) { + int sign = 1; + if (val < 0) { + val *= -1; + sign = -1; + } + int64_t pmult = 1; + for (int i = 0; i < precision; i++) { + pmult *= 10; + } + int64_t vali = val * pmult + 0.5; // NOLINT(bugprone-incorrect-roundings), just minor imprecisions + // larger numbers, NaNs, ... are not handled + APPEND(int, 10, width - precision - 1, ' ', sign * (vali / pmult)); + APPEND(char, '.', 1); + APPEND(uint, 10, precision, '0', 0, (vali % pmult)); + return dst; +} + +/// Find indices of maximal zero-filled gap in IPv6 (zeroes-end index is excl.) +static inline void sigsafe_inet6_longest_zeroes(uint8_t *ipv6, int *zb_out, int *ze_out) { + *zb_out = -2; *ze_out = 0; // nothing to be skipped + int zb = 0, ze = 0; + for (size_t i = 0; i < 16; i += 2) { + if (!ipv6[i] && !ipv6[i+1]) { + if (i == ze) { + ze += 2; + } else { + if (ze - zb > *ze_out - *zb_out) { + *zb_out = zb; + *ze_out = ze; + } + zb = i; ze = i + 2; + } + } + } + if (ze - zb > *ze_out - *zb_out) { + *zb_out = zb; + *ze_out = ze; + } +} + +/// Appends network address containing AF_UNIX, AF_INET (with port), or AF_INET6 (with port). +char *sigsafe_append_sockaddr(char *dst, char *dste, struct sockaddr *addr) { + if (!addr) { + return APPEND(str, 0, false, "(null)"); + } + switch (addr->sa_family) { + case AF_UNIX: + return APPEND(str, 0, false, ((struct sockaddr_un *)addr)->sun_path); + case AF_INET: { + struct sockaddr_in *addr4 = (struct sockaddr_in *)addr; + uint8_t *ipv4 = (uint8_t *)&(addr4->sin_addr); + uint8_t *port = (uint8_t *)&(addr4->sin_port); + for (int i = 0; i < 4; i++) { + APPEND(uint, 10, 0, '0', 0, ipv4[i]); + APPEND(char, "...#"[i], 1); + } + APPEND(uint, 10, 0, '0', 0, (port[0] << 8) | port[1]); + return dst; + }; + case AF_INET6: { + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr; + uint8_t *ipv6 = (uint8_t *)&(addr6->sin6_addr); + uint8_t *port = (uint8_t *)&(addr6->sin6_port); + int zb, ze; // maximal zero-filled gap begin (incl.) and end (excl.) + sigsafe_inet6_longest_zeroes(ipv6, &zb, &ze); + for (int i = -!zb; i < 15; i++) { + if (i == zb) i = ze - 1; // after ':' (possibly for i=-1), skip sth. and continue with ':' (possibly for i=15) + if (i%2) { + APPEND(char, ':', 1); + } else { + APPEND(uint, 16, 0, '0', 0, (ipv6[i] << 8) | ipv6[i+1]); + } + } + APPEND(char, '#', 1); + APPEND(uint, 10, 0, '0', 0, (port[0] << 8) | port[1]); + return dst; + }; + case AF_UNSPEC: + return APPEND(str, 0, false, "(unspec)"); + default: + return APPEND(str, 0, false, "(unknown)"); + } +} + +int sigsafe_format(char *str, size_t size, const char *fmt, ...) { + char *dst = str; // ptr just after the last written non-null character + char *dste = str + size; // ptr just after str buffer + va_list ap; + va_start(ap, fmt); // NOLINT, should be safe in GCC + while (*fmt && (dste-dst > 1)) { + if (*fmt != '%') { + char *perc = strchr(fmt, '%'); + int len = perc ? perc - fmt : strlen(fmt); + APPEND(data, fmt, len); + fmt += len; + continue; + } + fmt++; + + bool flag_zero = false; + bool flag_left = false; + while (true) { + switch(*fmt) { + case '0': + flag_zero = true; + fmt++; + continue; + case '-': + flag_left = true; + fmt++; + continue; + default: + break; + } + break; + } + + int width = 0; + while (('0' <= *fmt) && (*fmt <= '9')) { + width = width * 10 + *fmt - '0'; + fmt++; + } + + int precision = 3; + if (*fmt == '.') { + fmt++; + precision = 0; + while (('0' <= *fmt) && (*fmt <= '9')) { + precision = precision * 10 + *fmt - '0'; + fmt++; + } + } + + switch(*fmt) { + case '%': + APPEND(char, '%', 1); + break; + case 's': + APPEND(str, width, flag_left, + va_arg(ap, char *)); // NOLINT, should be safe in GCC + break; + case 'x': + APPEND(uint, 16, width, flag_zero ? '0' : ' ', 0, + va_arg(ap, unsigned)); // NOLINT, should be safe in GCC + break; + case 'u': + APPEND(uint, 10, width, flag_zero ? '0' : ' ', 0, + va_arg(ap, unsigned)); // NOLINT, should be safe in GCC + break; + case 'i': + APPEND(int, 10, width, flag_zero ? '0' : ' ', + va_arg(ap, int)); // NOLINT, should be safe in GCC + break; + case 'f': + APPEND(double, width, precision, + va_arg(ap, double)); // NOLINT, should be safe in GCC + break; + case 'r': + APPEND(sockaddr, + va_arg(ap, void *)); // NOLINT, should be safe in GCC + break; + default: + APPEND(str, 0, false, "[ERR]"); + break; + } + fmt++; + } + va_end(ap); // NOLINT, should be safe in GCC + return dst-str; +} diff --git a/lib/sigsafe.h b/lib/sigsafe.h new file mode 100644 index 0000000000000000000000000000000000000000..efc3372c85ecd86f449541f0c2747740654a2974 --- /dev/null +++ b/lib/sigsafe.h @@ -0,0 +1,31 @@ +/* Copyright (C) CZ.NIC, z.s.p.o. <knot-resolver@labs.nic.cz> + * SPDX-License-Identifier: GPL-3.0-or-later + */ + +#pragma once +#include "lib/defines.h" +#include "lib/log.h" + +/// Async-signal-safe snprintf-like formatting function, it supports: +/// * %% prints %; +/// * %s takes (char *), supports width and '-'-flag; +/// * %i takes int, supports width and '0'-flag; +/// * %u takes unsigned, supports width and '0'-flag; +/// * %x takes unsigned, supports width and '0'-flag; +/// * %f takes double, supports width and precision (defaults to .3); +/// * %r takes (struct sockaddr *). +KR_EXPORT +int sigsafe_format(char *str, size_t size, const char *fmt, ...); +#define sigsafe_format(...) sigsafe_format(__VA_ARGS__) // NOLINT, all calls are async-signal-safe + +/// Log according to the set target to stdout or stderr; +/// for syslog, print to stderr prefixed with <loglevel>, as syslog() is not async-signal-safe. +#define sigsafe_log(level, grp, max_size, fmt, ...) { \ + if ((KR_LOG_LEVEL_IS(level) || KR_LOG_GROUP_IS_SET(grp))) { \ + char msg[max_size + 12]; \ + int len = 0; \ + if (kr_log_target == LOG_TARGET_SYSLOG) \ + len += sigsafe_format(msg, sizeof(msg), "<%u>", level); \ + len += sigsafe_format(msg + len, sizeof(msg) - len, "[%-6s] " fmt, LOG_GRP_ ## grp ## _TAG, ## __VA_ARGS__); \ + write(kr_log_target == LOG_TARGET_STDOUT ? 1 : 2, msg, len); \ + }} diff --git a/lib/utils.h b/lib/utils.h index b6b350c8d8b3724411dbc361a2b9d83a749870e3..d7596a67ba2c223451420029b3df84de1ca62ffa 100644 --- a/lib/utils.h +++ b/lib/utils.h @@ -5,6 +5,7 @@ #pragma once #include <dirent.h> +#include <stdatomic.h> #include <stdio.h> #include <stdlib.h> #include <stdbool.h> @@ -628,3 +629,22 @@ static inline const knot_dname_t * knot_dname_next_label(const knot_dname_t *dn } #endif +/* Determine whether to perform an action (logging) limited once per time period in ms. */ +static inline bool kr_log_period(uint32_t period, _Atomic uint32_t *last_time) { + const uint32_t time_now = kr_now(); // 32 bits are sufficient here + uint32_t last_time_orig = atomic_load_explicit(last_time, memory_order_relaxed); + if (period) { + while (time_now - last_time_orig + 1024 >= period + 1024) { + if (atomic_compare_exchange_weak_explicit(last_time, &last_time_orig, time_now, + memory_order_relaxed, memory_order_relaxed)) { + return true; + } + } + } + return false; +} + +/* Initialize last_time for kr_log_period. */ +static inline uint32_t kr_log_period_init(uint32_t period) { + return kr_now() - period; +} diff --git a/python/knot_resolver/controller/supervisord/config_file.py b/python/knot_resolver/controller/supervisord/config_file.py index 05e380dcf6c5bf0407df60eafd9d2c20c54e8dd2..fdde29607a594b028515e136d6f62f00fafd41a7 100644 --- a/python/knot_resolver/controller/supervisord/config_file.py +++ b/python/knot_resolver/controller/supervisord/config_file.py @@ -141,6 +141,10 @@ class ProcessTypeConfig: ) +def sd_booted(): + return os.path.lexists("/run/systemd/system") # same as in libsystemd + + @dataclass class SupervisordConfig: unix_http_server: Path @@ -149,6 +153,7 @@ class SupervisordConfig: logfile: Path loglevel: Literal["critical", "error", "warn", "info", "debug", "trace", "blather"] target: LogTargetEnum + systemd_logfile: Literal["NONE", "PASS"] @staticmethod def create(config: KresConfig) -> "SupervisordConfig": @@ -165,6 +170,12 @@ class SupervisordConfig: "debug": "debug", }[config.logging.level] cwd = str(os.getcwd()) + + # Keep stderr FD unchanged if logging to systemd (it is likely to be handled by systemd as well as syslog); + # reformat `<N>` line prefixes in supervisord's patch_logger module, otherwise. + # Use as the value of `stderr_logfile` where desired. + systemd_logfile = "" if config.logging.target == "syslog" and sd_booted() else "NONE" + return SupervisordConfig( # type: ignore[call-arg] unix_http_server=supervisord_sock_file(config), pid_file=supervisord_pid_file(config), @@ -172,6 +183,7 @@ class SupervisordConfig: logfile=Path("syslog" if config.logging.target == "syslog" else "/dev/null"), loglevel=loglevel, # type: ignore[arg-type] target=config.logging.target, + systemd_logfile=systemd_logfile, # type: ignore[arg-type] ) diff --git a/python/knot_resolver/controller/supervisord/plugin/patch_logger.py b/python/knot_resolver/controller/supervisord/plugin/patch_logger.py index db27f105c19a12dbfabf00f656fc1bd586de910a..f401d95c877b8278621e2660446bddd06395dd27 100644 --- a/python/knot_resolver/controller/supervisord/plugin/patch_logger.py +++ b/python/knot_resolver/controller/supervisord/plugin/patch_logger.py @@ -2,12 +2,14 @@ # pylint: disable=protected-access import os +import re import sys import traceback from typing import Any, Literal from supervisor.dispatchers import POutputDispatcher from supervisor.loggers import LevelsByName, StreamHandler, SyslogHandler +from supervisor.process import Subprocess from supervisor.supervisord import Supervisor FORWARD_LOG_LEVEL = LevelsByName.CRIT # to make sure it's always printed @@ -17,7 +19,10 @@ def empty_function(*args, **kwargs): pass -FORWARD_MSG_FORMAT: str = "%(name)s[%(pid)d]%(stream)s: %(data)s" +FORWARD_MSG_FORMAT: str = "%(prefix)s%(name)s[%(pid)d]%(stream)s: %(data)s" +FORWARD_MSG_PREFIX: str = "" + +loglevel_re = re.compile(r"<(\d)>(.*)") def p_output_dispatcher_log(self: POutputDispatcher, data: bytearray): @@ -36,10 +41,25 @@ def p_output_dispatcher_log(self: POutputDispatcher, data: bytearray): config.options.logger.handlers = forward_handlers for line in text.splitlines(): stream = "" - if self.channel == "stderr": - stream = " (stderr)" + prefix = "" + loglevel_match = loglevel_re.match(line) + if loglevel_match: + # just strip the loglevel as supervisor cannot handle it; + # used only for target=syslog without systemd + line = loglevel_match.group(2) # pylint: disable=redefined-loop-name + else: + # no loglevel found, mark as stdio output to retain previous behaviour + if self.channel == "stderr": + stream = " (stderr)" + prefix = FORWARD_MSG_PREFIX config.options.logger.log( - FORWARD_LOG_LEVEL, FORWARD_MSG_FORMAT, name=config.name, stream=stream, data=line, pid=self.process.pid + FORWARD_LOG_LEVEL, + FORWARD_MSG_FORMAT, + prefix=prefix, + name=config.name, + stream=stream, + data=line, + pid=self.process.pid, ) config.options.logger.handlers = supervisord_handlers @@ -54,6 +74,21 @@ def _create_handler(fmt, level, target: Literal["stdout", "stderr", "syslog"]) - return handler +# keep stderr FD unchanged if stderr_logfile is empty; same behavior as originally otherwise; +# we use "" as other strings involve creation of file of that name (bool("") == False) +def _prepare_child_fds(self): + options = self.config.options + options.dup2(self.pipes["child_stdin"], 0) + options.dup2(self.pipes["child_stdout"], 1) + if self.config.stderr_logfile != "": + if self.config.redirect_stderr: + options.dup2(self.pipes["child_stdout"], 2) + else: + options.dup2(self.pipes["child_stderr"], 2) + for i in range(3, options.minfds): + options.close_fd(i) + + supervisord_handlers = [] forward_handlers = [] @@ -77,12 +112,15 @@ def inject(supervisord: Supervisor, **config: Any) -> Any: # pylint: disable=us # replace output handler for subprocesses POutputDispatcher._log = p_output_dispatcher_log # noqa: SLF001 + # replace setting FDs of subprocesses + Subprocess._prepare_child_fds = _prepare_child_fds # noqa: SLF001 + # we forward stdio in all cases, even when logging to syslog. This should prevent the unforturtunate # case of swallowing an error message leaving the users confused. To make the forwarded lines obvious # we just prepend a explanatory string at the beginning of all messages if config["target"] == "syslog": - global FORWARD_MSG_FORMAT - FORWARD_MSG_FORMAT = "captured stdio output from " + FORWARD_MSG_FORMAT + global FORWARD_MSG_PREFIX + FORWARD_MSG_PREFIX = "captured stdio output from " # this method is called by supervisord when loading the plugin, # it should return XML-RPC object, which we don't care about diff --git a/python/knot_resolver/controller/supervisord/supervisord.conf.j2 b/python/knot_resolver/controller/supervisord/supervisord.conf.j2 index 4179d522114e3885946c6705155e501f0e1392f7..3546478500f68b73d95b9c27c0385ed14e1769a6 100644 --- a/python/knot_resolver/controller/supervisord/supervisord.conf.j2 +++ b/python/knot_resolver/controller/supervisord/supervisord.conf.j2 @@ -66,7 +66,7 @@ killasgroup=true startsecs=60 environment={{ kresd.environment }} stdout_logfile=NONE -stderr_logfile=NONE +stderr_logfile={{ config.systemd_logfile }} # empty string has special meaning defined in patch_logger [program:policy-loader] directory={{ loader.workdir }} @@ -77,7 +77,7 @@ killasgroup=true startsecs=300 environment={{ loader.environment }} stdout_logfile=NONE -stderr_logfile=NONE +stderr_logfile={{ config.systemd_logfile }} # empty string has special meaning defined in patch_logger [program:cache-gc] redirect_stderr=false diff --git a/python/knot_resolver/datamodel/defer_schema.py b/python/knot_resolver/datamodel/defer_schema.py index 81546a0756841e5a2671e7b2e47281763a3f8f83..3154e1044ed2bca61f74652290bf238de6300231 100644 --- a/python/knot_resolver/datamodel/defer_schema.py +++ b/python/knot_resolver/datamodel/defer_schema.py @@ -9,7 +9,11 @@ class DeferSchema(ConfigSchema): --- enabled: Use request prioritization. log_period: Minimal time between two log messages, or '0s' to disable. + hard_timeout: If a measured operation lasts longer, kresd is interrupted; use '0s' to disable. + coredump_period: Minimal time between two coredumps caused by hard_timeout, or '0s' to disable them. """ enabled: bool = False log_period: TimeUnit = TimeUnit("0s") + hard_timeout: TimeUnit = TimeUnit("0s") + coredump_period: TimeUnit = TimeUnit("10m") diff --git a/python/knot_resolver/datamodel/templates/defer.lua.j2 b/python/knot_resolver/datamodel/templates/defer.lua.j2 index 131b71c4760943e000e8328e75e4900659efe933..e43a12fb0b58965debdddc4c11f10e02e37cf5d7 100644 --- a/python/knot_resolver/datamodel/templates/defer.lua.j2 +++ b/python/knot_resolver/datamodel/templates/defer.lua.j2 @@ -4,7 +4,9 @@ assert(C.defer_init( '{{ cfg.rundir }}/defer', {{ cfg.defer.log_period.millis() }}, + {{ cfg.defer.hard_timeout.millis() }}, + {{ cfg.defer.coredump_period.millis() }}, {{ cfg.workers }}) == 0) {% else %} -assert(C.defer_init(nil, 0, 0) == 0) +assert(C.defer_init(nil, 0, 0, 0, 0) == 0) {%- endif %}