Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
No results found
Show changes
Showing
with 4826 additions and 55 deletions
/* Copyright (C) CZ.NIC, z.s.p.o. <knot-resolver@labs.nic.cz>
* SPDX-License-Identifier: GPL-3.0-or-later
*/
#pragma once
#include "daemon/engine.h"
#include "daemon/worker.h" /* the_worker is often useful */
#include <lua.h>
#include <lauxlib.h>
/* It may happen that include files are messed up and we're hitting a header
* e.g. from vanilla Lua. Even 5.1 won't work due to missing luaL_traceback() in <lauxlib.h>. */
#if (LUA_VERSION_NUM) != 501 || !defined(LUA_LJDIR)
#error "Incorrect Lua version in #include <lua.h> - LuaJIT compatible with Lua 5.1 is required"
#endif
/** Useful to stringify macros into error strings. */
#define STR(s) STRINGIFY_TOKEN(s)
#define STRINGIFY_TOKEN(s) #s
/** Check lua table at the top of the stack for allowed keys.
* \param keys NULL-terminated array of 0-terminated strings
* \return NULL if passed or the offending string (pushed on top of lua stack)
* \note Future work: if non-NULL is returned, there's extra stuff on the lua stack.
* \note Brute-force complexity: table length * summed length of keys.
*/
const char * lua_table_checkindices(lua_State *L, const char *keys[]);
/** If the value at the top of the stack isn't a table, make it a single-element list. */
static inline void lua_listify(lua_State *L)
{
if (lua_istable(L, -1))
return;
lua_createtable(L, 1, 0);
lua_insert(L, lua_gettop(L) - 1); /* swap the top two stack elements */
lua_pushinteger(L, 1);
lua_insert(L, lua_gettop(L) - 1); /* swap the top two stack elements */
lua_settable(L, -3);
}
/** Throw a formatted lua error.
*
* The message will get prefixed by "ERROR: " and supplemented by stack trace.
* \return never! It calls lua_error().
*
* Example:
ERROR: not a valid pin_sha256: 'a1Z/3ek=', raw length 5 instead of 32
stack traceback:
[C]: in function 'tls_client'
/PathToPREFIX/lib/kdns_modules/policy.lua:175: in function 'TLS_FORWARD'
/PathToConfig.lua:46: in main chunk
*/
KR_PRINTF(2) KR_NORETURN KR_COLD
void lua_error_p(lua_State *L, const char *fmt, ...);
/** @internal Annotate for static checkers. */
KR_NORETURN int lua_error(lua_State *L);
/** Shortcut for common case. */
static inline void lua_error_maybe(lua_State *L, int err)
{
if (err) lua_error_p(L, "%s", kr_strerror(err));
}
static inline int execute_callback(lua_State *L, int argc)
{
int ret = engine_pcall(L, argc);
if (ret != 0) {
kr_log_error(SYSTEM, "error: %s\n", lua_tostring(L, -1));
}
/* Clear the stack, there may be event a/o anything returned */
lua_settop(L, 0);
return ret;
}
/** Push a pointer as heavy/full userdata.
*
* It's useful as a replacement of lua_pushlightuserdata(),
* but note that it behaves differently in lua (converts to pointer-to-pointer).
*/
static inline void lua_pushpointer(lua_State *L, void *p)
{
void **addr = lua_newuserdata(L, sizeof(void *));
kr_require(addr);
memcpy(addr, &p, sizeof(void *));
}
/* Copyright (C) CZ.NIC, z.s.p.o. <knot-resolver@labs.nic.cz>
* SPDX-License-Identifier: GPL-3.0-or-later
*/
#include "daemon/bindings/impl.h"
/** List loaded modules */
static int mod_list(lua_State *L)
{
const module_array_t * const modules = engine_modules();
lua_newtable(L);
for (unsigned i = 0; i < modules->len; ++i) {
struct kr_module *module = modules->at[i];
lua_pushstring(L, module->name);
lua_rawseti(L, -2, i + 1);
}
return 1;
}
/** Load module. */
static int mod_load(lua_State *L)
{
/* Check parameters */
int n = lua_gettop(L);
if (n != 1 || !lua_isstring(L, 1))
lua_error_p(L, "expected 'load(string name)'");
/* Parse precedence declaration */
char *declaration = strdup(lua_tostring(L, 1));
if (!declaration)
return kr_error(ENOMEM);
const char *name = strtok(declaration, " ");
const char *precedence = strtok(NULL, " ");
const char *ref = strtok(NULL, " ");
/* Load engine module */
int ret = engine_register(name, precedence, ref);
free(declaration);
if (ret != 0) {
if (ret == kr_error(EIDRM)) {
lua_error_p(L, "referenced module not found");
} else {
lua_error_maybe(L, ret);
}
}
lua_pushboolean(L, 1);
return 1;
}
/** Unload module. */
static int mod_unload(lua_State *L)
{
/* Check parameters */
int n = lua_gettop(L);
if (n != 1 || !lua_isstring(L, 1))
lua_error_p(L, "expected 'unload(string name)'");
/* Unload engine module */
int ret = engine_unregister(lua_tostring(L, 1));
lua_error_maybe(L, ret);
lua_pushboolean(L, 1);
return 1;
}
int kr_bindings_modules(lua_State *L)
{
static const luaL_Reg lib[] = {
{ "list", mod_list },
{ "load", mod_load },
{ "unload", mod_unload },
{ NULL, NULL }
};
luaL_register(L, "modules", lib);
return 1;
}
.. SPDX-License-Identifier: GPL-3.0-or-later
Modules
=======
Knot Resolver functionality consists of separate modules, which allow you
to mix-and-match features you need without slowing down operation
by features you do not use.
This practically means that you need to load module before using features contained in it, for example:
.. code-block:: lua
-- load module and make dnstap features available
modules.load('dnstap')
-- configure dnstap features
dnstap.config({
socket_path = "/tmp/dnstap.sock"
})
Obviously ordering matters, so you have to load module first and configure it after it is loaded.
Here is full reference manual for module configuration:
.. function:: modules.list()
:return: List of loaded modules.
.. function:: modules.load(name)
:param string name: Module name, e.g. "hints"
:return: ``true`` if modules was (or already is) loaded, error otherwise.
Load a module by name.
.. function:: modules.unload(name)
:param string name: Module name, e.g. "detect_time_jump"
:return: ``true`` if modules was unloaded, error otherwise.
Unload a module by name. This is useful for unloading modules loaded by default, mainly for debugging purposes.
/* Copyright (C) CZ.NIC, z.s.p.o. <knot-resolver@labs.nic.cz>
* SPDX-License-Identifier: GPL-3.0-or-later
*/
#include "daemon/bindings/impl.h"
#include "contrib/base64.h"
#include "contrib/cleanup.h"
#include "daemon/network.h"
#include "daemon/tls.h"
#include "lib/utils.h"
#include <stdlib.h>
#define PROXY_DATA_STRLEN (INET6_ADDRSTRLEN + 1 + 3 + 1)
/** Table and next index on top of stack -> append entries for given endpoint_array_t. */
static int net_list_add(const char *b_key, uint32_t key_len, trie_val_t *val, void *ext)
{
endpoint_array_t *ep_array = *val;
lua_State *L = (lua_State *)ext;
lua_Integer i = lua_tointeger(L, -1);
for (int j = 0; j < ep_array->len; ++j) {
struct endpoint *ep = &ep_array->at[j];
lua_newtable(L); // connection tuple
if (ep->flags.kind) {
lua_pushstring(L, ep->flags.kind);
} else if (ep->flags.http && ep->flags.tls) {
lua_pushliteral(L, "doh2");
} else if (ep->flags.tls) {
lua_pushliteral(L, "tls");
} else if (ep->flags.xdp) {
lua_pushliteral(L, "xdp");
} else {
lua_pushliteral(L, "dns");
}
lua_setfield(L, -2, "kind");
lua_newtable(L); // "transport" table
switch (ep->family) {
case AF_INET:
lua_pushliteral(L, "inet4");
break;
case AF_INET6:
lua_pushliteral(L, "inet6");
break;
case AF_XDP:
lua_pushliteral(L, "inet4+inet6"); // both UDP ports at once
break;
case AF_UNIX:
lua_pushliteral(L, "unix");
break;
default:
kr_assert(false);
lua_pushliteral(L, "invalid");
}
lua_setfield(L, -2, "family");
const char *ip_str_const = network_endpoint_key_str((struct endpoint_key *) b_key);
kr_require(ip_str_const);
auto_free char *ip_str = strdup(ip_str_const);
kr_require(ip_str);
char *hm = strchr(ip_str, '#');
if (hm) /* Omit port */
*hm = '\0';
lua_pushstring(L, ip_str);
if (ep->family == AF_INET || ep->family == AF_INET6) {
lua_setfield(L, -2, "ip");
lua_pushboolean(L, ep->flags.freebind);
lua_setfield(L, -2, "freebind");
} else if (ep->family == AF_UNIX) {
lua_setfield(L, -2, "path");
} else if (ep->family == AF_XDP) {
lua_setfield(L, -2, "interface");
lua_pushinteger(L, ep->nic_queue);
lua_setfield(L, -2, "nic_queue");
}
if (ep->family != AF_UNIX) {
lua_pushinteger(L, ep->port);
lua_setfield(L, -2, "port");
}
if (ep->family == AF_UNIX) {
lua_pushliteral(L, "stream");
} else if (ep->flags.sock_type == SOCK_STREAM) {
lua_pushliteral(L, "tcp");
} else if (ep->flags.sock_type == SOCK_DGRAM) {
lua_pushliteral(L, "udp");
} else {
kr_assert(false);
lua_pushliteral(L, "invalid");
}
lua_setfield(L, -2, "protocol");
lua_setfield(L, -2, "transport");
lua_settable(L, -3);
i++;
lua_pushinteger(L, i);
}
return kr_ok();
}
/** List active endpoints. */
static int net_list(lua_State *L)
{
lua_newtable(L);
lua_pushinteger(L, 1);
trie_apply_with_key(the_network->endpoints, net_list_add, L);
lua_pop(L, 1);
return 1;
}
/** Listen on an address list represented by the top of lua stack.
* \note flags.kind ownership is not transferred, and flags.sock_type doesn't make sense
* \return success */
static bool net_listen_addrs(lua_State *L, int port, endpoint_flags_t flags, int16_t nic_queue)
{
if (kr_fails_assert(flags.xdp || nic_queue == -1))
return false;
/* Case: table with 'addr' field; only follow that field directly. */
lua_getfield(L, -1, "addr");
if (!lua_isnil(L, -1)) {
lua_replace(L, -2);
} else {
lua_pop(L, 1);
}
/* Case: string, representing a single address. */
const char *str = lua_tostring(L, -1);
if (str != NULL) {
const bool is_unix = str[0] == '/';
int ret = 0;
if (!flags.kind && !flags.tls) { /* normal UDP or XDP */
flags.sock_type = SOCK_DGRAM;
ret = network_listen(str, port, nic_queue, flags);
}
if (!flags.kind && !flags.xdp && ret == 0) { /* common for TCP, DoT and DoH (v2) */
flags.sock_type = SOCK_STREAM;
ret = network_listen(str, port, nic_queue, flags);
}
if (flags.kind) {
flags.kind = strdup(flags.kind);
flags.sock_type = SOCK_STREAM; /* TODO: allow to override this? */
ret = network_listen(str, (is_unix ? 0 : port), nic_queue, flags);
}
if (ret == 0) return true; /* success */
if (is_unix) {
kr_log_error(NETWORK, "bind to '%s' (UNIX): %s\n",
str, kr_strerror(ret));
} else if (flags.xdp) {
const char *err_str = knot_strerror(ret);
if (ret == KNOT_ELIMIT) {
if ((strcmp(str, "::") == 0 || strcmp(str, "0.0.0.0") == 0)) {
err_str = "wildcard addresses not supported with XDP";
} else {
err_str = "address matched multiple network interfaces";
}
} else if (ret == kr_error(ENODEV)) {
err_str = "invalid address or interface name";
}
/* Notable OK strerror: KNOT_EPERM Operation not permitted */
if (nic_queue == -1) {
kr_log_error(NETWORK, "failed to initialize XDP for '%s@%d'"
" (nic_queue = <auto>): %s\n",
str, port, err_str);
} else {
kr_log_error(NETWORK, "failed to initialize XDP for '%s@%d'"
" (nic_queue = %d): %s\n",
str, port, nic_queue, err_str);
}
} else {
const char *stype = flags.sock_type == SOCK_DGRAM ? "UDP" : "TCP";
kr_log_error(NETWORK, "bind to '%s@%d' (%s): %s\n",
str, port, stype, kr_strerror(ret));
}
return false; /* failure */
}
/* Last case: table where all entries are added recursively. */
if (!lua_istable(L, -1))
lua_error_p(L, "bad type for address");
lua_pushnil(L);
while (lua_next(L, -2)) {
if (!net_listen_addrs(L, port, flags, nic_queue))
return false;
lua_pop(L, 1);
}
return true;
}
static bool table_get_flag(lua_State *L, int index, const char *key, bool def)
{
bool result = def;
lua_getfield(L, index, key);
if (lua_isboolean(L, -1)) {
result = lua_toboolean(L, -1);
}
lua_pop(L, 1);
return result;
}
/** Listen on endpoint. */
static int net_listen(lua_State *L)
{
/* Check parameters */
int n = lua_gettop(L);
if (n < 1 || n > 3) {
lua_error_p(L, "expected one to three arguments; usage:\n"
"net.listen(addresses, [port = " STR(KR_DNS_PORT)
", flags = {tls = (port == " STR(KR_DNS_TLS_PORT) ")}])\n");
}
int port = KR_DNS_PORT;
if (n > 1) {
if (lua_isnumber(L, 2)) {
port = lua_tointeger(L, 2);
} else
if (!lua_isnil(L, 2)) {
lua_error_p(L, "wrong type of second parameter (port number)");
}
}
endpoint_flags_t flags = { 0 };
if (port == KR_DNS_TLS_PORT) {
flags.tls = true;
} else if (port == KR_DNS_DOH_PORT) {
flags.http = flags.tls = true;
}
int16_t nic_queue = -1;
if (n > 2 && !lua_isnil(L, 3)) {
if (!lua_istable(L, 3))
lua_error_p(L, "wrong type of third parameter (table expected)");
flags.tls = table_get_flag(L, 3, "tls", flags.tls);
flags.freebind = table_get_flag(L, 3, "freebind", false);
lua_getfield(L, 3, "kind");
const char *k = lua_tostring(L, -1);
if (k && strcasecmp(k, "dns") == 0) {
flags.tls = flags.http = false;
} else if (k && strcasecmp(k, "xdp") == 0) {
flags.tls = flags.http = false;
flags.xdp = true;
} else if (k && strcasecmp(k, "tls") == 0) {
flags.tls = true;
flags.http = false;
} else if (k && strcasecmp(k, "doh2") == 0) {
flags.tls = flags.http = true;
} else if (k) {
flags.kind = k;
if (strcasecmp(k, "doh") == 0) {
lua_error_p(L, "kind=\"doh\" was renamed to kind=\"doh_legacy\", switch to the new implementation with kind=\"doh2\" or update your config");
}
}
lua_getfield(L, 3, "nic_queue");
if (lua_isnumber(L, -1)) {
if (flags.xdp) {
nic_queue = lua_tointeger(L, -1);
} else {
lua_error_p(L, "nic_queue only supported with kind = 'xdp'");
}
} else if (!lua_isnil(L, -1)) {
lua_error_p(L, "wrong value of nic_queue (integer expected)");
}
}
/* Memory management of `kind` string is difficult due to longjmp etc.
* Pop will unreference the lua value, so we store it on C stack instead (!) */
const int kind_alen = flags.kind ? strlen(flags.kind) + 1 : 1 /* 0 length isn't C standard */;
char kind_buf[kind_alen];
if (flags.kind) {
memcpy(kind_buf, flags.kind, kind_alen);
flags.kind = kind_buf;
}
/* Now focus on the first argument. */
lua_settop(L, 1);
if (!net_listen_addrs(L, port, flags, nic_queue))
lua_error_p(L, "net.listen() failed to bind");
lua_pushboolean(L, true);
return 1;
}
/** Prints the specified `data` into the specified `dst` buffer. */
static char *proxy_data_to_string(int af, const struct net_proxy_data *data,
char *dst, size_t size)
{
kr_assert(size >= PROXY_DATA_STRLEN);
const void *in_addr = (af == AF_INET)
? (void *) &data->addr.ip4
: (void *) &data->addr.ip6;
char *cur = dst;
const char *ret = inet_ntop(af, in_addr, cur, size);
if (!ret)
return NULL;
cur += strlen(cur); /*< advance cursor to after the address */
*(cur++) = '/';
int masklen = snprintf(cur, 3 + 1, "%u", data->netmask);
cur[masklen] = '\0';
return dst;
}
/** Put all IP addresses from `trie` into the table at the top of the Lua stack.
* For each address, increment the integer at `i`. All addresses in `trie` must
* be from the specified `family`. */
static void net_proxy_addr_put(lua_State *L, int family, trie_t *trie, int *i)
{
char addrbuf[PROXY_DATA_STRLEN];
const char *addr;
trie_it_t *it;
for (it = trie_it_begin(trie); !trie_it_finished(it); trie_it_next(it)) {
lua_pushinteger(L, *i);
struct net_proxy_data *data = *trie_it_val(it);
addr = proxy_data_to_string(family, data,
addrbuf, sizeof(addrbuf));
lua_pushstring(L, addr);
lua_settable(L, -3);
*i += 1;
}
trie_it_free(it);
}
/** Allow PROXYv2 headers for IP address. */
static int net_proxy_allowed(lua_State *L)
{
int n = lua_gettop(L);
int i = 1;
const char *addr;
/* Return current state */
if (n == 0) {
lua_newtable(L);
i = 1;
if (the_network->proxy_all4) {
lua_pushinteger(L, i);
lua_pushstring(L, "0.0.0.0/0");
lua_settable(L, -3);
i += 1;
} else {
net_proxy_addr_put(L, AF_INET, the_network->proxy_addrs4, &i);
}
if (the_network->proxy_all6) {
lua_pushinteger(L, i);
lua_pushstring(L, "::/0");
lua_settable(L, -3);
i += 1;
} else {
net_proxy_addr_put(L, AF_INET6, the_network->proxy_addrs6, &i);
}
return 1;
}
if (n != 1)
lua_error_p(L, "net.proxy_allowed() takes one parameter (string or table)");
if (!lua_istable(L, 1) && !lua_isstring(L, 1))
lua_error_p(L, "net.proxy_allowed() argument must be string or table");
/* Reset allowed proxy addresses */
network_proxy_reset();
/* Add new proxy addresses */
if (lua_istable(L, 1)) {
for (i = 1; !lua_isnil(L, -1); i++) {
lua_pushinteger(L, i);
lua_gettable(L, 1);
if (lua_isnil(L, -1)) /* missing value - end iteration */
break;
if (!lua_isstring(L, -1))
lua_error_p(L, "net.proxy_allowed() argument may only contain strings");
addr = lua_tostring(L, -1);
int ret = network_proxy_allow(addr);
if (ret)
lua_error_p(L, "invalid argument");
}
} else if (lua_isstring(L, 1)) {
addr = lua_tostring(L, 1);
int ret = network_proxy_allow(addr);
if (ret)
lua_error_p(L, "invalid argument");
}
return 0;
}
/** Close endpoint. */
static int net_close(lua_State *L)
{
/* Check parameters */
const int n = lua_gettop(L);
bool ok = (n == 1 || n == 2) && lua_isstring(L, 1);
const char *addr = lua_tostring(L, 1);
int port;
if (ok && (n < 2 || lua_isnil(L, 2))) {
port = -1;
} else if (ok) {
ok = lua_isnumber(L, 2);
port = lua_tointeger(L, 2);
ok = ok && port >= 0 && port <= 65535;
}
if (!ok)
lua_error_p(L, "expected 'close(string addr, [number port])'");
int ret = network_close(addr, port);
lua_pushboolean(L, ret == 0);
return 1;
}
/** List available interfaces. */
static int net_interfaces(lua_State *L)
{
/* Retrieve interface list */
int count = 0;
char buf[INET6_ADDRSTRLEN]; /* https://tools.ietf.org/html/rfc4291 */
uv_interface_address_t *info = NULL;
uv_interface_addresses(&info, &count);
lua_newtable(L);
for (int i = 0; i < count; ++i) {
uv_interface_address_t iface = info[i];
lua_getfield(L, -1, iface.name);
if (lua_isnil(L, -1)) {
lua_pop(L, 1);
lua_newtable(L);
}
/* Address */
lua_getfield(L, -1, "addr");
if (lua_isnil(L, -1)) {
lua_pop(L, 1);
lua_newtable(L);
}
if (iface.address.address4.sin_family == AF_INET) {
uv_ip4_name(&iface.address.address4, buf, sizeof(buf));
} else if (iface.address.address4.sin_family == AF_INET6) {
uv_ip6_name(&iface.address.address6, buf, sizeof(buf));
} else {
buf[0] = '\0';
}
if (kr_sockaddr_link_local((struct sockaddr *) &iface.address)) {
/* Link-local IPv6: add %interface prefix */
auto_free char *str = NULL;
int ret = asprintf(&str, "%s%%%s", buf, iface.name);
kr_assert(ret > 0);
lua_pushstring(L, str);
} else {
lua_pushstring(L, buf);
}
lua_rawseti(L, -2, lua_objlen(L, -2) + 1);
lua_setfield(L, -2, "addr");
/* Hardware address. */
char *p = buf;
for (int k = 0; k < sizeof(iface.phys_addr); ++k) {
(void)sprintf(p, "%.2x:", (uint8_t)iface.phys_addr[k]);
p += 3;
}
p[-1] = '\0';
lua_pushstring(L, buf);
lua_setfield(L, -2, "mac");
/* Push table */
lua_setfield(L, -2, iface.name);
}
uv_free_interface_addresses(info, count);
return 1;
}
/** Set UDP maximum payload size. */
static int net_bufsize(lua_State *L)
{
const int argc = lua_gettop(L);
if (argc == 0) {
lua_pushinteger(L, knot_edns_get_payload(the_resolver->downstream_opt_rr));
lua_pushinteger(L, knot_edns_get_payload(the_resolver->upstream_opt_rr));
return 2;
}
if (argc == 1) {
int bufsize = lua_tointeger(L, 1);
if (bufsize < 512 || bufsize > UINT16_MAX)
lua_error_p(L, "bufsize must be within <512, " STR(UINT16_MAX) ">");
knot_edns_set_payload(the_resolver->downstream_opt_rr, (uint16_t)bufsize);
knot_edns_set_payload(the_resolver->upstream_opt_rr, (uint16_t)bufsize);
} else if (argc == 2) {
int bufsize_downstream = lua_tointeger(L, 1);
int bufsize_upstream = lua_tointeger(L, 2);
if (bufsize_downstream < 512 || bufsize_upstream < 512
|| bufsize_downstream > UINT16_MAX || bufsize_upstream > UINT16_MAX) {
lua_error_p(L, "bufsize must be within <512, " STR(UINT16_MAX) ">");
}
knot_edns_set_payload(the_resolver->downstream_opt_rr, (uint16_t)bufsize_downstream);
knot_edns_set_payload(the_resolver->upstream_opt_rr, (uint16_t)bufsize_upstream);
}
return 0;
}
/** Set TCP pipelining size. */
static int net_pipeline(lua_State *L)
{
if (!the_worker) {
return 0;
}
if (!lua_isnumber(L, 1)) {
lua_pushinteger(L, the_worker->tcp_pipeline_max);
return 1;
}
int len = lua_tointeger(L, 1);
if (len < 0 || len > UINT16_MAX)
lua_error_p(L, "tcp_pipeline must be within <0, " STR(UINT16_MAX) ">");
the_worker->tcp_pipeline_max = len;
lua_pushinteger(L, len);
return 1;
}
static int net_tls(lua_State *L)
{
if (kr_fails_assert(the_network)) {
return 0;
}
/* Only return current credentials. */
if (lua_gettop(L) == 0) {
/* No credentials configured yet. */
if (!the_network->tls_credentials) {
return 0;
}
lua_newtable(L);
lua_pushstring(L, the_network->tls_credentials->tls_cert);
lua_setfield(L, -2, "cert_file");
lua_pushstring(L, the_network->tls_credentials->tls_key);
lua_setfield(L, -2, "key_file");
return 1;
}
if ((lua_gettop(L) != 2) || !lua_isstring(L, 1) || !lua_isstring(L, 2))
lua_error_p(L, "net.tls takes two parameters: (\"cert_file\", \"key_file\")");
int r = tls_certificate_set(lua_tostring(L, 1), lua_tostring(L, 2));
lua_error_maybe(L, r);
lua_pushboolean(L, true);
return 1;
}
/** Configure HTTP headers for DoH requests. */
static int net_doh_headers(lua_State *L)
{
doh_headerlist_t *headers = &the_worker->doh_qry_headers;
int i;
const char *name;
/* Only return current configuration. */
if (lua_gettop(L) == 0) {
lua_newtable(L);
for (i = 0; i < headers->len; i++) {
lua_pushinteger(L, i + 1);
name = headers->at[i];
lua_pushlstring(L, name, strlen(name));
lua_settable(L, -3);
}
return 1;
}
if (lua_gettop(L) != 1)
lua_error_p(L, "net.doh_headers() takes one parameter (string or table)");
if (!lua_istable(L, 1) && !lua_isstring(L, 1))
lua_error_p(L, "net.doh_headers() argument must be string or table");
/* Clear existing headers. */
for (i = 0; i < headers->len; i++)
free((void *)headers->at[i]);
array_clear(*headers);
if (lua_istable(L, 1)) {
for (i = 1; !lua_isnil(L, -1); i++) {
lua_pushinteger(L, i);
lua_gettable(L, 1);
if (lua_isnil(L, -1)) /* missing value - end iteration */
break;
if (!lua_isstring(L, -1))
lua_error_p(L, "net.doh_headers() argument table can only contain strings");
name = lua_tostring(L, -1);
array_push(*headers, strdup(name));
}
} else if (lua_isstring(L, 1)) {
name = lua_tostring(L, 1);
array_push(*headers, strdup(name));
}
return 0;
}
/** Return a lua table with TLS authentication parameters.
* The format is the same as passed to policy.TLS_FORWARD();
* more precisely, it's in a compatible canonical form. */
static int tls_params2lua(lua_State *L, trie_t *params)
{
lua_newtable(L);
if (!params) /* Allowed special case. */
return 1;
trie_it_t *it;
size_t list_index = 0;
for (it = trie_it_begin(params); !trie_it_finished(it); trie_it_next(it)) {
/* Prepare table for the current address
* and its index in the returned list. */
lua_pushinteger(L, ++list_index);
lua_createtable(L, 0, 2);
/* Get the "addr#port" string... */
size_t ia_len;
const char *key = trie_it_key(it, &ia_len);
int af = AF_UNSPEC;
if (ia_len == 2 + sizeof(struct in_addr)) {
af = AF_INET;
} else if (ia_len == 2 + sizeof(struct in6_addr)) {
af = AF_INET6;
}
if (kr_fails_assert(key && af != AF_UNSPEC))
lua_error_p(L, "internal error: bad IP address");
uint16_t port;
memcpy(&port, key, sizeof(port));
port = ntohs(port);
const char *ia = key + sizeof(port);
char str[INET6_ADDRSTRLEN + 1 + 5 + 1];
size_t len = sizeof(str);
if (kr_fails_assert(kr_ntop_str(af, ia, port, str, &len) == kr_ok()))
lua_error_p(L, "internal error: bad IP address conversion");
/* ...and push it as [1]. */
lua_pushinteger(L, 1);
lua_pushlstring(L, str, len - 1 /* len includes '\0' */);
lua_settable(L, -3);
const tls_client_param_t *e = *trie_it_val(it);
if (kr_fails_assert(e))
lua_error_p(L, "internal problem - NULL entry for %s", str);
/* .hostname = */
if (e->hostname) {
lua_pushstring(L, e->hostname);
lua_setfield(L, -2, "hostname");
}
/* .ca_files = */
if (e->ca_files.len) {
lua_createtable(L, e->ca_files.len, 0);
for (size_t i = 0; i < e->ca_files.len; ++i) {
lua_pushinteger(L, i + 1);
lua_pushstring(L, e->ca_files.at[i]);
lua_settable(L, -3);
}
lua_setfield(L, -2, "ca_files");
}
/* .pin_sha256 = ... ; keep sane indentation via goto. */
if (!e->pins.len) goto no_pins;
lua_createtable(L, e->pins.len, 0);
for (size_t i = 0; i < e->pins.len; ++i) {
uint8_t pin_base64[TLS_SHA256_BASE64_BUFLEN];
int err = kr_base64_encode(e->pins.at[i], TLS_SHA256_RAW_LEN,
pin_base64, sizeof(pin_base64));
if (kr_fails_assert(err >= 0))
lua_error_p(L,
"internal problem when converting pin_sha256: %s",
kr_strerror(err));
lua_pushinteger(L, i + 1);
lua_pushlstring(L, (const char *)pin_base64, err);
/* pin_base64 isn't 0-terminated ^^^ */
lua_settable(L, -3);
}
lua_setfield(L, -2, "pin_sha256");
no_pins:/* .insecure = */
if (e->insecure) {
lua_pushboolean(L, true);
lua_setfield(L, -2, "insecure");
}
/* Now the whole table is pushed atop the returned list. */
lua_settable(L, -3);
}
trie_it_free(it);
return 1;
}
static inline int cmp_sha256(const void *p1, const void *p2)
{
return memcmp(*(char * const *)p1, *(char * const *)p2, TLS_SHA256_RAW_LEN);
}
static int net_tls_client(lua_State *L)
{
/* TODO idea: allow starting the lua table with *multiple* IP targets,
* meaning the authentication config should be applied to each.
*/
if (lua_gettop(L) == 0)
return tls_params2lua(L, the_network->tls_client_params);
/* Various basic sanity-checking. */
if (lua_gettop(L) != 1 || !lua_istable(L, 1))
lua_error_maybe(L, EINVAL);
/* check that only allowed keys are present */
{
const char *bad_key = lua_table_checkindices(L, (const char *[])
{ "1", "hostname", "ca_file", "pin_sha256", "insecure", "tls", NULL });
if (bad_key)
lua_error_p(L, "found unexpected key '%s'", bad_key);
}
/**** Phase 1: get the parameter into a C struct, incl. parse of CA files,
* regardless of the address-pair having an entry already. */
tls_client_param_t *newcfg = tls_client_param_new();
if (!newcfg)
lua_error_p(L, "out of memory or something like that :-/");
/* Shortcut for cleanup actions needed from now on. */
#define ERROR(...) do { \
free(newcfg); \
lua_error_p(L, __VA_ARGS__); \
} while (false)
/* .hostname - always accepted. */
lua_getfield(L, 1, "hostname");
if (!lua_isnil(L, -1)) {
const char *hn_str = lua_tostring(L, -1);
/* Convert to lower-case dname and back, for checking etc. */
knot_dname_t dname[KNOT_DNAME_MAXLEN];
if (!hn_str || !knot_dname_from_str(dname, hn_str, sizeof(dname)))
ERROR("invalid hostname");
knot_dname_to_lower(dname);
char *h = knot_dname_to_str_alloc(dname);
if (!h)
ERROR("%s", kr_strerror(ENOMEM));
/* Strip the final dot produced by knot_dname_*() */
h[strlen(h) - 1] = '\0';
newcfg->hostname = h;
}
lua_pop(L, 1);
/* .ca_file - it can be a list of paths, contrary to the name. */
bool has_ca_file = false;
lua_getfield(L, 1, "ca_file");
if (!lua_isnil(L, -1)) {
if (!newcfg->hostname)
ERROR("missing hostname but specifying ca_file");
lua_listify(L);
array_init(newcfg->ca_files); /*< placate apparently confused scan-build */
if (array_reserve(newcfg->ca_files, lua_objlen(L, -1)) != 0) /*< optim. */
ERROR("%s", kr_strerror(ENOMEM));
/* Iterate over table at the top of the stack.
* http://www.lua.org/manual/5.1/manual.html#lua_next */
for (lua_pushnil(L); lua_next(L, -2); lua_pop(L, 1)) {
has_ca_file = true; /* deferred here so that {} -> false */
const char *ca_file = lua_tostring(L, -1);
if (!ca_file)
ERROR("ca_file contains a non-string");
/* Let gnutls process it immediately, so garbage gets detected. */
int ret = gnutls_certificate_set_x509_trust_file(
newcfg->credentials, ca_file, GNUTLS_X509_FMT_PEM);
if (ret < 0) {
ERROR("failed to import certificate file '%s': %s - %s\n",
ca_file, gnutls_strerror_name(ret),
gnutls_strerror(ret));
} else {
kr_log_debug(TLSCLIENT, "imported %d certs from file '%s'\n",
ret, ca_file);
}
ca_file = strdup(ca_file);
if (!ca_file || array_push(newcfg->ca_files, ca_file) < 0)
ERROR("%s", kr_strerror(ENOMEM));
}
/* Sort the strings for easier comparison later. */
if (newcfg->ca_files.len) {
qsort(&newcfg->ca_files.at[0], newcfg->ca_files.len,
array_member_size(newcfg->ca_files), strcmp_p);
}
}
lua_pop(L, 1);
/* .pin_sha256 */
lua_getfield(L, 1, "pin_sha256");
if (!lua_isnil(L, -1)) {
if (has_ca_file)
ERROR("mixing pin_sha256 with ca_file is not supported");
lua_listify(L);
array_init(newcfg->pins); /*< placate apparently confused scan-build */
if (array_reserve(newcfg->pins, lua_objlen(L, -1)) != 0) /*< optim. */
ERROR("%s", kr_strerror(ENOMEM));
/* Iterate over table at the top of the stack. */
for (lua_pushnil(L); lua_next(L, -2); lua_pop(L, 1)) {
const char *pin = lua_tostring(L, -1);
if (!pin)
ERROR("pin_sha256 is not a string");
uint8_t *pin_raw = malloc(TLS_SHA256_RAW_LEN);
/* Push the string early to simplify error processing. */
if (kr_fails_assert(pin_raw && array_push(newcfg->pins, pin_raw) >= 0)) {
free(pin_raw);
ERROR("%s", kr_strerror(ENOMEM));
}
int ret = kr_base64_decode((const uint8_t *)pin, strlen(pin),
pin_raw, TLS_SHA256_RAW_LEN + 8);
if (ret < 0) {
ERROR("not a valid pin_sha256: '%s' (length %d), %s\n",
pin, (int)strlen(pin), knot_strerror(ret));
} else if (ret != TLS_SHA256_RAW_LEN) {
ERROR("not a valid pin_sha256: '%s', "
"raw length %d instead of "
STR(TLS_SHA256_RAW_LEN)"\n",
pin, ret);
}
}
/* Sort the raw strings for easier comparison later. */
if (newcfg->pins.len) {
qsort(&newcfg->pins.at[0], newcfg->pins.len,
array_member_size(newcfg->pins), cmp_sha256);
}
}
lua_pop(L, 1);
/* .insecure */
lua_getfield(L, 1, "insecure");
if (lua_isnil(L, -1)) {
if (!newcfg->hostname && !newcfg->pins.len)
ERROR("no way to authenticate and not set as insecure");
} else if (lua_isboolean(L, -1) && lua_toboolean(L, -1)) {
newcfg->insecure = true;
if (has_ca_file || newcfg->pins.len)
ERROR("set as insecure but provided authentication config");
} else {
ERROR("incorrect value in the 'insecure' field");
}
lua_pop(L, 1);
/* Init CAs from system trust store, if needed. */
if (!newcfg->insecure && !newcfg->pins.len && !has_ca_file) {
int ret = gnutls_certificate_set_x509_system_trust(newcfg->credentials);
if (ret <= 0) {
ERROR("failed to use system CA certificate store: %s",
ret ? gnutls_strerror(ret) : kr_strerror(ENOENT));
} else {
kr_log_debug(TLSCLIENT, "imported %d certs from system store\n",
ret);
}
}
#undef ERROR
/**** Phase 2: deal with the C authentication "table". */
/* Parse address and port. */
lua_pushinteger(L, 1);
lua_gettable(L, 1);
const char *addr_str = lua_tostring(L, -1);
if (!addr_str)
lua_error_p(L, "address is not a string");
char buf[INET6_ADDRSTRLEN + 1];
uint16_t port = 853;
const struct sockaddr *addr = NULL;
if (kr_straddr_split(addr_str, buf, &port) == kr_ok())
addr = kr_straddr_socket(buf, port, NULL);
/* Add newcfg into the C map, saving the original into oldcfg. */
if (!addr)
lua_error_p(L, "address '%s' could not be converted", addr_str);
tls_client_param_t **oldcfgp = tls_client_param_getptr(
&the_network->tls_client_params, addr, true);
free_const(addr);
if (!oldcfgp)
lua_error_p(L, "internal error when extending tls_client_params map");
tls_client_param_t *oldcfg = *oldcfgp;
*oldcfgp = newcfg; /* replace old config in trie with the new one */
/* If there was no original entry, it's easy! */
if (!oldcfg)
return 0;
/* Check for equality (newcfg vs. oldcfg), and print a warning if not equal.*/
const bool ok_h = (!newcfg->hostname && !oldcfg->hostname)
|| (newcfg->hostname && oldcfg->hostname && strcmp(newcfg->hostname, oldcfg->hostname) == 0);
bool ok_ca = newcfg->ca_files.len == oldcfg->ca_files.len;
for (int i = 0; ok_ca && i < newcfg->ca_files.len; ++i)
ok_ca = strcmp(newcfg->ca_files.at[i], oldcfg->ca_files.at[i]) == 0;
bool ok_pins = newcfg->pins.len == oldcfg->pins.len;
for (int i = 0; ok_pins && i < newcfg->pins.len; ++i)
ok_ca = memcmp(newcfg->pins.at[i], oldcfg->pins.at[i], TLS_SHA256_RAW_LEN) == 0;
const bool ok_insecure = newcfg->insecure == oldcfg->insecure;
if (!(ok_h && ok_ca && ok_pins && ok_insecure)) {
kr_log_warning(TLSCLIENT,
"warning: re-defining TLS authentication parameters for %s\n",
addr_str);
}
tls_client_param_unref(oldcfg);
return 0;
}
int net_tls_client_clear(lua_State *L)
{
/* One parameter: address -> convert it to a struct sockaddr. */
if (lua_gettop(L) != 1 || !lua_isstring(L, 1))
lua_error_p(L, "net.tls_client_clear() requires one parameter (\"address\")");
const char *addr_str = lua_tostring(L, 1);
char buf[INET6_ADDRSTRLEN + 1];
uint16_t port = 853;
const struct sockaddr *addr = NULL;
if (kr_straddr_split(addr_str, buf, &port) == kr_ok())
addr = kr_straddr_socket(buf, port, NULL);
if (!addr)
lua_error_p(L, "invalid IP address");
/* Do the actual removal. */
int r = tls_client_param_remove(the_network->tls_client_params, addr);
free_const(addr);
lua_error_maybe(L, r);
lua_pushboolean(L, true);
return 1;
}
static int net_tls_padding(lua_State *L)
{
/* Only return current padding. */
if (lua_gettop(L) == 0) {
if (the_resolver->tls_padding < 0) {
lua_pushboolean(L, true);
return 1;
} else if (the_resolver->tls_padding == 0) {
lua_pushboolean(L, false);
return 1;
}
lua_pushinteger(L, the_resolver->tls_padding);
return 1;
}
const char *errstr = "net.tls_padding parameter has to be true, false,"
" or a number between <0, " STR(MAX_TLS_PADDING) ">";
if (lua_gettop(L) != 1)
lua_error_p(L, "%s", errstr);
if (lua_isboolean(L, 1)) {
bool x = lua_toboolean(L, 1);
if (x) {
the_resolver->tls_padding = -1;
} else {
the_resolver->tls_padding = 0;
}
} else if (lua_isnumber(L, 1)) {
int padding = lua_tointeger(L, 1);
if ((padding < 0) || (padding > MAX_TLS_PADDING))
lua_error_p(L, "%s", errstr);
the_resolver->tls_padding = padding;
} else {
lua_error_p(L, "%s", errstr);
}
lua_pushboolean(L, true);
return 1;
}
/** Shorter salt can't contain much entropy. */
#define net_tls_sticket_MIN_SECRET_LEN 32
static int net_tls_sticket_secret_string(lua_State *L)
{
size_t secret_len;
const char *secret;
if (lua_gettop(L) == 0) {
/* Zero-length secret, implying random key. */
secret_len = 0;
secret = NULL;
} else {
if (lua_gettop(L) != 1 || !lua_isstring(L, 1)) {
lua_error_p(L,
"net.tls_sticket_secret takes one parameter: (\"secret string\")");
}
secret = lua_tolstring(L, 1, &secret_len);
if (secret_len < net_tls_sticket_MIN_SECRET_LEN || !secret) {
lua_error_p(L, "net.tls_sticket_secret - the secret is shorter than "
STR(net_tls_sticket_MIN_SECRET_LEN) " bytes");
}
}
tls_session_ticket_ctx_destroy(the_network->tls_session_ticket_ctx);
the_network->tls_session_ticket_ctx =
tls_session_ticket_ctx_create(the_network->loop, secret, secret_len);
if (the_network->tls_session_ticket_ctx == NULL) {
lua_error_p(L,
"net.tls_sticket_secret_string - can't create session ticket context");
}
lua_pushboolean(L, true);
return 1;
}
static int net_tls_sticket_secret_file(lua_State *L)
{
if (lua_gettop(L) != 1 || !lua_isstring(L, 1)) {
lua_error_p(L,
"net.tls_sticket_secret_file takes one parameter: (\"file name\")");
}
const char *file_name = lua_tostring(L, 1);
if (strlen(file_name) == 0)
lua_error_p(L, "net.tls_sticket_secret_file - empty file name");
FILE *fp = fopen(file_name, "r");
if (fp == NULL) {
lua_error_p(L, "net.tls_sticket_secret_file - can't open file '%s': %s",
file_name, strerror(errno));
}
char secret_buf[TLS_SESSION_TICKET_SECRET_MAX_LEN];
const size_t secret_len = fread(secret_buf, 1, sizeof(secret_buf), fp);
int err = ferror(fp);
if (err) {
lua_error_p(L,
"net.tls_sticket_secret_file - error reading from file '%s': %s",
file_name, strerror(err));
}
if (secret_len < net_tls_sticket_MIN_SECRET_LEN) {
lua_error_p(L,
"net.tls_sticket_secret_file - file '%s' is shorter than "
STR(net_tls_sticket_MIN_SECRET_LEN) " bytes",
file_name);
}
if (fclose(fp) == EOF) {
lua_error_p(L,
"net.tls_sticket_secret_file - reading of file '%s' failed",
file_name);
}
tls_session_ticket_ctx_destroy(the_network->tls_session_ticket_ctx);
the_network->tls_session_ticket_ctx =
tls_session_ticket_ctx_create(the_network->loop, secret_buf, secret_len);
if (the_network->tls_session_ticket_ctx == NULL) {
lua_error_p(L,
"net.tls_sticket_secret_file - can't create session ticket context");
}
lua_pushboolean(L, true);
return 1;
}
static int net_outgoing(lua_State *L, int family)
{
union kr_sockaddr *addr;
if (family == AF_INET)
addr = (union kr_sockaddr*)&the_worker->out_addr4;
else
addr = (union kr_sockaddr*)&the_worker->out_addr6;
if (lua_gettop(L) == 0) { /* Return the current value. */
if (addr->ip.sa_family == AF_UNSPEC) {
lua_pushnil(L);
return 1;
}
if (kr_fails_assert(addr->ip.sa_family == family))
lua_error_p(L, "bad address family");
char addr_buf[INET6_ADDRSTRLEN];
int err;
if (family == AF_INET)
err = uv_ip4_name(&addr->ip4, addr_buf, sizeof(addr_buf));
else
err = uv_ip6_name(&addr->ip6, addr_buf, sizeof(addr_buf));
lua_error_maybe(L, err);
lua_pushstring(L, addr_buf);
return 1;
}
if ((lua_gettop(L) != 1) || (!lua_isstring(L, 1) && !lua_isnil(L, 1)))
lua_error_p(L, "net.outgoing_vX takes one address string parameter or nil");
if (lua_isnil(L, 1)) {
addr->ip.sa_family = AF_UNSPEC;
return 1;
}
const char *addr_str = lua_tostring(L, 1);
int err;
if (family == AF_INET)
err = uv_ip4_addr(addr_str, 0, &addr->ip4);
else
err = uv_ip6_addr(addr_str, 0, &addr->ip6);
if (err)
lua_error_p(L, "net.outgoing_vX: failed to parse the address");
lua_pushboolean(L, true);
return 1;
}
static int net_outgoing_v4(lua_State *L) { return net_outgoing(L, AF_INET); }
static int net_outgoing_v6(lua_State *L) { return net_outgoing(L, AF_INET6); }
static int net_update_timeout(lua_State *L, uint64_t *timeout, const char *name)
{
/* Only return current idle timeout. */
if (lua_gettop(L) == 0) {
lua_pushinteger(L, *timeout);
return 1;
}
if ((lua_gettop(L) != 1))
lua_error_p(L, "%s takes one parameter: (\"idle timeout\")", name);
if (lua_isnumber(L, 1)) {
int idle_timeout = lua_tointeger(L, 1);
if (idle_timeout <= 0)
lua_error_p(L, "%s parameter has to be positive number", name);
*timeout = idle_timeout;
} else {
lua_error_p(L, "%s parameter has to be positive number", name);
}
lua_pushboolean(L, true);
return 1;
}
static int net_tcp_in_idle(lua_State *L)
{
return net_update_timeout(L, &the_network->tcp.in_idle_timeout, "net.tcp_in_idle");
}
static int net_tls_handshake_timeout(lua_State *L)
{
return net_update_timeout(L, &the_network->tcp.tls_handshake_timeout, "net.tls_handshake_timeout");
}
static int net_bpf_set(lua_State *L)
{
if (lua_gettop(L) != 1 || !lua_isnumber(L, 1)) {
lua_error_p(L, "net.bpf_set(fd) takes one parameter:"
" the open file descriptor of a loaded BPF program");
}
#if __linux__
int progfd = lua_tointeger(L, 1);
if (progfd == 0) {
/* conversion error despite that fact
* that lua_isnumber(L, 1) has returned true.
* Real or stdin? */
lua_error_p(L, "failed to convert parameter");
}
lua_pop(L, 1);
if (network_set_bpf(progfd) == 0) {
lua_error_p(L, "failed to attach BPF program to some networks: %s",
kr_strerror(errno));
}
lua_pushboolean(L, 1);
return 1;
#endif
lua_error_p(L, "BPF is not supported on this operating system");
}
static int net_bpf_clear(lua_State *L)
{
if (lua_gettop(L) != 0)
lua_error_p(L, "net.bpf_clear() does not take any parameters");
#if __linux__
network_clear_bpf();
lua_pushboolean(L, 1);
return 1;
#endif
lua_error_p(L, "BPF is not supported on this operating system");
}
static int net_register_endpoint_kind(lua_State *L)
{
const int param_count = lua_gettop(L);
if (param_count != 1 && param_count != 2)
lua_error_p(L, "expected one or two parameters");
if (!lua_isstring(L, 1)) {
lua_error_p(L, "incorrect kind '%s'", lua_tostring(L, 1));
}
size_t kind_len;
const char *kind = lua_tolstring(L, 1, &kind_len);
/* Unregistering */
if (param_count == 1) {
void *val;
if (trie_del(the_network->endpoint_kinds, kind, kind_len, &val) == KNOT_EOK) {
const int fun_id = (intptr_t)val;
luaL_unref(L, LUA_REGISTRYINDEX, fun_id);
return 0;
}
lua_error_p(L, "attempt to unregister unknown kind '%s'\n", kind);
} /* else -> param_count == 2 */
/* Registering */
if (!lua_isfunction(L, 2)) {
lua_error_p(L, "second parameter: expected function but got %s\n",
lua_typename(L, lua_type(L, 2)));
}
const int fun_id = luaL_ref(L, LUA_REGISTRYINDEX);
/* ^^ The function is on top of the stack, incidentally. */
void **pp = trie_get_ins(the_network->endpoint_kinds, kind, kind_len);
if (!pp) lua_error_maybe(L, kr_error(ENOMEM));
if (*pp != NULL || !strcasecmp(kind, "dns") || !strcasecmp(kind, "tls"))
lua_error_p(L, "attempt to register known kind '%s'\n", kind);
*pp = (void *)(intptr_t)fun_id;
/* We don't attempt to engage corresponding endpoints now.
* That's the job for network_engage_endpoints() later. */
return 0;
}
int kr_bindings_net(lua_State *L)
{
static const luaL_Reg lib[] = {
{ "list", net_list },
{ "listen", net_listen },
{ "proxy_allowed", net_proxy_allowed },
{ "close", net_close },
{ "interfaces", net_interfaces },
{ "bufsize", net_bufsize },
{ "tcp_pipeline", net_pipeline },
{ "tls", net_tls },
{ "tls_server", net_tls },
{ "tls_client", net_tls_client },
{ "tls_client_clear", net_tls_client_clear },
{ "tls_padding", net_tls_padding },
{ "tls_sticket_secret", net_tls_sticket_secret_string },
{ "tls_sticket_secret_file", net_tls_sticket_secret_file },
{ "outgoing_v4", net_outgoing_v4 },
{ "outgoing_v6", net_outgoing_v6 },
{ "tcp_in_idle", net_tcp_in_idle },
{ "tls_handshake_timeout", net_tls_handshake_timeout },
{ "bpf_set", net_bpf_set },
{ "bpf_clear", net_bpf_clear },
{ "register_endpoint_kind", net_register_endpoint_kind },
{ "doh_headers", net_doh_headers },
{ NULL, NULL }
};
luaL_register(L, "net", lib);
return 1;
}
.. SPDX-License-Identifier: GPL-3.0-or-later
Buffering tweaks
----------------
We (can) set various server-side socket options that affect buffering.
The values are stored in C structures without real Lua bindings,
so setting them is a bit long.
.. py:data:: (require 'ffi').C.the_worker.engine.net.tcp.user_timeout
On TCP-based server-side sockets we set ``TCP_USER_TIMEOUT`` option if available (~Linux).
We use default 1000, i.e. one second. For details see the definition in ``man tcp.7``.
.. py:data:: (require 'ffi').C.the_worker.engine.net.listen_tcp_buflens.snd
.. py:data:: (require 'ffi').C.the_worker.engine.net.listen_tcp_buflens.rcv
.. py:data:: (require 'ffi').C.the_worker.engine.net.listen_udp_buflens.snd
.. py:data:: (require 'ffi').C.the_worker.engine.net.listen_udp_buflens.rcv
If overridden to nonzero, these variables instruct the OS to modify kernel-space buffers
for server-side sockets. We split the setting for UDP vs. TCP and sending vs. receiving.
For details see ``SO_SNDBUF`` and ``SO_RCVBUF`` in ``man socket.7``.
There is no user-space buffering beyond immediate manipulation, only the OS keeps some.
.. SPDX-License-Identifier: GPL-3.0-or-later
IPv4 and IPv6 usage
-------------------
Following settings affect client part of the resolver,
i.e. communication between the resolver itself and other DNS servers.
IPv4 and IPv6 protocols are used by default. For performance reasons it is
recommended to explicitly disable protocols which are not available
on your system, though the impact of IPv6 outage is lowered since release 5.3.0.
.. envvar:: net.ipv4 = true|false
:return: boolean (default: true)
Enable/disable using IPv4 for contacting upstream nameservers.
.. envvar:: net.ipv6 = true|false
:return: boolean (default: true)
Enable/disable using IPv6 for contacting upstream nameservers.
.. function:: net.outgoing_v4([string address])
Get/set the IPv4 address used to perform queries.
The default is ``nil``, which lets the OS choose any address.
.. function:: net.outgoing_v6([string address])
Get/set the IPv6 address used to perform queries.
The default is ``nil``, which lets the OS choose any address.
.. SPDX-License-Identifier: GPL-3.0-or-later
DNS protocol tweaks
-------------------
Following settings change low-level details of DNS protocol implementation.
Default values should not be changed except for very special cases.
.. function:: net.bufsize([udp_downstream_bufsize][, udp_upstream_bufsize])
Get/set maximum EDNS payload size advertised in DNS packets. Different values can be configured for communication downstream (towards clients) and upstream (towards other DNS servers). Set and also get operations use values in this order.
Default is 1232 bytes which was chosen to minimize risk of `issues caused by IP fragmentation <https://blog.apnic.net/2019/07/12/its-time-to-consider-avoiding-ip-fragmentation-in-the-dns/>`_. Further details can be found at `DNS Flag Day 2020 <https://www.dnsflagday.net/2020/>`_ web site.
Minimal value allowed by standard :rfc:`6891` is 512 bytes, which is equal to DNS packet size without Extension Mechanisms for DNS. Value 1220 bytes is minimum size required by DNSSEC standard :rfc:`4035`.
Example output:
.. code-block:: lua
-- set downstream and upstream bufsize to value 4096
> net.bufsize(4096)
-- get configured downstream and upstream bufsizes, respectively
> net.bufsize()
4096 -- result # 1
4096 -- result # 2
-- set downstream bufsize to 4096 and upstream bufsize to 1232
> net.bufsize(4096, 1232)
-- get configured downstream and upstream bufsizes, respectively
> net.bufsize()
4096 -- result # 1
1232 -- result # 2
.. include:: ../../modules/workarounds/README.rst
.. SPDX-License-Identifier: GPL-3.0-or-later
Addresses and services
----------------------
Addresses, ports, protocols, and API calls available for clients communicating
with resolver are configured using :func:`net.listen`.
First you need to decide what service should be available on given IP address
+ port combination.
.. csv-table::
:header: "Protocol/service", "net.listen *kind*"
"DNS (unencrypted UDP+TCP, :rfc:`1034`)","``dns``"
"DNS (unencrypted UDP, :ref:`using XDP Linux API <dns-over-xdp>`)","``xdp``"
":ref:`dns-over-tls`","``tls``"
":ref:`dns-over-https`","``doh2``"
":ref:`Web management <mod-http-built-in-services>`","``webmgmt``"
":ref:`Control socket <control-sockets>`","``control``"
":ref:`mod-http-doh`","``doh_legacy``"
.. note:: By default, **unencrypted DNS and DNS-over-TLS** are configured to **listen
on localhost**.
Control sockets are created either in
``/run/knot-resolver/control/`` (when using systemd) or ``$PWD/control/``.
.. function:: net.listen(addresses, [port = 53, { kind = 'dns', freebind = false }])
:return: ``true`` if port is bound, an error otherwise
Listen on addresses; port and flags are optional.
The addresses can be specified as a string or device.
Port 853 implies ``kind = 'tls'`` but it is always better to be explicit.
Freebind allows binding to a non-local or not yet available address.
.. csv-table::
:header: "**Network protocol**", "**Configuration command**"
"DNS (UDP+TCP, :rfc:`1034`)","``net.listen('192.0.2.123', 53)``"
"DNS (UDP, :ref:`using XDP <dns-over-xdp>`)","``net.listen('192.0.2.123', 53, { kind = 'xdp' })``"
":ref:`dns-over-tls`","``net.listen('192.0.2.123', 853, { kind = 'tls' })``"
":ref:`dns-over-https`","``net.listen('192.0.2.123', 443, { kind = 'doh2' })``"
":ref:`Web management <mod-http-built-in-services>`","``net.listen('192.0.2.123', 8453, { kind = 'webmgmt' })``"
":ref:`Control socket <control-sockets>`","``net.listen('/tmp/kres.control', nil, { kind = 'control' })``"
Examples:
.. code-block:: lua
net.listen('::1')
net.listen(net.lo, 53)
net.listen(net.eth0, 853, { kind = 'tls' })
net.listen('192.0.2.1', 53, { freebind = true })
net.listen({'127.0.0.1', '::1'}, 53, { kind = 'dns' })
net.listen('::', 443, { kind = 'doh2' })
net.listen('::', 8453, { kind = 'webmgmt' }) -- see http module
net.listen('/tmp/kresd-socket', nil, { kind = 'webmgmt' }) -- http module supports AF_UNIX
net.listen('eth0', 53, { kind = 'xdp' })
net.listen('192.0.2.123', 53, { kind = 'xdp', nic_queue = 0 })
.. warning:: On machines with multiple IP addresses avoid listening on wildcards
``0.0.0.0`` or ``::``. Knot Resolver could answer from different IP
addresses if the network address ranges overlap,
and clients would probably refuse such a response.
.. _proxyv2:
PROXYv2 protocol
^^^^^^^^^^^^^^^^
Knot Resolver supports proxies that utilize the `PROXYv2 protocol <https://www.haproxy.org/download/2.5/doc/proxy-protocol.txt>`_
to identify clients.
A PROXY header contains the IP address of the original client who sent a query.
This allows the resolver to treat queries as if they actually came from
the client's IP address rather than the address of the proxy they came through.
For example, :ref:`Views and ACLs <mod-view>` are able to work properly when
PROXYv2 is in use.
Since allowing usage of the PROXYv2 protocol for all clients would be a security
vulnerability, because clients would then be able to spoof their IP addresses via
the PROXYv2 header, the resolver requires you to specify explicitly which clients
are allowed to send PROXYv2 headers via the :func:`net.proxy_allowed` function.
PROXYv2 queries from clients who are not explicitly allowed to use this protocol
will be discarded.
.. function:: net.proxy_allowed([addresses])
Allow usage of the PROXYv2 protocol headers by clients on the specified
``addresses``. It is possible to permit whole networks to send PROXYv2 headers
by specifying the network mask using the CIDR notation
(e.g. ``172.22.0.0/16``). IPv4 as well as IPv6 addresses are supported.
If you wish to allow all clients to use PROXYv2 (e.g. because you have this
kind of security handled on another layer of your network infrastructure),
you can specify a netmask of ``/0``. Please note that this setting is
address-family-specific, so this needs to be applied to both IPv4 and IPv6
separately.
Subsequent calls to the function overwrite the effects of all previous calls.
Providing a table of strings as the function parameter allows multiple
distinct addresses to use the PROXYv2 protocol.
When called without arguments, ``net.proxy_allowed`` returns a table of all
addresses currently allowed to use the PROXYv2 protocol and does not change
the configuration.
Examples:
.. code-block:: lua
net.proxy_allowed('172.22.0.1') -- allows '172.22.0.1' specifically
net.proxy_allowed('172.18.1.0/24') -- allows everyone at '172.18.1.*'
net.proxy_allowed({
'172.22.0.1', '172.18.1.0/24'
}) -- allows both of the above at once
net.proxy_allowed({ 'fe80::/10' } -- allows everyone at IPv6 link-local
net.proxy_allowed({
'::/0', '0.0.0.0/0'
}) -- allows everyone
net.proxy_allowed('::/0') -- allows all IPv6 (but no IPv4)
net.proxy_allowed({}) -- prevents everyone from using PROXYv2
net.proxy_allowed() -- returns a list of all currently allowed addresses
Features for scripting
^^^^^^^^^^^^^^^^^^^^^^
Following configuration functions are useful mainly for scripting or :ref:`runtime-cfg`.
.. function:: net.close(address, [port])
:return: boolean (at least one endpoint closed)
Close all endpoints listening on the specified address, optionally restricted by port as well.
.. function:: net.list()
:return: Table of bound interfaces.
Example output:
.. code-block:: none
[1] => {
[kind] => tls
[transport] => {
[family] => inet4
[ip] => 127.0.0.1
[port] => 853
[protocol] => tcp
}
}
[2] => {
[kind] => dns
[transport] => {
[family] => inet6
[ip] => ::1
[port] => 53
[protocol] => udp
}
}
[3] => {
[kind] => dns
[transport] => {
[family] => inet6
[ip] => ::1
[port] => 53
[protocol] => tcp
}
}
[4] => {
[kind] => xdp
[transport] => {
[family] => inet4+inet6
[interface] => eth2
[nic_queue] => 0
[port] => 53
[protocol] => udp
}
}
.. function:: net.interfaces()
:return: Table of available interfaces and their addresses.
Example output:
.. code-block:: none
[lo0] => {
[addr] => {
[1] => ::1
[2] => 127.0.0.1
}
[mac] => 00:00:00:00:00:00
}
[eth0] => {
[addr] => {
[1] => 192.168.0.1
}
[mac] => de:ad:be:ef:aa:bb
}
.. tip:: You can use ``net.<iface>`` as a shortcut for specific interface, e.g. ``net.eth0``
.. function:: net.tcp_pipeline([len])
Get/set per-client TCP pipeline limit, i.e. the number of outstanding queries that a single client connection can make in parallel. Default is 100.
.. code-block:: lua
> net.tcp_pipeline()
100
> net.tcp_pipeline(50)
50
.. warning:: Please note that too large limit may have negative impact on performance and can lead to increased number of SERVFAIL answers.
.. _`dnsproxy module`: https://www.knot-dns.cz/docs/2.7/html/modules.html#dnsproxy-tiny-dns-proxy
.. SPDX-License-Identifier: GPL-3.0-or-later
.. _tls-server-config:
DoT and DoH (encrypted DNS)
---------------------------
.. warning::
It is important to understand **limits of encrypting only DNS traffic**.
Relevant security analysis can be found in article
*Simran Patil and Nikita Borisov. 2019. What can you learn from an IP?*
See `slides <https://irtf.org/anrw/2019/slides-anrw19-final44.pdf>`_
or `the article itself <https://dl.acm.org/authorize?N687437>`_.
DoT and DoH encrypt DNS traffic with Transport Layer Security (TLS) protocol
and thus protects DNS traffic from certain types of attacks.
You can learn more about DoT and DoH and their implementation in Knot Resolver
in `this article
<https://en.blog.nic.cz/2020/11/25/encrypted-dns-in-knot-resolver-dot-and-doh/>`_.
.. _dns-over-tls:
DNS-over-TLS (DoT)
^^^^^^^^^^^^^^^^^^
DNS-over-TLS server (:rfc:`7858`) can be configured using ``tls`` kind in
:func:`net.listen()`. It is enabled on localhost by default.
For certificate configuration, refer to :ref:`dot-doh-config-options`.
.. _dns-over-https:
DNS-over-HTTPS (DoH)
^^^^^^^^^^^^^^^^^^^^
.. note:: Knot Resolver currently offers two DoH implementations. It is
recommended to use this new implementation, which is more reliable, scalable
and has fewer dependencies. Make sure to use ``doh2`` kind in
:func:`net.listen()` to select this implementation.
.. tip:: Independent information about political controversies around the
DoH deployment by default can be found in blog posts `DNS Privacy at IETF
104 <http://www.potaroo.net/ispcol/2019-04/angst.html>`_ and `More DOH
<http://www.potaroo.net/ispcol/2019-04/moredoh.html>`_ by Geoff Huston and
`Centralised DoH is bad for Privacy, in 2019 and beyond
<https://labs.ripe.net/Members/bert_hubert/centralised-doh-is-bad-for-privacy-in-2019-and-beyond>`_
by Bert Hubert.
DNS-over-HTTPS server (:rfc:`8484`) can be configured using ``doh2`` kind in
:func:`net.listen()`.
This implementation supports HTTP/2 (:rfc:`7540`). Queries can be sent to the
``/dns-query`` endpoint, e.g.:
.. code-block:: bash
$ kdig @127.0.0.1 +https www.knot-resolver.cz AAAA
**Only TLS version 1.3 (or higher) is supported with DNS-over-HTTPS.** The
additional considerations for TLS 1.2 required by HTTP/2 are not implemented
(:rfc:`7540#section-9.2`).
.. warning:: Take care when configuring your server to listen on well known
HTTPS port. If an unrelated HTTPS service is running on the same port with
REUSEPORT enabled, you will end up with both services malfunctioning.
.. _dot-doh-config-options:
HTTP status codes
"""""""""""""""""
As specified by :rfc:`8484`, the resolver responds with status **200 OK** whenever
it can produce a valid DNS reply for a given query, even in cases where the DNS
``rcode`` indicates an error (like ``NXDOMAIN``, ``SERVFAIL``, etc.).
For DoH queries malformed at the HTTP level, the resolver may respond with
the following status codes:
* **400 Bad Request** for a generally malformed query, like one not containing
a valid DNS packet
* **404 Not Found** when an incorrect HTTP endpoint is queried - the only
supported ones are ``/dns-query`` and ``/doh``
* **413 Payload Too Large** when the DNS query exceeds its maximum size
* **415 Unsupported Media Type** when the query's ``Content-Type`` header
is not ``application/dns-message``
* **431 Request Header Fields Too Large** when a header in the query is too
large to process
* **501 Not Implemented** when the query uses a method other than
``GET``, ``POST``, or ``HEAD``
Configuration options for DoT and DoH
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. note:: These settings affect both DNS-over-TLS and DNS-over-HTTPS (except
the legacy implementation).
A self-signed certificate is generated by default. For serious deployments
it is strongly recommended to configure your own TLS certificates signed
by a trusted CA. This is done using function :c:func:`net.tls()`.
.. function:: net.tls([cert_path], [key_path])
When called with path arguments, the function loads the server TLS
certificate and private key for DoT and DoH.
When called without arguments, the command returns the currently configured paths.
Example output:
.. code-block:: lua
> net.tls("/etc/knot-resolver/server-cert.pem", "/etc/knot-resolver/server-key.pem")
> net.tls() -- print configured paths
[cert_file] => '/etc/knot-resolver/server-cert.pem'
[key_file] => '/etc/knot-resolver/server-key.pem'
.. tip:: The certificate files aren't automatically reloaded on change. If
you update the certificate files, e.g. using ACME, you have to either
restart the service(s) or call this function again using
:ref:`control-sockets`.
.. function:: net.tls_sticket_secret([string with pre-shared secret])
Set secret for TLS session resumption via tickets, by :rfc:`5077`.
The server-side key is rotated roughly once per hour.
By default or if called without secret, the key is random.
That is good for long-term forward secrecy, but multiple kresd instances
won't be able to resume each other's sessions.
If you provide the same secret to multiple instances, they will be able to resume
each other's sessions *without* any further communication between them.
This synchronization works only among instances having the same endianness
and time_t structure and size (`sizeof(time_t)`).
.. _pfs: https://en.wikipedia.org/wiki/Forward_secrecy
**For good security** the secret must have enough entropy to be hard to guess,
and it should still be occasionally rotated manually and securely forgotten,
to reduce the scope of privacy leak in case the
`secret leaks eventually <pfs_>`_.
.. warning:: **Setting the secret is probably too risky with TLS <= 1.2 and
GnuTLS < 3.7.5**. GnuTLS 3.7.5 adds an option to disable resumption via
tickets for TLS <= 1.2, enabling them only for protocols that do guarantee
`PFS <pfs_>`_. Knot Resolver makes use of this new option when linked
against GnuTLS >= 3.7.5.
.. function:: net.tls_sticket_secret_file([string with path to a file containing pre-shared secret])
The same as :func:`net.tls_sticket_secret`,
except the secret is read from a (binary) file.
.. function:: net.tls_padding([true | false])
Get/set EDNS(0) padding of queries and answers sent over an encrypted
channel. If set to `true` (the default), it will use a sensible
default padding scheme, as implemented by libknot if available at
compile time. If set to a numeric value >= 2 it will pad the
answers to nearest *padding* boundary, e.g. if set to `64`, the
answer will have size of a multiple of 64 (64, 128, 192, ...). If
set to `false` (or a number < 2), it will disable padding entirely.
Configuration options for DoH
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. function:: net.doh_headers([string or table of strings])
Selects the headers to be exposed. These headers and their values are
available in ``request.qsource.headers``. Comparison
is case-insensitive and pseudo-headers are supported as well.
The following snippet can be used in the lua module to access headers
``:method`` and ``user-agent``:
.. code-block:: lua
net.doh_headers({':method', 'user-agent'})
...
for i = 1, tonumber(req.qsource.headers.len) do
local name = ffi.string(req.qsource.headers.at[i - 1].name)
local value = ffi.string(req.qsource.headers.at[i - 1].value)
print(name, value)
end
.. SPDX-License-Identifier: GPL-3.0-or-later
.. _dns-over-xdp:
XDP for higher UDP performance
------------------------------
.. warning::
As of version 5.2.0, XDP support in Knot Resolver is considered
experimental. The impact on overall throughput and performance may not
always be beneficial.
Using XDP allows significant speedup of UDP packet processing in recent Linux kernels,
especially with some network drivers that implement good support.
The basic idea is that for selected packets the Linux networking stack is bypassed,
and some drivers can even directly use the user-space buffers for reading and writing.
.. TODO perhaps some hint/link about how significant speedup one might get? (link to some talk video?)
Prerequisites
^^^^^^^^^^^^^
.. this is mostly copied from knot-dns doc/operations.rst
.. warning::
Bypassing the network stack has significant implications, such as bypassing the firewall
and monitoring solutions.
Make sure you're familiar with the trade-offs before using this feature.
Read more in :ref:`dns-over-xdp_limitations`.
* Linux kernel 4.18+ (5.x+ is recommended for optimal performance) compiled with
the `CONFIG_XDP_SOCKETS=y` option. XDP isn't supported in other operating systems.
* libknot compiled with XDP support
* **A multiqueue network card with native XDP support is highly recommended**,
otherwise the performance gain will be much lower and you may encounter
issues due to XDP emulation.
Successfully tested cards:
* Intel series 700 (driver `i40e`), maximum number of queues per interface is 64.
* Intel series 500 (driver `ixgbe`), maximum number of queues per interface is 64.
The number of CPUs available has to be at most 64!
Set up
^^^^^^
.. first parts are mostly copied from knot-dns doc/operations.rst
The server instances need additional Linux **capabilities** during startup.
(Or you could start them as `root`.)
Execute command
.. code-block:: bash
systemctl edit kresd@.service
And insert these lines:
.. code-block:: ini
[Service]
CapabilityBoundingSet=CAP_NET_RAW CAP_NET_ADMIN CAP_SYS_ADMIN CAP_IPC_LOCK CAP_SYS_RESOURCE
AmbientCapabilities=CAP_NET_RAW CAP_NET_ADMIN CAP_SYS_ADMIN CAP_IPC_LOCK CAP_SYS_RESOURCE
The ``CAP_SYS_RESOURCE`` is only needed on Linux < 5.11.
.. TODO suggest some way for ethtool -L? Perhaps via systemd units?
You want the same number of kresd instances and network **queues** on your card;
you can use ``ethtool -L`` before the services start.
With XDP this is more important than with vanilla UDP, as we only support one instance
per queue and unclaimed queues will fall back to vanilla UDP.
Ideally you can set these numbers as high as the number of CPUs that you want kresd to use.
Modification of ``/etc/knot-resolver/kresd.conf`` may often be quite simple, for example:
.. code-block:: lua
net.listen('eth2', 53, { kind = 'xdp' })
net.listen('203.0.113.53', 53, { kind = 'dns' })
Note that you want to also keep the vanilla DNS line to service TCP
and possibly any fallback UDP (e.g. from unclaimed queues).
XDP listening is in principle done on queues of whole network interfaces
and the target addresses of incoming packets aren't checked in any way,
but you are still allowed to specify interface by an address
(if it's unambiguous at that moment):
.. code-block:: lua
net.listen('203.0.113.53', 53, { kind = 'xdp' })
net.listen('203.0.113.53', 53, { kind = 'dns' })
The default selection of queues is tailored for the usual naming convention:
``kresd@1.service``, ``kresd@2.service``, ...
but you can still specify them explicitly, e.g. the default is effectively the same as:
.. code-block:: lua
net.listen('eth2', 53, { kind = 'xdp', nic_queue = env.SYSTEMD_INSTANCE - 1 })
Optimizations
^^^^^^^^^^^^^
.. this is basically copied from knot-dns doc/operations.rst
Some helpful commands:
.. code-block:: text
ethtool -N <interface> rx-flow-hash udp4 sdfn
ethtool -N <interface> rx-flow-hash udp6 sdfn
ethtool -L <interface> combined <queue-number>
ethtool -G <interface> rx <ring-size> tx <ring-size>
renice -n 19 -p $(pgrep '^ksoftirqd/[0-9]*$')
.. TODO CPU affinities? `CPUAffinity=%i` in systemd unit sounds good.
.. _dns-over-xdp_limitations:
Limitations
^^^^^^^^^^^
.. this is basically copied from knot-dns doc/operations.rst
* VLAN segmentation is not supported.
* MTU higher than 1792 bytes is not supported.
* Multiple BPF filters per one network device are not supported.
* Symmetrical routing is required (query source MAC/IP addresses and
reply destination MAC/IP addresses are the same).
* Systems with big-endian byte ordering require special recompilation of libknot.
* IPv4 header and UDP checksums are not verified on received DNS messages.
* DNS over XDP traffic is not visible to common system tools (e.g. firewall, tcpdump etc.).
* BPF filter is not automatically unloaded from the network device. Manual filter unload::
ip link set dev <interface> xdp off
* Knot Resolver only supports using XDP towards clients currently (not towards upstreams).
* When starting up an XDP socket you may get a harmless warning::
libbpf: Kernel error message: XDP program already attached
/* Copyright (C) CZ.NIC, z.s.p.o. <knot-resolver@labs.nic.cz>
* SPDX-License-Identifier: GPL-3.0-or-later
*/
#include "daemon/bindings/impl.h"
static inline double getseconds(uv_timeval_t *tv)
{
return (double)tv->tv_sec + 0.000001*((double)tv->tv_usec);
}
/** Return worker statistics. */
static int wrk_stats(lua_State *L)
{
if (kr_fails_assert(the_worker)) {
return 0;
}
lua_newtable(L);
lua_pushnumber(L, the_worker->stats.queries);
lua_setfield(L, -2, "queries");
lua_pushnumber(L, the_worker->stats.concurrent);
lua_setfield(L, -2, "concurrent");
lua_pushnumber(L, the_worker->stats.dropped);
lua_setfield(L, -2, "dropped");
lua_pushnumber(L, the_worker->stats.timeout);
lua_setfield(L, -2, "timeout");
lua_pushnumber(L, the_worker->stats.udp);
lua_setfield(L, -2, "udp");
lua_pushnumber(L, the_worker->stats.tcp);
lua_setfield(L, -2, "tcp");
lua_pushnumber(L, the_worker->stats.tls);
lua_setfield(L, -2, "tls");
lua_pushnumber(L, the_worker->stats.ipv4);
lua_setfield(L, -2, "ipv4");
lua_pushnumber(L, the_worker->stats.ipv6);
lua_setfield(L, -2, "ipv6");
lua_pushnumber(L, the_worker->stats.err_udp);
lua_setfield(L, -2, "err_udp");
lua_pushnumber(L, the_worker->stats.err_tcp);
lua_setfield(L, -2, "err_tcp");
lua_pushnumber(L, the_worker->stats.err_tls);
lua_setfield(L, -2, "err_tls");
lua_pushnumber(L, the_worker->stats.err_http);
lua_setfield(L, -2, "err_http");
/* Add subset of rusage that represents counters. */
uv_rusage_t rusage;
if (uv_getrusage(&rusage) == 0) {
lua_pushnumber(L, getseconds(&rusage.ru_utime));
lua_setfield(L, -2, "usertime");
lua_pushnumber(L, getseconds(&rusage.ru_stime));
lua_setfield(L, -2, "systime");
lua_pushnumber(L, rusage.ru_majflt);
lua_setfield(L, -2, "pagefaults");
lua_pushnumber(L, rusage.ru_nswap);
lua_setfield(L, -2, "swaps");
lua_pushnumber(L, rusage.ru_nvcsw + rusage.ru_nivcsw);
lua_setfield(L, -2, "csw");
}
/* Get RSS */
size_t rss = 0;
if (uv_resident_set_memory(&rss) == 0) {
lua_pushnumber(L, rss);
lua_setfield(L, -2, "rss");
}
return 1;
}
int kr_bindings_worker(lua_State *L)
{
static const luaL_Reg lib[] = {
{ "stats", wrk_stats },
{ NULL, NULL }
};
luaL_register(L, "worker", lib);
return 1;
}
.. SPDX-License-Identifier: GPL-3.0-or-later
Scripting worker
^^^^^^^^^^^^^^^^
Worker is a service over event loop that tracks and schedules outstanding queries,
you can see the statistics or schedule new queries. It also contains information about
specified worker count and process rank.
.. envvar:: worker.id
Value from environment variable ``SYSTEMD_INSTANCE``,
or if it is not set, :envvar:`PID <worker.pid>` (string).
.. envvar:: worker.pid
Current worker process PID (number).
.. function:: worker.stats()
Return table of statistics. See member descriptions in :c:type:`worker_stats`.
A few fields are added, mainly from POSIX ``getrusage()``:
* ``usertime`` and ``systime`` -- CPU time used, in seconds
* ``pagefaults`` -- the number of hard page faults, i.e. those that required I/O activity
* ``swaps`` -- the number of times the process was “swapped” out of main memory; unused on Linux
* ``csw`` -- the number of context switches, both voluntary and involuntary
* ``rss`` -- current memory usage in bytes, including whole cache (resident set size)
Example:
.. code-block:: lua
print(worker.stats().concurrent)
-- unload modules which are not related to this test
-- SPDX-License-Identifier: GPL-3.0-or-later
if ta_signal_query then
modules.unload('ta_signal_query')
end
if priming then
modules.unload('priming')
end
if detect_time_skew then
modules.unload('detect_time_skew')
end
-- test. domain is used by some tests, allow it
policy.add(policy.suffix(policy.PASS, {todname('test.')}))
cache.size = 2*MB
-- log_level('debug')
-- Self-checks on globals
assert(help() ~= nil)
assert(worker.id ~= nil)
-- Self-checks on facilities
assert(cache.stats() ~= nil)
assert(cache.backends() ~= nil)
assert(worker.stats() ~= nil)
assert(net.interfaces() ~= nil)
-- Self-checks on loaded stuff
assert(#modules.list() > 0)
-- Self-check timers
ev = event.recurrent(1 * sec, function () return 1 end)
event.cancel(ev)
ev = event.after(0, function () return 1 end)
-- Import fake root zone; avoid interference with configured keyfile_default.
trust_anchors.remove('.')
trust_anchors.add('. IN DS 48409 8 2 3D63A0C25BCE86621DE63636F11B35B908EFE8E9381E0E3E9DEFD89EA952C27D')
local check_answer = require('test_utils').check_answer
-- do not attempt to contact outside world, operate only on cache
net.ipv4 = false
net.ipv6 = false
-- do not listen, test is driven by config code
env.KRESD_NO_LISTEN = true
local function import_zone()
local import_res = require('ffi').C.zi_zone_import({ zone_file = 'testroot.zone' })
assert(import_res == 0)
-- beware that import takes at least 100 ms
worker.sleep(0.2) -- zimport is delayed by 100 ms from function call
-- sanity checks - cache must be filled in
ok(cache.count() > 0, 'cache is not empty after import')
check_answer('root apex is in cache',
'.', kres.type.NS, kres.rcode.NOERROR)
check_answer('deep subdomain is in cache',
'a.b.subtree1.', kres.type.AAAA, kres.rcode.NOERROR)
end
local function test_exact_match_qtype()
nok(cache.clear('a.b.subtree1.', true, kres.type.A)['chunk_limit'],
'single qname+qtype can be cleared at once')
check_answer('exact match on qname+qtype must flush RR from cache',
'a.b.subtree1.', kres.type.A, kres.rcode.SERVFAIL)
check_answer('exact match on qname+qtype must not affect other RRs on the same node',
'a.b.subtree1.', kres.type.AAAA, kres.rcode.NOERROR)
check_answer('exact match on qname must not affect parent',
'b.subtree1.', kres.type.A, kres.rcode.NOERROR)
end
local function test_exact_match_qname()
res = cache.clear('a.b.SubTree1.')
is(res.count, 2, 'single qname can be cleared at once')
check_answer('exact match on qname must flush all RRs with the same owner from cache',
'a.b.subtree1.', kres.type.AAAA, kres.rcode.SERVFAIL)
check_answer('exact match on qname must flush all RRs with the same owner from cache',
'a.b.subtree1.', kres.type.A, kres.rcode.SERVFAIL)
check_answer('exact match on qname must flush all RRs with the same owner from cache',
'a.b.subtree1.', kres.type.TXT, kres.rcode.SERVFAIL)
-- exact match for negative proofs is not implemented yet
--check_answer('exact match on qname must flush negative proofs for owner from cache',
-- 'a.b.subtree1.', kres.type.NULL, kres.rcode.SERVFAIL)
--check_answer('exact match on qname must not affect parent',
-- 'b.subtree1.', kres.type.A, kres.rcode.NOERROR)
-- same(cache.clear(), 0, 'full cache clear can be performed')
--check_answer('.', kres.type.NS, false)
end
local function test_subtree()
res = cache.clear('subtree1.')
nok(res.chunk_limit,
'whole positive subtree must be flushed (does not include neg. proofs)')
ok(res.not_apex,
'subtree clear below apex must be detected')
same(res.subtree, '.', 'detected apex must be returned')
check_answer('subtree variant must flush all RRs in subdomains from cache',
'b.subtree1.', kres.type.A, kres.rcode.SERVFAIL)
check_answer('subtree variant must flush all RRs in subdomains from cache',
'b.subtree1.', kres.type.TXT, kres.rcode.SERVFAIL)
check_answer('subtree variant must flush all RRs in subdomains from cache',
'subtree1.', kres.type.TXT, kres.rcode.SERVFAIL)
check_answer('subtree variant must not affect parent',
'.', kres.type.NS, kres.rcode.NOERROR)
-- same(cache.clear(), 0, 'full cache clear can be performed')
--check_answer('.', kres.type.NS, false)
end
local function test_callback()
local test_name = '20r.subtree2.'
local test_exactname = true
local test_rrtype = nil
local test_chunksize = 1
local test_prev_state = { works = true }
local function check_callback(name, exact_name, rr_type, chunk_size, callback, prev_state, errors)
is(errors.count, 1, 'callback received correct # of removed records')
is(test_name, name, 'callback received subtree name')
is(test_exactname, exact_name, 'callback received exact_name')
is(test_rrtype, rr_type, 'callback received rr_type')
is(test_chunksize, chunk_size, 'callback received chunk_size')
is(check_callback, callback, 'callback received reference to itself')
is(type(errors), 'table', 'callback received table of errors')
same(test_prev_state, prev_state, 'callback received previous state')
return 666
end
same(cache.clear(test_name, test_exactname, test_rrtype, test_chunksize, check_callback, test_prev_state),
666, 'first callback return value is passed to cache.clear() caller')
local cnt_before_wait = cache.count()
worker.sleep(0.2)
is(cnt_before_wait, cache.count(), 'custom callback can stop clearing')
end
local function test_subtree_limit() -- default limit = 100
res = cache.clear('subtree2.', false, nil)
ok(res.chunk_limit,
'chunk_size limit must be respected')
is(res.count, 100,
'chunk_size limit must match returned count')
-- callbacks are running in background so we can now wait
-- and later verify that everything was removed
-- 200 RRs, 100 was removed in first call
-- so the rest should be removed in single invocation of callback
-- hopefully the machine is not too slow ...
worker.sleep(0.1)
res = cache.clear('subtree2.', false, nil)
is(res.count, 0,
'previous calls + callbacks must have removed everything')
end
local function test_apex()
check_answer('a negative proof is still present in cache',
'aaaaa.b.subtree1.', kres.type.TXT, kres.rcode.NXDOMAIN)
local prev_count = cache.count()
ok(prev_count > 0, 'previous subtree clearing did not remove everything')
res = cache.clear('.', false, nil, 10000)
is(res.count, prev_count, 'clear on root removed everything including proofs')
check_answer('exact match on qname must flush negative proofs for owner from cache',
'a.b.subtree1.', kres.type.NULL, kres.rcode.SERVFAIL)
end
local function test_root()
check_answer('root apex is still in cache',
'.', kres.type.NS, kres.rcode.NOERROR)
res = cache.clear('.', true)
check_answer('root apex is in no longer cache',
'.', kres.type.NS, kres.rcode.SERVFAIL)
check_answer('some other item is still in cache',
'16r.subtree2.', kres.type.A, kres.rcode.NOERROR)
local prev_count = cache.count()
res = cache.clear('.')
is(res.count, prev_count, 'full clear reports correct number of entries')
is(cache.count(), 0, 'clearing root clears everything')
end
local function test_complete_flush()
local prev_count = cache.count()
res = cache.clear()
is(res.count, prev_count, 'full clear reports correct number of entries')
is(cache.count(), 0, 'cache is empty after full clear')
end
local function test_cache_used(lower, upper)
return function()
local usage = cache.stats().usage_percent
ok(usage >= lower and usage <= upper,
string.format('cache percentage usage %.1f is between <%d, %d>', usage, lower, upper))
end
end
return {
test_cache_used(0, 1),
import_zone,
test_cache_used(9, 11),
test_exact_match_qtype,
test_exact_match_qname,
test_callback,
import_zone,
test_subtree,
test_cache_used(9, 11),
test_subtree_limit,
test_cache_used(5, 8),
test_apex,
import_zone,
test_root,
import_zone,
test_complete_flush,
test_cache_used(0, 1),
}
# SPDX-License-Identifier: GPL-3.0-or-later
programs:
- name: kresd
binary: kresd
additional:
- --noninteractive
templates:
- daemon/cache.test/insert_ns.test.integr/kresd_config.j2
- tests/integration/hints_zone.j2
configs:
- config
- hints
noclean: True
-- SPDX-License-Identifier: GPL-3.0-or-later
{% for TAF in TRUST_ANCHOR_FILES %}
trust_anchors.add_file('{{TAF}}')
{% endfor %}
{% raw %}
-- insert NS record pointing to a non-delegated DNS server
cache.open(1*MB)
cache.clear()
trust_anchors.remove('.')
local ffi = require('ffi')
local c = kres.context().cache
ns_name = todname('ns.example.com')
local ns_addr = '\1\2\3\4'
local rr = kres.rrset(ns_name, kres.type.A, kres.class.IN, 2147483647)
assert(rr:add_rdata(ns_addr, #ns_addr))
assert(c:insert(rr, nil, ffi.C.KR_RANK_SECURE))
rr_ns = kres.rrset(todname('example.com'), kres.type.NS, kres.class.IN, 3600)
assert(rr_ns:add_rdata(ns_name, #ns_name))
assert(c:insert(rr_ns, nil, bit.bor(ffi.C.KR_RANK_AUTH, ffi.C.KR_RANK_INSECURE)))
c:commit()
assert(cache.count() > 0)
-- from now on queries for domain example.com should go directly to IP addr 1.2.3.4
-- Disable RFC5011 TA update
if ta_update then
modules.unload('ta_update')
end
-- Disable RFC8145 signaling, scenario doesn't provide expected answers
if ta_signal_query then
modules.unload('ta_signal_query')
end
-- Disable RFC8109 priming, scenario doesn't provide expected answers
if priming then
modules.unload('priming')
end
-- Disable this module because it makes one priming query
if detect_time_skew then
modules.unload('detect_time_skew')
end
_hint_root_file('hints')
log_level('debug')
{% endraw %}
net = { '{{SELF_ADDR}}' }
{% if DO_IP6 == "true" %}
net.ipv6 = true
{% else %}
net.ipv6 = false
{% endif %}
{% if DO_IP4 == "true" %}
net.ipv4 = true
{% else %}
net.ipv4 = false
{% endif %}
{% if QMIN == "false" %}
option('NO_MINIMIZE', true)
{% else %}
option('NO_MINIMIZE', false)
{% endif %}
-- Self-checks on globals
assert(help() ~= nil)
assert(worker.id ~= nil)
-- Self-checks on facilities
assert(cache.stats() ~= nil)
assert(cache.backends() ~= nil)
assert(worker.stats() ~= nil)
assert(net.interfaces() ~= nil)
-- Self-checks on loaded stuff
assert(net.list()[1].transport.ip == '{{SELF_ADDR}}')
assert(#modules.list() > 0)
-- Self-check timers
ev = event.recurrent(1 * sec, function (ev) return 1 end)
event.cancel(ev)
ev = event.after(0, function (ev) return 1 end)
; SPDX-License-Identifier: GPL-3.0-or-later
; config options
; target-fetch-policy: "0 0 0 0 0"
; name: "."
stub-addr: 193.0.14.129 # K.ROOT-SERVERS.NET.
do-ip6: no
CONFIG_END
SCENARIO_BEGIN Delegation explicitly added into cache must be followed
; ns.example.com.
RANGE_BEGIN 0 100
ADDRESS 1.2.3.4
ENTRY_BEGIN
MATCH opcode qtype qname
ADJUST copy_id
REPLY QR NOERROR
SECTION QUESTION
example.com. IN NS
SECTION ANSWER
example.com. IN NS ns.example.com.
SECTION ADDITIONAL
ns.example.com. IN A 1.2.3.4
ENTRY_END
ENTRY_BEGIN
MATCH opcode qtype qname
ADJUST copy_id
REPLY QR NOERROR
SECTION QUESTION
www.example.com. IN A
SECTION ANSWER
www.example.com. IN A 10.20.30.40
SECTION AUTHORITY
example.com. IN NS ns.example.com.
SECTION ADDITIONAL
ns.example.com. IN A 1.2.3.4
ENTRY_END
RANGE_END
STEP 1 QUERY
ENTRY_BEGIN
REPLY RD
SECTION QUESTION
www.example.com. IN A
ENTRY_END
; recursion happens here.
STEP 10 CHECK_ANSWER
ENTRY_BEGIN
MATCH flags rcode question
REPLY QR RD RA NOERROR
SECTION QUESTION
www.example.com. IN A
SECTION ANSWER
www.example.com. IN A 10.20.30.40
ENTRY_END
SCENARIO_END
This source diff could not be displayed because it is too large. You can view the blob instead.
; SPDX-License-Identifier: GPL-3.0-or-later
. 86400 SOA rootns. you.test. 2017071101 1800 900 604800 86400
. 86400 NS rootns.
rootns. 86400 A 198.41.0.4
subtree1. 86400 TXT "txt exists"
subtree1. 86400 A 192.0.2.1
b.subtree1. 86400 TXT "txt exists"
b.subtree1. 86400 A 192.0.2.2
a.b.subtree1. 86400 TXT "txt exists"
a.b.subtree1. 86400 A 192.0.2.3
a.b.subtree1. 86400 AAAA 2001:db8::
; subtree2. is empty non-terminal
1r.subtree2. 86400 AAAA 2001:db8::
2r.subtree2. 86400 AAAA 2001:db8::1
2r.subtree2. 86400 AAAA 2001:db8::2
3r.subtree2. 86400 AAAA 2001:db8::
4r.subtree2. 86400 A 192.0.2.1
5r.subtree2. 86400 A 192.0.2.1
6r.subtree2. 86400 A 192.0.2.1
7r.subtree2. 86400 A 192.0.2.1
8r.subtree2. 86400 A 192.0.2.1
9r.subtree2. 86400 A 192.0.2.1
10r.subtree2. 86400 A 192.0.2.1
11r.subtree2. 86400 A 192.0.2.1
12r.subtree2. 86400 A 192.0.2.1
13r.subtree2. 86400 A 192.0.2.1
14r.subtree2. 86400 A 192.0.2.1
15r.subtree2. 86400 A 192.0.2.1
16r.subtree2. 86400 A 192.0.2.1
17r.subtree2. 86400 A 192.0.2.1
18r.subtree2. 86400 A 192.0.2.1
19r.subtree2. 86400 A 192.0.2.1
20r.subtree2. 86400 A 192.0.2.1
21r.subtree2. 86400 A 192.0.2.1
22r.subtree2. 86400 A 192.0.2.1
23r.subtree2. 86400 A 192.0.2.1
24r.subtree2. 86400 A 192.0.2.1
25r.subtree2. 86400 A 192.0.2.1
26r.subtree2. 86400 A 192.0.2.1
27r.subtree2. 86400 A 192.0.2.1
28r.subtree2. 86400 A 192.0.2.1
29r.subtree2. 86400 A 192.0.2.1
30r.subtree2. 86400 A 192.0.2.1
31r.subtree2. 86400 A 192.0.2.1
32r.subtree2. 86400 A 192.0.2.1
33r.subtree2. 86400 A 192.0.2.1
34r.subtree2. 86400 A 192.0.2.1
35r.subtree2. 86400 A 192.0.2.1
36r.subtree2. 86400 A 192.0.2.1
37r.subtree2. 86400 A 192.0.2.1
38r.subtree2. 86400 A 192.0.2.1
39r.subtree2. 86400 A 192.0.2.1
40r.subtree2. 86400 A 192.0.2.1
41r.subtree2. 86400 A 192.0.2.1
42r.subtree2. 86400 A 192.0.2.1
43r.subtree2. 86400 A 192.0.2.1
44r.subtree2. 86400 A 192.0.2.1
45r.subtree2. 86400 A 192.0.2.1
46r.subtree2. 86400 A 192.0.2.1
47r.subtree2. 86400 A 192.0.2.1
48r.subtree2. 86400 A 192.0.2.1
49r.subtree2. 86400 A 192.0.2.1
50r.subtree2. 86400 A 192.0.2.1
51r.subtree2. 86400 A 192.0.2.1
52r.subtree2. 86400 A 192.0.2.1
53r.subtree2. 86400 A 192.0.2.1
54r.subtree2. 86400 A 192.0.2.1
55r.subtree2. 86400 A 192.0.2.1
56r.subtree2. 86400 A 192.0.2.1
57r.subtree2. 86400 A 192.0.2.1
58r.subtree2. 86400 A 192.0.2.1
59r.subtree2. 86400 A 192.0.2.1
60r.subtree2. 86400 A 192.0.2.1
61r.subtree2. 86400 A 192.0.2.1
62r.subtree2. 86400 A 192.0.2.1
63r.subtree2. 86400 A 192.0.2.1
64r.subtree2. 86400 A 192.0.2.1
65r.subtree2. 86400 A 192.0.2.1
66r.subtree2. 86400 A 192.0.2.1
67r.subtree2. 86400 A 192.0.2.1
68r.subtree2. 86400 A 192.0.2.1
69r.subtree2. 86400 A 192.0.2.1
70r.subtree2. 86400 A 192.0.2.1
71r.subtree2. 86400 A 192.0.2.1
72r.subtree2. 86400 A 192.0.2.1
73r.subtree2. 86400 A 192.0.2.1
74r.subtree2. 86400 A 192.0.2.1
75r.subtree2. 86400 A 192.0.2.1
76r.subtree2. 86400 A 192.0.2.1
77r.subtree2. 86400 A 192.0.2.1
78r.subtree2. 86400 A 192.0.2.1
79r.subtree2. 86400 A 192.0.2.1
80r.subtree2. 86400 A 192.0.2.1
81r.subtree2. 86400 A 192.0.2.1
82r.subtree2. 86400 A 192.0.2.1
83r.subtree2. 86400 A 192.0.2.1
84r.subtree2. 86400 A 192.0.2.1
85r.subtree2. 86400 A 192.0.2.1
86r.subtree2. 86400 A 192.0.2.1
87r.subtree2. 86400 A 192.0.2.1
88r.subtree2. 86400 A 192.0.2.1
89r.subtree2. 86400 A 192.0.2.1
90r.subtree2. 86400 A 192.0.2.1
91r.subtree2. 86400 A 192.0.2.1
92r.subtree2. 86400 A 192.0.2.1
93r.subtree2. 86400 A 192.0.2.1
94r.subtree2. 86400 A 192.0.2.1
95r.subtree2. 86400 A 192.0.2.1
96r.subtree2. 86400 A 192.0.2.1
97r.subtree2. 86400 A 192.0.2.1
98r.subtree2. 86400 A 192.0.2.1
99r.subtree2. 86400 A 192.0.2.1
100r.subtree2. 86400 A 192.0.2.1
101r.subtree2. 86400 A 192.0.2.1
102r.subtree2. 86400 A 192.0.2.1
103r.subtree2. 86400 A 192.0.2.1
104r.subtree2. 86400 A 192.0.2.1
105r.subtree2. 86400 A 192.0.2.1
106r.subtree2. 86400 A 192.0.2.1
107r.subtree2. 86400 A 192.0.2.1
108r.subtree2. 86400 A 192.0.2.1
109r.subtree2. 86400 A 192.0.2.1
110r.subtree2. 86400 A 192.0.2.1
111r.subtree2. 86400 A 192.0.2.1
112r.subtree2. 86400 A 192.0.2.1
113r.subtree2. 86400 A 192.0.2.1
114r.subtree2. 86400 A 192.0.2.1
115r.subtree2. 86400 A 192.0.2.1
116r.subtree2. 86400 A 192.0.2.1
117r.subtree2. 86400 A 192.0.2.1
118r.subtree2. 86400 A 192.0.2.1
119r.subtree2. 86400 A 192.0.2.1
120r.subtree2. 86400 A 192.0.2.1
121r.subtree2. 86400 A 192.0.2.1
122r.subtree2. 86400 A 192.0.2.1
123r.subtree2. 86400 A 192.0.2.1
124r.subtree2. 86400 A 192.0.2.1
125r.subtree2. 86400 A 192.0.2.1
126r.subtree2. 86400 A 192.0.2.1
127r.subtree2. 86400 A 192.0.2.1
128r.subtree2. 86400 A 192.0.2.1
129r.subtree2. 86400 A 192.0.2.1
130r.subtree2. 86400 A 192.0.2.1
131r.subtree2. 86400 A 192.0.2.1
132r.subtree2. 86400 A 192.0.2.1
133r.subtree2. 86400 A 192.0.2.1
134r.subtree2. 86400 A 192.0.2.1
135r.subtree2. 86400 A 192.0.2.1
136r.subtree2. 86400 A 192.0.2.1
137r.subtree2. 86400 A 192.0.2.1
138r.subtree2. 86400 A 192.0.2.1
139r.subtree2. 86400 A 192.0.2.1
140r.subtree2. 86400 A 192.0.2.1
141r.subtree2. 86400 A 192.0.2.1
142r.subtree2. 86400 A 192.0.2.1
143r.subtree2. 86400 A 192.0.2.1
144r.subtree2. 86400 A 192.0.2.1
145r.subtree2. 86400 A 192.0.2.1
146r.subtree2. 86400 A 192.0.2.1
147r.subtree2. 86400 A 192.0.2.1
148r.subtree2. 86400 A 192.0.2.1
149r.subtree2. 86400 A 192.0.2.1
150r.subtree2. 86400 A 192.0.2.1
151r.subtree2. 86400 A 192.0.2.1
152r.subtree2. 86400 A 192.0.2.1
153r.subtree2. 86400 A 192.0.2.1
154r.subtree2. 86400 A 192.0.2.1
155r.subtree2. 86400 A 192.0.2.1
156r.subtree2. 86400 A 192.0.2.1
157r.subtree2. 86400 A 192.0.2.1
158r.subtree2. 86400 A 192.0.2.1
159r.subtree2. 86400 A 192.0.2.1
160r.subtree2. 86400 A 192.0.2.1
161r.subtree2. 86400 A 192.0.2.1
162r.subtree2. 86400 A 192.0.2.1
163r.subtree2. 86400 A 192.0.2.1
164r.subtree2. 86400 A 192.0.2.1
165r.subtree2. 86400 A 192.0.2.1
166r.subtree2. 86400 A 192.0.2.1
167r.subtree2. 86400 A 192.0.2.1
168r.subtree2. 86400 A 192.0.2.1
169r.subtree2. 86400 A 192.0.2.1
170r.subtree2. 86400 A 192.0.2.1
171r.subtree2. 86400 A 192.0.2.1
172r.subtree2. 86400 A 192.0.2.1
173r.subtree2. 86400 A 192.0.2.1
174r.subtree2. 86400 A 192.0.2.1
175r.subtree2. 86400 A 192.0.2.1
176r.subtree2. 86400 A 192.0.2.1
177r.subtree2. 86400 A 192.0.2.1
178r.subtree2. 86400 A 192.0.2.1
179r.subtree2. 86400 A 192.0.2.1
180r.subtree2. 86400 A 192.0.2.1
181r.subtree2. 86400 A 192.0.2.1
182r.subtree2. 86400 A 192.0.2.1
183r.subtree2. 86400 A 192.0.2.1
184r.subtree2. 86400 A 192.0.2.1
185r.subtree2. 86400 A 192.0.2.1
186r.subtree2. 86400 A 192.0.2.1
187r.subtree2. 86400 A 192.0.2.1
188r.subtree2. 86400 A 192.0.2.1
189r.subtree2. 86400 A 192.0.2.1
190r.subtree2. 86400 A 192.0.2.1
191r.subtree2. 86400 A 192.0.2.1
192r.subtree2. 86400 A 192.0.2.1
193r.subtree2. 86400 A 192.0.2.1
194r.subtree2. 86400 A 192.0.2.1
195r.subtree2. 86400 A 192.0.2.1
196r.subtree2. 86400 A 192.0.2.1
197r.subtree2. 86400 A 192.0.2.1
198r.subtree2. 86400 A 192.0.2.1
199r.subtree2. 86400 A 192.0.2.1
200r.subtree2. 86400 A 192.0.2.1
201r.subtree2. 86400 A 192.0.2.1
kresd_SOURCES := \
daemon/io.c \
daemon/network.c \
daemon/engine.c \
daemon/worker.c \
daemon/bindings.c \
daemon/ffimodule.c \
daemon/tls.c \
daemon/main.c
kresd_DIST := daemon/lua/kres.lua daemon/lua/trust_anchors.lua
# Embedded resources
%.inc: %.lua
@$(call quiet,XXD,$<) $< > $@
ifeq ($(AMALG), yes)
kresd.amalg.c: daemon/lua/sandbox.inc daemon/lua/config.inc
else
daemon/engine.o: daemon/lua/sandbox.inc daemon/lua/config.inc
endif
# Installed FFI bindings
bindings-install: $(kresd_DIST) $(DESTDIR)$(MODULEDIR)
$(INSTALL) -m 0644 $(kresd_DIST) $(DESTDIR)$(MODULEDIR)
kresd_CFLAGS := -fPIE
kresd_DEPEND := $(libkres) $(contrib)
kresd_LIBS := $(libkres_TARGET) $(contrib_TARGET) $(libknot_LIBS) \
$(libzscanner_LIBS) $(libdnssec_LIBS) $(libuv_LIBS) $(lua_LIBS) \
$(gnutls_LIBS)
# Enable systemd
ifeq ($(HAS_libsystemd), yes)
kresd_CFLAGS += -DHAS_SYSTEMD
kresd_LIBS += $(libsystemd_LIBS)
endif
# Make binary
ifeq ($(HAS_lua)|$(HAS_libuv), yes|yes)
$(eval $(call make_sbin,kresd,daemon,yes))
endif
# Targets
date := $(shell date +%F)
daemon: $(kresd)
daemon-install: kresd-install bindings-install
ifneq ($(SED),)
$(SED) -e "s/@VERSION@/$(MAJOR).$(MINOR).$(PATCH)/" -e "s/@DATE@/$(date)/" doc/kresd.8.in > doc/kresd.8
$(INSTALL) -d -m 0755 $(DESTDIR)$(PREFIX)/share/man/man8/
$(INSTALL) -m 0644 doc/kresd.8 $(DESTDIR)$(PREFIX)/share/man/man8/
endif
daemon-clean: kresd-clean
@$(RM) daemon/lua/*.inc
.PHONY: daemon daemon-install daemon-clean
/* Copyright (C) CZ.NIC, z.s.p.o. <knot-resolver@labs.nic.cz>
* SPDX-License-Identifier: GPL-3.0-or-later
*/
#include <math.h>
#include <stdatomic.h>
#include "daemon/defer.h"
#include "daemon/session2.h"
#include "daemon/udp_queue.h"
#include "lib/kru.h"
#include "lib/mmapped.h"
#include "lib/resolve.h"
#include "lib/utils.h"
#define V4_PREFIXES (uint8_t[]) { 18, 20, 24, 32 }
#define V4_RATE_MULT (kru_price_t[]) { 768, 256, 32, 1 }
#define V4_SUBPRIO (uint8_t[]) { 0, 1, 3, 7 }
#define V6_PREFIXES (uint8_t[]) { 32, 48, 56, 64, 128 }
#define V6_RATE_MULT (kru_price_t[]) { 64, 4, 3, 2, 1 }
#define V6_SUBPRIO (uint8_t[]) { 2, 4, 5, 6, 7 }
#define SUBPRIO_CNT 8
#define V4_PREFIXES_CNT (sizeof(V4_PREFIXES) / sizeof(*V4_PREFIXES))
#define V6_PREFIXES_CNT (sizeof(V6_PREFIXES) / sizeof(*V6_PREFIXES))
#define MAX_PREFIXES_CNT ((V4_PREFIXES_CNT > V6_PREFIXES_CNT) ? V4_PREFIXES_CNT : V6_PREFIXES_CNT)
struct kru_conf {
uint8_t namespace;
size_t prefixes_cnt;
uint8_t *prefixes;
const kru_price_t *rate_mult;
const uint8_t *subprio;
} const
V4_CONF = {0, V4_PREFIXES_CNT, V4_PREFIXES, V4_RATE_MULT, V4_SUBPRIO},
V6_CONF = {1, V6_PREFIXES_CNT, V6_PREFIXES, V6_RATE_MULT, V6_SUBPRIO};
#define LOADS_THRESHOLDS (uint16_t[]) {1<<4, 1<<8, 1<<12, -1} // the last one should be UINT16_MAX
#define QUEUES_CNT ((sizeof(LOADS_THRESHOLDS) / sizeof(*LOADS_THRESHOLDS) - 1) * SUBPRIO_CNT + 2)
// priority 0 has no subpriorities, +1 for unverified
#define PRIORITY_UDP (QUEUES_CNT - 1) // last queue
#define Q0_INSTANT_LIMIT 1000000 // ns
#define KRU_CAPACITY (1<<19) // same as ratelimiting default
#define BASE_PRICE(nsec) ((uint64_t)KRU_LIMIT * LOADS_THRESHOLDS[0] / (1<<16) * (nsec) / Q0_INSTANT_LIMIT)
#define MAX_DECAY (BASE_PRICE(1000000) / 2) // max value at 50% utilization of single cpu
// see log written by defer_str_conf for details
#define REQ_TIMEOUT 1000000000 // ns (THREAD_CPUTIME), older deferred queries are dropped
#define IDLE_TIMEOUT 1000000 // ns (THREAD_CPUTIME); if exceeded, continue processing after next poll phase
#define PHASE_UDP_TIMEOUT 400000 // ns (THREAD_CPUTIME); switch between udp, non-udp phases
#define PHASE_NON_UDP_TIMEOUT 400000 // ns (THREAD_CPUTIME); after timeout or emptying queue
#define MAX_WAITING_REQS_SIZE (64l * 1024 * 1024) // bytes; if exceeded, some deferred requests are processed in poll phase
// single TCP allocates more than 64KiB wire buffer
// TODO check whether all important allocations are counted;
// different things are not counted: tasks and subsessions (not deferred after creation), uv handles, queues overhead, ...;
// payload is counted either as part of session wire buffer (for stream) or as part of iter ctx (for datagrams)
#define VERBOSE_LOG(...) kr_log_debug(DEFER, " | " __VA_ARGS__)
struct defer {
size_t capacity;
kru_price_t max_decay;
uint32_t log_period;
int cpus;
bool using_avx2;
_Atomic uint32_t log_time;
_Alignas(64) uint8_t kru[];
};
struct defer *defer = NULL;
bool defer_initialized = false;
uint64_t defer_uvtime_stamp = 0;
struct mmapped defer_mmapped = {0};
defer_sample_state_t defer_sample_state = {
.is_accounting = 0,
};
uv_idle_t idle_handle;
static void defer_queues_idle(uv_idle_t *handle);
protolayer_iter_ctx_queue_t queues[QUEUES_CNT];
int waiting_requests = 0;
ptrdiff_t waiting_requests_size = 0; // signed for non-negativeness asserts
int queue_ix = QUEUES_CNT; // MIN( last popped queue, first non-empty queue )
enum phase {
PHASE_NONE,
PHASE_UDP,
PHASE_NON_UDP
} phase = PHASE_NONE;
uint64_t phase_elapsed[3] = { 0 }; // ns; [PHASE_NONE] value is being incremented but never used
const uint64_t phase_limits[3] = {0, PHASE_UDP_TIMEOUT, PHASE_NON_UDP_TIMEOUT};
uint64_t phase_stamp = 0;
static inline bool phase_over_limit(enum phase p)
{
return phase_elapsed[p] >= phase_limits[p];
}
/// Reset elapsed times of phases and set phase to UDP, NON_UDP, or NONE.
static inline void phase_reset(enum phase p)
{
phase_elapsed[PHASE_UDP] = 0;
phase_elapsed[PHASE_NON_UDP] = 0;
phase_stamp = defer_sample_state.stamp;
phase = p;
}
/// Set phase to UDP or NON_UDP if it is not over limit or both are over limit (reset them).
static inline bool phase_try_set(enum phase p)
{
phase_elapsed[phase] += defer_sample_state.stamp - phase_stamp;
phase_stamp = defer_sample_state.stamp;
if (!phase_over_limit(p)) {
phase = p;
return true;
} else if (phase_over_limit(PHASE_UDP) && phase_over_limit(PHASE_NON_UDP)) {
phase_reset(p);
return true;
}
return false;
}
struct pl_defer_sess_data {
struct protolayer_data h;
protolayer_iter_ctx_queue_t queue; // properly ordered sequence of deferred packets, for stream only
// the first ctx in the queue is also in a defer queue
size_t size;
};
struct pl_defer_iter_data {
struct protolayer_data h;
uint64_t req_stamp; // time when request was received, uses get_stamp()
size_t size;
};
/// Return whether we're using optimized variant right now.
static bool using_avx2(void)
{
bool result = (KRU.initialize == KRU_AVX2.initialize);
kr_require(result || KRU.initialize == KRU_GENERIC.initialize);
return result;
}
/// Print configuration into desc array.
void defer_str_conf(char *desc, int desc_len)
{
int len = 0;
#define append(...) len += snprintf(desc + len, desc_len > len ? desc_len - len : 0, __VA_ARGS__)
#define append_time(prefix, ms, suffix) { \
if ((ms) < 1) append(prefix "%7.1f us" suffix, (ms) * 1000); \
else if ((ms) < 1000) append(prefix "%7.1f ms" suffix, (ms)); \
else append(prefix "%7.1f s " suffix, (ms) / 1000); }
append( " Expected cpus/procs: %5d\n", defer->cpus);
const size_t thresholds = sizeof(LOADS_THRESHOLDS) / sizeof(*LOADS_THRESHOLDS);
append( " Max waiting requests:%7.1f MiB\n", MAX_WAITING_REQS_SIZE / 1024.0 / 1024.0);
append_time(" Request timeout: ", REQ_TIMEOUT / 1000000.0, "\n");
append_time(" Idle: ", IDLE_TIMEOUT / 1000000.0, "\n");
append_time(" UDP phase: ", PHASE_UDP_TIMEOUT / 1000000.0, "\n");
append_time(" Non-UDP phase: ", PHASE_NON_UDP_TIMEOUT / 1000000.0, "\n");
append( " Priority levels: %5ld (%ld main levels, %d sublevels) + UDP\n", QUEUES_CNT - 1, thresholds, SUBPRIO_CNT);
size_t capacity_log = 0;
for (size_t c = defer->capacity - 1; c > 0; c >>= 1) capacity_log++;
size_t size = offsetof(struct defer, kru) + KRU.get_size(capacity_log);
append( " KRU capacity: %7.1f k (%0.1f MiB)\n", (1 << capacity_log) / 1000.0, size / 1000000.0);
bool uniform_thresholds = true;
for (int i = 1; i < thresholds - 1; i++)
uniform_thresholds &= (LOADS_THRESHOLDS[i] == LOADS_THRESHOLDS[i - 1] * LOADS_THRESHOLDS[0]);
uniform_thresholds &= ((1<<16) == (int)LOADS_THRESHOLDS[thresholds - 2] * LOADS_THRESHOLDS[0]);
append( " Decay: %7.3f %% per ms (32-bit max: %d)\n",
100.0 * defer->max_decay / KRU_LIMIT, defer->max_decay);
float half_life = -1.0 / log2f(1.0 - (float)defer->max_decay / KRU_LIMIT);
append_time(" Half-life: ", half_life, "\n");
if (uniform_thresholds)
append_time(" Priority rise in: ", half_life * 16 / thresholds, "\n");
append_time(" Counter reset in: ", half_life * 16, "\n");
append(" Rate limits for crossing priority levels as single CPU utilization:\n");
const struct kru_conf *kru_confs[] = {&V4_CONF, &V6_CONF};
const int version[] = {4, 6};
const kru_price_t base_price_ms = BASE_PRICE(1000000);
append("%15s", "");
for (int j = 0; j < 3; j++)
append("%14d", j+1);
append("%14s\n", "max");
for (int v = 0; v < 2; v++) {
for (int i = kru_confs[v]->prefixes_cnt - 1; i >= 0; i--) {
append("%9sv%d/%-3d: ", "", version[v], kru_confs[v]->prefixes[i]);
for (int j = 0; j < thresholds; j++) {
float needed_util = (float)defer->max_decay / (1<<16) * LOADS_THRESHOLDS[j] / base_price_ms * kru_confs[v]->rate_mult[i];
append("%12.3f %%", needed_util * 100);
}
append("\n");
}
}
append(" Instant limits for crossing priority levels as CPU time:\n");
append("%15s", "");
for (int j = 0; j < 3; j++)
append("%14d", j+1);
append("%14s\n", "max");
for (int v = 0; v < 2; v++) {
for (int i = kru_confs[v]->prefixes_cnt - 1; i >= 0; i--) {
append("%9sv%d/%-3d: ", "", version[v], kru_confs[v]->prefixes[i]);
for (int j = 0; j < thresholds; j++) {
float needed_time = (float)KRU_LIMIT / (1<<16) * LOADS_THRESHOLDS[j] / base_price_ms * kru_confs[v]->rate_mult[i];
if (needed_time < 1) {
append("%11.1f us", needed_time * 1000);
} else if (needed_time < 1000) {
append("%11.1f ms", needed_time);
} else {
append("%11.1f s ", needed_time / 1000);
}
}
append("\n");
}
}
append(" (values above max are indistinguishable)\n");
#undef append_time
#undef append
}
void defer_set_price_factor16(struct kr_request *req, uint32_t price_factor16)
{
req->qsource.price_factor16 = defer_sample_state.price_factor16 = price_factor16;
}
/// Call KRU, return priority and as params load and prefix.
static inline int kru_charge_classify(const struct kru_conf *kru_conf, uint8_t *key, kru_price_t *prices,
uint16_t *out_load, uint8_t *out_prefix)
{
uint16_t loads[kru_conf->prefixes_cnt];
KRU.load_multi_prefix((struct kru *)defer->kru, kr_now(),
kru_conf->namespace, key, kru_conf->prefixes, prices, kru_conf->prefixes_cnt, loads);
int priority = 0;
int prefix_index = kru_conf->prefixes_cnt - 1;
for (int i = kru_conf->prefixes_cnt - 1, j = 0; i >= 0; i--) {
for (; LOADS_THRESHOLDS[j] < loads[i]; j++) {
prefix_index = i;
priority = 1 + j * SUBPRIO_CNT + kru_conf->subprio[i];
}
}
*out_load = loads[prefix_index];
*out_prefix = kru_conf->prefixes[prefix_index];
return priority;
}
/// Increment KRU counters by given time.
void defer_charge(uint64_t nsec, union kr_sockaddr *addr, bool stream)
{
if (!stream) return; // UDP is not accounted in KRU; TODO remove !stream invocations?
// compute time adjusted by the price factor
uint64_t nsec_adj;
const uint32_t pf16 = defer_sample_state.price_factor16;
if (pf16 == 0) return; // whitelisted
if (nsec < (1ul<<32)) { // simple way with standard rounding
nsec_adj = (nsec * pf16 + (1<<15)) >> 16;
} else { // afraid of overflow, so we swap the order of the math
nsec_adj = ((nsec + (1<<15)) >> 16) * pf16;
}
_Alignas(16) uint8_t key[16] = {0, };
const struct kru_conf *kru_conf;
if (addr->ip.sa_family == AF_INET6) {
memcpy(key, &addr->ip6.sin6_addr, 16);
kru_conf = &V6_CONF;
} else if (addr->ip.sa_family == AF_INET) {
memcpy(key, &addr->ip4.sin_addr, 4);
kru_conf = &V4_CONF;
} else {
return;
}
uint64_t base_price = BASE_PRICE(nsec_adj);
kru_price_t prices[kru_conf->prefixes_cnt];
for (size_t i = 0; i < kru_conf->prefixes_cnt; i++) {
uint64_t price = base_price / kru_conf->rate_mult[i];
prices[i] = price > (kru_price_t)-1 ? -1 : price;
}
uint16_t load;
uint8_t prefix;
kru_charge_classify(kru_conf, key, prices, &load, &prefix);
VERBOSE_LOG(" %s ADD %4.3f ms * %.2f -> load: %d on /%d\n",
kr_straddr(&addr->ip), nsec / 1000000.0, pf16 / (float)(1<<16), load, prefix);
}
/// Determine priority of the request in [0, QUEUES_CNT - 1];
/// lower value has higher priority; plain UDP always gets PRIORITY_UDP.
static inline int classify(const union kr_sockaddr *addr, bool stream)
{
if (!stream) { // UDP
VERBOSE_LOG(" unverified address\n");
return PRIORITY_UDP;
}
_Alignas(16) uint8_t key[16] = {0, };
const struct kru_conf *kru_conf = NULL;
if (addr->ip.sa_family == AF_INET6) {
memcpy(key, &addr->ip6.sin6_addr, 16);
kru_conf = &V6_CONF;
} else if (addr->ip.sa_family == AF_INET) {
memcpy(key, &addr->ip4.sin_addr, 4);
kru_conf = &V4_CONF;
} else {
kr_assert(false);
return 0; // shouldn't happen anyway
}
uint16_t load;
uint8_t prefix;
int priority = kru_charge_classify(kru_conf, key, NULL, &load, &prefix);
VERBOSE_LOG(" load %d on /%d\n", load, prefix);
return priority;
}
/// Push query to a queue according to its priority.
static inline void push_query(struct protolayer_iter_ctx *ctx, int priority, bool to_head_end)
{
if (to_head_end) {
queue_push_head(queues[priority], ctx);
} else {
queue_push(queues[priority], ctx);
}
queue_ix = MIN(queue_ix, priority);
waiting_requests++;
}
/// Pop and return query from the specified queue..
static inline struct protolayer_iter_ctx *pop_query_queue(int priority)
{
kr_assert(queue_len(queues[priority]) > 0);
struct protolayer_iter_ctx *ctx = queue_head(queues[priority]);
queue_pop(queues[priority]);
waiting_requests--;
kr_assert(waiting_requests >= 0);
return ctx;
}
/// Pop and return the query with the highest priority, UDP or non-UDP based on the current phase.
static inline struct protolayer_iter_ctx *pop_query(void)
{
const int waiting_udp = queue_len(queues[PRIORITY_UDP]);
const int waiting_non_udp = waiting_requests - waiting_udp;
if (!((waiting_non_udp > 0) && phase_try_set(PHASE_NON_UDP)) &&
!((waiting_udp > 0) && phase_try_set(PHASE_UDP)))
phase_reset(waiting_non_udp > 0 ? PHASE_NON_UDP : PHASE_UDP);
int i;
if (phase == PHASE_NON_UDP) {
for (; queue_ix < QUEUES_CNT && queue_len(queues[queue_ix]) == 0; queue_ix++);
if (kr_fails_assert(queue_ix < PRIORITY_UDP))
return NULL;
i = queue_ix;
} else {
i = PRIORITY_UDP;
}
return pop_query_queue(i);
}
// Break the given query; for streams break also all follow-up queries and force-close the stream.
static inline void break_query(struct protolayer_iter_ctx *ctx, int err)
{
if (ctx->session->stream) {
struct session2 *s = ctx->session;
struct pl_defer_sess_data *sdata = protolayer_sess_data_get_current(ctx);
s->ref_count++; // keep session and sdata alive for a while
waiting_requests_size -= sdata->size;
if (!ctx->session->closing) {
session2_force_close(ctx->session);
}
kr_assert(ctx == queue_head(sdata->queue));
while (true) {
queue_pop(sdata->queue);
if (ctx) {
struct pl_defer_iter_data *idata = protolayer_iter_data_get_current(ctx);
waiting_requests_size -= idata->size;
protolayer_break(ctx, kr_error(err));
}
if (queue_len(sdata->queue) == 0) break;
ctx = queue_head(sdata->queue);
}
session2_unhandle(s); // decrease ref_count
} else {
struct pl_defer_iter_data *idata = protolayer_iter_data_get_current(ctx);
waiting_requests_size -= idata->size;
protolayer_break(ctx, kr_error(err));
}
kr_assert(waiting_requests ? waiting_requests_size > 0 : waiting_requests_size == 0);
}
/// Process a single deferred query (or defer again) if there is any.
/// Time accounting should have been just started, the stamp is used, accounted address is set.
static inline void process_single_deferred(void)
{
struct protolayer_iter_ctx *ctx = pop_query();
if (kr_fails_assert(ctx)) return;
defer_sample_addr((const union kr_sockaddr *)ctx->comm->src_addr, ctx->session->stream);
struct pl_defer_iter_data *idata = protolayer_iter_data_get_current(ctx);
struct pl_defer_sess_data *sdata = protolayer_sess_data_get_current(ctx);
struct session2 *session = ctx->session;
uint64_t age_ns = defer_sample_state.stamp - idata->req_stamp;
VERBOSE_LOG(" %s POP from %d after %4.3f ms\n",
kr_straddr(ctx->comm->src_addr),
queue_ix,
age_ns / 1000000.0);
if (ctx->session->closing) {
VERBOSE_LOG(" BREAK (session is closing)\n");
break_query(ctx, ECANCELED);
return;
}
if (age_ns >= REQ_TIMEOUT) {
VERBOSE_LOG(" BREAK (timeout)\n");
// notice logging according to log-period
const uint32_t time_now = kr_now();
uint32_t log_time_orig = atomic_load_explicit(&defer->log_time, memory_order_relaxed);
if (defer->log_period) {
while (time_now - log_time_orig + 1024 >= defer->log_period + 1024) {
if (atomic_compare_exchange_weak_explicit(&defer->log_time, &log_time_orig, time_now,
memory_order_relaxed, memory_order_relaxed)) {
kr_log_notice(DEFER, "Data from %s too long in queue, dropping. (%0.3f MiB in queues)\n",
kr_straddr(ctx->comm->src_addr), waiting_requests_size / 1024.0 / 1024.0);
break;
}
}
}
break_query(ctx, ETIME);
return;
}
bool eof = false;
if (ctx->session->stream) {
int priority = classify((const union kr_sockaddr *)ctx->comm->src_addr, ctx->session->stream);
if (priority > queue_ix) { // priority dropped (got higher value)
VERBOSE_LOG(" PUSH to %d\n", priority);
push_query(ctx, priority, false);
return;
}
kr_assert(queue_head(sdata->queue) == ctx);
queue_pop(sdata->queue);
while ((queue_len(sdata->queue) > 0) && (queue_head(sdata->queue) == NULL)) { // EOF event
eof = true;
queue_pop(sdata->queue);
}
if (queue_len(sdata->queue) > 0) {
VERBOSE_LOG(" PUSH follow-up to head of %d\n", priority);
push_query(queue_head(sdata->queue), priority, true);
} else {
waiting_requests_size -= sdata->size;
}
}
waiting_requests_size -= idata->size;
kr_assert(waiting_requests ? waiting_requests_size > 0 : waiting_requests_size == 0);
if (eof) {
// Keep session alive even if it is somehow force-closed during continuation.
// TODO Is it possible?
session->ref_count++;
}
VERBOSE_LOG(" CONTINUE\n");
protolayer_continue(ctx);
if (eof) {
VERBOSE_LOG(" CONTINUE EOF event\n");
session2_event_after(session, PROTOLAYER_TYPE_DEFER, PROTOLAYER_EVENT_EOF, NULL);
session2_unhandle(session); // decrease ref_count
}
}
/// Process as many deferred requests as needed to get memory consumption under limit.
static inline void process_deferred_over_size_limit(void) {
if (waiting_requests_size > MAX_WAITING_REQS_SIZE) {
defer_sample_state_t prev_sample_state;
defer_sample_start(&prev_sample_state);
do {
process_single_deferred(); // possibly defers again without decreasing waiting_requests_size
// If the unwrapped query is to be processed here,
// it is the last iteration and the query is processed after returning.
defer_sample_restart();
} while (waiting_requests_size > MAX_WAITING_REQS_SIZE);
defer_sample_stop(&prev_sample_state, true);
}
}
/// Break expired requests at the beginning of queues, uses current stamp.
static inline void cleanup_queues(void)
{
for (int i = 0; i < QUEUES_CNT; i++) {
int cnt = 0;
while (queue_len(queues[i]) > 0) {
struct protolayer_iter_ctx *ctx = queue_head(queues[i]);
struct pl_defer_iter_data *idata = protolayer_iter_data_get_current(ctx);
uint64_t age_ns = defer_sample_state.stamp - idata->req_stamp;
if (age_ns < REQ_TIMEOUT) break;
pop_query_queue(i);
break_query(ctx, ETIME);
cnt++;
}
if (cnt > 0) {
VERBOSE_LOG(" BREAK %d queries from %d\n", cnt, i);
}
}
}
/// Unwrap: defer or process the query synchronously.
/// Time accounting should have been started, the stamp is used, accounted address is set.
static enum protolayer_iter_cb_result pl_defer_unwrap(
void *sess_data, void *iter_data,
struct protolayer_iter_ctx *ctx)
{
if (!defer || ctx->session->outgoing)
return protolayer_continue(ctx);
defer_sample_addr((const union kr_sockaddr *)ctx->comm->src_addr, ctx->session->stream);
struct pl_defer_iter_data *idata = iter_data;
struct pl_defer_sess_data *sdata = sess_data;
idata->req_stamp = defer_sample_state.stamp;
VERBOSE_LOG(" %s UNWRAP\n",
kr_straddr(ctx->comm->src_addr));
uv_idle_start(&idle_handle, defer_queues_idle);
if (queue_len(sdata->queue) > 0) { // stream with preceding packet already deferred
queue_push(sdata->queue, ctx);
waiting_requests_size += idata->size = protolayer_iter_size_est(ctx, false);
// payload counted in session wire buffer
VERBOSE_LOG(" PUSH as follow-up\n");
process_deferred_over_size_limit();
return protolayer_async();
}
int priority = classify((const union kr_sockaddr *)ctx->comm->src_addr, ctx->session->stream);
// Process synchronously unless there may exist requests that has to be processed first
if (((priority == 0) || (priority == PRIORITY_UDP)) && (queue_len(queues[priority]) == 0) &&
phase_try_set(priority == PRIORITY_UDP ? PHASE_UDP : PHASE_NON_UDP)) {
VERBOSE_LOG(" CONTINUE\n");
return protolayer_continue(ctx);
}
VERBOSE_LOG(" PUSH to %d\n", priority);
if (ctx->session->stream) {
queue_push(sdata->queue, ctx);
waiting_requests_size += sdata->size = protolayer_sess_size_est(ctx->session);
}
push_query(ctx, priority, false);
waiting_requests_size += idata->size = protolayer_iter_size_est(ctx, !ctx->session->stream);
// for stream, payload is counted in session wire buffer
process_deferred_over_size_limit();
return protolayer_async();
}
/// Unwrap event: EOF event may be deferred here, other events pass synchronously.
static enum protolayer_event_cb_result pl_defer_event_unwrap(
enum protolayer_event_type event, void **baton,
struct session2 *session, void *sess_data)
{
if (!defer || !session->stream || session->outgoing)
return PROTOLAYER_EVENT_PROPAGATE;
defer_sample_addr((const union kr_sockaddr *)session->comm_storage.src_addr, session->stream);
struct pl_defer_sess_data *sdata = sess_data;
if ((event == PROTOLAYER_EVENT_EOF) && (queue_len(sdata->queue) > 0)) {
// defer EOF event if unprocessed data remain, baton is dropped if any
queue_push(sdata->queue, NULL);
VERBOSE_LOG(" %s event %s deferred\n",
session->comm_storage.src_addr ? kr_straddr(session->comm_storage.src_addr) : "(null)",
protolayer_event_name(event));
return PROTOLAYER_EVENT_CONSUME;
}
VERBOSE_LOG(" %s event %s passes through synchronously%s%s\n",
session->comm_storage.src_addr ? kr_straddr(session->comm_storage.src_addr) : "(null)",
protolayer_event_name(event),
queue_len(sdata->queue) > 0 ? " ahead of deferred data" : "",
*baton ? " (with baton)" : "");
return PROTOLAYER_EVENT_PROPAGATE;
}
/// Idle: continue processing deferred requests.
static void defer_queues_idle(uv_idle_t *handle)
{
VERBOSE_LOG("IDLE\n");
if (waiting_requests > 0) {
VERBOSE_LOG(" %d waiting\n", waiting_requests);
defer_sample_start(NULL);
uint64_t idle_stamp = defer_sample_state.stamp;
do {
process_single_deferred();
defer_sample_restart();
} while ((waiting_requests > 0) && (defer_sample_state.stamp < idle_stamp + IDLE_TIMEOUT));
defer_sample_stop(NULL, true);
cleanup_queues();
udp_queue_send_all();
}
if (waiting_requests > 0) {
VERBOSE_LOG(" %d waiting\n", waiting_requests);
} else {
phase_reset(PHASE_NONE);
VERBOSE_LOG(" deactivate idle\n");
uv_idle_stop(&idle_handle);
}
VERBOSE_LOG("POLL\n");
}
/// Initialize shared memory, queues. To be called from Lua.
int defer_init(const char *mmap_file, uint32_t log_period, int cpus) // TODO possibly remove cpus; not needed
{
defer_initialized = true;
if (mmap_file == NULL) {
// defer explicitly disabled
return 0;
}
int ret = 0;
if (cpus < 1) {
ret = EINVAL;
goto fail;
}
struct defer header = {
.capacity = KRU_CAPACITY,
.max_decay = MAX_DECAY,
.log_period = log_period,
.cpus = cpus,
.using_avx2 = using_avx2(),
};
size_t capacity_log = 0;
for (size_t c = header.capacity - 1; c > 0; c >>= 1) capacity_log++;
size_t size = offsetof(struct defer, kru) + KRU.get_size(capacity_log);
size_t header_size = offsetof(struct defer, using_avx2) + sizeof(header.using_avx2);
static_assert( // no padding up to .using_avx2
offsetof(struct defer, using_avx2) ==
sizeof(header.capacity) +
sizeof(header.max_decay) +
sizeof(header.log_period) +
sizeof(header.cpus),
"detected padding with undefined data inside mmapped header");
ret = mmapped_init(&defer_mmapped, mmap_file, size, &header, header_size);
if (ret == MMAPPED_WAS_FIRST) {
kr_log_info(DEFER, "Initializing defer...\n");
defer = defer_mmapped.mem;
bool succ = KRU.initialize((struct kru *)defer->kru, capacity_log, header.max_decay);
if (!succ) {
defer = NULL;
ret = kr_error(EINVAL);
goto fail;
}
defer->log_time = kr_now() - log_period;
ret = mmapped_init_continue(&defer_mmapped);
if (ret != 0) goto fail;
kr_log_info(DEFER, "Defer initialized (%s).\n", (defer->using_avx2 ? "AVX2" : "generic"));
// log current configuration
if (KR_LOG_LEVEL_IS(LOG_INFO) || kr_log_group_is_set(LOG_GRP_DEFER)) {
char desc[8000];
defer_str_conf(desc, sizeof(desc));
kr_log_info(DEFER, "Defer configuration:\n%s", desc);
}
} else if (ret == 0) {
defer = defer_mmapped.mem;
kr_log_info(DEFER, "Using existing defer data (%s).\n", (defer->using_avx2 ? "AVX2" : "generic"));
} else goto fail;
for (size_t i = 0; i < QUEUES_CNT; i++)
queue_init(queues[i]);
return 0;
fail:
kr_log_crit(DEFER, "Initialization of shared defer data failed.\n");
return ret;
}
/// Initialize idle.
int defer_init_idle(uv_loop_t *loop)
{
return uv_idle_init(loop, &idle_handle);
}
/// Initialize session queue
int pl_defer_sess_init(struct session2 *session, void *data, void *param)
{
struct pl_defer_sess_data *sdata = data;
queue_init(sdata->queue);
return 0;
}
/// Deinitialize shared memory.
void defer_deinit(void)
{
mmapped_deinit(&defer_mmapped);
defer = NULL;
}
/// Initialize protolayer.
__attribute__((constructor))
static void defer_protolayers_init(void)
{
protolayer_globals[PROTOLAYER_TYPE_DEFER] = (struct protolayer_globals){
.iter_size = sizeof(struct pl_defer_iter_data),
.sess_size = sizeof(struct pl_defer_sess_data),
.sess_init = pl_defer_sess_init,
.unwrap = pl_defer_unwrap,
.event_unwrap = pl_defer_event_unwrap,
};
}