8c3183c5 · 8c3183c5 · 8c3183c5 · 8c3183c5 · 8c3183c5 · 8c3183c5
--- a/daemon/bindings/net_buffering.rst
+++ b/daemon/bindings/net_buffering.rst
+.. SPDX-License-Identifier: GPL-3.0-or-later
+
+Buffering tweaks
+----------------
+
+We (can) set various server-side socket options that affect buffering.
+The values are stored in C structures without real Lua bindings,
+so setting them is a bit long.
+
+.. py:data:: (require 'ffi').C.the_worker.engine.net.tcp.user_timeout
+ 
+   On TCP-based server-side sockets we set ``TCP_USER_TIMEOUT`` option if available (~Linux).
+   We use default 1000, i.e. one second.  For details see the definition in ``man tcp.7``.
+
+.. py:data:: (require 'ffi').C.the_worker.engine.net.listen_tcp_buflens.snd
+.. py:data:: (require 'ffi').C.the_worker.engine.net.listen_tcp_buflens.rcv
+.. py:data:: (require 'ffi').C.the_worker.engine.net.listen_udp_buflens.snd
+.. py:data:: (require 'ffi').C.the_worker.engine.net.listen_udp_buflens.rcv
+
+   If overridden to nonzero, these variables instruct the OS to modify kernel-space buffers
+   for server-side sockets.  We split the setting for UDP vs. TCP and sending vs. receiving.
+
+   For details see ``SO_SNDBUF`` and ``SO_RCVBUF`` in ``man socket.7``.
+   There is no user-space buffering beyond immediate manipulation, only the OS keeps some.
+
--- a/daemon/bindings/net_client.rst
+++ b/daemon/bindings/net_client.rst
+.. SPDX-License-Identifier: GPL-3.0-or-later
+
+IPv4 and IPv6 usage
+-------------------
+
+Following settings affect client part of the resolver,
+i.e. communication between the resolver itself and other DNS servers.
+
+IPv4 and IPv6 protocols are used by default. For performance reasons it is
+recommended to explicitly disable protocols which are not available
+on your system, though the impact of IPv6 outage is lowered since release 5.3.0.
+
+.. envvar:: net.ipv4 = true|false
+
+   :return: boolean (default: true)
+
+   Enable/disable using IPv4 for contacting upstream nameservers.
+
+.. envvar:: net.ipv6 = true|false
+
+   :return: boolean (default: true)
+
+   Enable/disable using IPv6 for contacting upstream nameservers.
+
+.. function:: net.outgoing_v4([string address])
+
+   Get/set the IPv4 address used to perform queries.
+   The default is ``nil``, which lets the OS choose any address.
+
+.. function:: net.outgoing_v6([string address])
+
+   Get/set the IPv6 address used to perform queries.
+   The default is ``nil``, which lets the OS choose any address.
+
--- a/daemon/bindings/net_dns_tweaks.rst
+++ b/daemon/bindings/net_dns_tweaks.rst
+.. SPDX-License-Identifier: GPL-3.0-or-later
+
+DNS protocol tweaks
+-------------------
+
+Following settings change low-level details of DNS protocol implementation.
+Default values should not be changed except for very special cases.
+
+.. function:: net.bufsize([udp_downstream_bufsize][, udp_upstream_bufsize])
+
+   Get/set maximum EDNS payload size advertised in DNS packets. Different values can be configured for communication downstream (towards clients) and upstream (towards other DNS servers). Set and also get operations use values in this order.
+
+   Default is 1232 bytes which was chosen to minimize risk of `issues caused by IP fragmentation <https://blog.apnic.net/2019/07/12/its-time-to-consider-avoiding-ip-fragmentation-in-the-dns/>`_. Further details can be found at `DNS Flag Day 2020 <https://www.dnsflagday.net/2020/>`_ web site.
+
+   Minimal value allowed by standard :rfc:`6891` is 512 bytes, which is equal to DNS packet size without Extension Mechanisms for DNS. Value 1220 bytes is minimum size required by DNSSEC standard :rfc:`4035`.
+
+   Example output:
+
+   .. code-block:: lua
+
+	-- set downstream and upstream bufsize to value 4096
+	> net.bufsize(4096)
+	-- get configured downstream and upstream bufsizes, respectively
+	> net.bufsize()
+	4096	-- result # 1
+	4096	-- result # 2
+
+	-- set downstream bufsize to 4096 and upstream bufsize to 1232
+	> net.bufsize(4096, 1232)
+	-- get configured downstream and upstream bufsizes, respectively
+	> net.bufsize()
+	4096	-- result # 1
+	1232	-- result # 2
+
+.. include:: ../../modules/workarounds/README.rst
--- a/daemon/bindings/net_server.rst
+++ b/daemon/bindings/net_server.rst
+.. SPDX-License-Identifier: GPL-3.0-or-later
+
+Addresses and services
+----------------------
+
+Addresses, ports, protocols, and API calls available for clients communicating
+with resolver are configured using :func:`net.listen`.
+
+First you need to decide what service should be available on given IP address
+ port combination.
+
+.. csv-table::
+  :header: "Protocol/service", "net.listen *kind*"
+
+  "DNS (unencrypted UDP+TCP, :rfc:`1034`)","``dns``"
+  "DNS (unencrypted UDP, :ref:`using XDP Linux API <dns-over-xdp>`)","``xdp``"
+  ":ref:`dns-over-tls`","``tls``"
+  ":ref:`dns-over-https`","``doh2``"
+  ":ref:`Web management <mod-http-built-in-services>`","``webmgmt``"
+  ":ref:`Control socket <control-sockets>`","``control``"
+  ":ref:`mod-http-doh`","``doh_legacy``"
+
+.. note:: By default, **unencrypted DNS and DNS-over-TLS** are configured to **listen
+   on localhost**.
+
+   Control sockets are created either in
+   ``/run/knot-resolver/control/`` (when using systemd) or ``$PWD/control/``.
+
+.. function:: net.listen(addresses, [port = 53, { kind = 'dns', freebind = false }])
+
+   :return: ``true`` if port is bound, an error otherwise
+
+   Listen on addresses; port and flags are optional.
+   The addresses can be specified as a string or device.
+   Port 853 implies ``kind = 'tls'`` but it is always better to be explicit.
+   Freebind allows binding to a non-local or not yet available address.
+
+.. csv-table::
+  :header: "**Network protocol**", "**Configuration command**"
+
+  "DNS (UDP+TCP, :rfc:`1034`)","``net.listen('192.0.2.123', 53)``"
+  "DNS (UDP, :ref:`using XDP <dns-over-xdp>`)","``net.listen('192.0.2.123', 53, { kind = 'xdp' })``"
+  ":ref:`dns-over-tls`","``net.listen('192.0.2.123', 853, { kind = 'tls' })``"
+  ":ref:`dns-over-https`","``net.listen('192.0.2.123', 443, { kind = 'doh2' })``"
+  ":ref:`Web management <mod-http-built-in-services>`","``net.listen('192.0.2.123', 8453, { kind = 'webmgmt' })``"
+  ":ref:`Control socket <control-sockets>`","``net.listen('/tmp/kres.control', nil, { kind = 'control' })``"
+
+
+Examples:
+
+   .. code-block:: lua
+
+	net.listen('::1')
+	net.listen(net.lo, 53)
+	net.listen(net.eth0, 853, { kind = 'tls' })
+	net.listen('192.0.2.1', 53, { freebind = true })
+	net.listen({'127.0.0.1', '::1'}, 53, { kind = 'dns' })
+	net.listen('::', 443, { kind = 'doh2' })
+	net.listen('::', 8453, { kind = 'webmgmt' }) -- see http module
+	net.listen('/tmp/kresd-socket', nil, { kind = 'webmgmt' }) -- http module supports AF_UNIX
+	net.listen('eth0', 53, { kind = 'xdp' })
+	net.listen('192.0.2.123', 53, { kind = 'xdp', nic_queue = 0 })
+
+.. warning:: On machines with multiple IP addresses avoid listening on wildcards
+        ``0.0.0.0`` or ``::``. Knot Resolver could answer from different IP
+        addresses if the network address ranges overlap,
+        and clients would probably refuse such a response.
+
+.. _proxyv2:
+
+PROXYv2 protocol
+^^^^^^^^^^^^^^^^
+
+Knot Resolver supports proxies that utilize the `PROXYv2 protocol <https://www.haproxy.org/download/2.5/doc/proxy-protocol.txt>`_
+to identify clients.
+
+A PROXY header contains the IP address of the original client who sent a query.
+This allows the resolver to treat queries as if they actually came from
+the client's IP address rather than the address of the proxy they came through.
+For example, :ref:`Views and ACLs <mod-view>` are able to work properly when
+PROXYv2 is in use.
+
+Since allowing usage of the PROXYv2 protocol for all clients would be a security
+vulnerability, because clients would then be able to spoof their IP addresses via
+the PROXYv2 header, the resolver requires you to specify explicitly which clients
+are allowed to send PROXYv2 headers via the :func:`net.proxy_allowed` function.
+
+PROXYv2 queries from clients who are not explicitly allowed to use this protocol
+will be discarded.
+
+.. function:: net.proxy_allowed([addresses])
+
+   Allow usage of the PROXYv2 protocol headers by clients on the specified
+   ``addresses``. It is possible to permit whole networks to send PROXYv2 headers
+   by specifying the network mask using the CIDR notation
+   (e.g. ``172.22.0.0/16``). IPv4 as well as IPv6 addresses are supported.
+
+   If you wish to allow all clients to use PROXYv2 (e.g. because you have this
+   kind of security handled on another layer of your network infrastructure),
+   you can specify a netmask of ``/0``. Please note that this setting is
+   address-family-specific, so this needs to be applied to both IPv4 and IPv6
+   separately.
+
+   Subsequent calls to the function overwrite the effects of all previous calls.
+   Providing a table of strings as the function parameter allows multiple
+   distinct addresses to use the PROXYv2 protocol.
+
+   When called without arguments, ``net.proxy_allowed`` returns a table of all
+   addresses currently allowed to use the PROXYv2 protocol and does not change
+   the configuration.
+
+Examples:
+
+   .. code-block:: lua
+
+	net.proxy_allowed('172.22.0.1')    -- allows '172.22.0.1' specifically
+	net.proxy_allowed('172.18.1.0/24') -- allows everyone at '172.18.1.*'
+	net.proxy_allowed({
+		'172.22.0.1', '172.18.1.0/24'
+	})                                 -- allows both of the above at once
+	net.proxy_allowed({ 'fe80::/10' }  -- allows everyone at IPv6 link-local
+	net.proxy_allowed({
+		'::/0', '0.0.0.0/0'
+	})                                 -- allows everyone
+	net.proxy_allowed('::/0')          -- allows all IPv6 (but no IPv4)
+	net.proxy_allowed({})              -- prevents everyone from using PROXYv2
+	net.proxy_allowed()                -- returns a list of all currently allowed addresses
+
+Features for scripting
+^^^^^^^^^^^^^^^^^^^^^^
+Following configuration functions are useful mainly for scripting or :ref:`runtime-cfg`.
+
+.. function:: net.close(address, [port])
+
+   :return: boolean (at least one endpoint closed)
+
+   Close all endpoints listening on the specified address, optionally restricted by port as well.
+
+
+.. function:: net.list()
+
+   :return: Table of bound interfaces.
+
+   Example output:
+
+   .. code-block:: none
+
+      [1] => {
+          [kind] => tls
+          [transport] => {
+              [family] => inet4
+              [ip] => 127.0.0.1
+              [port] => 853
+              [protocol] => tcp
+          }
+      }
+      [2] => {
+          [kind] => dns
+          [transport] => {
+              [family] => inet6
+              [ip] => ::1
+              [port] => 53
+              [protocol] => udp
+          }
+      }
+      [3] => {
+          [kind] => dns
+          [transport] => {
+              [family] => inet6
+              [ip] => ::1
+              [port] => 53
+              [protocol] => tcp
+          }
+      }
+      [4] => {
+          [kind] => xdp
+          [transport] => {
+              [family] => inet4+inet6
+              [interface] => eth2
+              [nic_queue] => 0
+              [port] => 53
+              [protocol] => udp
+          }
+      }
+
+.. function:: net.interfaces()
+
+   :return: Table of available interfaces and their addresses.
+
+   Example output:
+
+   .. code-block:: none
+
+	[lo0] => {
+	    [addr] => {
+	        [1] => ::1
+	        [2] => 127.0.0.1
+	    }
+	    [mac] => 00:00:00:00:00:00
+	}
+	[eth0] => {
+	    [addr] => {
+	        [1] => 192.168.0.1
+	    }
+	    [mac] => de:ad:be:ef:aa:bb
+	}
+
+   .. tip:: You can use ``net.<iface>`` as a shortcut for specific interface, e.g. ``net.eth0``
+
+.. function:: net.tcp_pipeline([len])
+
+   Get/set per-client TCP pipeline limit, i.e. the number of outstanding queries that a single client connection can make in parallel.  Default is 100.
+
+   .. code-block:: lua
+
+      > net.tcp_pipeline()
+      100
+      > net.tcp_pipeline(50)
+      50
+
+   .. warning:: Please note that too large limit may have negative impact on performance and can lead to increased number of SERVFAIL answers.
+
+.. _`dnsproxy module`: https://www.knot-dns.cz/docs/2.7/html/modules.html#dnsproxy-tiny-dns-proxy
+
+
--- a/daemon/bindings/net_tlssrv.rst
+++ b/daemon/bindings/net_tlssrv.rst
+.. SPDX-License-Identifier: GPL-3.0-or-later
+
+.. _tls-server-config:
+
+DoT and DoH (encrypted DNS)
+---------------------------
+
+.. warning::
+
+   It is important to understand **limits of encrypting only DNS traffic**.
+   Relevant security analysis can be found in article
+   *Simran Patil and Nikita Borisov. 2019. What can you learn from an IP?*
+   See `slides <https://irtf.org/anrw/2019/slides-anrw19-final44.pdf>`_
+   or `the article itself <https://dl.acm.org/authorize?N687437>`_.
+
+DoT and DoH encrypt DNS traffic with Transport Layer Security (TLS) protocol
+and thus protects DNS traffic from certain types of attacks.
+
+You can learn more about DoT and DoH and their implementation in Knot Resolver
+in `this article
+<https://en.blog.nic.cz/2020/11/25/encrypted-dns-in-knot-resolver-dot-and-doh/>`_.
+
+.. _dns-over-tls:
+
+DNS-over-TLS (DoT)
+^^^^^^^^^^^^^^^^^^
+
+DNS-over-TLS server (:rfc:`7858`) can be configured using ``tls`` kind in
+:func:`net.listen()`.  It is enabled on localhost by default.
+
+For certificate configuration, refer to :ref:`dot-doh-config-options`.
+
+.. _dns-over-https:
+
+DNS-over-HTTPS (DoH)
+^^^^^^^^^^^^^^^^^^^^
+
+.. note:: Knot Resolver currently offers two DoH implementations. It is
+   recommended to use this new implementation, which is more reliable, scalable
+   and has fewer dependencies. Make sure to use ``doh2`` kind in
+   :func:`net.listen()` to select this implementation.
+
+.. tip:: Independent information about political controversies around the
+   DoH deployment by default can be found in blog posts `DNS Privacy at IETF
+   104 <http://www.potaroo.net/ispcol/2019-04/angst.html>`_ and `More DOH
+   <http://www.potaroo.net/ispcol/2019-04/moredoh.html>`_ by Geoff Huston and
+   `Centralised DoH is bad for Privacy, in 2019 and beyond
+   <https://labs.ripe.net/Members/bert_hubert/centralised-doh-is-bad-for-privacy-in-2019-and-beyond>`_
+   by Bert Hubert.
+
+DNS-over-HTTPS server (:rfc:`8484`) can be configured using ``doh2`` kind in
+:func:`net.listen()`.
+
+This implementation supports HTTP/2 (:rfc:`7540`). Queries can be sent to the
+``/dns-query`` endpoint, e.g.:
+
+.. code-block:: bash
+
+	$ kdig @127.0.0.1 +https www.knot-resolver.cz AAAA
+
+**Only TLS version 1.3 (or higher) is supported with DNS-over-HTTPS.** The
+additional considerations for TLS 1.2 required by HTTP/2 are not implemented
+(:rfc:`7540#section-9.2`).
+
+.. warning:: Take care when configuring your server to listen on well known
+   HTTPS port. If an unrelated HTTPS service is running on the same port with
+   REUSEPORT enabled, you will end up with both services malfunctioning.
+
+.. _dot-doh-config-options:
+
+HTTP status codes
+"""""""""""""""""
+
+As specified by :rfc:`8484`, the resolver responds with status **200 OK** whenever
+it can produce a valid DNS reply for a given query, even in cases where the DNS
+``rcode`` indicates an error (like ``NXDOMAIN``, ``SERVFAIL``, etc.).
+
+For DoH queries malformed at the HTTP level, the resolver may respond with
+the following status codes:
+
+ * **400 Bad Request** for a generally malformed query, like one not containing
+   a valid DNS packet
+ * **404 Not Found** when an incorrect HTTP endpoint is queried - the only
+   supported ones are ``/dns-query`` and ``/doh``
+ * **413 Payload Too Large** when the DNS query exceeds its maximum size
+ * **415 Unsupported Media Type** when the query's ``Content-Type`` header
+   is not ``application/dns-message``
+ * **431 Request Header Fields Too Large** when a header in the query is too
+   large to process
+ * **501 Not Implemented** when the query uses a method other than
+   ``GET``, ``POST``, or ``HEAD``
+
+Configuration options for DoT and DoH
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. note:: These settings affect both DNS-over-TLS and DNS-over-HTTPS (except
+   the legacy implementation).
+
+A self-signed certificate is generated by default. For serious deployments
+it is strongly recommended to configure your own TLS certificates signed
+by a trusted CA. This is done using function :c:func:`net.tls()`.
+
+.. function:: net.tls([cert_path], [key_path])
+
+   When called with path arguments, the function loads the server TLS
+   certificate and private key for DoT and DoH.
+
+   When called without arguments, the command returns the currently configured paths.
+
+   Example output:
+
+   .. code-block:: lua
+
+      > net.tls("/etc/knot-resolver/server-cert.pem", "/etc/knot-resolver/server-key.pem")
+      > net.tls()  -- print configured paths
+      [cert_file] => '/etc/knot-resolver/server-cert.pem'
+      [key_file] => '/etc/knot-resolver/server-key.pem'
+
+   .. tip:: The certificate files aren't automatically reloaded on change. If
+      you update the certificate files, e.g. using ACME, you have to either
+      restart the service(s) or call this function again using
+      :ref:`control-sockets`.
+
+.. function:: net.tls_sticket_secret([string with pre-shared secret])
+
+   Set secret for TLS session resumption via tickets, by :rfc:`5077`.
+
+   The server-side key is rotated roughly once per hour.
+   By default or if called without secret, the key is random.
+   That is good for long-term forward secrecy, but multiple kresd instances
+   won't be able to resume each other's sessions.
+
+   If you provide the same secret to multiple instances, they will be able to resume
+   each other's sessions *without* any further communication between them.
+   This synchronization works only among instances having the same endianness
+   and time_t structure and size (`sizeof(time_t)`).
+
+.. _pfs: https://en.wikipedia.org/wiki/Forward_secrecy
+
+   **For good security** the secret must have enough entropy to be hard to guess,
+   and it should still be occasionally rotated manually and securely forgotten,
+   to reduce the scope of privacy leak in case the
+   `secret leaks eventually <pfs_>`_.
+
+   .. warning:: **Setting the secret is probably too risky with TLS <= 1.2 and
+      GnuTLS < 3.7.5**. GnuTLS 3.7.5 adds an option to disable resumption via
+      tickets for TLS <= 1.2, enabling them only for protocols that do guarantee
+      `PFS <pfs_>`_. Knot Resolver makes use of this new option when linked
+      against GnuTLS >= 3.7.5.
+
+.. function:: net.tls_sticket_secret_file([string with path to a file containing pre-shared secret])
+
+   The same as :func:`net.tls_sticket_secret`,
+   except the secret is read from a (binary) file.
+
+.. function:: net.tls_padding([true | false])
+
+   Get/set EDNS(0) padding of queries and answers sent over an encrypted
+   channel.  If set to `true` (the default), it will use a sensible
+   default padding scheme, as implemented by libknot if available at
+   compile time.  If set to a numeric value >= 2 it will pad the
+   answers to nearest *padding* boundary, e.g. if set to `64`, the
+   answer will have size of a multiple of 64 (64, 128, 192, ...).  If
+   set to `false` (or a number < 2), it will disable padding entirely.
+
+Configuration options for DoH
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. function:: net.doh_headers([string or table of strings])
+
+   Selects the headers to be exposed. These headers and their values are
+   available in ``request.qsource.headers``. Comparison
+   is case-insensitive and pseudo-headers are supported as well.
+
+   The following snippet can be used in the lua module to access headers
+   ``:method`` and ``user-agent``:
+
+   .. code-block:: lua
+
+      net.doh_headers({':method', 'user-agent'})
+
+      ...
+
+      for i = 1, tonumber(req.qsource.headers.len) do
+        local name = ffi.string(req.qsource.headers.at[i - 1].name)
+        local value = ffi.string(req.qsource.headers.at[i - 1].value)
+        print(name, value)
+      end
--- a/daemon/bindings/net_xdpsrv.rst
+++ b/daemon/bindings/net_xdpsrv.rst
+.. SPDX-License-Identifier: GPL-3.0-or-later
+
+.. _dns-over-xdp:
+
+XDP for higher UDP performance
+------------------------------
+
+.. warning::
+   As of version 5.2.0, XDP support in Knot Resolver is considered
+   experimental. The impact on overall throughput and performance may not
+   always be beneficial.
+
+Using XDP allows significant speedup of UDP packet processing in recent Linux kernels,
+especially with some network drivers that implement good support.
+The basic idea is that for selected packets the Linux networking stack is bypassed,
+and some drivers can even directly use the user-space buffers for reading and writing.
+
+.. TODO perhaps some hint/link about how significant speedup one might get? (link to some talk video?)
+
+Prerequisites
+^^^^^^^^^^^^^
+.. this is mostly copied from knot-dns doc/operations.rst
+
+.. warning::
+   Bypassing the network stack has significant implications, such as bypassing the firewall
+   and monitoring solutions.
+   Make sure you're familiar with the trade-offs before using this feature.
+   Read more in :ref:`dns-over-xdp_limitations`.
+
+* Linux kernel 4.18+ (5.x+ is recommended for optimal performance) compiled with
+  the `CONFIG_XDP_SOCKETS=y` option. XDP isn't supported in other operating systems.
+* libknot compiled with XDP support
+* **A multiqueue network card with native XDP support is highly recommended**,
+  otherwise the performance gain will be much lower and you may encounter
+  issues due to XDP emulation.
+  Successfully tested cards:
+
+  * Intel series 700 (driver `i40e`), maximum number of queues per interface is 64.
+  * Intel series 500 (driver `ixgbe`), maximum number of queues per interface is 64.
+    The number of CPUs available has to be at most 64!
+
+
+Set up
+^^^^^^
+.. first parts are mostly copied from knot-dns doc/operations.rst
+
+The server instances need additional Linux **capabilities** during startup.
+(Or you could start them as `root`.)
+Execute command
+
+.. code-block:: bash
+
+	systemctl edit kresd@.service
+
+And insert these lines:
+
+.. code-block:: ini
+
+	[Service]
+        CapabilityBoundingSet=CAP_NET_RAW CAP_NET_ADMIN CAP_SYS_ADMIN CAP_IPC_LOCK CAP_SYS_RESOURCE
+        AmbientCapabilities=CAP_NET_RAW CAP_NET_ADMIN CAP_SYS_ADMIN CAP_IPC_LOCK CAP_SYS_RESOURCE
+
+The ``CAP_SYS_RESOURCE`` is only needed on Linux < 5.11.
+
+.. TODO suggest some way for ethtool -L?  Perhaps via systemd units?
+
+You want the same number of kresd instances and network **queues** on your card;
+you can use ``ethtool -L`` before the services start.
+With XDP this is more important than with vanilla UDP, as we only support one instance
+per queue and unclaimed queues will fall back to vanilla UDP.
+Ideally you can set these numbers as high as the number of CPUs that you want kresd to use.
+
+Modification of ``/etc/knot-resolver/kresd.conf`` may often be quite simple, for example:
+
+.. code-block:: lua
+
+	net.listen('eth2', 53, { kind = 'xdp' })
+	net.listen('203.0.113.53', 53, { kind = 'dns' })
+
+Note that you want to also keep the vanilla DNS line to service TCP
+and possibly any fallback UDP (e.g. from unclaimed queues).
+XDP listening is in principle done on queues of whole network interfaces
+and the target addresses of incoming packets aren't checked in any way,
+but you are still allowed to specify interface by an address
+(if it's unambiguous at that moment):
+
+.. code-block:: lua
+
+	net.listen('203.0.113.53', 53, { kind = 'xdp' })
+	net.listen('203.0.113.53', 53, { kind = 'dns' })
+
+The default selection of queues is tailored for the usual naming convention:
+``kresd@1.service``, ``kresd@2.service``, ...
+but you can still specify them explicitly, e.g. the default is effectively the same as:
+
+.. code-block:: lua
+
+	net.listen('eth2', 53, { kind = 'xdp', nic_queue = env.SYSTEMD_INSTANCE - 1 })
+
+
+Optimizations
+^^^^^^^^^^^^^
+.. this is basically copied from knot-dns doc/operations.rst
+
+Some helpful commands:
+
+.. code-block:: text
+
+	ethtool -N <interface> rx-flow-hash udp4 sdfn
+	ethtool -N <interface> rx-flow-hash udp6 sdfn
+	ethtool -L <interface> combined <queue-number>
+	ethtool -G <interface> rx <ring-size> tx <ring-size>
+	renice -n 19 -p $(pgrep '^ksoftirqd/[0-9]*$')
+
+.. TODO CPU affinities?  `CPUAffinity=%i` in systemd unit sounds good.
+
+
+.. _dns-over-xdp_limitations:
+
+Limitations
+^^^^^^^^^^^
+.. this is basically copied from knot-dns doc/operations.rst
+
+* VLAN segmentation is not supported.
+* MTU higher than 1792 bytes is not supported.
+* Multiple BPF filters per one network device are not supported.
+* Symmetrical routing is required (query source MAC/IP addresses and
+  reply destination MAC/IP addresses are the same).
+* Systems with big-endian byte ordering require special recompilation of libknot.
+* IPv4 header and UDP checksums are not verified on received DNS messages.
+* DNS over XDP traffic is not visible to common system tools (e.g. firewall, tcpdump etc.).
+* BPF filter is not automatically unloaded from the network device. Manual filter unload::
+
+	ip link set dev <interface> xdp off
+
+* Knot Resolver only supports using XDP towards clients currently (not towards upstreams).
+* When starting up an XDP socket you may get a harmless warning::
+
+	libbpf: Kernel error message: XDP program already attached
+
--- a/daemon/bindings/worker.c
+++ b/daemon/bindings/worker.c
+/*  Copyright (C) CZ.NIC, z.s.p.o. <knot-resolver@labs.nic.cz>
+ *  SPDX-License-Identifier: GPL-3.0-or-later
+ */
+
+#include "daemon/bindings/impl.h"
+
+
+static inline double getseconds(uv_timeval_t *tv)
+{
+	return (double)tv->tv_sec + 0.000001*((double)tv->tv_usec);
+}
+
+/** Return worker statistics. */
+static int wrk_stats(lua_State *L)
+{
+	if (kr_fails_assert(the_worker)) {
+		return 0;
+	}
+	lua_newtable(L);
+	lua_pushnumber(L, the_worker->stats.queries);
+	lua_setfield(L, -2, "queries");
+	lua_pushnumber(L, the_worker->stats.concurrent);
+	lua_setfield(L, -2, "concurrent");
+	lua_pushnumber(L, the_worker->stats.dropped);
+	lua_setfield(L, -2, "dropped");
+
+	lua_pushnumber(L, the_worker->stats.timeout);
+	lua_setfield(L, -2, "timeout");
+	lua_pushnumber(L, the_worker->stats.udp);
+	lua_setfield(L, -2, "udp");
+	lua_pushnumber(L, the_worker->stats.tcp);
+	lua_setfield(L, -2, "tcp");
+	lua_pushnumber(L, the_worker->stats.tls);
+	lua_setfield(L, -2, "tls");
+	lua_pushnumber(L, the_worker->stats.ipv4);
+	lua_setfield(L, -2, "ipv4");
+	lua_pushnumber(L, the_worker->stats.ipv6);
+	lua_setfield(L, -2, "ipv6");
+	lua_pushnumber(L, the_worker->stats.err_udp);
+	lua_setfield(L, -2, "err_udp");
+	lua_pushnumber(L, the_worker->stats.err_tcp);
+	lua_setfield(L, -2, "err_tcp");
+	lua_pushnumber(L, the_worker->stats.err_tls);
+	lua_setfield(L, -2, "err_tls");
+	lua_pushnumber(L, the_worker->stats.err_http);
+	lua_setfield(L, -2, "err_http");
+
+	/* Add subset of rusage that represents counters. */
+	uv_rusage_t rusage;
+	if (uv_getrusage(&rusage) == 0) {
+		lua_pushnumber(L, getseconds(&rusage.ru_utime));
+		lua_setfield(L, -2, "usertime");
+		lua_pushnumber(L, getseconds(&rusage.ru_stime));
+		lua_setfield(L, -2, "systime");
+		lua_pushnumber(L, rusage.ru_majflt);
+		lua_setfield(L, -2, "pagefaults");
+		lua_pushnumber(L, rusage.ru_nswap);
+		lua_setfield(L, -2, "swaps");
+		lua_pushnumber(L, rusage.ru_nvcsw + rusage.ru_nivcsw);
+		lua_setfield(L, -2, "csw");
+	}
+	/* Get RSS */
+	size_t rss = 0;
+	if (uv_resident_set_memory(&rss) == 0) {
+		lua_pushnumber(L, rss);
+		lua_setfield(L, -2, "rss");
+	}
+	return 1;
+}
+
+int kr_bindings_worker(lua_State *L)
+{
+	static const luaL_Reg lib[] = {
+		{ "stats",    wrk_stats },
+		{ NULL, NULL }
+	};
+	luaL_register(L, "worker", lib);
+	return 1;
+}
+
--- a/daemon/bindings/worker.rst
+++ b/daemon/bindings/worker.rst
+.. SPDX-License-Identifier: GPL-3.0-or-later
+
+Scripting worker
+^^^^^^^^^^^^^^^^
+
+Worker is a service over event loop that tracks and schedules outstanding queries,
+you can see the statistics or schedule new queries. It also contains information about
+specified worker count and process rank.
+
+.. envvar:: worker.id
+
+   Value from environment variable ``SYSTEMD_INSTANCE``,
+   or if it is not set, :envvar:`PID <worker.pid>` (string).
+
+.. envvar:: worker.pid
+
+   Current worker process PID (number).
+
+.. function:: worker.stats()
+
+   Return table of statistics.  See member descriptions in :c:type:`worker_stats`.
+   A few fields are added, mainly from POSIX ``getrusage()``:
+
+   * ``usertime`` and ``systime`` -- CPU time used, in seconds
+   * ``pagefaults`` -- the number of hard page faults, i.e. those that required I/O activity
+   * ``swaps`` -- the number of times the process was “swapped” out of main memory; unused on Linux
+   * ``csw`` -- the number of context switches, both voluntary and involuntary
+   * ``rss`` -- current memory usage in bytes, including whole cache (resident set size)
+
+   Example:
+
+   .. code-block:: lua
+
+	print(worker.stats().concurrent)
+
--- a/daemon/cache.test/clear.test.lua
+++ b/daemon/cache.test/clear.test.lua
+-- unload modules which are not related to this test
+-- SPDX-License-Identifier: GPL-3.0-or-later
+
+if ta_signal_query then
+        modules.unload('ta_signal_query')
+end
+if priming then
+        modules.unload('priming')
+end
+if detect_time_skew then
+        modules.unload('detect_time_skew')
+end
+
+-- test. domain is used by some tests, allow it
+policy.add(policy.suffix(policy.PASS, {todname('test.')}))
+
+cache.size = 2*MB
+-- log_level('debug')
+
+-- Self-checks on globals
+assert(help() ~= nil)
+assert(worker.id ~= nil)
+-- Self-checks on facilities
+assert(cache.stats() ~= nil)
+assert(cache.backends() ~= nil)
+assert(worker.stats() ~= nil)
+assert(net.interfaces() ~= nil)
+-- Self-checks on loaded stuff
+assert(#modules.list() > 0)
+-- Self-check timers
+ev = event.recurrent(1 * sec, function () return 1 end)
+event.cancel(ev)
+ev = event.after(0, function () return 1 end)
+
+
+-- Import fake root zone; avoid interference with configured keyfile_default.
+trust_anchors.remove('.')
+trust_anchors.add('. IN DS 48409 8 2 3D63A0C25BCE86621DE63636F11B35B908EFE8E9381E0E3E9DEFD89EA952C27D')
+
+local check_answer = require('test_utils').check_answer
+
+-- do not attempt to contact outside world, operate only on cache
+net.ipv4 = false
+net.ipv6 = false
+-- do not listen, test is driven by config code
+env.KRESD_NO_LISTEN = true
+
+
+local function import_zone()
+	local import_res = require('ffi').C.zi_zone_import({ zone_file = 'testroot.zone' })
+	assert(import_res == 0)
+	-- beware that import takes at least 100 ms
+	worker.sleep(0.2)  -- zimport is delayed by 100 ms from function call
+	-- sanity checks - cache must be filled in
+	ok(cache.count() > 0, 'cache is not empty after import')
+	check_answer('root apex is in cache',
+                     '.', kres.type.NS, kres.rcode.NOERROR)
+	check_answer('deep subdomain is in cache',
+		     'a.b.subtree1.', kres.type.AAAA, kres.rcode.NOERROR)
+
+end
+
+local function test_exact_match_qtype()
+	nok(cache.clear('a.b.subtree1.', true, kres.type.A)['chunk_limit'],
+	     'single qname+qtype can be cleared at once')
+	check_answer('exact match on qname+qtype must flush RR from cache',
+		     'a.b.subtree1.', kres.type.A, kres.rcode.SERVFAIL)
+	check_answer('exact match on qname+qtype must not affect other RRs on the same node',
+		     'a.b.subtree1.', kres.type.AAAA, kres.rcode.NOERROR)
+	check_answer('exact match on qname must not affect parent',
+		     'b.subtree1.', kres.type.A, kres.rcode.NOERROR)
+end
+
+local function test_exact_match_qname()
+	res = cache.clear('a.b.SubTree1.')
+	is(res.count, 2, 'single qname can be cleared at once')
+	check_answer('exact match on qname must flush all RRs with the same owner from cache',
+		     'a.b.subtree1.', kres.type.AAAA, kres.rcode.SERVFAIL)
+	check_answer('exact match on qname must flush all RRs with the same owner from cache',
+		     'a.b.subtree1.', kres.type.A, kres.rcode.SERVFAIL)
+	check_answer('exact match on qname must flush all RRs with the same owner from cache',
+		     'a.b.subtree1.', kres.type.TXT, kres.rcode.SERVFAIL)
+	-- exact match for negative proofs is not implemented yet
+	--check_answer('exact match on qname must flush negative proofs for owner from cache',
+	--	     'a.b.subtree1.', kres.type.NULL, kres.rcode.SERVFAIL)
+	--check_answer('exact match on qname must not affect parent',
+	--	     'b.subtree1.', kres.type.A, kres.rcode.NOERROR)
+	-- same(cache.clear(), 0, 'full cache clear can be performed')
+	--check_answer('.', kres.type.NS, false)
+
+end
+
+local function test_subtree()
+	res = cache.clear('subtree1.')
+	nok(res.chunk_limit,
+	    'whole positive subtree must be flushed (does not include neg. proofs)')
+	ok(res.not_apex,
+	    'subtree clear below apex must be detected')
+        same(res.subtree, '.', 'detected apex must be returned')
+	check_answer('subtree variant must flush all RRs in subdomains from cache',
+		     'b.subtree1.', kres.type.A, kres.rcode.SERVFAIL)
+	check_answer('subtree variant must flush all RRs in subdomains from cache',
+		     'b.subtree1.', kres.type.TXT, kres.rcode.SERVFAIL)
+	check_answer('subtree variant must flush all RRs in subdomains from cache',
+		     'subtree1.', kres.type.TXT, kres.rcode.SERVFAIL)
+	check_answer('subtree variant must not affect parent',
+		     '.', kres.type.NS, kres.rcode.NOERROR)
+	-- same(cache.clear(), 0, 'full cache clear can be performed')
+	--check_answer('.', kres.type.NS, false)
+
+end
+
+local function test_callback()
+	local test_name = '20r.subtree2.'
+	local test_exactname = true
+	local test_rrtype = nil
+	local test_chunksize = 1
+	local test_prev_state = { works = true }
+	local function check_callback(name, exact_name, rr_type, chunk_size, callback, prev_state, errors)
+		is(errors.count, 1, 'callback received correct # of removed records')
+		is(test_name, name, 'callback received subtree name')
+		is(test_exactname, exact_name, 'callback received exact_name')
+		is(test_rrtype, rr_type, 'callback received rr_type')
+		is(test_chunksize, chunk_size, 'callback received chunk_size')
+		is(check_callback, callback, 'callback received reference to itself')
+		is(type(errors), 'table', 'callback received table of errors')
+		same(test_prev_state, prev_state, 'callback received previous state')
+		return 666
+	end
+	same(cache.clear(test_name, test_exactname, test_rrtype, test_chunksize, check_callback, test_prev_state),
+	     666, 'first callback return value is passed to cache.clear() caller')
+	local cnt_before_wait = cache.count()
+	worker.sleep(0.2)
+	is(cnt_before_wait, cache.count(), 'custom callback can stop clearing')
+end
+
+local function test_subtree_limit()  -- default limit = 100
+	res = cache.clear('subtree2.', false, nil)
+	ok(res.chunk_limit,
+	   'chunk_size limit must be respected')
+	is(res.count, 100,
+	   'chunk_size limit must match returned count')
+
+	-- callbacks are running in background so we can now wait
+	-- and later verify that everything was removed
+	-- 200 RRs, 100 was removed in first call
+	-- so the rest should be removed in single invocation of callback
+	-- hopefully the machine is not too slow ...
+	worker.sleep(0.1)
+	res = cache.clear('subtree2.', false, nil)
+	is(res.count, 0,
+	   'previous calls + callbacks must have removed everything')
+end
+
+local function test_apex()
+	check_answer('a negative proof is still present in cache',
+		     'aaaaa.b.subtree1.', kres.type.TXT, kres.rcode.NXDOMAIN)
+
+	local prev_count = cache.count()
+	ok(prev_count > 0, 'previous subtree clearing did not remove everything')
+	res = cache.clear('.', false, nil, 10000)
+	is(res.count, prev_count, 'clear on root removed everything including proofs')
+	check_answer('exact match on qname must flush negative proofs for owner from cache',
+		     'a.b.subtree1.', kres.type.NULL, kres.rcode.SERVFAIL)
+end
+
+local function test_root()
+	check_answer('root apex is still in cache',
+                     '.', kres.type.NS, kres.rcode.NOERROR)
+	res = cache.clear('.', true)
+	check_answer('root apex is in no longer cache',
+                     '.', kres.type.NS, kres.rcode.SERVFAIL)
+	check_answer('some other item is still in cache',
+                     '16r.subtree2.', kres.type.A, kres.rcode.NOERROR)
+
+	local prev_count = cache.count()
+	res = cache.clear('.')
+	is(res.count, prev_count, 'full clear reports correct number of entries')
+	is(cache.count(), 0, 'clearing root clears everything')
+end
+
+local function test_complete_flush()
+	local prev_count = cache.count()
+	res = cache.clear()
+	is(res.count, prev_count, 'full clear reports correct number of entries')
+	is(cache.count(), 0, 'cache is empty after full clear')
+end
+
+local function test_cache_used(lower, upper)
+	return function()
+		local usage = cache.stats().usage_percent
+		ok(usage >= lower and usage <= upper,
+		   string.format('cache percentage usage %.1f is between <%d, %d>', usage, lower, upper))
+	end
+end
+
+return {
+	test_cache_used(0, 1),
+	import_zone,
+	test_cache_used(9, 11),
+	test_exact_match_qtype,
+	test_exact_match_qname,
+	test_callback,
+	import_zone,
+	test_subtree,
+	test_cache_used(9, 11),
+	test_subtree_limit,
+	test_cache_used(5, 8),
+	test_apex,
+	import_zone,
+	test_root,
+	import_zone,
+	test_complete_flush,
+	test_cache_used(0, 1),
+}
--- a/daemon/cache.test/insert_ns.test.integr/deckard.yaml
+++ b/daemon/cache.test/insert_ns.test.integr/deckard.yaml
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+programs:
+- name: kresd
+  binary: kresd
+  additional:
+    - --noninteractive
+  templates:
+    - daemon/cache.test/insert_ns.test.integr/kresd_config.j2
+    - tests/integration/hints_zone.j2
+  configs:
+    - config
+    - hints
+noclean: True
--- a/daemon/cache.test/insert_ns.test.integr/kresd_config.j2
+++ b/daemon/cache.test/insert_ns.test.integr/kresd_config.j2
+-- SPDX-License-Identifier: GPL-3.0-or-later
+
+{% for TAF in TRUST_ANCHOR_FILES %}
+trust_anchors.add_file('{{TAF}}')
+{% endfor %}
+
+{% raw %}
+-- insert NS record pointing to a non-delegated DNS server
+cache.open(1*MB)
+cache.clear()
+trust_anchors.remove('.')
+
+local ffi = require('ffi')
+local c = kres.context().cache
+ns_name = todname('ns.example.com')
+local ns_addr = '\1\2\3\4'
+local rr = kres.rrset(ns_name, kres.type.A, kres.class.IN, 2147483647)
+assert(rr:add_rdata(ns_addr, #ns_addr))
+assert(c:insert(rr, nil, ffi.C.KR_RANK_SECURE))
+
+rr_ns = kres.rrset(todname('example.com'), kres.type.NS, kres.class.IN, 3600)
+assert(rr_ns:add_rdata(ns_name, #ns_name))
+assert(c:insert(rr_ns, nil, bit.bor(ffi.C.KR_RANK_AUTH, ffi.C.KR_RANK_INSECURE)))
+
+c:commit()
+assert(cache.count() > 0)
+
+-- from now on queries for domain example.com should go directly to IP addr 1.2.3.4
+
+-- Disable RFC5011 TA update
+if ta_update then
+        modules.unload('ta_update')
+end
+
+-- Disable RFC8145 signaling, scenario doesn't provide expected answers
+if ta_signal_query then
+        modules.unload('ta_signal_query')
+end
+
+-- Disable RFC8109 priming, scenario doesn't provide expected answers
+if priming then
+        modules.unload('priming')
+end
+
+-- Disable this module because it makes one priming query
+if detect_time_skew then
+        modules.unload('detect_time_skew')
+end
+
+_hint_root_file('hints')
+log_level('debug')
+{% endraw %}
+
+net = { '{{SELF_ADDR}}' }
+
+{% if DO_IP6 == "true" %}
+net.ipv6 = true
+{% else %}
+net.ipv6 = false
+{% endif %}
+
+{% if DO_IP4 == "true" %}
+net.ipv4 = true
+{% else %}
+net.ipv4 = false
+{% endif %}
+
+{% if QMIN == "false" %}
+option('NO_MINIMIZE', true)
+{% else %}
+option('NO_MINIMIZE', false)
+{% endif %}
+
+
+-- Self-checks on globals
+assert(help() ~= nil)
+assert(worker.id ~= nil)
+-- Self-checks on facilities
+assert(cache.stats() ~= nil)
+assert(cache.backends() ~= nil)
+assert(worker.stats() ~= nil)
+assert(net.interfaces() ~= nil)
+-- Self-checks on loaded stuff
+assert(net.list()[1].transport.ip == '{{SELF_ADDR}}')
+assert(#modules.list() > 0)
+-- Self-check timers
+ev = event.recurrent(1 * sec, function (ev) return 1 end)
+event.cancel(ev)
+ev = event.after(0, function (ev) return 1 end)
--- a/tests/testdata/iter_minim_a.rpl
+++ b/tests/testdata/iter_minim_a.rpl
+; SPDX-License-Identifier: GPL-3.0-or-later
 ; config options
-server:
-	target-fetch-policy: "0 0 0 0 0"
-	query-minimization: on
-
-stub-zone:
-	name: "."
+;	target-fetch-policy: "0 0 0 0 0"
+;	name: "."
 	stub-addr: 193.0.14.129 	# K.ROOT-SERVERS.NET.
+	do-ip6: no
 CONFIG_END

-SCENARIO_BEGIN Test basic query minimization www.example.com.
-
-; K.ROOT-SERVERS.NET.
-RANGE_BEGIN 0 100
-	ADDRESS 193.0.14.129 
-ENTRY_BEGIN
-MATCH opcode qtype qname
-ADJUST copy_id
-REPLY QR NOERROR
-SECTION QUESTION
-. IN NS
-SECTION ANSWER
-. IN NS	K.ROOT-SERVERS.NET.
-SECTION ADDITIONAL
-K.ROOT-SERVERS.NET.	IN	A	193.0.14.129
-ENTRY_END
-
-ENTRY_BEGIN
-MATCH opcode qtype qname
-ADJUST copy_id
-REPLY QR NOERROR
-SECTION QUESTION
-com. IN NS
-SECTION AUTHORITY
-com.	IN NS	a.gtld-servers.net.
-SECTION ADDITIONAL
-a.gtld-servers.net.	IN 	A	192.5.6.30
-ENTRY_END
-RANGE_END
+SCENARIO_BEGIN Delegation explicitly added into cache must be followed

-; a.gtld-servers.net.
+; ns.example.com.
 RANGE_BEGIN 0 100
-	ADDRESS 192.5.6.30
-
+	ADDRESS 1.2.3.4
 ENTRY_BEGIN
 MATCH opcode qtype qname
 ADJUST copy_id
 REPLY QR NOERROR
 SECTION QUESTION
 example.com. IN NS
-SECTION AUTHORITY
+SECTION ANSWER
 example.com.	IN NS	ns.example.com.
 SECTION ADDITIONAL
 ns.example.com.		IN 	A	1.2.3.4
 ENTRY_END
-RANGE_END
-
-; ns.example.com.
-RANGE_BEGIN 0 100
-	ADDRESS 1.2.3.4

 ENTRY_BEGIN
 MATCH opcode qtype qname
@@ -72,7 +36,6 @@ example.com.	IN NS	ns.example.com.
 SECTION ADDITIONAL
 ns.example.com.		IN 	A	1.2.3.4
 ENTRY_END
-
 RANGE_END

 STEP 1 QUERY
@@ -85,7 +48,7 @@ ENTRY_END
 ; recursion happens here.
 STEP 10 CHECK_ANSWER
 ENTRY_BEGIN
-MATCH all
+MATCH flags rcode question
 REPLY QR RD RA NOERROR
 SECTION QUESTION
 www.example.com. IN A

--- a/daemon/cache.test/testroot.zone
+++ b/daemon/cache.test/testroot.zone
--- a/daemon/cache.test/testroot.zone.unsigned
+++ b/daemon/cache.test/testroot.zone.unsigned
+; SPDX-License-Identifier: GPL-3.0-or-later
+.		86400	SOA	rootns. you.test. 2017071101 1800 900 604800 86400
+.		86400	NS	rootns.
+rootns.		86400	A	198.41.0.4
+
+subtree1.	86400	TXT	"txt exists"
+subtree1.	86400	A	192.0.2.1
+b.subtree1.	86400	TXT	"txt exists"
+b.subtree1.	86400	A	192.0.2.2
+a.b.subtree1.	86400	TXT	"txt exists"
+a.b.subtree1.	86400	A	192.0.2.3
+a.b.subtree1.	86400	AAAA	2001:db8::
+
+; subtree2. is empty non-terminal
+1r.subtree2.	86400	AAAA	2001:db8::
+2r.subtree2.	86400	AAAA	2001:db8::1
+2r.subtree2.	86400	AAAA	2001:db8::2
+3r.subtree2.	86400	AAAA	2001:db8::
+4r.subtree2.	86400	A	192.0.2.1
+5r.subtree2.	86400	A	192.0.2.1
+6r.subtree2.	86400	A	192.0.2.1
+7r.subtree2.	86400	A	192.0.2.1
+8r.subtree2.	86400	A	192.0.2.1
+9r.subtree2.	86400	A	192.0.2.1
+10r.subtree2.	86400	A	192.0.2.1
+11r.subtree2.	86400	A	192.0.2.1
+12r.subtree2.	86400	A	192.0.2.1
+13r.subtree2.	86400	A	192.0.2.1
+14r.subtree2.	86400	A	192.0.2.1
+15r.subtree2.	86400	A	192.0.2.1
+16r.subtree2.	86400	A	192.0.2.1
+17r.subtree2.	86400	A	192.0.2.1
+18r.subtree2.	86400	A	192.0.2.1
+19r.subtree2.	86400	A	192.0.2.1
+20r.subtree2.	86400	A	192.0.2.1
+21r.subtree2.	86400	A	192.0.2.1
+22r.subtree2.	86400	A	192.0.2.1
+23r.subtree2.	86400	A	192.0.2.1
+24r.subtree2.	86400	A	192.0.2.1
+25r.subtree2.	86400	A	192.0.2.1
+26r.subtree2.	86400	A	192.0.2.1
+27r.subtree2.	86400	A	192.0.2.1
+28r.subtree2.	86400	A	192.0.2.1
+29r.subtree2.	86400	A	192.0.2.1
+30r.subtree2.	86400	A	192.0.2.1
+31r.subtree2.	86400	A	192.0.2.1
+32r.subtree2.	86400	A	192.0.2.1
+33r.subtree2.	86400	A	192.0.2.1
+34r.subtree2.	86400	A	192.0.2.1
+35r.subtree2.	86400	A	192.0.2.1
+36r.subtree2.	86400	A	192.0.2.1
+37r.subtree2.	86400	A	192.0.2.1
+38r.subtree2.	86400	A	192.0.2.1
+39r.subtree2.	86400	A	192.0.2.1
+40r.subtree2.	86400	A	192.0.2.1
+41r.subtree2.	86400	A	192.0.2.1
+42r.subtree2.	86400	A	192.0.2.1
+43r.subtree2.	86400	A	192.0.2.1
+44r.subtree2.	86400	A	192.0.2.1
+45r.subtree2.	86400	A	192.0.2.1
+46r.subtree2.	86400	A	192.0.2.1
+47r.subtree2.	86400	A	192.0.2.1
+48r.subtree2.	86400	A	192.0.2.1
+49r.subtree2.	86400	A	192.0.2.1
+50r.subtree2.	86400	A	192.0.2.1
+51r.subtree2.	86400	A	192.0.2.1
+52r.subtree2.	86400	A	192.0.2.1
+53r.subtree2.	86400	A	192.0.2.1
+54r.subtree2.	86400	A	192.0.2.1
+55r.subtree2.	86400	A	192.0.2.1
+56r.subtree2.	86400	A	192.0.2.1
+57r.subtree2.	86400	A	192.0.2.1
+58r.subtree2.	86400	A	192.0.2.1
+59r.subtree2.	86400	A	192.0.2.1
+60r.subtree2.	86400	A	192.0.2.1
+61r.subtree2.	86400	A	192.0.2.1
+62r.subtree2.	86400	A	192.0.2.1
+63r.subtree2.	86400	A	192.0.2.1
+64r.subtree2.	86400	A	192.0.2.1
+65r.subtree2.	86400	A	192.0.2.1
+66r.subtree2.	86400	A	192.0.2.1
+67r.subtree2.	86400	A	192.0.2.1
+68r.subtree2.	86400	A	192.0.2.1
+69r.subtree2.	86400	A	192.0.2.1
+70r.subtree2.	86400	A	192.0.2.1
+71r.subtree2.	86400	A	192.0.2.1
+72r.subtree2.	86400	A	192.0.2.1
+73r.subtree2.	86400	A	192.0.2.1
+74r.subtree2.	86400	A	192.0.2.1
+75r.subtree2.	86400	A	192.0.2.1
+76r.subtree2.	86400	A	192.0.2.1
+77r.subtree2.	86400	A	192.0.2.1
+78r.subtree2.	86400	A	192.0.2.1
+79r.subtree2.	86400	A	192.0.2.1
+80r.subtree2.	86400	A	192.0.2.1
+81r.subtree2.	86400	A	192.0.2.1
+82r.subtree2.	86400	A	192.0.2.1
+83r.subtree2.	86400	A	192.0.2.1
+84r.subtree2.	86400	A	192.0.2.1
+85r.subtree2.	86400	A	192.0.2.1
+86r.subtree2.	86400	A	192.0.2.1
+87r.subtree2.	86400	A	192.0.2.1
+88r.subtree2.	86400	A	192.0.2.1
+89r.subtree2.	86400	A	192.0.2.1
+90r.subtree2.	86400	A	192.0.2.1
+91r.subtree2.	86400	A	192.0.2.1
+92r.subtree2.	86400	A	192.0.2.1
+93r.subtree2.	86400	A	192.0.2.1
+94r.subtree2.	86400	A	192.0.2.1
+95r.subtree2.	86400	A	192.0.2.1
+96r.subtree2.	86400	A	192.0.2.1
+97r.subtree2.	86400	A	192.0.2.1
+98r.subtree2.	86400	A	192.0.2.1
+99r.subtree2.	86400	A	192.0.2.1
+100r.subtree2.	86400	A	192.0.2.1
+101r.subtree2.	86400	A	192.0.2.1
+102r.subtree2.	86400	A	192.0.2.1
+103r.subtree2.	86400	A	192.0.2.1
+104r.subtree2.	86400	A	192.0.2.1
+105r.subtree2.	86400	A	192.0.2.1
+106r.subtree2.	86400	A	192.0.2.1
+107r.subtree2.	86400	A	192.0.2.1
+108r.subtree2.	86400	A	192.0.2.1
+109r.subtree2.	86400	A	192.0.2.1
+110r.subtree2.	86400	A	192.0.2.1
+111r.subtree2.	86400	A	192.0.2.1
+112r.subtree2.	86400	A	192.0.2.1
+113r.subtree2.	86400	A	192.0.2.1
+114r.subtree2.	86400	A	192.0.2.1
+115r.subtree2.	86400	A	192.0.2.1
+116r.subtree2.	86400	A	192.0.2.1
+117r.subtree2.	86400	A	192.0.2.1
+118r.subtree2.	86400	A	192.0.2.1
+119r.subtree2.	86400	A	192.0.2.1
+120r.subtree2.	86400	A	192.0.2.1
+121r.subtree2.	86400	A	192.0.2.1
+122r.subtree2.	86400	A	192.0.2.1
+123r.subtree2.	86400	A	192.0.2.1
+124r.subtree2.	86400	A	192.0.2.1
+125r.subtree2.	86400	A	192.0.2.1
+126r.subtree2.	86400	A	192.0.2.1
+127r.subtree2.	86400	A	192.0.2.1
+128r.subtree2.	86400	A	192.0.2.1
+129r.subtree2.	86400	A	192.0.2.1
+130r.subtree2.	86400	A	192.0.2.1
+131r.subtree2.	86400	A	192.0.2.1
+132r.subtree2.	86400	A	192.0.2.1
+133r.subtree2.	86400	A	192.0.2.1
+134r.subtree2.	86400	A	192.0.2.1
+135r.subtree2.	86400	A	192.0.2.1
+136r.subtree2.	86400	A	192.0.2.1
+137r.subtree2.	86400	A	192.0.2.1
+138r.subtree2.	86400	A	192.0.2.1
+139r.subtree2.	86400	A	192.0.2.1
+140r.subtree2.	86400	A	192.0.2.1
+141r.subtree2.	86400	A	192.0.2.1
+142r.subtree2.	86400	A	192.0.2.1
+143r.subtree2.	86400	A	192.0.2.1
+144r.subtree2.	86400	A	192.0.2.1
+145r.subtree2.	86400	A	192.0.2.1
+146r.subtree2.	86400	A	192.0.2.1
+147r.subtree2.	86400	A	192.0.2.1
+148r.subtree2.	86400	A	192.0.2.1
+149r.subtree2.	86400	A	192.0.2.1
+150r.subtree2.	86400	A	192.0.2.1
+151r.subtree2.	86400	A	192.0.2.1
+152r.subtree2.	86400	A	192.0.2.1
+153r.subtree2.	86400	A	192.0.2.1
+154r.subtree2.	86400	A	192.0.2.1
+155r.subtree2.	86400	A	192.0.2.1
+156r.subtree2.	86400	A	192.0.2.1
+157r.subtree2.	86400	A	192.0.2.1
+158r.subtree2.	86400	A	192.0.2.1
+159r.subtree2.	86400	A	192.0.2.1
+160r.subtree2.	86400	A	192.0.2.1
+161r.subtree2.	86400	A	192.0.2.1
+162r.subtree2.	86400	A	192.0.2.1
+163r.subtree2.	86400	A	192.0.2.1
+164r.subtree2.	86400	A	192.0.2.1
+165r.subtree2.	86400	A	192.0.2.1
+166r.subtree2.	86400	A	192.0.2.1
+167r.subtree2.	86400	A	192.0.2.1
+168r.subtree2.	86400	A	192.0.2.1
+169r.subtree2.	86400	A	192.0.2.1
+170r.subtree2.	86400	A	192.0.2.1
+171r.subtree2.	86400	A	192.0.2.1
+172r.subtree2.	86400	A	192.0.2.1
+173r.subtree2.	86400	A	192.0.2.1
+174r.subtree2.	86400	A	192.0.2.1
+175r.subtree2.	86400	A	192.0.2.1
+176r.subtree2.	86400	A	192.0.2.1
+177r.subtree2.	86400	A	192.0.2.1
+178r.subtree2.	86400	A	192.0.2.1
+179r.subtree2.	86400	A	192.0.2.1
+180r.subtree2.	86400	A	192.0.2.1
+181r.subtree2.	86400	A	192.0.2.1
+182r.subtree2.	86400	A	192.0.2.1
+183r.subtree2.	86400	A	192.0.2.1
+184r.subtree2.	86400	A	192.0.2.1
+185r.subtree2.	86400	A	192.0.2.1
+186r.subtree2.	86400	A	192.0.2.1
+187r.subtree2.	86400	A	192.0.2.1
+188r.subtree2.	86400	A	192.0.2.1
+189r.subtree2.	86400	A	192.0.2.1
+190r.subtree2.	86400	A	192.0.2.1
+191r.subtree2.	86400	A	192.0.2.1
+192r.subtree2.	86400	A	192.0.2.1
+193r.subtree2.	86400	A	192.0.2.1
+194r.subtree2.	86400	A	192.0.2.1
+195r.subtree2.	86400	A	192.0.2.1
+196r.subtree2.	86400	A	192.0.2.1
+197r.subtree2.	86400	A	192.0.2.1
+198r.subtree2.	86400	A	192.0.2.1
+199r.subtree2.	86400	A	192.0.2.1
+200r.subtree2.	86400	A	192.0.2.1
+201r.subtree2.	86400	A	192.0.2.1
--- a/daemon/daemon.mk
+++ b/daemon/daemon.mk
-kresd_EMBED := \
-	contrib/ccan/json/json.c \
-	contrib/ccan/asprintf/asprintf.c
-kresd_SOURCES := \
-	$(kresd_EMBED)   \
-	daemon/io.c          \
-	daemon/network.c     \
-	daemon/engine.c      \
-	daemon/worker.c      \
-	daemon/bindings.c    \
-	daemon/ffimodule.c   \
-	daemon/bindings/kres.c \
-	daemon/main.c
-
-# Embed resources
-daemon/engine.o: daemon/lua/sandbox.inc daemon/lua/config.inc
-%.inc: %.lua
-	@$(call quiet,XXD,$<) $< > $@
-
-# Dependencies
-kresd_DEPEND := $(libkres)
-kresd_LIBS := $(libkres_TARGET) $(libknot_LIBS) $(libuv_LIBS) $(lua_LIBS)
-
-# Make binary
-ifeq ($(HAS_lua)|$(HAS_libuv), yes|yes)
-$(eval $(call make_bin,kresd,daemon))
-endif
-
-# Targets
-daemon: $(kresd)
-daemon-install: kresd-install
-daemon-clean: kresd-clean
-	@$(RM) daemon/lua/*.inc
-
-.PHONY: daemon daemon-install daemon-clean
--- a/daemon/defer.c
+++ b/daemon/defer.c
+/*  Copyright (C) CZ.NIC, z.s.p.o. <knot-resolver@labs.nic.cz>
+ *  SPDX-License-Identifier: GPL-3.0-or-later
+ */
+
+#include <math.h>
+#include <stdatomic.h>
+#include "daemon/defer.h"
+#include "daemon/session2.h"
+#include "daemon/udp_queue.h"
+#include "lib/kru.h"
+#include "lib/mmapped.h"
+#include "lib/resolve.h"
+#include "lib/utils.h"
+
+#define V4_PREFIXES  (uint8_t[])       {  18,  20, 24, 32 }
+#define V4_RATE_MULT (kru_price_t[])   { 768, 256, 32,  1 }
+#define V4_SUBPRIO   (uint8_t[])       {   0,   1,  3,  7 }
+
+#define V6_PREFIXES  (uint8_t[])       { 32, 48, 56, 64, 128 }
+#define V6_RATE_MULT (kru_price_t[])   { 64,  4,  3,  2,   1 }
+#define V6_SUBPRIO   (uint8_t[])       {  2,  4,  5,  6,   7 }
+
+#define SUBPRIO_CNT 8
+#define V4_PREFIXES_CNT (sizeof(V4_PREFIXES) / sizeof(*V4_PREFIXES))
+#define V6_PREFIXES_CNT (sizeof(V6_PREFIXES) / sizeof(*V6_PREFIXES))
+#define MAX_PREFIXES_CNT ((V4_PREFIXES_CNT > V6_PREFIXES_CNT) ? V4_PREFIXES_CNT : V6_PREFIXES_CNT)
+
+struct kru_conf {
+	uint8_t namespace;
+	size_t prefixes_cnt;
+	uint8_t *prefixes;
+	const kru_price_t *rate_mult;
+	const uint8_t *subprio;
+} const
+V4_CONF = {0, V4_PREFIXES_CNT, V4_PREFIXES, V4_RATE_MULT, V4_SUBPRIO},
+V6_CONF = {1, V6_PREFIXES_CNT, V6_PREFIXES, V6_RATE_MULT, V6_SUBPRIO};
+
+#define LOADS_THRESHOLDS      (uint16_t[])  {1<<4, 1<<8, 1<<12, -1}    // the last one should be UINT16_MAX
+#define QUEUES_CNT            ((sizeof(LOADS_THRESHOLDS) / sizeof(*LOADS_THRESHOLDS) - 1) * SUBPRIO_CNT + 2)
+	// priority 0 has no subpriorities, +1 for unverified
+#define PRIORITY_UDP          (QUEUES_CNT - 1)  // last queue
+
+#define Q0_INSTANT_LIMIT      1000000 // ns
+#define KRU_CAPACITY          (1<<19) // same as ratelimiting default
+#define BASE_PRICE(nsec)      ((uint64_t)KRU_LIMIT * LOADS_THRESHOLDS[0] / (1<<16) * (nsec) / Q0_INSTANT_LIMIT)
+#define MAX_DECAY             (BASE_PRICE(1000000) / 2)  // max value at 50% utilization of single cpu
+	//   see log written by defer_str_conf for details
+
+#define REQ_TIMEOUT        1000000000 // ns (THREAD_CPUTIME), older deferred queries are dropped
+#define IDLE_TIMEOUT          1000000 // ns (THREAD_CPUTIME); if exceeded, continue processing after next poll phase
+#define PHASE_UDP_TIMEOUT      400000 // ns (THREAD_CPUTIME); switch between udp, non-udp phases
+#define PHASE_NON_UDP_TIMEOUT  400000 // ns (THREAD_CPUTIME);    after timeout or emptying queue
+#define MAX_WAITING_REQS_SIZE (64l * 1024 * 1024)  // bytes; if exceeded, some deferred requests are processed in poll phase
+	// single TCP allocates more than 64KiB wire buffer
+	// TODO check whether all important allocations are counted;
+	//   different things are not counted: tasks and subsessions (not deferred after creation), uv handles, queues overhead, ...;
+	//   payload is counted either as part of session wire buffer (for stream) or as part of iter ctx (for datagrams)
+
+
+#define VERBOSE_LOG(...) kr_log_debug(DEFER, " | " __VA_ARGS__)
+
+struct defer {
+	size_t capacity;
+	kru_price_t max_decay;
+	uint32_t log_period;
+	int cpus;
+	bool using_avx2;
+	_Atomic uint32_t log_time;
+	_Alignas(64) uint8_t kru[];
+};
+struct defer *defer = NULL;
+bool defer_initialized = false;
+uint64_t defer_uvtime_stamp = 0;
+struct mmapped defer_mmapped = {0};
+
+defer_sample_state_t defer_sample_state = {
+	.is_accounting = 0,
+};
+
+uv_idle_t idle_handle;
+static void defer_queues_idle(uv_idle_t *handle);
+
+protolayer_iter_ctx_queue_t queues[QUEUES_CNT];
+int waiting_requests = 0;
+ptrdiff_t waiting_requests_size = 0;  // signed for non-negativeness asserts
+int queue_ix = QUEUES_CNT;  // MIN( last popped queue, first non-empty queue )
+
+enum phase {
+	PHASE_NONE,
+	PHASE_UDP,
+	PHASE_NON_UDP
+} phase = PHASE_NONE;
+uint64_t phase_elapsed[3] = { 0 };  // ns; [PHASE_NONE] value is being incremented but never used
+const uint64_t phase_limits[3] = {0, PHASE_UDP_TIMEOUT, PHASE_NON_UDP_TIMEOUT};
+uint64_t phase_stamp = 0;
+
+static inline bool phase_over_limit(enum phase p)
+{
+	return phase_elapsed[p] >= phase_limits[p];
+}
+
+/// Reset elapsed times of phases and set phase to UDP, NON_UDP, or NONE.
+static inline void phase_reset(enum phase p)
+{
+	phase_elapsed[PHASE_UDP] = 0;
+	phase_elapsed[PHASE_NON_UDP] = 0;
+	phase_stamp = defer_sample_state.stamp;
+	phase = p;
+}
+
+/// Set phase to UDP or NON_UDP if it is not over limit or both are over limit (reset them).
+static inline bool phase_try_set(enum phase p)
+{
+	phase_elapsed[phase] += defer_sample_state.stamp - phase_stamp;
+	phase_stamp = defer_sample_state.stamp;
+
+	if (!phase_over_limit(p)) {
+		phase = p;
+		return true;
+	} else if (phase_over_limit(PHASE_UDP) && phase_over_limit(PHASE_NON_UDP)) {
+		phase_reset(p);
+		return true;
+	}
+
+	return false;
+}
+
+struct pl_defer_sess_data {
+	struct protolayer_data h;
+	protolayer_iter_ctx_queue_t queue;  // properly ordered sequence of deferred packets, for stream only
+		// the first ctx in the queue is also in a defer queue
+	size_t size;
+};
+
+struct pl_defer_iter_data {
+	struct protolayer_data h;
+	uint64_t req_stamp;   // time when request was received, uses get_stamp()
+	size_t size;
+};
+
+/// Return whether we're using optimized variant right now.
+static bool using_avx2(void)
+{
+	bool result = (KRU.initialize == KRU_AVX2.initialize);
+	kr_require(result || KRU.initialize == KRU_GENERIC.initialize);
+	return result;
+}
+
+/// Print configuration into desc array.
+void defer_str_conf(char *desc, int desc_len)
+{
+	int len = 0;
+#define append(...) len += snprintf(desc + len, desc_len > len ? desc_len - len : 0, __VA_ARGS__)
+#define append_time(prefix, ms, suffix) { \
+		if ((ms) < 1) append(prefix "%7.1f us" suffix, (ms) * 1000); \
+		else if ((ms) < 1000) append(prefix "%7.1f ms" suffix, (ms)); \
+		else append(prefix "%7.1f s " suffix, (ms) / 1000); }
+	append(     "  Expected cpus/procs: %5d\n", defer->cpus);
+
+	const size_t thresholds = sizeof(LOADS_THRESHOLDS) / sizeof(*LOADS_THRESHOLDS);
+	append(     "  Max waiting requests:%7.1f MiB\n", MAX_WAITING_REQS_SIZE / 1024.0 / 1024.0);
+	append_time("  Request timeout:     ", REQ_TIMEOUT           / 1000000.0, "\n");
+	append_time("  Idle:                ", IDLE_TIMEOUT          / 1000000.0, "\n");
+	append_time("  UDP phase:           ", PHASE_UDP_TIMEOUT     / 1000000.0, "\n");
+	append_time("  Non-UDP phase:       ", PHASE_NON_UDP_TIMEOUT / 1000000.0, "\n");
+	append(     "  Priority levels:     %5ld (%ld main levels, %d sublevels) + UDP\n", QUEUES_CNT - 1, thresholds, SUBPRIO_CNT);
+
+	size_t capacity_log = 0;
+	for (size_t c = defer->capacity - 1; c > 0; c >>= 1) capacity_log++;
+	size_t size = offsetof(struct defer, kru) + KRU.get_size(capacity_log);
+
+	append(     "  KRU capacity:        %7.1f k (%0.1f MiB)\n", (1 << capacity_log) / 1000.0, size / 1000000.0);
+
+	bool uniform_thresholds = true;
+	for (int i = 1; i < thresholds - 1; i++)
+		uniform_thresholds &= (LOADS_THRESHOLDS[i] == LOADS_THRESHOLDS[i - 1] * LOADS_THRESHOLDS[0]);
+	uniform_thresholds &= ((1<<16) == (int)LOADS_THRESHOLDS[thresholds - 2] * LOADS_THRESHOLDS[0]);
+
+	append(     "  Decay:                 %7.3f %% per ms (32-bit max: %d)\n",
+			100.0 * defer->max_decay / KRU_LIMIT, defer->max_decay);
+	float half_life = -1.0 / log2f(1.0 - (float)defer->max_decay / KRU_LIMIT);
+	append_time("    Half-life:         ", half_life, "\n");
+	if (uniform_thresholds)
+		append_time("    Priority rise in:  ", half_life * 16 / thresholds, "\n");
+	append_time("    Counter reset in:  ", half_life * 16, "\n");
+
+	append("  Rate limits for crossing priority levels as single CPU utilization:\n");
+
+	const struct kru_conf *kru_confs[] = {&V4_CONF, &V6_CONF};
+	const int version[] = {4, 6};
+	const kru_price_t base_price_ms = BASE_PRICE(1000000);
+
+	append("%15s", "");
+	for (int j = 0; j < 3; j++)
+		append("%14d", j+1);
+	append("%14s\n", "max");
+
+	for (int v = 0; v < 2; v++) {
+		for (int i = kru_confs[v]->prefixes_cnt - 1; i >= 0; i--) {
+			append("%9sv%d/%-3d: ", "", version[v], kru_confs[v]->prefixes[i]);
+			for (int j = 0; j < thresholds; j++) {
+				float needed_util = (float)defer->max_decay / (1<<16) * LOADS_THRESHOLDS[j] / base_price_ms * kru_confs[v]->rate_mult[i];
+				append("%12.3f %%", needed_util * 100);
+			}
+			append("\n");
+		}
+	}
+
+	append("  Instant limits for crossing priority levels as CPU time:\n");
+
+	append("%15s", "");
+	for (int j = 0; j < 3; j++)
+		append("%14d", j+1);
+	append("%14s\n", "max");
+
+	for (int v = 0; v < 2; v++) {
+		for (int i = kru_confs[v]->prefixes_cnt - 1; i >= 0; i--) {
+			append("%9sv%d/%-3d:  ", "", version[v], kru_confs[v]->prefixes[i]);
+			for (int j = 0; j < thresholds; j++) {
+				float needed_time = (float)KRU_LIMIT / (1<<16) * LOADS_THRESHOLDS[j] / base_price_ms * kru_confs[v]->rate_mult[i];
+				if (needed_time < 1) {
+					append("%11.1f us", needed_time * 1000);
+				} else if (needed_time < 1000) {
+					append("%11.1f ms", needed_time);
+				} else {
+					append("%11.1f s ", needed_time / 1000);
+				}
+			}
+			append("\n");
+		}
+	}
+	append("    (values above max are indistinguishable)\n");
+
+#undef append_time
+#undef append
+}
+
+void defer_set_price_factor16(struct kr_request *req, uint32_t price_factor16)
+{
+	req->qsource.price_factor16 = defer_sample_state.price_factor16 = price_factor16;
+}
+
+/// Call KRU, return priority and as params load and prefix.
+static inline int kru_charge_classify(const struct kru_conf *kru_conf, uint8_t *key, kru_price_t *prices,
+		uint16_t *out_load, uint8_t *out_prefix)
+{
+	uint16_t loads[kru_conf->prefixes_cnt];
+	KRU.load_multi_prefix((struct kru *)defer->kru, kr_now(),
+			kru_conf->namespace, key, kru_conf->prefixes, prices, kru_conf->prefixes_cnt, loads);
+
+	int priority = 0;
+	int prefix_index = kru_conf->prefixes_cnt - 1;
+	for (int i = kru_conf->prefixes_cnt - 1, j = 0; i >= 0; i--) {
+		for (; LOADS_THRESHOLDS[j] < loads[i]; j++) {
+			prefix_index = i;
+			priority = 1 + j * SUBPRIO_CNT + kru_conf->subprio[i];
+		}
+	}
+	*out_load = loads[prefix_index];
+	*out_prefix = kru_conf->prefixes[prefix_index];
+	return priority;
+}
+
+/// Increment KRU counters by given time.
+void defer_charge(uint64_t nsec, union kr_sockaddr *addr, bool stream)
+{
+	if (!stream) return;  // UDP is not accounted in KRU; TODO remove !stream invocations?
+	
+	// compute time adjusted by the price factor
+	uint64_t nsec_adj;
+	const uint32_t pf16 = defer_sample_state.price_factor16;
+	if (pf16 == 0) return;  // whitelisted
+	if (nsec < (1ul<<32)) {  // simple way with standard rounding
+		nsec_adj = (nsec * pf16 + (1<<15)) >> 16;
+	} else {  // afraid of overflow, so we swap the order of the math
+		nsec_adj = ((nsec + (1<<15)) >> 16) * pf16;
+	}
+
+	_Alignas(16) uint8_t key[16] = {0, };
+	const struct kru_conf *kru_conf;
+	if (addr->ip.sa_family == AF_INET6) {
+		memcpy(key, &addr->ip6.sin6_addr, 16);
+		kru_conf = &V6_CONF;
+	} else if (addr->ip.sa_family == AF_INET) {
+		memcpy(key, &addr->ip4.sin_addr, 4);
+		kru_conf = &V4_CONF;
+	} else {
+		return;
+	}
+
+	uint64_t base_price = BASE_PRICE(nsec_adj);
+	kru_price_t prices[kru_conf->prefixes_cnt];
+	for (size_t i = 0; i < kru_conf->prefixes_cnt; i++) {
+		uint64_t price = base_price / kru_conf->rate_mult[i];
+		prices[i] = price > (kru_price_t)-1 ? -1 : price;
+	}
+
+	uint16_t load;
+	uint8_t prefix;
+	kru_charge_classify(kru_conf, key, prices, &load, &prefix);
+
+	VERBOSE_LOG("  %s ADD %4.3f ms * %.2f -> load: %d on /%d\n",
+			kr_straddr(&addr->ip), nsec / 1000000.0, pf16 / (float)(1<<16), load, prefix);
+}
+
+/// Determine priority of the request in [0, QUEUES_CNT - 1];
+/// lower value has higher priority; plain UDP always gets PRIORITY_UDP.
+static inline int classify(const union kr_sockaddr *addr, bool stream)
+{
+	if (!stream) { // UDP
+		VERBOSE_LOG("    unverified address\n");
+		return PRIORITY_UDP;
+	}
+
+	_Alignas(16) uint8_t key[16] = {0, };
+	const struct kru_conf *kru_conf = NULL;
+	if (addr->ip.sa_family == AF_INET6) {
+		memcpy(key, &addr->ip6.sin6_addr, 16);
+		kru_conf = &V6_CONF;
+	} else if (addr->ip.sa_family == AF_INET) {
+		memcpy(key, &addr->ip4.sin_addr, 4);
+		kru_conf = &V4_CONF;
+	} else {
+		kr_assert(false);
+		return 0; // shouldn't happen anyway
+	}
+
+	uint16_t load;
+	uint8_t prefix;
+	int priority = kru_charge_classify(kru_conf, key, NULL, &load, &prefix);
+
+	VERBOSE_LOG("    load %d on /%d\n", load, prefix);
+
+	return priority;
+}
+
+
+/// Push query to a queue according to its priority.
+static inline void push_query(struct protolayer_iter_ctx *ctx, int priority, bool to_head_end)
+{
+	if (to_head_end) {
+		queue_push_head(queues[priority], ctx);
+	} else {
+		queue_push(queues[priority], ctx);
+	}
+	queue_ix = MIN(queue_ix, priority);
+	waiting_requests++;
+}
+
+/// Pop and return query from the specified queue..
+static inline struct protolayer_iter_ctx *pop_query_queue(int priority)
+{
+	kr_assert(queue_len(queues[priority]) > 0);
+	struct protolayer_iter_ctx *ctx = queue_head(queues[priority]);
+	queue_pop(queues[priority]);
+	waiting_requests--;
+	kr_assert(waiting_requests >= 0);
+	return ctx;
+}
+
+
+/// Pop and return the query with the highest priority, UDP or non-UDP based on the current phase.
+static inline struct protolayer_iter_ctx *pop_query(void)
+{
+	const int waiting_udp = queue_len(queues[PRIORITY_UDP]);
+	const int waiting_non_udp = waiting_requests - waiting_udp;
+
+	if (!((waiting_non_udp > 0) && phase_try_set(PHASE_NON_UDP)) &&
+		  !((waiting_udp     > 0) && phase_try_set(PHASE_UDP)))
+		phase_reset(waiting_non_udp > 0 ? PHASE_NON_UDP : PHASE_UDP);
+
+	int i;
+	if (phase == PHASE_NON_UDP) {
+		for (; queue_ix < QUEUES_CNT && queue_len(queues[queue_ix]) == 0; queue_ix++);
+		if (kr_fails_assert(queue_ix < PRIORITY_UDP))
+			return NULL;
+		i = queue_ix;
+	} else {
+		i = PRIORITY_UDP;
+	}
+
+	return pop_query_queue(i);
+}
+
+
+// Break the given query; for streams break also all follow-up queries and force-close the stream.
+static inline void break_query(struct protolayer_iter_ctx *ctx, int err)
+{
+	if (ctx->session->stream) {
+		struct session2 *s = ctx->session;
+		struct pl_defer_sess_data *sdata = protolayer_sess_data_get_current(ctx);
+		s->ref_count++; // keep session and sdata alive for a while
+		waiting_requests_size -= sdata->size;
+		if (!ctx->session->closing) {
+			session2_force_close(ctx->session);
+		}
+		kr_assert(ctx == queue_head(sdata->queue));
+		while (true) {
+			queue_pop(sdata->queue);
+			if (ctx) {
+				struct pl_defer_iter_data *idata = protolayer_iter_data_get_current(ctx);
+				waiting_requests_size -= idata->size;
+				protolayer_break(ctx, kr_error(err));
+			}
+			if (queue_len(sdata->queue) == 0) break;
+			ctx = queue_head(sdata->queue);
+		}
+		session2_unhandle(s); // decrease ref_count
+	} else {
+		struct pl_defer_iter_data *idata = protolayer_iter_data_get_current(ctx);
+		waiting_requests_size -= idata->size;
+		protolayer_break(ctx, kr_error(err));
+	}
+	kr_assert(waiting_requests ? waiting_requests_size > 0 : waiting_requests_size == 0);
+}
+
+/// Process a single deferred query (or defer again) if there is any.
+/// Time accounting should have been just started, the stamp is used, accounted address is set.
+static inline void process_single_deferred(void)
+{
+	struct protolayer_iter_ctx *ctx = pop_query();
+	if (kr_fails_assert(ctx)) return;
+
+	defer_sample_addr((const union kr_sockaddr *)ctx->comm->src_addr, ctx->session->stream);
+
+	struct pl_defer_iter_data *idata = protolayer_iter_data_get_current(ctx);
+	struct pl_defer_sess_data *sdata = protolayer_sess_data_get_current(ctx);
+	struct session2 *session = ctx->session;
+	uint64_t age_ns = defer_sample_state.stamp - idata->req_stamp;
+
+	VERBOSE_LOG("  %s POP from %d after %4.3f ms\n",
+			kr_straddr(ctx->comm->src_addr),
+			queue_ix,
+			age_ns / 1000000.0);
+
+	if (ctx->session->closing) {
+		VERBOSE_LOG("    BREAK (session is closing)\n");
+		break_query(ctx, ECANCELED);
+		return;
+	}
+
+	if (age_ns >= REQ_TIMEOUT) {
+		VERBOSE_LOG("    BREAK (timeout)\n");
+
+		// notice logging according to log-period
+		const uint32_t time_now = kr_now();
+		uint32_t log_time_orig = atomic_load_explicit(&defer->log_time, memory_order_relaxed);
+		if (defer->log_period) {
+			while (time_now - log_time_orig + 1024 >= defer->log_period + 1024) {
+				if (atomic_compare_exchange_weak_explicit(&defer->log_time, &log_time_orig, time_now,
+						memory_order_relaxed, memory_order_relaxed)) {
+					kr_log_notice(DEFER, "Data from %s too long in queue, dropping. (%0.3f MiB in queues)\n",
+							kr_straddr(ctx->comm->src_addr), waiting_requests_size / 1024.0 / 1024.0);
+					break;
+				}
+			}
+		}
+
+		break_query(ctx, ETIME);
+		return;
+	}
+
+	bool eof = false;
+	if (ctx->session->stream) {
+		int priority = classify((const union kr_sockaddr *)ctx->comm->src_addr, ctx->session->stream);
+		if (priority > queue_ix) {  // priority dropped (got higher value)
+			VERBOSE_LOG("    PUSH to %d\n", priority);
+			push_query(ctx, priority, false);
+			return;
+		}
+
+		kr_assert(queue_head(sdata->queue) == ctx);
+		queue_pop(sdata->queue);
+		while ((queue_len(sdata->queue) > 0) && (queue_head(sdata->queue) == NULL)) { // EOF event
+			eof = true;
+			queue_pop(sdata->queue);
+		}
+		if (queue_len(sdata->queue) > 0) {
+			VERBOSE_LOG("    PUSH follow-up to head of %d\n", priority);
+			push_query(queue_head(sdata->queue), priority, true);
+		} else {
+			waiting_requests_size -= sdata->size;
+		}
+	}
+
+	waiting_requests_size -= idata->size;
+	kr_assert(waiting_requests ? waiting_requests_size > 0 : waiting_requests_size == 0);
+
+	if (eof) {
+		// Keep session alive even if it is somehow force-closed during continuation.
+		// TODO Is it possible?
+		session->ref_count++;
+	}
+
+	VERBOSE_LOG("    CONTINUE\n");
+	protolayer_continue(ctx);
+
+	if (eof) {
+		VERBOSE_LOG("    CONTINUE EOF event\n");
+		session2_event_after(session, PROTOLAYER_TYPE_DEFER, PROTOLAYER_EVENT_EOF, NULL);
+		session2_unhandle(session); // decrease ref_count
+	}
+}
+
+/// Process as many deferred requests as needed to get memory consumption under limit.
+static inline void process_deferred_over_size_limit(void) {
+	if (waiting_requests_size > MAX_WAITING_REQS_SIZE) {
+		defer_sample_state_t prev_sample_state;
+		defer_sample_start(&prev_sample_state);
+		do {
+			process_single_deferred();  // possibly defers again without decreasing waiting_requests_size
+				// If the unwrapped query is to be processed here,
+				// it is the last iteration and the query is processed after returning.
+			defer_sample_restart();
+		} while (waiting_requests_size > MAX_WAITING_REQS_SIZE);
+		defer_sample_stop(&prev_sample_state, true);
+	}
+}
+
+/// Break expired requests at the beginning of queues, uses current stamp.
+static inline void cleanup_queues(void)
+{
+	for (int i = 0; i < QUEUES_CNT; i++) {
+		int cnt = 0;
+		while (queue_len(queues[i]) > 0) {
+			struct protolayer_iter_ctx *ctx = queue_head(queues[i]);
+			struct pl_defer_iter_data *idata = protolayer_iter_data_get_current(ctx);
+			uint64_t age_ns = defer_sample_state.stamp - idata->req_stamp;
+			if (age_ns < REQ_TIMEOUT) break;
+			pop_query_queue(i);
+			break_query(ctx, ETIME);
+			cnt++;
+		}
+		if (cnt > 0) {
+			VERBOSE_LOG("  BREAK %d queries from %d\n", cnt, i);
+		}
+	}
+}
+
+/// Unwrap: defer or process the query synchronously.
+/// Time accounting should have been started, the stamp is used, accounted address is set.
+static enum protolayer_iter_cb_result pl_defer_unwrap(
+		void *sess_data, void *iter_data,
+		struct protolayer_iter_ctx *ctx)
+{
+	if (!defer || ctx->session->outgoing)
+		return protolayer_continue(ctx);
+
+	defer_sample_addr((const union kr_sockaddr *)ctx->comm->src_addr, ctx->session->stream);
+	struct pl_defer_iter_data *idata = iter_data;
+	struct pl_defer_sess_data *sdata = sess_data;
+	idata->req_stamp = defer_sample_state.stamp;
+
+	VERBOSE_LOG("  %s UNWRAP\n",
+			kr_straddr(ctx->comm->src_addr));
+
+	uv_idle_start(&idle_handle, defer_queues_idle);
+
+	if (queue_len(sdata->queue) > 0) {  // stream with preceding packet already deferred
+		queue_push(sdata->queue, ctx);
+		waiting_requests_size += idata->size = protolayer_iter_size_est(ctx, false);
+			// payload counted in session wire buffer
+		VERBOSE_LOG("    PUSH as follow-up\n");
+		process_deferred_over_size_limit();
+		return protolayer_async();
+	}
+
+	int priority = classify((const union kr_sockaddr *)ctx->comm->src_addr, ctx->session->stream);
+
+	// Process synchronously unless there may exist requests that has to be processed first
+	if (((priority == 0) || (priority == PRIORITY_UDP)) && (queue_len(queues[priority]) == 0) &&
+			phase_try_set(priority == PRIORITY_UDP ? PHASE_UDP : PHASE_NON_UDP)) {
+		VERBOSE_LOG("    CONTINUE\n");
+		return protolayer_continue(ctx);
+	}
+
+	VERBOSE_LOG("    PUSH to %d\n", priority);
+	if (ctx->session->stream) {
+		queue_push(sdata->queue, ctx);
+		waiting_requests_size += sdata->size = protolayer_sess_size_est(ctx->session);
+	}
+	push_query(ctx, priority, false);
+	waiting_requests_size += idata->size = protolayer_iter_size_est(ctx, !ctx->session->stream);
+		// for stream, payload is counted in session wire buffer
+
+	process_deferred_over_size_limit();
+	return protolayer_async();
+}
+
+/// Unwrap event: EOF event may be deferred here, other events pass synchronously.
+static enum protolayer_event_cb_result pl_defer_event_unwrap(
+		enum protolayer_event_type event, void **baton,
+		struct session2 *session, void *sess_data)
+{
+	if (!defer || !session->stream || session->outgoing)
+		return PROTOLAYER_EVENT_PROPAGATE;
+
+	defer_sample_addr((const union kr_sockaddr *)session->comm_storage.src_addr, session->stream);
+
+	struct pl_defer_sess_data *sdata = sess_data;
+	if ((event == PROTOLAYER_EVENT_EOF) && (queue_len(sdata->queue) > 0)) {
+		// defer EOF event if unprocessed data remain, baton is dropped if any
+		queue_push(sdata->queue, NULL);
+		VERBOSE_LOG("  %s event %s deferred\n",
+				session->comm_storage.src_addr ? kr_straddr(session->comm_storage.src_addr) : "(null)",
+				protolayer_event_name(event));
+		return PROTOLAYER_EVENT_CONSUME;
+	}
+
+	VERBOSE_LOG("  %s event %s passes through synchronously%s%s\n",
+			session->comm_storage.src_addr ? kr_straddr(session->comm_storage.src_addr) : "(null)",
+			protolayer_event_name(event),
+			queue_len(sdata->queue) > 0 ? " ahead of deferred data" : "",
+			*baton ? " (with baton)" : "");
+	return PROTOLAYER_EVENT_PROPAGATE;
+}
+
+/// Idle: continue processing deferred requests.
+static void defer_queues_idle(uv_idle_t *handle)
+{
+	VERBOSE_LOG("IDLE\n");
+	if (waiting_requests > 0) {
+		VERBOSE_LOG("  %d waiting\n", waiting_requests);
+		defer_sample_start(NULL);
+		uint64_t idle_stamp = defer_sample_state.stamp;
+		do {
+			process_single_deferred();
+			defer_sample_restart();
+		} while ((waiting_requests > 0) && (defer_sample_state.stamp < idle_stamp + IDLE_TIMEOUT));
+		defer_sample_stop(NULL, true);
+		cleanup_queues();
+		udp_queue_send_all();
+	}
+
+	if (waiting_requests > 0) {
+		VERBOSE_LOG("  %d waiting\n", waiting_requests);
+	} else {
+		phase_reset(PHASE_NONE);
+		VERBOSE_LOG("  deactivate idle\n");
+		uv_idle_stop(&idle_handle);
+	}
+	VERBOSE_LOG("POLL\n");
+}
+
+
+/// Initialize shared memory, queues. To be called from Lua.
+int defer_init(const char *mmap_file, uint32_t log_period, int cpus)  // TODO possibly remove cpus; not needed
+{
+	defer_initialized = true;
+	if (mmap_file == NULL) {
+		// defer explicitly disabled
+		return 0;
+	}
+
+	int ret = 0;
+	if (cpus < 1) {
+		ret = EINVAL;
+		goto fail;
+	}
+
+	struct defer header = {
+		.capacity = KRU_CAPACITY,
+		.max_decay = MAX_DECAY,
+		.log_period = log_period,
+		.cpus = cpus,
+		.using_avx2 = using_avx2(),
+	};
+
+	size_t capacity_log = 0;
+	for (size_t c = header.capacity - 1; c > 0; c >>= 1) capacity_log++;
+
+	size_t size = offsetof(struct defer, kru) + KRU.get_size(capacity_log);
+	size_t header_size = offsetof(struct defer, using_avx2) + sizeof(header.using_avx2);
+	static_assert(  // no padding up to .using_avx2
+		offsetof(struct defer, using_avx2) ==
+			sizeof(header.capacity) +
+			sizeof(header.max_decay) +
+			sizeof(header.log_period) +
+			sizeof(header.cpus),
+		"detected padding with undefined data inside mmapped header");
+
+	ret = mmapped_init(&defer_mmapped, mmap_file, size, &header, header_size);
+	if (ret == MMAPPED_WAS_FIRST) {
+		kr_log_info(DEFER, "Initializing defer...\n");
+
+		defer = defer_mmapped.mem;
+
+		bool succ = KRU.initialize((struct kru *)defer->kru, capacity_log, header.max_decay);
+		if (!succ) {
+			defer = NULL;
+			ret = kr_error(EINVAL);
+			goto fail;
+		}
+
+		defer->log_time = kr_now() - log_period;
+
+		ret = mmapped_init_continue(&defer_mmapped);
+		if (ret != 0) goto fail;
+
+		kr_log_info(DEFER, "Defer initialized (%s).\n", (defer->using_avx2 ? "AVX2" : "generic"));
+
+		// log current configuration
+		if (KR_LOG_LEVEL_IS(LOG_INFO) || kr_log_group_is_set(LOG_GRP_DEFER)) {
+			char desc[8000];
+			defer_str_conf(desc, sizeof(desc));
+			kr_log_info(DEFER, "Defer configuration:\n%s", desc);
+		}
+	} else if (ret == 0) {
+		defer = defer_mmapped.mem;
+		kr_log_info(DEFER, "Using existing defer data (%s).\n", (defer->using_avx2 ? "AVX2" : "generic"));
+	} else goto fail;
+
+	for (size_t i = 0; i < QUEUES_CNT; i++)
+		queue_init(queues[i]);
+
+	return 0;
+
+fail:
+
+	kr_log_crit(DEFER, "Initialization of shared defer data failed.\n");
+	return ret;
+}
+
+/// Initialize idle.
+int defer_init_idle(uv_loop_t *loop)
+{
+	return uv_idle_init(loop, &idle_handle);
+}
+
+/// Initialize session queue
+int pl_defer_sess_init(struct session2 *session, void *data, void *param)
+{
+	struct pl_defer_sess_data *sdata = data;
+	queue_init(sdata->queue);
+	return 0;
+}
+
+/// Deinitialize shared memory.
+void defer_deinit(void)
+{
+	mmapped_deinit(&defer_mmapped);
+	defer = NULL;
+}
+
+/// Initialize protolayer.
+__attribute__((constructor))
+static void defer_protolayers_init(void)
+{
+	protolayer_globals[PROTOLAYER_TYPE_DEFER] = (struct protolayer_globals){
+		.iter_size = sizeof(struct pl_defer_iter_data),
+		.sess_size = sizeof(struct pl_defer_sess_data),
+		.sess_init = pl_defer_sess_init,
+		.unwrap = pl_defer_unwrap,
+		.event_unwrap = pl_defer_event_unwrap,
+	};
+}
--- a/daemon/defer.h
+++ b/daemon/defer.h
+/*  Copyright (C) CZ.NIC, z.s.p.o. <knot-resolver@labs.nic.cz>
+ *  SPDX-License-Identifier: GPL-3.0-or-later
+ */
+
+#include <stdbool.h>
+#include "lib/defines.h"
+#include "lib/utils.h"
+#include "lib/kru.h"
+
+/// Initialize defer, incl. shared memory with KRU, excl. idle.
+KR_EXPORT
+int defer_init(const char *mmap_file, uint32_t log_period, int cpus);
+
+/// Initialize idle.
+int defer_init_idle(uv_loop_t *loop);
+
+/// Deinitialize shared memory.
+void defer_deinit(void);
+
+/// Increment KRU counters by the given time.
+void defer_charge(uint64_t nsec, union kr_sockaddr *addr, bool stream);
+
+struct kr_request;
+/// Set the price-factor; see struct kr_request::qsource.price_factor16
+KR_EXPORT
+void defer_set_price_factor16(struct kr_request *req, uint32_t price_factor16);
+
+typedef struct {
+	bool is_accounting; /// whether currently accounting the time to someone
+	bool stream;
+	union kr_sockaddr addr; /// request source (to which we account) or AF_UNSPEC if unknown yet
+	uint32_t price_factor16; /// see struct kr_request::qsource.price_factor16
+	uint64_t stamp; /// monotonic nanoseconds, probably won't wrap
+} defer_sample_state_t;
+extern defer_sample_state_t defer_sample_state;
+
+extern struct defer *defer;  /// skip sampling/deferring if NULL
+extern bool defer_initialized; /// defer_init was called, possibly keeping defer disabled
+extern uint64_t defer_uvtime_stamp; /// stamp of the last uv time update
+
+// TODO: reconsider `static inline` cases below
+
+#include <time.h>
+static inline uint64_t defer_get_stamp(void)
+{
+	struct timespec now_ts = {0};
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &now_ts);
+	uint64_t stamp = now_ts.tv_nsec + 1000*1000*1000 * (uint64_t)now_ts.tv_sec;
+	if (defer_uvtime_stamp + 1000*1000 < stamp) {
+		defer_uvtime_stamp = stamp;
+		uv_update_time(uv_default_loop());
+	}
+	return stamp;
+}
+
+/// Annotate the work currently being accounted by an IP address.
+static inline void defer_sample_addr(const union kr_sockaddr *addr, bool stream)
+{
+	if (!defer || kr_fails_assert(addr)) return;
+	if (!defer_sample_state.is_accounting) return;
+
+	if (defer_sample_state.addr.ip.sa_family != AF_UNSPEC) {
+		// TODO: this costs performance, so only in some debug mode?
+		if (kr_sockaddr_cmp(&addr->ip, &defer_sample_state.addr.ip) != kr_ok()) {
+			char defer_addr[KR_STRADDR_MAXLEN + 1] = { 0 };
+			strncpy(defer_addr, kr_straddr(&defer_sample_state.addr.ip), sizeof(defer_addr) - 1);
+			kr_log_warning(DEFER, "Sampling address mismatch: %s != %s\n",
+				kr_straddr(&addr->ip),
+				defer_addr);
+			return;
+		}
+	}
+
+	switch (addr->ip.sa_family) {
+	case AF_INET:
+		defer_sample_state.addr.ip4 = addr->ip4;
+		break;
+	case AF_INET6:
+		defer_sample_state.addr.ip6 = addr->ip6;
+		break;
+	default:
+		defer_sample_state.addr.ip.sa_family = AF_UNSPEC;
+		break;
+	}
+	defer_sample_state.stream = stream;
+	defer_sample_state.price_factor16 = 1 << 16; // meaning *1.0, until more information is known
+		// TODO set to the proper value on each invocation of defer_sample_addr
+}
+
+/// Internal; start accounting work at specified timestamp.
+static inline void defer_sample_start_stamp(uint64_t stamp)
+{
+	if (!defer) return;
+	kr_assert(!defer_sample_state.is_accounting);
+	defer_sample_state.is_accounting = true;
+	defer_sample_state.stamp = stamp;
+	defer_sample_state.addr.ip.sa_family = AF_UNSPEC;
+}
+
+/// Internal; stop accounting work at specified timestamp and charge the source if applicable.
+static inline void defer_sample_stop_stamp(uint64_t stamp)
+{
+	if (!defer) return;
+	kr_assert(defer_sample_state.is_accounting);
+	defer_sample_state.is_accounting = false;
+
+	if (defer_sample_state.addr.ip.sa_family == AF_UNSPEC) return;
+
+	const uint64_t elapsed = stamp - defer_sample_state.stamp;
+	if (elapsed == 0) return;
+
+	// TODO: some queries of internal origin have suspicioiusly high numbers.
+	// We won't be really accounting those, but it might suggest some other issue.
+
+	defer_charge(elapsed, &defer_sample_state.addr, defer_sample_state.stream);
+}
+
+static inline bool defer_sample_is_accounting(void)
+{
+	return defer_sample_state.is_accounting;
+}
+
+/// Start accounting work; optionally save state of current accounting.
+/// Current state can be saved only after having an address assigned.
+static inline void defer_sample_start(defer_sample_state_t *prev_state_out) {
+	if (!defer) return;
+	uint64_t stamp = defer_get_stamp();
+
+	// suspend
+	if (prev_state_out) {
+		*prev_state_out = defer_sample_state;  // TODO stamp is not needed
+		if (defer_sample_state.is_accounting)
+			defer_sample_stop_stamp(stamp);
+	}
+
+	// start
+	defer_sample_start_stamp(stamp);
+}
+
+/// Stop accounting and start it again.
+static inline void defer_sample_restart(void) {
+	if (!defer) return;
+	uint64_t stamp = defer_get_stamp();
+
+	// stop
+	defer_sample_stop_stamp(stamp);
+
+	// start
+	defer_sample_start_stamp(stamp);
+}
+
+/// Stop accounting and charge the source if applicable; optionally resume previous accounting.
+static inline void defer_sample_stop(defer_sample_state_t *prev_state, bool reuse_last_stamp) {
+	if (!defer) return;
+	uint64_t stamp = reuse_last_stamp ? defer_sample_state.stamp : defer_get_stamp();
+
+	// stop
+	defer_sample_stop_stamp(stamp);
+
+	// resume
+	if (prev_state) {
+		defer_sample_state = *prev_state;
+		defer_sample_state.stamp = stamp;
+	}
+}
--- a/daemon/engine.c
+++ b/daemon/engine.c
-/*  Copyright (C) 2015 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+/*  Copyright (C) CZ.NIC, z.s.p.o. <knot-resolver@labs.nic.cz>
+ *  SPDX-License-Identifier: GPL-3.0-or-later
 */

+#include <contrib/cleanup.h>
 #include <ccan/json/json.h>
+#include <ccan/asprintf/asprintf.h>
+#include <dlfcn.h>
 #include <uv.h>
 #include <unistd.h>
-#include <libknot/internal/mempattern.h>
-/* #include <libknot/internal/namedb/namedb_trie.h> @todo Not supported (doesn't keep value copy) */
-#include <libknot/internal/namedb/namedb_lmdb.h>
-
+#include <grp.h>
+#include <pwd.h>
+#include <sys/param.h>
+#include <libzscanner/scanner.h>
+#include <sys/un.h>
+
+#include <lua.h>
+#include <lualib.h>
+#include <lauxlib.h>
+#include "daemon/bindings/impl.h"
+
+#include "kresconfig.h"
 #include "daemon/engine.h"
-#include "daemon/bindings.h"
 #include "daemon/ffimodule.h"
-#include "lib/nsrep.h"
-#include "lib/cache.h"
+#include "lib/selection.h"
+#include "lib/cache/api.h"
 #include "lib/defines.h"
+#include "lib/cache/cdb_lmdb.h"
+#include "lib/dnssec/ta.h"
+#include "lib/log.h"

-/** @internal Compatibility wrapper for Lua < 5.2 */
-#if LUA_VERSION_NUM < 502
-#define lua_rawlen(L, obj) lua_objlen((L), (obj))
-#endif
+
+/* Execute byte code */
+#define l_dobytecode(L, arr, len, name) \
+	(luaL_loadbuffer((L), (arr), (len), (name)) || lua_pcall((L), 0, LUA_MULTRET, 0))

 /*
 * Global bindings.
 */
+struct args *the_args;
+
+static struct engine engine = {{0}};
+struct engine *the_engine = NULL;

-/** Register module callback into Lua world. */
-#define REGISTER_MODULE_CALL(L, module, cb, name) do { \
-	lua_pushlightuserdata((L), (module)); \
-	lua_pushlightuserdata((L), (cb)); \
-	lua_pushcclosure((L), l_trampoline, 2); \
-	lua_setfield((L), -2, (name)); \
-	} while (0)

 /** Print help and available commands. */
 static int l_help(lua_State *L)
@@ -52,32 +50,310 @@ static int l_help(lua_State *L)
 		"help()\n    show this help\n"
 		"quit()\n    quit\n"
 		"hostname()\n    hostname\n"
+		"package_version()\n    return package version\n"
+		"user(name[, group])\n    change process user (and group)\n"
+		"log_level(level)\n    logging level (crit, err, warning, notice, info or debug)\n"
+		"log_target(target)\n    logging target (syslog, stderr, stdout)\n"
+		"log_groups(groups)\n    turn on debug log for selected groups\n"
+		"option(opt[, new_val])\n    get/set server option\n"
+		"mode(strict|normal|permissive)\n    set resolver strictness level\n"
+		"reorder_RR([true|false])\n    set/get reordering of RRs within RRsets\n"
+		"resolve(name, type[, class, flags, callback])\n    resolve query, callback when it's finished\n"
+		"todname(name)\n    convert name to wire format\n"
+		"tojson(val)\n    convert value to JSON\n"
+		"net\n    network configuration\n"
+		"cache\n    network configuration\n"
+		"modules\n    modules configuration\n"
+		"kres\n    resolver services\n"
+		"trust_anchors\n    configure trust anchors\n"
+		"debugging\n    debugging configuration\n"
 		;
 	lua_pushstring(L, help_str);
 	return 1;
 }

+static bool update_privileges(int uid, int gid)
+{
+	if ((gid_t)gid != getgid()) {
+		if (setregid(gid, gid) < 0) {
+			return false;
+		}
+	}
+	if ((uid_t)uid != getuid()) {
+		if (setreuid(uid, uid) < 0) {
+			return false;
+		}
+	}
+	return true;
+}
+
+/** Set process user/group. */
+static int l_setuser(lua_State *L)
+{
+	int n = lua_gettop(L);
+	if (n < 1 || !lua_isstring(L, 1))
+		lua_error_p(L, "user(user[, group])");
+
+	/* Fetch UID/GID based on string identifiers. */
+	struct passwd *user_pw = getpwnam(lua_tostring(L, 1));
+	if (!user_pw)
+		lua_error_p(L, "invalid user name");
+	int uid = user_pw->pw_uid;
+	int gid = getgid();
+	if (n > 1 && lua_isstring(L, 2)) {
+		struct group *group_pw = getgrnam(lua_tostring(L, 2));
+		if (!group_pw)
+			lua_error_p(L, "invalid group name");
+		gid = group_pw->gr_gid;
+	}
+	/* Drop privileges */
+	bool ret = update_privileges(uid, gid);
+	if (!ret) {
+		lua_error_maybe(L, errno);
+	}
+	lua_pushboolean(L, ret);
+	return 1;
+}
+
 /** Quit current executable. */
 static int l_quit(lua_State *L)
 {
-	/* Stop engine */
-	engine_stop(engine_luaget(L));
-	/* No results */
+	engine_stop();
+	return 0;
+}
+
+/** Toggle verbose mode. */
+static int l_verbose(lua_State *L)
+{
+	kr_log_deprecate(SYSTEM, "use log_level() instead of verbose()\n");
+
+	if (lua_isboolean(L, 1) || lua_isnumber(L, 1)) {
+		kr_log_level_set(lua_toboolean(L, 1) == true ? LOG_DEBUG : LOG_DEFAULT_LEVEL);
+	}
+
+	lua_pushboolean(L, kr_log_level == LOG_DEBUG);
+	return 1;
+}
+
+static int l_log_level(lua_State *L)
+{
+	const int params = lua_gettop(L);
+	if (params > 1) {
+		goto bad_call;
+	} else if (params == 1) {  // set
+		const char *lvl_str = lua_tostring(L, 1);
+		if (!lvl_str)
+			goto bad_call;
+		kr_log_level_t lvl = kr_log_name2level(lvl_str);
+		if (lvl < 0)
+			lua_error_p(L, "unknown log level '%s'", lvl_str);
+		kr_log_level_set(lvl);
+	}
+	// get
+	lua_pushstring(L, kr_log_level2name(kr_log_level));
+	return 1;
+bad_call:
+	lua_error_p(L, "takes one string parameter or nothing");
+}
+
+static int l_log_target(lua_State *L)
+{
+	const int params = lua_gettop(L);
+	if (params > 1)
+		goto bad_call;
+	// set
+	if (params == 1) {
+		const char *t_str = lua_tostring(L, 1);
+		if (!t_str)
+			goto bad_call;
+		kr_log_target_t t;
+		if (strcmp(t_str, "syslog") == 0) {
+			t = LOG_TARGET_SYSLOG;
+		} else if (strcmp(t_str, "stdout") == 0) {
+			t = LOG_TARGET_STDOUT;
+		} else if (strcmp(t_str, "stderr") == 0) {
+			t = LOG_TARGET_STDERR;
+		} else {
+			lua_error_p(L, "unknown log target '%s'", t_str);
+		}
+		kr_log_target_set(t);
+	}
+	// get
+	const char *t_str = NULL;
+	switch (kr_log_target) {
+	case LOG_TARGET_SYSLOG: t_str = "syslog"; break;
+	case LOG_TARGET_STDERR: t_str = "stderr"; break;
+	case LOG_TARGET_STDOUT: t_str = "stdout"; break;
+	} // -Wswitch-enum
+	lua_pushstring(L, t_str);
+	return 1;
+bad_call:
+	lua_error_p(L, "takes one string parameter or nothing");
+}
+
+static int l_log_groups(lua_State *L)
+{
+	const int params = lua_gettop(L);
+	if (params > 1)
+		goto bad_call;
+	if (params == 1) {  // set
+		if (!lua_istable(L, 1))
+			goto bad_call;
+		kr_log_group_reset();
+
+		lua_pushnil(L);
+		while (lua_next(L, 1) != 0) {
+			const char *grp_str = lua_tostring(L, -1);
+			if (!grp_str)
+				goto bad_call;
+
+			enum kr_log_group grp = kr_log_name2grp(grp_str);
+			if (grp >= 0) {
+				kr_log_group_add(grp);
+			} else {
+				kr_log_warning(SYSTEM, "WARNING: unknown log group '%s'\n", lua_tostring(L, -1));
+			}
+
+			lua_pop(L, 1);
+		}
+	}
+	// get
+	lua_newtable(L);
+	int i = 1;
+	for (enum kr_log_group grp = LOG_GRP_SYSTEM; grp < LOG_GRP_REQDBG; grp++) {
+		const char *name = kr_log_grp2name(grp);
+		if (kr_fails_assert(name))
+			continue;
+		if (kr_log_group_is_set(grp)) {
+			lua_pushinteger(L, i);
+			lua_pushstring(L, name);
+			lua_settable(L, -3);
+			i++;
+		}
+	}
+	return 1;
+bad_call:
+	lua_error_p(L, "takes a table of string groups as parameter or nothing");
+}
+
+char *engine_get_hostname(void) {
+	static char hostname_str[KNOT_DNAME_MAXLEN];
+
+	if (!the_engine->hostname) {
+		if (gethostname(hostname_str, sizeof(hostname_str)) != 0)
+			return NULL;
+		return hostname_str;
+	}
+	return the_engine->hostname;
+}
+
+int engine_set_hostname(const char *hostname) {
+	if (!hostname) {
+		return kr_error(EINVAL);
+	}
+
+	char *new_hostname = strdup(hostname);
+	if (!new_hostname) {
+		return kr_error(ENOMEM);
+	}
+	if (the_engine->hostname) {
+		free(the_engine->hostname);
+	}
+	the_engine->hostname = new_hostname;
+	network_new_hostname();
+
 	return 0;
 }

 /** Return hostname. */
 static int l_hostname(lua_State *L)
 {
-	char host_str[KNOT_DNAME_MAXLEN];
-	gethostname(host_str, sizeof(host_str));
-	lua_pushstring(L, host_str);
+	if (lua_gettop(L) == 0) {
+		lua_pushstring(L, engine_get_hostname());
+		return 1;
+	}
+	if ((lua_gettop(L) != 1) || !lua_isstring(L, 1))
+		lua_error_p(L, "hostname takes at most one parameter: (\"fqdn\")");
+
+	if (engine_set_hostname(lua_tostring(L, 1)) != 0)
+		lua_error_p(L, "setting hostname failed");
+
+	lua_pushstring(L, engine_get_hostname());
+	return 1;
+}
+
+/** Return server package version. */
+static int l_package_version(lua_State *L)
+{
+	lua_pushliteral(L, PACKAGE_VERSION);
 	return 1;
 }

+/** Load root hints from zonefile. */
+static int l_hint_root_file(lua_State *L)
+{
+	const char *file = lua_tostring(L, 1);
+
+	const char *err = engine_hint_root_file(file);
+	if (err) {
+		if (!file) {
+			file = ROOTHINTS;
+		}
+		lua_error_p(L, "error when opening '%s': %s", file, err);
+	} else {
+		lua_pushboolean(L, true);
+		return 1;
+	}
+}
+
+/** @internal for engine_hint_root_file */
+static void roothints_add(zs_scanner_t *zs)
+{
+	struct kr_zonecut *hints = zs->process.data;
+	if (!hints) {
+		return;
+	}
+	if (zs->r_type == KNOT_RRTYPE_A || zs->r_type == KNOT_RRTYPE_AAAA) {
+		kr_zonecut_add(hints, zs->r_owner, zs->r_data, zs->r_data_length);
+	}
+}
+const char* engine_hint_root_file(const char *file)
+{
+	if (!file) {
+		file = ROOTHINTS;
+	}
+	if (strlen(file) == 0) {
+		return "invalid parameters";
+	}
+	struct kr_zonecut *root_hints = &the_resolver->root_hints;
+
+	zs_scanner_t zs;
+	if (zs_init(&zs, ".", 1, 0) != 0) {
+		return "not enough memory";
+	}
+	if (zs_set_input_file(&zs, file) != 0) {
+		zs_deinit(&zs);
+		return "failed to open root hints file";
+	}
+
+	kr_zonecut_set(root_hints, (const uint8_t *)"");
+	zs_set_processing(&zs, roothints_add, NULL, root_hints);
+	zs_parse_all(&zs);
+	zs_deinit(&zs);
+	return NULL;
+}
+
 /** Unpack JSON object to table */
 static void l_unpack_json(lua_State *L, JsonNode *table)
 {
+	/* Unpack POD */
+	switch(table->tag) {
+		case JSON_STRING: lua_pushstring(L, table->string_); return;
+		case JSON_NUMBER: lua_pushnumber(L, table->number_); return;
+		case JSON_BOOL:   lua_pushboolean(L, table->bool_); return;
+		default: break;
+	}
+	/* Unpack object or array into table */
 	lua_newtable(L);
 	JsonNode *node = NULL;
 	json_foreach(node, table) {
@@ -94,409 +370,451 @@ static void l_unpack_json(lua_State *L, JsonNode *table)
 		if (node->key) {
 			lua_setfield(L, -2, node->key);
 		} else {
-			lua_rawseti(L, -2, lua_rawlen(L, -2) + 1);
+			lua_rawseti(L, -2, lua_objlen(L, -2) + 1);
 		}
 	}
 }

+/** @internal Recursive Lua/JSON serialization. */
 static JsonNode *l_pack_elem(lua_State *L, int top)
 {
-	if (lua_isstring(L, top)) {
-		return json_mkstring(lua_tostring(L, top));
-	}
-	if (lua_isnumber(L, top)) {
-		return json_mknumber(lua_tonumber(L, top));	
-	}
-	if (lua_isboolean(L, top)) {
-		return json_mkbool(lua_toboolean(L, top));	
+	switch(lua_type(L, top)) {
+	case LUA_TSTRING:  return json_mkstring(lua_tostring(L, top));
+	case LUA_TNUMBER:  return json_mknumber(lua_tonumber(L, top));
+	case LUA_TBOOLEAN: return json_mkbool(lua_toboolean(L, top));
+	case LUA_TTABLE:   break; /* Table, iterate it. */
+	default:           return json_mknull();
+	}
+	/* Use absolute indexes here, as the table may be nested. */
+	JsonNode *node = NULL;
+	lua_pushnil(L);
+	while(lua_next(L, top) != 0) {
+		bool is_array = false;
+		if (!node) {
+			is_array = (lua_type(L, top + 1) == LUA_TNUMBER);
+			node = is_array ? json_mkarray() : json_mkobject();
+			if (!node) {
+				return NULL;
+			}
+		} else {
+			is_array = node->tag == JSON_ARRAY;
+		}
+
+		/* Insert to array/table. */
+		JsonNode *val = l_pack_elem(L, top + 2);
+		if (is_array) {
+			json_append_element(node, val);
+		} else {
+			const char *key = lua_tostring(L, top + 1);
+			json_append_member(node, key, val);
+		}
+		lua_pop(L, 1);
 	}
-	return json_mknull();
+	/* Return empty object for empty tables. */
+	return node ? node : json_mkobject();
 }

+/** @internal Serialize to string */
 static char *l_pack_json(lua_State *L, int top)
 {
-	JsonNode *root = json_mkobject();
+	JsonNode *root = l_pack_elem(L, top);
 	if (!root) {
 		return NULL;
 	}
-	/* Iterate table on stack */
-	lua_pushnil(L);
-	while(lua_next(L, top)) {
-		JsonNode *val = l_pack_elem(L, -1);
-		if (lua_isstring(L, -2)) {
-			json_append_member(root, lua_tostring(L, -2), val);
-		} else {
-			json_append_element(root, val);
-		}
-		lua_pop(L, 1);
-	}
-	lua_pop(L, 1);
-	/* Serialize to string */
 	char *result = json_encode(root);
 	json_delete(root);
 	return result;
 }

-/** Trampoline function for module properties. */
-static int l_trampoline(lua_State *L)
+static int l_tojson(lua_State *L)
 {
-	struct kr_module *module = lua_touserdata(L, lua_upvalueindex(1));
-	void* callback = lua_touserdata(L, lua_upvalueindex(2));
-	struct engine *engine = engine_luaget(L);
-	if (!module) {
-		lua_pushstring(L, "module closure missing upvalue");
-		lua_error(L);
-	}
-
-	/* Now we only have property callback or config,
-	 * if we expand the callables, we might need a callback_type.
-	 */
-	const char *args = NULL;
-	auto_free char *cleanup_args = NULL;
-	if (lua_gettop(L) > 0) {
-		if (lua_istable(L, 1)) {
-			cleanup_args = l_pack_json(L, 1);
-			args = cleanup_args;
-		} else {
-			args = lua_tostring(L, 1);
-		}
-	}
-	if (callback == module->config) {
-		module->config(module, args);
-	} else {
-		kr_prop_cb *prop = (kr_prop_cb *)callback;
-		auto_free char *ret = prop(engine, module, args);
-		if (!ret) { /* No results */
-			return 0;
-		}
-		JsonNode *root_node = json_decode(ret);
-		if (root_node->tag == JSON_OBJECT || root_node->tag == JSON_ARRAY) {
-			l_unpack_json(L, root_node);
-		} else {
-			lua_pushstring(L, ret);
-		}
-		json_delete(root_node);
-		return 1;
+	auto_free char *json_str = l_pack_json(L, lua_gettop(L));
+	if (!json_str) {
+		return 0;
 	}
+	lua_pushstring(L, json_str);
+	return 1;
+}

-	/* No results */
-	return 0;
+static int l_fromjson(lua_State *L)
+{
+	if (lua_gettop(L) != 1 || !lua_isstring(L, 1))
+		lua_error_p(L, "a JSON string is required");
+
+	const char *json_str = lua_tostring(L, 1);
+	JsonNode *root_node = json_decode(json_str);
+
+	if (!root_node)
+		lua_error_p(L, "invalid JSON string");
+	l_unpack_json(L, root_node);
+	json_delete(root_node);
+
+	return 1;
 }

 /*
 * Engine API.
 */

-/** @internal Make lmdb options. */
-void *namedb_lmdb_mkopts(const char *conf, size_t maxsize)
+static int init_state(void)
 {
-	struct namedb_lmdb_opts *opts = malloc(sizeof(*opts));
-	if (opts) {
-		memset(opts, 0, sizeof(*opts));
-		opts->path = (conf && strlen(conf)) ? conf : ".";
-		opts->mapsize = maxsize;
+	/* Initialize Lua state */
+	the_engine->L = luaL_newstate();
+	if (the_engine->L == NULL) {
+		return kr_error(ENOMEM);
 	}
-	return opts;
+	/* Initialize used libraries. */
+	luaL_openlibs(the_engine->L);
+	/* Global functions */
+	lua_pushcfunction(the_engine->L, l_help);
+	lua_setglobal(the_engine->L, "help");
+	lua_pushcfunction(the_engine->L, l_quit);
+	lua_setglobal(the_engine->L, "quit");
+	lua_pushcfunction(the_engine->L, l_hostname);
+	lua_setglobal(the_engine->L, "hostname");
+	lua_pushcfunction(the_engine->L, l_package_version);
+	lua_setglobal(the_engine->L, "package_version");
+	lua_pushcfunction(the_engine->L, l_verbose);
+	lua_setglobal(the_engine->L, "verbose");
+	lua_pushcfunction(the_engine->L, l_log_level);
+	lua_setglobal(the_engine->L, "log_level");
+	lua_pushcfunction(the_engine->L, l_log_target);
+	lua_setglobal(the_engine->L, "log_target");
+	lua_pushcfunction(the_engine->L, l_log_groups);
+	lua_setglobal(the_engine->L, "log_groups");
+	lua_pushcfunction(the_engine->L, l_setuser);
+	lua_setglobal(the_engine->L, "user");
+	lua_pushcfunction(the_engine->L, l_hint_root_file);
+	lua_setglobal(the_engine->L, "_hint_root_file");
+	lua_pushliteral(the_engine->L, libknot_SONAME);
+	lua_setglobal(the_engine->L, "libknot_SONAME");
+	lua_pushliteral(the_engine->L, libzscanner_SONAME);
+	lua_setglobal(the_engine->L, "libzscanner_SONAME");
+	lua_pushcfunction(the_engine->L, l_tojson);
+	lua_setglobal(the_engine->L, "tojson");
+	lua_pushcfunction(the_engine->L, l_fromjson);
+	lua_setglobal(the_engine->L, "fromjson");
+	/* Random number generator */
+	lua_getfield(the_engine->L, LUA_GLOBALSINDEX, "math");
+	lua_getfield(the_engine->L, -1, "randomseed");
+	lua_remove(the_engine->L, -2);
+	lua_Number seed = kr_rand_bytes(sizeof(lua_Number));
+	lua_pushnumber(the_engine->L, seed);
+	lua_call(the_engine->L, 1, 0);
+	return kr_ok();
 }

-static int init_resolver(struct engine *engine)
+/**
+ * Start luacov measurement and store results to file specified by
+ * KRESD_COVERAGE_STATS environment variable.
+ * Do nothing if the variable is not set.
+ */
+static void init_measurement(void)
 {
-	/* Open resolution context */
-	engine->resolver.pool = engine->pool;
-	engine->resolver.modules = &engine->modules;
-	/* Create OPT RR */
-	engine->resolver.opt_rr = mm_alloc(engine->pool, sizeof(knot_rrset_t));
-	if (!engine->resolver.opt_rr) {
-		return kr_error(ENOMEM);
-	}
-	knot_edns_init(engine->resolver.opt_rr, KR_EDNS_PAYLOAD, 0, KR_EDNS_VERSION, engine->pool);
-	/* Set default root hints */
-	kr_zonecut_init(&engine->resolver.root_hints, (const uint8_t *)"", engine->pool);
-	kr_zonecut_set_sbelt(&engine->resolver, &engine->resolver.root_hints);
-	/* Open NS rtt + reputation cache */
-	engine->resolver.cache_rtt = mm_alloc(engine->pool, lru_size(kr_nsrep_lru_t, LRU_RTT_SIZE));
-	if (engine->resolver.cache_rtt) {
-		lru_init(engine->resolver.cache_rtt, LRU_RTT_SIZE);
-	}
-	engine->resolver.cache_rep = mm_alloc(engine->pool, lru_size(kr_nsrep_lru_t, LRU_REP_SIZE));
-	if (engine->resolver.cache_rep) {
-		lru_init(engine->resolver.cache_rep, LRU_REP_SIZE);
-	}
-
-	/* Load basic modules */
-	engine_register(engine, "iterate");
-	engine_register(engine, "rrcache");
-	engine_register(engine, "pktcache");
+	const char * const statspath = getenv("KRESD_COVERAGE_STATS");
+	if (!statspath)
+		return;

-	/* Initialize storage backends */
-	struct storage_api lmdb = {
-		"lmdb://", namedb_lmdb_api, namedb_lmdb_mkopts
-	};
+	char * snippet = NULL;
+	int ret = asprintf(&snippet,
+		"_luacov_runner = require('luacov.runner')\n"
+		"_luacov_runner.init({\n"
+		"	statsfile = '%s',\n"
+		"	exclude = {'test', 'tapered', 'lua/5.1'},\n"
+		"})\n"
+		"jit.off()\n", statspath
+	);
+	if (kr_fails_assert(ret > 0))
+		return;

-	return array_push(engine->storage_registry, lmdb);
+	ret = luaL_loadstring(the_engine->L, snippet);
+	if (kr_fails_assert(ret == 0)) {
+		free(snippet);
+		return;
+	}
+	lua_call(the_engine->L, 0, 0);
+	free(snippet);
 }

-static int init_state(struct engine *engine)
-{
-	/* Initialize Lua state */
-	engine->L = luaL_newstate();
-	if (engine->L == NULL) {
-		return kr_error(ENOMEM);
+int init_lua(void) {
+	/* Use libdir path for including Lua scripts */
+	char l_paths[MAXPATHLEN] = { 0 };
+	#pragma GCC diagnostic push
+	#pragma GCC diagnostic ignored "-Wformat" /* %1$ is not in C standard */
+	/* Save original package.path to package._path */
+	(void)snprintf(l_paths, MAXPATHLEN - 1,
+		 "if package._path == nil then package._path = package.path end\n"
+		 "package.path = '%1$s/?.lua;%1$s/?/init.lua;'..package._path\n"
+		 "if package._cpath == nil then package._cpath = package.cpath end\n"
+		 "package.cpath = '%1$s/?%2$s;'..package._cpath\n",
+		 LIBDIR, LIBEXT);
+	#pragma GCC diagnostic pop
+
+	int ret = l_dobytecode(the_engine->L, l_paths, strlen(l_paths), "");
+	if (ret != 0) {
+		lua_pop(the_engine->L, 1);
+		return ret;
 	}
-	/* Initialize used libraries. */
-	lua_gc(engine->L, LUA_GCSTOP, 0);
-	luaL_openlibs(engine->L);
-	/* Global functions */
-	lua_pushcfunction(engine->L, l_help);
-	lua_setglobal(engine->L, "help");
-	lua_pushcfunction(engine->L, l_quit);
-	lua_setglobal(engine->L, "quit");
-	lua_pushcfunction(engine->L, l_hostname);
-	lua_setglobal(engine->L, "hostname");
-	lua_pushlightuserdata(engine->L, engine);
-	lua_setglobal(engine->L, "__engine");
-	return kr_ok();
+	return 0;
 }

-int engine_init(struct engine *engine, mm_ctx_t *pool)
+
+int engine_init(void)
 {
-	if (engine == NULL) {
-		return kr_error(EINVAL);
+	kr_require(!the_engine);
+	the_engine = &engine;
+	mm_ctx_mempool(&the_engine->pool, MM_DEFAULT_BLKSIZE);
+
+	/* Initialize state */
+	int ret = init_state();
+	if (ret != 0) {
+		engine_deinit();
+		return ret;
 	}
+	init_measurement();

-	memset(engine, 0, sizeof(*engine));
-	engine->pool = pool;
+	/* Load basic modules */
+	engine_register("iterate", NULL, NULL);
+	engine_register("validate", NULL, NULL);
+	engine_register("cache", NULL, NULL);

-	/* Initialize state */
-	int ret = init_state(engine);
+	ret = array_push(the_engine->backends, kr_cdb_lmdb());
 	if (ret != 0) {
-		engine_deinit(engine);
+		engine_deinit();
+		return ret;
 	}
-	/* Initialize resolver */
-	ret = init_resolver(engine);
+
+	/* Initialize lua */
+	ret = init_lua();
 	if (ret != 0) {
-		engine_deinit(engine);
+		engine_deinit();
 		return ret;
 	}
-	/* Initialize network */
-	network_init(&engine->net, uv_default_loop());

 	return ret;
 }

-static void engine_unload(struct engine *engine, struct kr_module *module)
+/** Unregister a (found) module */
+static void engine_unload(struct kr_module *module)
 {
-	/* Unregister module */
-	auto_free char *name = strdup(module->name);
-	kr_module_unload(module);
-	/* Clear in Lua world */
-	if (name) {
-		lua_pushnil(engine->L);
-		lua_setglobal(engine->L, name);
+	auto_free char *name = module->name ? strdup(module->name) : NULL;
+	kr_module_unload(module); /* beware: lua/C mix, could be confusing */
+	/* Clear in Lua world, but not for embedded modules ('cache' in particular). */
+	if (name && !kr_module_get_embedded(name)) {
+		lua_pushnil(the_engine->L);
+		lua_setglobal(the_engine->L, name);
 	}
 	free(module);
 }

-void engine_deinit(struct engine *engine)
+void engine_deinit(void)
 {
-	if (engine == NULL) {
+	if (kr_fails_assert(the_engine->L))
 		return;
-	}
-
-	/* Only close sockets and services,
-	 * no need to clean up mempool. */
-	network_deinit(&engine->net);
-	kr_zonecut_deinit(&engine->resolver.root_hints);
-	kr_cache_close(&engine->resolver.cache);
-	lru_deinit(engine->resolver.cache_rtt);
-	lru_deinit(engine->resolver.cache_rep);
+	/* Only close sockets and services; no need to clean up mempool. */

-	/* Unload modules. */
-	for (size_t i = 0; i < engine->modules.len; ++i) {
-		engine_unload(engine, engine->modules.at[i]);
+	/* Network deinit is split up.  We first need to stop listening,
+	 * then we can unload modules during which we still want
+	 * e.g. the endpoint kind registry to work (inside ->net),
+	 * and this registry deinitialization uses the lua state. */
+	for (size_t i = 0; i < the_engine->modules.len; ++i) {
+		engine_unload(the_engine->modules.at[i]);
 	}
-	array_clear(engine->modules);
-	array_clear(engine->storage_registry);

-	if (engine->L) {
-		lua_close(engine->L);
-	}
+	ffimodule_deinit(the_engine->L);
+	lua_close(the_engine->L);

+	/* Free data structures */
+	array_clear(the_engine->modules);
+	array_clear(the_engine->backends);
+	free(the_engine->hostname);
+	mp_delete(the_engine->pool.ctx);
+
+	the_engine = NULL;
 }

 int engine_pcall(lua_State *L, int argc)
 {
-#if LUA_VERSION_NUM >= 502
-	lua_getglobal(L, "_SANDBOX");
-	lua_setupvalue(L, -(2 + argc), 1);
-#endif
 	return lua_pcall(L, argc, LUA_MULTRET, 0);
 }

-int engine_cmd(struct engine *engine, const char *str)
+const char *engine_eval_mode_str(enum engine_eval_mode mode)
+{
+	switch (mode) {
+#define XX(cid) case ENGINE_EVAL_MODE_##cid: return #cid;
+		ENGINE_EVAL_MODE_MAP(XX)
+#undef XX
+	}
+	return "(invalid)";
+}
+
+int engine_cmd(struct lua_State *L, const char *str, enum engine_eval_mode mode)
 {
-	if (engine == NULL || engine->L == NULL) {
+	if (L == NULL) {
 		return kr_error(ENOEXEC);
 	}

 	/* Evaluate results */
-	lua_getglobal(engine->L, "eval_cmd");
-	lua_pushstring(engine->L, str);
+	lua_getglobal(L, "eval_cmd");
+	lua_pushstring(L, str);
+	lua_pushstring(L, engine_eval_mode_str(mode));

 	/* Check result. */
-	return engine_pcall(engine->L, 1);
+	return engine_pcall(L, 2);
 }

-/* Execute byte code */
-#define l_dobytecode(L, arr, len, name) \
-	(luaL_loadbuffer((L), (arr), (len), (name)) || lua_pcall((L), 0, LUA_MULTRET, 0))
-/** Load file in a sandbox environment. */
-#define l_dosandboxfile(L, filename) \
-	(luaL_loadfile((L), (filename)) || engine_pcall((L), 0))
-
-static int engine_loadconf(struct engine *engine)
+int engine_load_sandbox(void)
 {
 	/* Init environment */
-	static const char sandbox_bytecode[] = {
-		#include "daemon/lua/sandbox.inc"
-	};
-	if (l_dobytecode(engine->L, sandbox_bytecode, sizeof(sandbox_bytecode), "init") != 0) {
-		fprintf(stderr, "[system] error %s\n", lua_tostring(engine->L, -1));
-		lua_pop(engine->L, 1);
+	int ret = luaL_dofile(the_engine->L, LIBDIR "/sandbox.lua");
+	if (ret != 0) {
+		kr_log_error(SYSTEM, "error %s\n", lua_tostring(the_engine->L, -1));
+		lua_pop(the_engine->L, 1);
 		return kr_error(ENOEXEC);
 	}
-	/* Use module path for including Lua scripts */
-	int ret = engine_cmd(engine, "package.path = package.path..';" PREFIX MODULEDIR "/?.lua'");
-	if (ret > 0) {
-		lua_pop(engine->L, 1);
-	}
+	ret = ffimodule_init(the_engine->L);
+	return ret;
+}

-	/* Load config file */
-	if(access("config", F_OK ) != -1 ) {
-		ret = l_dosandboxfile(engine->L, "config");
-	} else {
-		/* Load defaults */
-		static const char config_bytecode[] = {
-			#include "daemon/lua/config.inc"
-		};
-		ret = l_dobytecode(engine->L, config_bytecode, sizeof(config_bytecode), "config");
-	}
+int engine_loadconf(const char *config_path)
+{
+	if (kr_fails_assert(config_path))
+		return kr_error(EINVAL);

-	/* Evaluate */
+	char cwd[PATH_MAX];
+	get_workdir(cwd, sizeof(cwd));
+	kr_log_debug(SYSTEM, "loading config '%s' (workdir '%s')\n", config_path, cwd);
+
+	int ret = luaL_dofile(the_engine->L, config_path);
 	if (ret != 0) {
-		fprintf(stderr, "%s\n", lua_tostring(engine->L, -1));
-		lua_pop(engine->L, 1);
+		kr_log_error(SYSTEM, "error while loading config: "
+			"%s (workdir '%s')\n", lua_tostring(the_engine->L, -1), cwd);
+		lua_pop(the_engine->L, 1);
 	}
 	return ret;
 }

-int engine_start(struct engine *engine)
+int engine_start(void)
 {
-	/* Load configuration. */
-	int ret = engine_loadconf(engine);
-	if (ret != 0) {
-		return ret;
-	}
+	/* Clean up stack */
+	lua_settop(the_engine->L, 0);

-	/* Clean up stack and restart GC */
-	lua_settop(engine->L, 0);
-	lua_gc(engine->L, LUA_GCCOLLECT, 0);
-	lua_gc(engine->L, LUA_GCSETSTEPMUL, 99);
-	lua_gc(engine->L, LUA_GCSETPAUSE, 400);
-	lua_gc(engine->L, LUA_GCRESTART, 0);
 	return kr_ok();
 }

-void engine_stop(struct engine *engine)
+void engine_stop(void)
 {
 	uv_stop(uv_default_loop());
 }

-/** Register module properties in Lua environment */
-static int register_properties(struct engine *engine, struct kr_module *module)
+/** @internal Find matching module */
+static size_t module_find(module_array_t *mod_list, const char *name)
 {
-	lua_newtable(engine->L);
-	if (module->config != NULL) {
-		REGISTER_MODULE_CALL(engine->L, module, module->config, "config");
-	}
-	for (struct kr_prop *p = module->props; p->name; ++p) {
-		if (p->cb != NULL && p->name != NULL) {
-			REGISTER_MODULE_CALL(engine->L, module, p->cb, p->name);
+	size_t found = mod_list->len;
+	for (size_t i = 0; i < mod_list->len; ++i) {
+		struct kr_module *mod = mod_list->at[i];
+		if (strcmp(mod->name, name) == 0) {
+			found = i;
+			break;
 		}
 	}
-	lua_setglobal(engine->L, module->name);
-
-	/* Register module in Lua env */
-	lua_getglobal(engine->L, "modules_register");
-	lua_getglobal(engine->L, module->name);
-	if (engine_pcall(engine->L, 1) != 0) {
-		lua_pop(engine->L, 1);
-	}
-
-	return kr_ok();
+	return found;
 }

-int engine_register(struct engine *engine, const char *name)
+int engine_register(const char *name, const char *precedence, const char* ref)
 {
-	if (engine == NULL || name == NULL) {
+	if (kr_fails_assert(name))
 		return kr_error(EINVAL);
-	}
-	/* Check priority modules */
-	bool is_priority = false;
-	if (name[0] == '<') {
-		is_priority = true;
-		name += 1;
-	}
 	/* Make sure module is unloaded */
-	(void) engine_unregister(engine, name);
+	(void) engine_unregister(name);
+	/* Find the index of referenced module. */
+	module_array_t *mod_list = &the_engine->modules;
+	size_t ref_pos = mod_list->len;
+	if (precedence && ref) {
+		ref_pos = module_find(mod_list, ref);
+		if (ref_pos >= mod_list->len) {
+			return kr_error(EIDRM);
+		}
+	}
 	/* Attempt to load binary module */
 	struct kr_module *module = malloc(sizeof(*module));
 	if (!module) {
 		return kr_error(ENOMEM);
 	}
-	module->data = engine;
-	int ret = kr_module_load(module, name, NULL);
-	/* Load Lua module if not a binary */
-	if (ret == kr_error(ENOENT)) {
-		ret = ffimodule_register_lua(engine, module, name);
+	int ret = kr_module_load(module, name, LIBDIR "/kres_modules");
+	if (ret == 0) {
+		/* We have a C module, loaded and init() was called.
+		 * Now we need to prepare the lua side. */
+		lua_State *L = the_engine->L;
+		lua_getglobal(L, "modules_create_table_for_c");
+		lua_pushpointer(L, module);
+		if (lua_isnil(L, -2)) {
+			/* When loading the three embedded modules, we don't
+			 * have the "modules_*" lua function yet, but fortunately
+			 * we don't need it there.  Let's just check they're embedded.
+			 * TODO: solve this better *without* breaking stuff. */
+			lua_pop(L, 2);
+			if (module->lib != RTLD_DEFAULT) {
+				ret = kr_error(1);
+				lua_pushliteral(L, "missing modules_create_table_for_c()");
+			}
+		} else {
+			ret = engine_pcall(L, 1);
+		}
+		if (kr_fails_assert(ret == 0)) {  /* probably not critical, but weird */
+			kr_log_error(SYSTEM, "internal error when loading C module %s: %s\n",
+					module->name, lua_tostring(L, -1));
+			lua_pop(L, 1);
+		}
+
+	} else if (ret == kr_error(ENOENT)) {
+		/* No luck with C module, so try to load and .init() lua module. */
+		ret = ffimodule_register_lua(module, name);
+		if (ret != 0) {
+			kr_log_error(SYSTEM, "failed to load module '%s'\n", name);
+		}
+
+	} else if (ret == kr_error(ENOTSUP)) {
+		/* Print a more helpful message when module is linked against an old resolver ABI. */
+		kr_log_error(SYSTEM, "module '%s' links to unsupported ABI, please rebuild it\n", name);
 	}
+
 	if (ret != 0) {
-		free(module);
+		engine_unload(module);
 		return ret;
 	}
-	if (array_push(engine->modules, module) < 0) {
-		engine_unload(engine, module);
+
+	/* Push to the right place in the_engine->modules */
+	if (array_push(the_engine->modules, module) < 0) {
+		engine_unload(module);
 		return kr_error(ENOMEM);
 	}
-	/* Push to front if priority module */
-	if (is_priority) {
-		struct kr_module **arr = engine->modules.at;
-		memmove(&arr[1], &arr[0], sizeof(*arr) * (engine->modules.len - 1));
-		arr[0] = module;
-	}
-
-	/* Register properties */
-	if (module->props) {
-		return register_properties(engine, module);
+	if (precedence) {
+		struct kr_module **arr = mod_list->at;
+		size_t emplacement = mod_list->len;
+		if (strcasecmp(precedence, ">") == 0) {
+			if (ref_pos + 1 < mod_list->len)
+				emplacement = ref_pos + 1; /* Insert after target */
+		}
+		if (strcasecmp(precedence, "<") == 0) {
+			emplacement = ref_pos; /* Insert at target */
+		}
+		/* Move the tail if it has some elements. */
+		if (emplacement + 1 < mod_list->len) {
+			memmove(&arr[emplacement + 1], &arr[emplacement], sizeof(*arr) * (mod_list->len - (emplacement + 1)));
+			arr[emplacement] = module;
+		}
 	}

 	return kr_ok();
 }

-int engine_unregister(struct engine *engine, const char *name)
+int engine_unregister(const char *name)
 {
-	/* Find matching module. */
-	module_array_t *mod_list = &engine->modules;
-	size_t found = mod_list->len;
-	for (size_t i = 0; i < mod_list->len; ++i) {
-		struct kr_module *mod = mod_list->at[i];
-		if (strcmp(mod->name, name) == 0) {
-			found = i;
-			break;
-		}
-	}
+	module_array_t *mod_list = &the_engine->modules;
+	size_t found = module_find(mod_list, name);
 	if (found < mod_list->len) {
-		engine_unload(engine, mod_list->at[found]);
+		engine_unload(mod_list->at[found]);
 		array_del(*mod_list, found);
 		return kr_ok();
 	}
@@ -504,22 +822,7 @@ int engine_unregister(struct engine *engine, const char *name)
 	return kr_error(ENOENT);
 }

-void engine_lualib(struct engine *engine, const char *name, lua_CFunction lib_cb)
-{
-	if (engine != NULL) {
-#if LUA_VERSION_NUM >= 502
-		luaL_requiref(engine->L, name, lib_cb, 1);
-		lua_pop(engine->L, 1);
-#else
-		lib_cb(engine->L);
-#endif
-	}
-}
-
-struct engine *engine_luaget(lua_State *L)
+module_array_t *engine_modules(void)
 {
-	lua_getglobal(L, "__engine");
-	struct engine *engine = lua_touserdata(L, -1);
-	lua_pop(engine->L, 1);
-	return engine;
+	return &the_engine->modules;
 }
--- a/daemon/engine.h
+++ b/daemon/engine.h
-/*  Copyright (C) 2015 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+/*  Copyright (C) CZ.NIC, z.s.p.o. <knot-resolver@labs.nic.cz>
+ *  SPDX-License-Identifier: GPL-3.0-or-later
 */

 #pragma once

-/* Magic defaults */
-#ifndef LRU_RTT_SIZE
-#define LRU_RTT_SIZE 4096 /**< NS RTT cache size */
-#endif
-#ifndef LRU_REP_SIZE
-#define LRU_REP_SIZE (LRU_RTT_SIZE / 2) /**< NS reputation cache size */
-#endif
-#ifndef MP_FREELIST_SIZE
-#define MP_FREELIST_SIZE 32 /**< Maximum length of the worker mempool freelist */
-#endif
-#ifndef RECVMMSG_BATCH
-#define RECVMMSG_BATCH 5
-#endif
-
 /*
 * @internal These are forward decls to allow building modules with engine but without Lua.
 */
 struct lua_State;

+#include "lib/utils.h"
 #include "lib/resolve.h"
 #include "daemon/network.h"

-/** Cache storage backend. */
-struct storage_api {
-	const char *prefix; /**< Storage prefix, e.g. 'lmdb://' */
-	const namedb_api_t *(*api)(void); /**< Storage API implementation */
-	void *(*opts_create)(const char *, size_t); /**< Storage options factory */
-};
-
-/** @internal Array of cache backend options. */
-typedef array_t(struct storage_api) storage_registry_t;
-
 struct engine {
-    struct kr_context resolver;
-    struct network net;
    module_array_t modules;
-    storage_registry_t storage_registry;
-    mm_ctx_t *pool;
+    array_t(const struct kr_cdb_api *) backends;
+    knot_mm_t pool;
+    char *hostname;
    struct lua_State *L;
 };

-int engine_init(struct engine *engine, mm_ctx_t *pool);
-void engine_deinit(struct engine *engine);
-/** @warning This function leaves 1 string result on stack. */
-int engine_cmd(struct engine *engine, const char *str);
-int engine_start(struct engine *engine);
-void engine_stop(struct engine *engine);
-int engine_register(struct engine *engine, const char *module);
-int engine_unregister(struct engine *engine, const char *module);
-void engine_lualib(struct engine *engine, const char *name, int (*lib_cb) (struct lua_State *));
+/** Pointer to the singleton engine state. NULL if not initialized. */
+KR_EXPORT extern struct engine *the_engine;
+
+/** Initializes the engine. */
+int engine_init(void);
+
+/* Deinitializes the engine. `network_unregister` should be called before
+ * this and before `network_deinit`. */
+void engine_deinit(void);
+
+#define ENGINE_EVAL_MODE_MAP(XX) \
+	XX(LUA_TABLE) \
+	XX(RAW) \
+	XX(JSON) \
+	//
+
+enum engine_eval_mode {
+#define XX(cid) ENGINE_EVAL_MODE_##cid,
+	ENGINE_EVAL_MODE_MAP(XX)
+#undef XX
+};
+
+const char *engine_eval_mode_str(enum engine_eval_mode mode);
+
+/** Perform a lua command within the sandbox.
+ *
+ *  @return zero on success.
+ *  The result will be returned on the lua stack - an error message in case of failure.
+ *  http://www.lua.org/manual/5.1/manual.html#lua_pcall */
+int engine_cmd(struct lua_State *L, const char *str, enum engine_eval_mode mode);

 /** Execute current chunk in the sandbox */
 int engine_pcall(struct lua_State *L, int argc);

-/** Return engine light userdata. */
-struct engine *engine_luaget(struct lua_State *L);
+int engine_load_sandbox(void);
+int engine_loadconf(const char *config_path);
+
+/** Start the lua engine and execute the config. */
+int engine_start(void);
+void engine_stop(void);
+int engine_register(const char *name, const char *precedence, const char* ref);
+int engine_unregister(const char *name);
+/** Gets the list of the engine's registered modules. */
+module_array_t *engine_modules(void);
+
+/** Set/get the per engine hostname */
+char *engine_get_hostname(void);
+int engine_set_hostname(const char *hostname);
+
+/** Load root hints from a zonefile (or config-time default if NULL).
+ *
+ * @return error message or NULL (statically allocated)
+ * @note exported to be usable from the hints module.
+ */
+KR_EXPORT
+const char* engine_hint_root_file(const char *file);
+
+/* @internal Array of ip address shorthand. */
+typedef array_t(char*) addr_array_t;
+
+typedef array_t(const char*) config_array_t;
+
+typedef struct {
+	int fd;
+	endpoint_flags_t flags; /**< .sock_type isn't meaningful here */
+} flagged_fd_t;
+typedef array_t(flagged_fd_t) flagged_fd_array_t;
+
+struct args {
+	addr_array_t addrs, addrs_tls;
+	flagged_fd_array_t fds;
+	int control_fd;
+	config_array_t config;
+	const char *rundir;
+	bool interactive;
+	bool quiet;
+	bool tty_binary_output;
+};
+
+/** Pointer to kresd arguments. */
+KR_EXPORT extern struct args *the_args;
--- a/daemon/ffimodule.c
+++ b/daemon/ffimodule.c
-/*  Copyright (C) 2015 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+/*  Copyright (C) CZ.NIC, z.s.p.o. <knot-resolver@labs.nic.cz>
+ *  SPDX-License-Identifier: GPL-3.0-or-later
 */

 #include <uv.h>
+#include <lua.h>
+#include <lauxlib.h>

+#include "daemon/bindings/impl.h"
 #include "daemon/engine.h"
 #include "daemon/ffimodule.h"
-#include "daemon/bindings.h"
-#include "daemon/bindings/kres.h"
+#include "daemon/worker.h"
 #include "lib/module.h"
 #include "lib/layer.h"

-#if LUA_VERSION_NUM >= 502
-#define l_resume(L, argc) lua_resume((L), NULL, (argc))
-#else
-#define l_resume(L, argc) lua_resume((L), (argc))
-#endif
-
 /** @internal Slots for layer callbacks.
  * Each slot ID corresponds to Lua reference in module API. */
 enum {
@@ -37,36 +21,19 @@ enum {
 	SLOT_finish,
 	SLOT_consume,
 	SLOT_produce,
-	SLOT_count
+	SLOT_checkout,
+	SLOT_answer_finalize,
+	SLOT_count /* dummy, must be the last */
 };
-#define SLOT_size sizeof(int)

-/** @internal Set metatable on the object on stack. */
-static void set_metatable(lua_State *L, const char *tname)
-{
-	luaL_getmetatable(L, tname);
-	lua_setmetatable(L, -2);
-}
-
-/** @internal Helper for retrieving the right function entrypoint. */
-static inline lua_State *l_ffi_preface(struct kr_module *module, const char *call) {
-	lua_State *L = module->lib;
-	lua_getglobal(L, module->name);
-	lua_getfield(L, -1, call);
-	lua_remove(L, -2);
-	if (lua_isnil(L, -1)) {
-		lua_pop(L, 1);
-		return NULL;
-	}
-	lua_pushlightuserdata(L, module);
-	return L;
-}
+/** Lua registry indices for functions that wrap layer callbacks (shared by all lua modules). */
+static int l_ffi_wrap_slots[SLOT_count] = { 0 };

 /** @internal Continue with coroutine. */
 static void l_ffi_resume_cb(uv_idle_t *check)
 {
 	lua_State *L = check->data;
-	int status = l_resume(L, 0);
+	int status = lua_resume(L, 0);
 	if (status != LUA_YIELD) {
 		uv_idle_stop(check); /* Stop coroutine */
 		uv_close((uv_handle_t *)check, (uv_close_cb)free);
@@ -86,176 +53,232 @@ static int l_ffi_defer(lua_State *L)
 	return uv_idle_start(check, l_ffi_resume_cb);
 }

-/** @internal Helper for calling the entrypoint. */
-static inline int l_ffi_call(lua_State *L, int argc)
+/** Common part of calling modname.(de)init in lua.
+ * The function to call should be on top of the stack and it gets popped. */
+static int l_ffi_modcb(lua_State *L, struct kr_module *module)
 {
-	int status = lua_pcall(L, argc, 1, 0);
-	if (status != 0) {
-		fprintf(stderr, "error: %s\n", lua_tostring(L, -1));
-		lua_pop(L, 1);
-		return kr_error(EIO);
-	}
-	if (lua_isthread(L, -1)) { /* Continuations */
-		status = l_ffi_defer(lua_tothread(L, -1));
-	} else if (lua_isnumber(L, -1)) { /* Return code */
-		status = lua_tonumber(L, -1);
+	if (lua_isnil(L, -1)) {
+		lua_pop(L, 1); /* .(de)init == nil, maybe even the module table doesn't exist */
+		return kr_ok();
 	}
+	lua_getglobal(L, "modules_ffi_wrap_modcb");
+	lua_insert(L, -2); /* swap with .(de)init */
+	lua_pushpointer(L, module);
+	if (lua_pcall(L, 2, 0, 0) == 0)
+		return kr_ok();
+	kr_log_error(SYSTEM, "error: %s\n", lua_tostring(L, -1));
 	lua_pop(L, 1);
-	return status;
+	return kr_error(1);
 }

-static int l_ffi_init(struct kr_module *module)
+static int l_ffi_deinit(struct kr_module *module)
 {
-	lua_State *L = l_ffi_preface(module, "init");
-	if (!L) {
-		return 0;
+	/* Call .deinit(), if it exists. */
+	lua_State *L = the_engine->L;
+	lua_getglobal(L, module->name);
+	lua_getfield(L, -1, "deinit");
+	const int ret = l_ffi_modcb(L, module);
+	lua_pop(L, 1); /* the module's table */
+
+	const kr_layer_api_t *api = module->layer;
+	if (!api) {
+		return ret;
+	}
+	/* Unregister layer callback references from registry. */
+	for (int si = 0; si < SLOT_count; ++si) {
+		if (api->cb_slots[si] > 0) {
+			luaL_unref(L, LUA_REGISTRYINDEX, api->cb_slots[si]);
+		}
 	}
-	return l_ffi_call(L, 1);
+	free_const(api);
+	return ret;
 }

-/** @internal Unregister layer callback reference from registry. */
-#define LAYER_UNREGISTER(L, api, name) do { \
-	int *cb_slot = (int *)((char *)api + sizeof(knot_layer_api_t)); \
-	if (cb_slot[SLOT_ ## name] > 0) \
-		luaL_unref(L, LUA_REGISTRYINDEX, cb_slot[SLOT_ ## name]); \
-} while(0)
+kr_layer_t kr_layer_t_static;

-static int l_ffi_deinit(struct kr_module *module)
+/** @internal Helper for calling a layer Lua function by e.g. SLOT_begin. */
+static int l_ffi_call_layer(kr_layer_t *ctx, int slot_ix)
 {
-	/* Deinit the module in Lua (if possible) */
-	int ret = 0;
-	lua_State *L = module->lib;
-	if (l_ffi_preface(module, "deinit")) {
-		ret = l_ffi_call(L, 1);
-	}
-	/* Free the layer API wrapper (unconst it) */
-	knot_layer_api_t* api = module->data;
-	if (api) {
-		LAYER_UNREGISTER(L, api, begin);
-		LAYER_UNREGISTER(L, api, finish);
-		LAYER_UNREGISTER(L, api, consume);
-		LAYER_UNREGISTER(L, api, produce);
-		LAYER_UNREGISTER(L, api, reset);
-		free(api);
+	const int wrap_slot = l_ffi_wrap_slots[slot_ix];
+	const int cb_slot = ctx->api->cb_slots[slot_ix];
+	kr_require(wrap_slot > 0 && cb_slot > 0);
+	lua_State *L = the_engine->L;
+	lua_rawgeti(L, LUA_REGISTRYINDEX, wrap_slot);
+	lua_rawgeti(L, LUA_REGISTRYINDEX, cb_slot);
+	/* We pass the content of *ctx via a global structure to avoid
+	 * lua (full) userdata, as that's relatively expensive (GC-allocated).
+	 * Performance: copying isn't ideal, but it's not visible in profiles. */
+	memcpy(&kr_layer_t_static, ctx, sizeof(*ctx));
+
+	int ret = lua_pcall(L, 1, 1, 0);
+	/* Handle result of the pcall.
+	 * Default state: ctx->req->state seems safer than ctx->state,
+	 * in case the pcall touched req->state. */
+	int state = ctx->req->state;
+	if (ret) { /* Exception or another lua problem. */
+		state = KR_STATE_FAIL;
+		kr_log_error(SYSTEM, "error: %s\n", lua_tostring(L, -1));
+
+	} else if (lua_isnumber(L, -1)) { /* Explicitly returned state. */
+		state = lua_tointeger(L, -1);
+		if (!kr_state_consistent(state)) {
+			kr_log_error(SYSTEM, "error: nonsense state returned from lua module layer: %d\n",
+					state);
+			state = KR_STATE_FAIL;
+		}
+
+	} else if (lua_isnil(L, -1)) { /* Don't change state. */
+
+	} else if (kr_fails_assert(!lua_isthread(L, -1))) { /* Continuations */
+		/* TODO: unused, possibly in a bad shape.  Meant KR_STATE_YIELD? */
+		if (l_ffi_defer(lua_tothread(L, -1)) != 0)
+			state = KR_STATE_FAIL;
+
+	} else { /* Nonsense returned. */
+		state = KR_STATE_FAIL;
+		kr_log_error(SYSTEM, "error: nonsense returned from lua module layer: %s\n",
+				lua_tostring(L, -1));
+		/* Unfortunately we can't easily get name of the module/function here. */
 	}
-	module->lib = NULL;
-	return ret;
+	lua_pop(L, 1);
+	return state;
 }
-#undef LAYER_UNREGISTER

-/** @internal Helper for retrieving layer Lua function by name. */
-#define LAYER_FFI_CALL(ctx, slot) \
-	int *cb_slot = (int *)((char *)(ctx)->api + sizeof(knot_layer_api_t)); \
-	if (cb_slot[SLOT_ ## slot] <= 0) { \
-		return ctx->state; \
-	} \
-	struct kr_module *module = (ctx)->api->data; \
-	lua_State *L = module->lib; \
-	lua_rawgeti(L, LUA_REGISTRYINDEX, cb_slot[SLOT_ ## slot]); \
-	lua_pushnumber(L, ctx->state)
+static int l_ffi_layer_begin(kr_layer_t *ctx)
+{
+	return l_ffi_call_layer(ctx, SLOT_begin);
+}
+
+static int l_ffi_layer_reset(kr_layer_t *ctx)
+{
+	return l_ffi_call_layer(ctx, SLOT_reset);
+}

-static int l_ffi_layer_begin(knot_layer_t *ctx, void *module_param)
+static int l_ffi_layer_finish(kr_layer_t *ctx)
 {
-	ctx->data = module_param;
-	LAYER_FFI_CALL(ctx, begin);
-	lua_pushlightuserdata(L, module_param);
-	return l_ffi_call(L, 2);
+	ctx->pkt = ctx->req->answer;
+	return l_ffi_call_layer(ctx, SLOT_finish);
 }

-static int l_ffi_layer_reset(knot_layer_t *ctx)
+static int l_ffi_layer_consume(kr_layer_t *ctx, knot_pkt_t *pkt)
 {
-	LAYER_FFI_CALL(ctx, reset);
-	lua_pushlightuserdata(L, ctx->data);
-	return l_ffi_call(L, 2);
+	if (ctx->state & KR_STATE_FAIL) {
+		return ctx->state; /* Already failed, skip */
+	}
+	ctx->pkt = pkt;
+	return l_ffi_call_layer(ctx, SLOT_consume);
 }

-static int l_ffi_layer_finish(knot_layer_t *ctx)
+static int l_ffi_layer_produce(kr_layer_t *ctx, knot_pkt_t *pkt)
 {
-	struct kr_request *req = ctx->data;
-	LAYER_FFI_CALL(ctx, finish);
-	lua_pushlightuserdata(L, req);
-	lua_pushlightuserdata(L, req->answer);
-	set_metatable(L, META_PKT);
-	return l_ffi_call(L, 3);
+	if (ctx->state & KR_STATE_FAIL) {
+		return ctx->state; /* Already failed, skip */
+	}
+	ctx->pkt = pkt;
+	return l_ffi_call_layer(ctx, SLOT_produce);
 }

-static int l_ffi_layer_consume(knot_layer_t *ctx, knot_pkt_t *pkt)
+static int l_ffi_layer_checkout(kr_layer_t *ctx, knot_pkt_t *pkt,
+				struct sockaddr *dst, int type)
 {
-	if (ctx->state & KNOT_STATE_FAIL) {
+	if (ctx->state & KR_STATE_FAIL) {
 		return ctx->state; /* Already failed, skip */
 	}
-	LAYER_FFI_CALL(ctx, consume);
-	lua_pushlightuserdata(L, ctx->data);
-	lua_pushlightuserdata(L, pkt);
-	set_metatable(L, META_PKT);
-	return l_ffi_call(L, 3);
+	ctx->pkt = pkt;
+	ctx->dst = dst;
+	ctx->is_stream = (type == SOCK_STREAM);
+	return l_ffi_call_layer(ctx, SLOT_checkout);
+}
+
+static int l_ffi_layer_answer_finalize(kr_layer_t *ctx)
+{
+	return l_ffi_call_layer(ctx, SLOT_answer_finalize);
 }

-static int l_ffi_layer_produce(knot_layer_t *ctx, knot_pkt_t *pkt)
+int ffimodule_init(lua_State *L)
+{
+	/* Wrappers defined in ./lua/sandbox.lua */
+	/* for API: (int state, kr_request_t *req) */
+	lua_getglobal(L, "modules_ffi_layer_wrap1");
+	const int wrap1 = luaL_ref(L, LUA_REGISTRYINDEX);
+	/* for API: (int state, kr_request_t *req, knot_pkt_t *) */
+	lua_getglobal(L, "modules_ffi_layer_wrap2");
+	const int wrap2 = luaL_ref(L, LUA_REGISTRYINDEX);
+	lua_getglobal(L, "modules_ffi_layer_wrap_checkout");
+	const int wrap_checkout = luaL_ref(L, LUA_REGISTRYINDEX);
+	if (wrap1 == LUA_REFNIL || wrap2 == LUA_REFNIL || wrap_checkout == LUA_REFNIL) {
+		return kr_error(ENOENT);
+	}
+
+	const int slots[SLOT_count] = {
+		[SLOT_begin]   = wrap1,
+		[SLOT_reset]   = wrap1,
+		[SLOT_finish]  = wrap2,
+		[SLOT_consume] = wrap2,
+		[SLOT_produce] = wrap2,
+		[SLOT_checkout] = wrap_checkout,
+		[SLOT_answer_finalize] = wrap1,
+	};
+	memcpy(l_ffi_wrap_slots, slots, sizeof(l_ffi_wrap_slots));
+	return kr_ok();
+}
+void ffimodule_deinit(lua_State *L)
 {
-	if (ctx->state & (KNOT_STATE_FAIL)) {
-		return ctx->state; /* Already failed or done, skip */
+	/* Unref each wrapper function from lua.
+	 * It's probably useless, as we're about to destroy lua_State, but... */
+	const int wrapsIndices[] = {
+		SLOT_begin,
+		SLOT_consume,
+		SLOT_checkout,
+	};
+	for (int i = 0; i < sizeof(wrapsIndices) / sizeof(wrapsIndices[0]); ++i) {
+		luaL_unref(L, LUA_REGISTRYINDEX, l_ffi_wrap_slots[wrapsIndices[i]]);
 	}
-	LAYER_FFI_CALL(ctx, produce);
-	lua_pushlightuserdata(L, ctx->data);
-	lua_pushlightuserdata(L, pkt);
-	set_metatable(L, META_PKT);
-	return l_ffi_call(L, 3);
 }
-#undef LAYER_FFI_CALL

 /** @internal Conditionally register layer trampoline
  * @warning Expects 'module.layer' to be on top of Lua stack. */
 #define LAYER_REGISTER(L, api, name) do { \
-	int *cb_slot = (int *)((char *)api + sizeof(knot_layer_api_t)); \
+	int *cb_slot = (api)->cb_slots + SLOT_ ## name; \
 	lua_getfield((L), -1, #name); \
 	if (!lua_isnil((L), -1)) { \
 		(api)->name = l_ffi_layer_ ## name; \
-		cb_slot[SLOT_ ## name] = luaL_ref((L), LUA_REGISTRYINDEX); \
+		*cb_slot = luaL_ref((L), LUA_REGISTRYINDEX); \
 	} else { \
 		lua_pop((L), 1); \
 	} \
 } while(0)

 /** @internal Create C layer api wrapper. */
-static knot_layer_api_t *l_ffi_layer_create(lua_State *L, struct kr_module *module)
+static kr_layer_api_t *l_ffi_layer_create(lua_State *L, struct kr_module *module)
 {
 	/* Fabricate layer API wrapping the Lua functions
 	 * reserve slots after it for references to Lua callbacks. */
-	const size_t api_length = sizeof(knot_layer_api_t) + (SLOT_count * SLOT_size);
-	knot_layer_api_t *api = malloc(api_length);
+	const size_t api_length = offsetof(kr_layer_api_t, cb_slots)
+				+ (SLOT_count * sizeof(module->layer->cb_slots[0]));
+	kr_layer_api_t *api = calloc(1, api_length);
 	if (api) {
-		memset(api, 0, api_length);
 		LAYER_REGISTER(L, api, begin);
 		LAYER_REGISTER(L, api, finish);
 		LAYER_REGISTER(L, api, consume);
 		LAYER_REGISTER(L, api, produce);
+		LAYER_REGISTER(L, api, checkout);
+		LAYER_REGISTER(L, api, answer_finalize);
 		LAYER_REGISTER(L, api, reset);
-		/* Begin is always set, as it initializes layer baton. */
-		api->begin = l_ffi_layer_begin;
-		api->data = module;
 	}
 	return api;
 }

-/** @internal Retrieve C layer api wrapper. */
-static const knot_layer_api_t *l_ffi_layer(struct kr_module *module)
-{
-	if (module) {
-		return (const knot_layer_api_t *)module->data;
-	}
-	return NULL;
-}
 #undef LAYER_REGISTER

-int ffimodule_register_lua(struct engine *engine, struct kr_module *module, const char *name)
+int ffimodule_register_lua(struct kr_module *module, const char *name)
 {
 	/* Register module in Lua */
-	lua_State *L = engine->L;
+	lua_State *L = the_engine->L;
 	lua_getglobal(L, "require");
-	lua_pushstring(L, name);
+	lua_pushfstring(L, "kres_modules.%s", name);
 	if (lua_pcall(L, 1, LUA_MULTRET, 0) != 0) {
-		fprintf(stderr, "error: %s\n", lua_tostring(L, -1));
+		kr_log_error(SYSTEM, "error: %s\n", lua_tostring(L, -1));
 		lua_pop(L, 1);
 		return kr_error(ENOENT);
 	}
@@ -265,18 +288,17 @@ int ffimodule_register_lua(struct engine *engine, struct kr_module *module, cons
 	/* Create FFI module with trampolined functions. */
 	memset(module, 0, sizeof(*module));
 	module->name = strdup(name);
-	module->init = &l_ffi_init;
 	module->deinit = &l_ffi_deinit;
 	/* Bake layer API if defined in module */
 	lua_getfield(L, -1, "layer");
 	if (!lua_isnil(L, -1)) {
-		module->layer = &l_ffi_layer;
-		module->data = l_ffi_layer_create(L, module);
-	}
-	module->lib = L;
-	lua_pop(L, 2); /* Clear the layer + module global */
-	if (module->init) {
-		return module->init(module);
+		module->layer = l_ffi_layer_create(L, module);
 	}
-	return kr_ok();
+	lua_pop(L, 1); /* .layer table */
+
+	/* Now call .init(), if it exists. */
+	lua_getfield(L, -1, "init");
+	const int ret = l_ffi_modcb(L, module);
+	lua_pop(L, 1); /* the module's table */
+	return ret;
 }
No results found