diff --git a/daemon/daemon.mk b/daemon/daemon.mk index c3c15075ef8dfc552a129d844671367a302969c1..eef6e8e5b2da092e008e66f7ec4cfab594881894 100644 --- a/daemon/daemon.mk +++ b/daemon/daemon.mk @@ -9,6 +9,7 @@ kresd_SOURCES := \ daemon/tls_ephemeral_credentials.c \ daemon/tls_session_ticket-srv.c \ daemon/zimport.c \ + daemon/session.c \ daemon/main.c kresd_DIST := daemon/lua/kres.lua daemon/lua/kres-gen.lua \ diff --git a/daemon/engine.c b/daemon/engine.c index c99ef63f283acfed6eb1fc3cb2669bc16fcee19b..6fea74545170c36be38a40f83d7e875a685564cc 100644 --- a/daemon/engine.c +++ b/daemon/engine.c @@ -33,6 +33,21 @@ #include "lib/cache/cdb_lmdb.h" #include "lib/dnssec/ta.h" +/* Magic defaults for the engine. */ +#ifndef LRU_RTT_SIZE +#define LRU_RTT_SIZE 65536 /**< NS RTT cache size */ +#endif +#ifndef LRU_REP_SIZE +#define LRU_REP_SIZE (LRU_RTT_SIZE / 4) /**< NS reputation cache size */ +#endif +#ifndef LRU_COOKIES_SIZE + #ifdef ENABLE_COOKIES + #define LRU_COOKIES_SIZE LRU_RTT_SIZE /**< DNS cookies cache size. */ + #else + #define LRU_COOKIES_SIZE LRU_ASSOC /* simpler than guards everywhere */ + #endif +#endif + /** @internal Compatibility wrapper for Lua < 5.2 */ #if LUA_VERSION_NUM < 502 #define lua_rawlen(L, obj) lua_objlen((L), (obj)) @@ -608,6 +623,7 @@ static int l_trampoline(lua_State *L) static int init_resolver(struct engine *engine) { + /* Note: it had been zored by engine_init(). */ /* Open resolution context */ engine->resolver.trust_anchors = map_make(NULL); engine->resolver.negative_anchors = map_make(NULL); diff --git a/daemon/engine.h b/daemon/engine.h index 6d0a73b7042c07ddc08eb50a640c078f02dcbec7..b79991a8cadc290ce1565cd50ce5b8a0068309b2 100644 --- a/daemon/engine.h +++ b/daemon/engine.h @@ -16,33 +16,6 @@ #pragma once -/* Magic defaults */ -#ifndef LRU_RTT_SIZE -#define LRU_RTT_SIZE 65536 /**< NS RTT cache size */ -#endif -#ifndef LRU_REP_SIZE -#define LRU_REP_SIZE (LRU_RTT_SIZE / 4) /**< NS reputation cache size */ -#endif -#ifndef LRU_COOKIES_SIZE -#define LRU_COOKIES_SIZE LRU_RTT_SIZE /**< DNS cookies cache size. */ -#endif -#ifndef MP_FREELIST_SIZE -# ifdef __clang_analyzer__ -# define MP_FREELIST_SIZE 0 -# else -# define MP_FREELIST_SIZE 64 /**< Maximum length of the worker mempool freelist */ -# endif -#endif -#ifndef RECVMMSG_BATCH -#define RECVMMSG_BATCH 4 -#endif -#ifndef QUERY_RATE_THRESHOLD -#define QUERY_RATE_THRESHOLD (2 * MP_FREELIST_SIZE) /**< Nr of parallel queries considered as high rate */ -#endif -#ifndef MAX_PIPELINED -#define MAX_PIPELINED 100 -#endif - /* * @internal These are forward decls to allow building modules with engine but without Lua. */ diff --git a/daemon/io.c b/daemon/io.c index e5b8a139a81566b464ee9863d9fa503109e92e44..30d3723ce7076d80b29d6cc7651cc90c8f0a1a5e 100644 --- a/daemon/io.c +++ b/daemon/io.c @@ -24,6 +24,7 @@ #include "daemon/network.h" #include "daemon/worker.h" #include "daemon/tls.h" +#include "daemon/session.h" #define negotiate_bufsize(func, handle, bufsize_want) do { \ int bufsize = 0; func(handle, &bufsize); \ @@ -33,101 +34,38 @@ } \ } while (0) -void io_release(uv_handle_t *handle); - static void check_bufsize(uv_handle_t* handle) { + return; /* TODO: resurrect after https://github.com/libuv/libuv/issues/419 */ /* We want to buffer at least N waves in advance. * This is magic presuming we can pull in a whole recvmmsg width in one wave. * Linux will double this the bufsize wanted. */ - const int bufsize_want = RECVMMSG_BATCH * 65535 * 2; + const int bufsize_want = 2 * sizeof( ((struct worker_ctx *)NULL)->wire_buf ) ; negotiate_bufsize(uv_recv_buffer_size, handle, bufsize_want); negotiate_bufsize(uv_send_buffer_size, handle, bufsize_want); } #undef negotiate_bufsize -static void session_clear(struct session *s) -{ - assert(s->tasks.len == 0 && s->waiting.len == 0); - array_clear(s->tasks); - array_clear(s->waiting); - tls_free(s->tls_ctx); - tls_client_ctx_free(s->tls_client_ctx); - memset(s, 0, sizeof(*s)); -} - -void session_free(struct session *s) -{ - if (s) { - assert(s->tasks.len == 0 && s->waiting.len == 0); - session_clear(s); - free(s); - } -} - -struct session *session_new(void) -{ - return calloc(1, sizeof(struct session)); -} - -static struct session *session_borrow(struct worker_ctx *worker) -{ - struct session *s = NULL; - if (worker->pool_sessions.len > 0) { - s = array_tail(worker->pool_sessions); - array_pop(worker->pool_sessions); - kr_asan_unpoison(s, sizeof(*s)); - } else { - s = session_new(); - } - return s; -} - -static void session_release(struct worker_ctx *worker, uv_handle_t *handle) -{ - if (!worker || !handle) { - return; - } - struct session *s = handle->data; - if (!s) { - return; - } - assert(s->waiting.len == 0 && s->tasks.len == 0); - assert(s->buffering == NULL); - if (!s->outgoing && handle->type == UV_TCP) { - worker_end_tcp(worker, handle); /* to free the buffering task */ - } - if (worker->pool_sessions.len < MP_FREELIST_SIZE) { - session_clear(s); - array_push(worker->pool_sessions, s); - kr_asan_poison(s, sizeof(*s)); - } else { - session_free(s); - } -} - static void handle_getbuf(uv_handle_t* handle, size_t suggested_size, uv_buf_t* buf) { - /* Worker has single buffer which is reused for all incoming - * datagrams / stream reads, the content of the buffer is + /* UDP sessions use worker buffer for wire data, + * TCP sessions use session buffer for wire data + * (see session_set_handle()). + * TLS sessions use buffer from TLS context. + * The content of the worker buffer is * guaranteed to be unchanged only for the duration of * udp_read() and tcp_read(). */ - struct session *session = handle->data; - uv_loop_t *loop = handle->loop; - struct worker_ctx *worker = loop->data; - buf->base = (char *)worker->wire_buf; - /* Limit TCP stream buffer size to 4K for granularity in batches of incoming queries. */ - if (handle->type == UV_TCP) { - buf->len = MIN(suggested_size, 4096); - /* Regular buffer size for subrequests. */ - } else if (session->outgoing) { - buf->len = suggested_size; - /* Use recvmmsg() on master sockets if possible. */ + struct session *s = handle->data; + if (!session_flags(s)->has_tls) { + buf->base = (char *) session_wirebuf_get_free_start(s); + buf->len = session_wirebuf_get_free_size(s); } else { - buf->len = sizeof(worker->wire_buf); + struct tls_common_ctx *ctx = session_tls_get_common_ctx(s); + buf->base = (char *) ctx->recv_buf; + buf->len = sizeof(ctx->recv_buf); } } @@ -137,29 +75,32 @@ void udp_recv(uv_udp_t *handle, ssize_t nread, const uv_buf_t *buf, uv_loop_t *loop = handle->loop; struct worker_ctx *worker = loop->data; struct session *s = handle->data; - if (s->closing) { + if (session_flags(s)->closing) { return; } if (nread <= 0) { if (nread < 0) { /* Error response, notify resolver */ - worker_submit(worker, (uv_handle_t *)handle, NULL, addr); + worker_submit(s, NULL); } /* nread == 0 is for freeing buffers, we don't need to do this */ return; } if (addr->sa_family == AF_UNSPEC) { return; } - if (s->outgoing) { - assert(s->peer.ip.sa_family != AF_UNSPEC); - if (kr_sockaddr_cmp(&s->peer.ip, addr) != 0) { + struct sockaddr *peer = session_get_peer(s); + if (session_flags(s)->outgoing) { + assert(peer->sa_family != AF_UNSPEC); + if (kr_sockaddr_cmp(peer, addr) != 0) { return; } + } else { + memcpy(peer, addr, kr_sockaddr_len(addr)); } - knot_pkt_t *query = knot_pkt_new(buf->base, nread, &worker->pkt_pool); - if (query) { - query->max_size = KNOT_WIRE_MAX_PKTSIZE; - worker_submit(worker, (uv_handle_t *)handle, query, addr); - } + ssize_t consumed = session_wirebuf_consume(s, (const uint8_t *)buf->base, + nread); + assert(consumed == nread); (void)consumed; + session_wirebuf_process(s); + session_wirebuf_discard(s); mp_flush(worker->pkt_pool.ctx); } @@ -167,11 +108,9 @@ static int udp_bind_finalize(uv_handle_t *handle) { check_bufsize(handle); /* Handle is already created, just create context. */ - struct session *session = session_new(); - assert(session); - session->outgoing = false; - session->handle = handle; - handle->data = session; + struct session *s = session_new(handle); + assert(s); + session_flags(s)->outgoing = false; return io_start_read(handle); } @@ -201,16 +140,45 @@ int udp_bindfd(uv_udp_t *handle, int fd) return udp_bind_finalize((uv_handle_t *)handle); } -static void tcp_timeout_trigger(uv_timer_t *timer) +void tcp_timeout_trigger(uv_timer_t *timer) { - struct session *session = timer->data; + struct session *s = timer->data; + + assert(!session_flags(s)->closing); + assert(session_waitinglist_is_empty(s)); + + struct worker_ctx *worker = timer->loop->data; - assert(session->outgoing == false); - if (session->tasks.len > 0) { - uv_timer_again(timer); - } else if (!session->closing) { + if (!session_tasklist_is_empty(s)) { + int finalized = session_tasklist_finalize_expired(s); + worker->stats.timeout += finalized; + /* session_tasklist_finalize_expired() may call worker_task_finalize(). + * If session is a source session and there were IO errors, + * worker_task_finalize() can filnalize all tasks and close session. */ + if (session_flags(s)->closing) { + return; + } + + } + if (!session_tasklist_is_empty(s)) { uv_timer_stop(timer); - worker_session_close(session); + session_timer_start(s, tcp_timeout_trigger, + KR_RESOLVE_TIME_LIMIT / 2, + KR_RESOLVE_TIME_LIMIT / 2); + } else { + const struct engine *engine = worker->engine; + const struct network *net = &engine->net; + uint64_t idle_in_timeout = net->tcp.in_idle_timeout; + uint64_t last_activity = session_last_input_activity(s); + uint64_t idle_time = kr_now() - last_activity; + if (idle_time < idle_in_timeout) { + idle_in_timeout -= idle_time; + uv_timer_stop(timer); + session_timer_start(s, tcp_timeout_trigger, + idle_in_timeout, idle_in_timeout); + } else { + session_close(s); + } } } @@ -218,59 +186,71 @@ static void tcp_recv(uv_stream_t *handle, ssize_t nread, const uv_buf_t *buf) { uv_loop_t *loop = handle->loop; struct session *s = handle->data; - if (s->closing) { + + assert(s && session_get_handle(s) == (uv_handle_t *)handle && + handle->type == UV_TCP); + + if (session_flags(s)->closing) { return; } + /* nread might be 0, which does not indicate an error or EOF. * This is equivalent to EAGAIN or EWOULDBLOCK under read(2). */ if (nread == 0) { return; } - if (nread == UV_EOF) { - nread = 0; - } + struct worker_ctx *worker = loop->data; - /* TCP pipelining is rather complicated and requires cooperation from the worker - * so the whole message reassembly and demuxing logic is inside worker */ - int ret = 0; - if (s->has_tls) { - ret = tls_process(worker, handle, (const uint8_t *)buf->base, nread); - } else { - ret = worker_process_tcp(worker, handle, (const uint8_t *)buf->base, nread); + + if (nread < 0 || !buf->base) { + if (kr_verbose_status) { + struct sockaddr *peer = session_get_peer(s); + char peer_str[INET6_ADDRSTRLEN]; + inet_ntop(peer->sa_family, kr_inaddr(peer), + peer_str, sizeof(peer_str)); + kr_log_verbose("[io] => connection to '%s' closed by peer (%s)\n", peer_str, + uv_strerror(nread)); + } + worker_end_tcp(s); + return; + } + + ssize_t consumed = 0; + const uint8_t *data = (const uint8_t *)buf->base; + ssize_t data_len = nread; + if (session_flags(s)->has_tls) { + /* buf->base points to start of the tls receive buffer. + Decode data free space in session wire buffer. */ + consumed = tls_process_input_data(s, (const uint8_t *)buf->base, nread); + data = session_wirebuf_get_free_start(s); + data_len = consumed; } + + /* data points to start of the free space in session wire buffer. + Simple increase internal counter. */ + consumed = session_wirebuf_consume(s, data, data_len); + assert(consumed == data_len); + + int ret = session_wirebuf_process(s); if (ret < 0) { - worker_end_tcp(worker, (uv_handle_t *)handle); - /* Exceeded per-connection quota for outstanding requests - * stop reading from stream and close after last message is processed. */ - if (!s->outgoing && !uv_is_closing((uv_handle_t *)&s->timeout)) { - uv_timer_stop(&s->timeout); - if (s->tasks.len == 0) { - worker_session_close(s); - } else { /* If there are tasks running, defer until they finish. */ - uv_timer_start(&s->timeout, tcp_timeout_trigger, - MAX_TCP_INACTIVITY, MAX_TCP_INACTIVITY); - } - } - /* Connection spawned at least one request, reset its deadline for next query. - * https://tools.ietf.org/html/rfc7766#section-6.2.3 */ - } else if (ret > 0 && !s->outgoing && !s->closing) { - uv_timer_again(&s->timeout); + /* An error has occurred, close the session. */ + worker_end_tcp(s); } + session_wirebuf_compress(s); mp_flush(worker->pkt_pool.ctx); } static void _tcp_accept(uv_stream_t *master, int status, bool tls) { - if (status != 0) { + if (status != 0) { return; } struct worker_ctx *worker = (struct worker_ctx *)master->loop->data; - uv_stream_t *client = worker_iohandle_borrow(worker); + uv_tcp_t *client = malloc(sizeof(uv_tcp_t)); if (!client) { return; } - memset(client, 0, sizeof(*client)); int res = io_create(master->loop, (uv_handle_t *)client, SOCK_STREAM, AF_UNSPEC); if (res) { if (res == UV_EMFILE) { @@ -280,32 +260,32 @@ static void _tcp_accept(uv_stream_t *master, int status, bool tls) /* Since res isn't OK struct session wasn't allocated \ borrowed. * We must release client handle only. */ - worker_iohandle_release(worker, client); + free(client); return; } /* struct session was allocated \ borrowed from memory pool. */ struct session *session = client->data; - assert(session->outgoing == false); + assert(session_flags(session)->outgoing == false); - if (uv_accept(master, client) != 0) { + if (uv_accept(master, (uv_stream_t *)client) != 0) { /* close session, close underlying uv handles and * deallocate (or return to memory pool) memory. */ - worker_session_close(session); + session_close(session); return; } /* Set deadlines for TCP connection and start reading. * It will re-check every half of a request time limit if the connection * is idle and should be terminated, this is an educated guess. */ - - struct sockaddr *addr = &(session->peer.ip); - int addr_len = sizeof(union inaddr); - int ret = uv_tcp_getpeername((uv_tcp_t *)client, addr, &addr_len); - if (ret || addr->sa_family == AF_UNSPEC) { - /* close session, close underlying uv handles and - * deallocate (or return to memory pool) memory. */ - worker_session_close(session); + struct session *s = client->data; + assert(session_flags(s)->outgoing == false); + + struct sockaddr *peer = session_get_peer(s); + int peer_len = sizeof(union inaddr); + int ret = uv_tcp_getpeername(client, peer, &peer_len); + if (ret || peer->sa_family == AF_UNSPEC) { + session_close(s); return; } @@ -314,21 +294,22 @@ static void _tcp_accept(uv_stream_t *master, int status, bool tls) uint64_t idle_in_timeout = net->tcp.in_idle_timeout; uint64_t timeout = KR_CONN_RTT_MAX / 2; - session->has_tls = tls; + session_flags(s)->has_tls = tls; if (tls) { timeout += TLS_MAX_HANDSHAKE_TIME; - if (!session->tls_ctx) { - session->tls_ctx = tls_new(master->loop->data); - if (!session->tls_ctx) { - worker_session_close(session); + struct tls_ctx_t *ctx = session_tls_get_server_ctx(s); + if (!ctx) { + ctx = tls_new(worker); + if (!ctx) { + session_close(s); return; } - session->tls_ctx->c.session = session; - session->tls_ctx->c.handshake_state = TLS_HS_IN_PROGRESS; + ctx->c.session = s; + ctx->c.handshake_state = TLS_HS_IN_PROGRESS; + session_tls_set_server_ctx(s, ctx); } } - uv_timer_t *timer = &session->timeout; - uv_timer_start(timer, tcp_timeout_trigger, timeout, idle_in_timeout); + session_timer_start(s, tcp_timeout_trigger, timeout, idle_in_timeout); io_start_read((uv_handle_t *)client); } @@ -443,13 +424,10 @@ int io_create(uv_loop_t *loop, uv_handle_t *handle, int type, unsigned family) if (ret != 0) { return ret; } - struct worker_ctx *worker = loop->data; - struct session *session = session_borrow(worker); - assert(session); - session->handle = handle; - handle->data = session; - session->timeout.data = session; - uv_timer_init(worker->loop, &session->timeout); + struct session *s = session_new(handle); + if (s == NULL) { + ret = -1; + } return ret; } @@ -458,36 +436,16 @@ void io_deinit(uv_handle_t *handle) if (!handle) { return; } - uv_loop_t *loop = handle->loop; - if (loop && loop->data) { - struct worker_ctx *worker = loop->data; - session_release(worker, handle); - } else { - session_free(handle->data); - } + session_free(handle->data); handle->data = NULL; } void io_free(uv_handle_t *handle) { - if (!handle) { - return; - } io_deinit(handle); free(handle); } -void io_release(uv_handle_t *handle) -{ - if (!handle) { - return; - } - uv_loop_t *loop = handle->loop; - struct worker_ctx *worker = loop->data; - io_deinit(handle); - worker_iohandle_release(worker, handle); -} - int io_start_read(uv_handle_t *handle) { switch (handle->type) { diff --git a/daemon/io.h b/daemon/io.h index 428cc62a392bb3c7d866bcf6f86a7141e615d77d..c81b1c996fba295d6bdcef3abd4cfd249a9e3913 100644 --- a/daemon/io.h +++ b/daemon/io.h @@ -25,39 +25,13 @@ struct tls_ctx_t; struct tls_client_ctx_t; -/* Per-session (TCP or UDP) persistent structure, - * that exists between remote counterpart and a local socket. - */ -struct session { - bool outgoing; /**< True: to upstream; false: from a client. */ - bool throttled; - bool has_tls; - bool connected; - bool closing; - union inaddr peer; - uv_handle_t *handle; - uv_timer_t timeout; - struct qr_task *buffering; /**< Worker buffers the incomplete TCP query here. */ - struct tls_ctx_t *tls_ctx; - struct tls_client_ctx_t *tls_client_ctx; - - uint8_t msg_hdr[4]; /**< Buffer for DNS message header. */ - ssize_t msg_hdr_idx; /**< The number of bytes in msg_hdr filled so far. */ - - qr_tasklist_t tasks; - qr_tasklist_t waiting; - ssize_t bytes_to_skip; -}; - -void session_free(struct session *s); -struct session *session_new(void); - int udp_bind(uv_udp_t *handle, struct sockaddr *addr); int udp_bindfd(uv_udp_t *handle, int fd); int tcp_bind(uv_tcp_t *handle, struct sockaddr *addr); int tcp_bind_tls(uv_tcp_t *handle, struct sockaddr *addr); int tcp_bindfd(uv_tcp_t *handle, int fd); int tcp_bindfd_tls(uv_tcp_t *handle, int fd); +void tcp_timeout_trigger(uv_timer_t *timer); /** Initialize the handle, incl. ->data = struct session * instance. * \param type = SOCK_* diff --git a/daemon/lua/kres-gen.lua b/daemon/lua/kres-gen.lua index 7d06d2003f9208aa159c0cde6724210f692d2a0b..ef12802c65054bfb31297a98622e0f8c897762c7 100644 --- a/daemon/lua/kres-gen.lua +++ b/daemon/lua/kres-gen.lua @@ -195,6 +195,7 @@ struct kr_request { trace_callback_f trace_finish; int vars_ref; knot_mm_t pool; + unsigned int uid; }; enum kr_rank {KR_RANK_INITIAL, KR_RANK_OMIT, KR_RANK_TRY, KR_RANK_INDET = 4, KR_RANK_BOGUS, KR_RANK_MISMATCH, KR_RANK_MISSING, KR_RANK_INSECURE, KR_RANK_AUTH = 16, KR_RANK_SECURE = 32}; struct kr_cache { diff --git a/daemon/main.c b/daemon/main.c index 3981c8c9b93864626ffb23da5a90c8c42634ce32..7bced7bcae216e47d1f3e71091554efb8eed1b47 100644 --- a/daemon/main.c +++ b/daemon/main.c @@ -727,20 +727,11 @@ int main(int argc, char **argv) return EXIT_FAILURE; } - uv_loop_t *loop = NULL; - /* Bind to passed fds and sockets*/ - if (bind_fds(&engine.net, &args.fd_set, false) != 0 || - bind_fds(&engine.net, &args.tls_fd_set, true) != 0 || - bind_sockets(&engine.net, &args.addr_set, false) != 0 || - bind_sockets(&engine.net, &args.tls_set, true) != 0 - ) { - ret = EXIT_FAILURE; - goto cleanup; - } + uv_loop_t *loop = uv_default_loop(); + worker->loop = loop; + loop->data = worker; /* Catch some signals. */ - - loop = uv_default_loop(); uv_signal_t sigint, sigterm; if (true) ret = uv_signal_init(loop, &sigint); if (!ret) ret = uv_signal_init(loop, &sigterm); @@ -766,10 +757,18 @@ int main(int argc, char **argv) goto cleanup; } + /* Bind to passed fds and sockets*/ + if (bind_fds(&engine.net, &args.fd_set, false) != 0 || + bind_fds(&engine.net, &args.tls_fd_set, true) != 0 || + bind_sockets(&engine.net, &args.addr_set, false) != 0 || + bind_sockets(&engine.net, &args.tls_set, true) != 0 + ) { + ret = EXIT_FAILURE; + goto cleanup; + } + /* Start the scripting engine */ engine_set_moduledir(&engine, args.moduledir); - worker->loop = loop; - loop->data = worker; if (engine_load_sandbox(&engine) != 0) { ret = EXIT_FAILURE; diff --git a/daemon/session.c b/daemon/session.c new file mode 100644 index 0000000000000000000000000000000000000000..1756b630d1509ec29deda52e54265da16bd17524 --- /dev/null +++ b/daemon/session.c @@ -0,0 +1,755 @@ +/* Copyright (C) 2018 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +#include <assert.h> + +#include <libknot/packet/pkt.h> + +#include "lib/defines.h" +#include "daemon/session.h" +#include "daemon/engine.h" +#include "daemon/tls.h" +#include "daemon/worker.h" +#include "daemon/io.h" +#include "lib/generic/queue.h" + +/* Per-session (TCP or UDP) persistent structure, + * that exists between remote counterpart and a local socket. + */ +struct session { + struct session_flags sflags; /**< miscellaneous flags. */ + union inaddr peer; /**< address of peer; is not set for client's UDP sessions. */ + uv_handle_t *handle; /**< libuv handle for IO operations. */ + uv_timer_t timeout; /**< libuv handle for timer. */ + + struct tls_ctx_t *tls_ctx; /**< server side tls-related data. */ + struct tls_client_ctx_t *tls_client_ctx; /**< client side tls-related data. */ + + trie_t *tasks; /**< list of tasks assotiated with given session. */ + queue_t(struct qr_task *) waiting; /**< list of tasks waiting for sending to upstream. */ + + uint8_t *wire_buf; /**< Buffer for DNS message. */ + ssize_t wire_buf_size; /**< Buffer size. */ + ssize_t wire_buf_start_idx; /**< Data start offset in wire_buf. */ + ssize_t wire_buf_end_idx; /**< Data end offset in wire_buf. */ + uint64_t last_input_activity; /**< Either creatoion time or time of peer's last activity */ +}; + +static void on_session_close(uv_handle_t *handle) +{ + struct session *session = handle->data; + assert(session->handle == handle); (void)session; + io_free(handle); +} + +static void on_session_timer_close(uv_handle_t *timer) +{ + struct session *session = timer->data; + uv_handle_t *handle = session->handle; + assert(handle && handle->data == session); + assert (session->sflags.outgoing || handle->type == UV_TCP); + if (!uv_is_closing(handle)) { + uv_close(handle, on_session_close); + } +} + +void session_free(struct session *session) +{ + if (session) { + assert(session_is_empty(session)); + session_clear(session); + free(session); + } +} + +void session_clear(struct session *session) +{ + assert(session_is_empty(session)); + if (session->handle && session->handle->type == UV_TCP) { + free(session->wire_buf); + } + trie_clear(session->tasks); + trie_free(session->tasks); + queue_deinit(session->waiting); + tls_free(session->tls_ctx); + tls_client_ctx_free(session->tls_client_ctx); + memset(session, 0, sizeof(*session)); +} + +void session_close(struct session *session) +{ + assert(session_is_empty(session)); + if (session->sflags.closing) { + return; + } + + uv_handle_t *handle = session->handle; + io_stop_read(handle); + session->sflags.closing = true; + if (session->peer.ip.sa_family != AF_UNSPEC) { + struct worker_ctx *worker = handle->loop->data; + struct sockaddr *peer = &session->peer.ip; + worker_del_tcp_connected(worker, peer); + session->sflags.connected = false; + } + + if (!uv_is_closing((uv_handle_t *)&session->timeout)) { + uv_timer_stop(&session->timeout); + if (session->tls_client_ctx) { + tls_close(&session->tls_client_ctx->c); + } + if (session->tls_ctx) { + tls_close(&session->tls_ctx->c); + } + + session->timeout.data = session; + uv_close((uv_handle_t *)&session->timeout, on_session_timer_close); + } +} + +int session_start_read(struct session *session) +{ + return io_start_read(session->handle); +} + +int session_stop_read(struct session *session) +{ + return io_stop_read(session->handle); +} + +int session_waitinglist_push(struct session *session, struct qr_task *task) +{ + queue_push(session->waiting, task); + worker_task_ref(task); + return kr_ok(); +} + +struct qr_task *session_waitinglist_get(const struct session *session) +{ + return queue_head(session->waiting); +} + +struct qr_task *session_waitinglist_pop(struct session *session, bool deref) +{ + struct qr_task *t = session_waitinglist_get(session); + queue_pop(session->waiting); + if (deref) { + worker_task_unref(t); + } + return t; +} + +int session_tasklist_add(struct session *session, struct qr_task *task) +{ + trie_t *t = session->tasks; + uint16_t task_msg_id = 0; + const char *key = NULL; + size_t key_len = 0; + if (session->sflags.outgoing) { + knot_pkt_t *pktbuf = worker_task_get_pktbuf(task); + task_msg_id = knot_wire_get_id(pktbuf->wire); + key = (const char *)&task_msg_id; + key_len = sizeof(task_msg_id); + } else { + key = (const char *)&task; + key_len = sizeof(char *); + } + trie_val_t *v = trie_get_ins(t, key, key_len); + if (unlikely(!v)) { + assert(false); + return kr_error(ENOMEM); + } + if (*v == NULL) { + *v = task; + worker_task_ref(task); + } else if (*v != task) { + assert(false); + return kr_error(EINVAL); + } + return kr_ok(); +} + +int session_tasklist_del(struct session *session, struct qr_task *task) +{ + trie_t *t = session->tasks; + uint16_t task_msg_id = 0; + const char *key = NULL; + size_t key_len = 0; + trie_val_t val; + if (session->sflags.outgoing) { + knot_pkt_t *pktbuf = worker_task_get_pktbuf(task); + task_msg_id = knot_wire_get_id(pktbuf->wire); + key = (const char *)&task_msg_id; + key_len = sizeof(task_msg_id); + } else { + key = (const char *)&task; + key_len = sizeof(char *); + } + int ret = trie_del(t, key, key_len, &val); + if (ret == kr_ok()) { + assert(val == task); + worker_task_unref(val); + } + return ret; +} + +struct qr_task *session_tasklist_get_first(struct session *session) +{ + trie_val_t *val = trie_get_first(session->tasks, NULL, NULL); + return val ? (struct qr_task *) *val : NULL; +} + +struct qr_task *session_tasklist_del_first(struct session *session, bool deref) +{ + trie_val_t val = NULL; + int res = trie_del_first(session->tasks, NULL, NULL, &val); + if (res != kr_ok()) { + val = NULL; + } else if (deref) { + worker_task_unref(val); + } + return (struct qr_task *)val; +} +struct qr_task* session_tasklist_del_msgid(const struct session *session, uint16_t msg_id) +{ + trie_t *t = session->tasks; + assert(session->sflags.outgoing); + struct qr_task *ret = NULL; + const char *key = (const char *)&msg_id; + size_t key_len = sizeof(msg_id); + trie_val_t val; + int res = trie_del(t, key, key_len, &val); + if (res == kr_ok()) { + if (worker_task_numrefs(val) > 1) { + ret = val; + } + worker_task_unref(val); + } + return ret; +} + +struct qr_task* session_tasklist_find_msgid(const struct session *session, uint16_t msg_id) +{ + trie_t *t = session->tasks; + assert(session->sflags.outgoing); + struct qr_task *ret = NULL; + trie_val_t *val = trie_get_try(t, (char *)&msg_id, sizeof(msg_id)); + if (val) { + ret = *val; + } + return ret; +} + +struct session_flags *session_flags(struct session *session) +{ + return &session->sflags; +} + +struct sockaddr *session_get_peer(struct session *session) +{ + return &session->peer.ip; +} + +struct tls_ctx_t *session_tls_get_server_ctx(const struct session *session) +{ + return session->tls_ctx; +} + +void session_tls_set_server_ctx(struct session *session, struct tls_ctx_t *ctx) +{ + session->tls_ctx = ctx; +} + +struct tls_client_ctx_t *session_tls_get_client_ctx(const struct session *session) +{ + return session->tls_client_ctx; +} + +void session_tls_set_client_ctx(struct session *session, struct tls_client_ctx_t *ctx) +{ + session->tls_client_ctx = ctx; +} + +struct tls_common_ctx *session_tls_get_common_ctx(const struct session *session) +{ + struct tls_common_ctx *tls_ctx = session->sflags.outgoing ? &session->tls_client_ctx->c : + &session->tls_ctx->c; + return tls_ctx; +} + +uv_handle_t *session_get_handle(struct session *session) +{ + return session->handle; +} + +struct session *session_get(uv_handle_t *h) +{ + return h->data; +} + +struct session *session_new(uv_handle_t *handle) +{ + if (!handle) { + return NULL; + } + struct session *session = calloc(1, sizeof(struct session)); + if (!session) { + return NULL; + } + + queue_init(session->waiting); + session->tasks = trie_create(NULL); + if (handle->type == UV_TCP) { + uint8_t *wire_buf = malloc(KNOT_WIRE_MAX_PKTSIZE); + if (!wire_buf) { + free(session); + return NULL; + } + session->wire_buf = wire_buf; + session->wire_buf_size = KNOT_WIRE_MAX_PKTSIZE; + } else if (handle->type == UV_UDP) { + /* We use the singleton buffer from worker for all UDP (!) + * libuv documentation doesn't really guarantee this is OK, + * but the implementation for unix systems does not hold + * the buffer (both UDP and TCP) - always makes a NON-blocking + * syscall that fills the buffer and immediately calls + * the callback, whatever the result of the operation. + * We still need to keep in mind to only touch the buffer + * in this callback... */ + assert(handle->loop->data); + struct worker_ctx *worker = handle->loop->data; + session->wire_buf = worker->wire_buf; + session->wire_buf_size = sizeof(worker->wire_buf); + } + + uv_timer_init(handle->loop, &session->timeout); + + session->handle = handle; + handle->data = session; + session->timeout.data = session; + session_touch(session); + + return session; +} + +size_t session_tasklist_get_len(const struct session *session) +{ + return trie_weight(session->tasks); +} + +size_t session_waitinglist_get_len(const struct session *session) +{ + return queue_len(session->waiting); +} + +bool session_tasklist_is_empty(const struct session *session) +{ + return session_tasklist_get_len(session) == 0; +} + +bool session_waitinglist_is_empty(const struct session *session) +{ + return session_waitinglist_get_len(session) == 0; +} + +bool session_is_empty(const struct session *session) +{ + return session_tasklist_is_empty(session) && + session_waitinglist_is_empty(session); +} + +bool session_has_tls(const struct session *session) +{ + return session->sflags.has_tls; +} + +void session_set_has_tls(struct session *session, bool has_tls) +{ + session->sflags.has_tls = has_tls; +} + +void session_waitinglist_retry(struct session *session, bool increase_timeout_cnt) +{ + while (!session_waitinglist_is_empty(session)) { + struct qr_task *task = session_waitinglist_pop(session, false); + if (increase_timeout_cnt) { + worker_task_timeout_inc(task); + } + worker_task_step(task, NULL, NULL); + worker_task_unref(task); + } +} + +void session_waitinglist_finalize(struct session *session, int status) +{ + while (!session_waitinglist_is_empty(session)) { + struct qr_task *t = session_waitinglist_pop(session, false); + worker_task_finalize(t, status); + worker_task_unref(t); + } +} + +void session_tasklist_finalize(struct session *session, int status) +{ + while (session_tasklist_get_len(session) > 0) { + struct qr_task *t = session_tasklist_del_first(session, false); + assert(worker_task_numrefs(t) > 0); + worker_task_finalize(t, status); + worker_task_unref(t); + } +} + +int session_tasklist_finalize_expired(struct session *session) +{ + int ret = 0; + queue_t(struct qr_task *) q; + uint64_t now = kr_now(); + trie_t *t = session->tasks; + trie_it_t *it; + queue_init(q); + for (it = trie_it_begin(t); !trie_it_finished(it); trie_it_next(it)) { + trie_val_t *v = trie_it_val(it); + struct qr_task *task = (struct qr_task *)*v; + if ((now - worker_task_creation_time(task)) >= KR_RESOLVE_TIME_LIMIT) { + queue_push(q, task); + worker_task_ref(task); + } + } + trie_it_free(it); + + struct qr_task *task = NULL; + uint16_t msg_id = 0; + char *key = (char *)&task; + int32_t keylen = sizeof(struct qr_task *); + if (session->sflags.outgoing) { + key = (char *)&msg_id; + keylen = sizeof(msg_id); + } + while (queue_len(q) > 0) { + task = queue_head(q); + if (session->sflags.outgoing) { + knot_pkt_t *pktbuf = worker_task_get_pktbuf(task); + msg_id = knot_wire_get_id(pktbuf->wire); + } + int res = trie_del(t, key, keylen, NULL); + if (!worker_task_finished(task)) { + /* task->pending_count must be zero, + * but there are can be followers, + * so run worker_task_subreq_finalize() to ensure retrying + * for all the followers. */ + worker_task_subreq_finalize(task); + worker_task_finalize(task, KR_STATE_FAIL); + } + if (res == KNOT_EOK) { + worker_task_unref(task); + } + queue_pop(q); + worker_task_unref(task); + ++ret; + } + + queue_deinit(q); + return ret; +} + +int session_timer_start(struct session *session, uv_timer_cb cb, + uint64_t timeout, uint64_t repeat) +{ + uv_timer_t *timer = &session->timeout; + assert(timer->data == session); + int ret = uv_timer_start(timer, cb, timeout, repeat); + if (ret != 0) { + uv_timer_stop(timer); + return kr_error(ENOMEM); + } + return 0; +} + +int session_timer_restart(struct session *session) +{ + return uv_timer_again(&session->timeout); +} + +int session_timer_stop(struct session *session) +{ + return uv_timer_stop(&session->timeout); +} + +ssize_t session_wirebuf_consume(struct session *session, const uint8_t *data, ssize_t len) +{ + if (data != &session->wire_buf[session->wire_buf_end_idx]) { + /* shouldn't happen */ + return kr_error(EINVAL); + } + + if (session->wire_buf_end_idx + len > session->wire_buf_size) { + /* shouldn't happen */ + return kr_error(EINVAL); + } + + session->wire_buf_end_idx += len; + return len; +} + +knot_pkt_t *session_produce_packet(struct session *session, knot_mm_t *mm) +{ + session->sflags.wirebuf_error = false; + if (session->wire_buf_end_idx == 0) { + return NULL; + } + + if (session->wire_buf_start_idx == session->wire_buf_end_idx) { + session->wire_buf_start_idx = 0; + session->wire_buf_end_idx = 0; + return NULL; + } + + if (session->wire_buf_start_idx > session->wire_buf_end_idx) { + session->sflags.wirebuf_error = true; + session->wire_buf_start_idx = 0; + session->wire_buf_end_idx = 0; + return NULL; + } + + const uv_handle_t *handle = session->handle; + uint8_t *msg_start = &session->wire_buf[session->wire_buf_start_idx]; + ssize_t wirebuf_msg_data_size = session->wire_buf_end_idx - session->wire_buf_start_idx; + uint16_t msg_size = wirebuf_msg_data_size; + + if (!handle) { + session->sflags.wirebuf_error = true; + return NULL; + } else if (handle->type == UV_TCP) { + if (msg_size < 2) { + return NULL; + } + msg_size = knot_wire_read_u16(msg_start); + if (msg_size >= session->wire_buf_size) { + session->sflags.wirebuf_error = true; + return NULL; + } + if (msg_size + 2 > wirebuf_msg_data_size) { + return NULL; + } + msg_start += 2; + } + + knot_pkt_t *pkt = knot_pkt_new(msg_start, msg_size, mm); + if (pkt) { + session->sflags.wirebuf_error = false; + } + return pkt; +} + +int session_discard_packet(struct session *session, const knot_pkt_t *pkt) +{ + uv_handle_t *handle = session->handle; + /* Pointer to data start in wire_buf */ + uint8_t *wirebuf_data_start = &session->wire_buf[session->wire_buf_start_idx]; + /* Number of data bytes in wire_buf */ + size_t wirebuf_data_size = session->wire_buf_end_idx - session->wire_buf_start_idx; + /* Pointer to message start in wire_buf */ + uint8_t *wirebuf_msg_start = wirebuf_data_start; + /* Number of message bytes in wire_buf. + * For UDP it is the same number as wirebuf_data_size. */ + size_t wirebuf_msg_size = wirebuf_data_size; + /* Wire data from parsed packet. */ + uint8_t *pkt_msg_start = pkt->wire; + /* Number of bytes in packet wire buffer. */ + size_t pkt_msg_size = pkt->size; + if (knot_pkt_has_tsig(pkt)) { + pkt_msg_size += pkt->tsig_wire.len; + } + + session->sflags.wirebuf_error = true; + + if (!handle) { + return kr_error(EINVAL); + } else if (handle->type == UV_TCP) { + /* wire_buf contains TCP DNS message. */ + if (wirebuf_data_size < 2) { + /* TCP message length field isn't in buffer, must not happen. */ + assert(0); + session->wire_buf_start_idx = 0; + session->wire_buf_end_idx = 0; + return kr_error(EINVAL); + } + wirebuf_msg_size = knot_wire_read_u16(wirebuf_msg_start); + wirebuf_msg_start += 2; + if (wirebuf_msg_size + 2 > wirebuf_data_size) { + /* TCP message length field is greater then + * number of bytes in buffer, must not happen. */ + assert(0); + session->wire_buf_start_idx = 0; + session->wire_buf_end_idx = 0; + return kr_error(EINVAL); + } + } + + if (wirebuf_msg_start != pkt_msg_start) { + /* packet wirebuf must be located at the beginning + * of the session wirebuf, must not happen. */ + assert(0); + session->wire_buf_start_idx = 0; + session->wire_buf_end_idx = 0; + return kr_error(EINVAL); + } + + if (wirebuf_msg_size < pkt_msg_size) { + /* Message length field is lesser then packet size, + * must not happen. */ + assert(0); + session->wire_buf_start_idx = 0; + session->wire_buf_end_idx = 0; + return kr_error(EINVAL); + } + + if (handle->type == UV_TCP) { + session->wire_buf_start_idx += wirebuf_msg_size + 2; + } else { + session->wire_buf_start_idx += pkt_msg_size; + } + session->sflags.wirebuf_error = false; + + wirebuf_data_size = session->wire_buf_end_idx - session->wire_buf_start_idx; + if (wirebuf_data_size == 0) { + session_wirebuf_discard(session); + } else if (wirebuf_data_size < KNOT_WIRE_HEADER_SIZE) { + session_wirebuf_compress(session); + } + + return kr_ok(); +} + +void session_wirebuf_discard(struct session *session) +{ + session->wire_buf_start_idx = 0; + session->wire_buf_end_idx = 0; +} + +void session_wirebuf_compress(struct session *session) +{ + if (session->wire_buf_start_idx == 0) { + return; + } + uint8_t *wirebuf_data_start = &session->wire_buf[session->wire_buf_start_idx]; + size_t wirebuf_data_size = session->wire_buf_end_idx - session->wire_buf_start_idx; + if (session->wire_buf_start_idx < wirebuf_data_size) { + memmove(session->wire_buf, wirebuf_data_start, wirebuf_data_size); + } else { + memcpy(session->wire_buf, wirebuf_data_start, wirebuf_data_size); + } + session->wire_buf_start_idx = 0; + session->wire_buf_end_idx = wirebuf_data_size; +} + +bool session_wirebuf_error(struct session *session) +{ + return session->sflags.wirebuf_error; +} + +uint8_t *session_wirebuf_get_start(struct session *session) +{ + return session->wire_buf; +} + +size_t session_wirebuf_get_len(struct session *session) +{ + return session->wire_buf_end_idx; +} + +size_t session_wirebuf_get_size(struct session *session) +{ + return sizeof(session->wire_buf); +} + +uint8_t *session_wirebuf_get_free_start(struct session *session) +{ + return &session->wire_buf[session->wire_buf_end_idx]; +} + +size_t session_wirebuf_get_free_size(struct session *session) +{ + return session->wire_buf_size - session->wire_buf_end_idx; +} + +void session_poison(struct session *session) +{ + kr_asan_poison(session, sizeof(*session)); +} + +void session_unpoison(struct session *session) +{ + kr_asan_unpoison(session, sizeof(*session)); +} + +int session_wirebuf_process(struct session *session) +{ + int ret = 0; + if (session->wire_buf_start_idx == session->wire_buf_end_idx) { + return ret; + } + struct worker_ctx *worker = session_get_handle(session)->loop->data; + size_t wirebuf_data_size = session->wire_buf_end_idx - session->wire_buf_start_idx; + uint32_t max_iterations = (wirebuf_data_size / (KNOT_WIRE_HEADER_SIZE + KNOT_WIRE_QUESTION_MIN_SIZE)) + 1; + knot_pkt_t *query = NULL; + while (((query = session_produce_packet(session, &worker->pkt_pool)) != NULL) && + (ret < max_iterations)) { + assert (!session_wirebuf_error(session)); + int res = worker_submit(session, query); + if (res != kr_error(EILSEQ)) { + /* Packet has been successfully parsed. */ + ret += 1; + } + if (session_discard_packet(session, query) < 0) { + /* Packet data isn't stored in memory as expected. + something went wrong, normally should not happen. */ + break; + } + } + if (session_wirebuf_error(session)) { + ret = -1; + } + return ret; +} + +void session_kill_ioreq(struct session *s, struct qr_task *task) +{ + if (!s) { + return; + } + assert(s->sflags.outgoing && s->handle); + if (s->sflags.closing) { + return; + } + session_tasklist_del(s, task); + if (s->handle->type == UV_UDP) { + assert(session_tasklist_is_empty(s)); + session_close(s); + return; + } +} + +/** Update timestamp */ +void session_touch(struct session *s) +{ + s->last_input_activity = kr_now(); +} + +uint64_t session_last_input_activity(struct session *s) +{ + return s->last_input_activity; +} diff --git a/daemon/session.h b/daemon/session.h new file mode 100644 index 0000000000000000000000000000000000000000..5855be0a198b752e0ca6e1616005307c62eb14a8 --- /dev/null +++ b/daemon/session.h @@ -0,0 +1,146 @@ +/* Copyright (C) 2018 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +#pragma once + +#include <stdbool.h> +#include <uv.h> + +struct qr_task; +struct worker_ctx; +struct session; + +struct session_flags { + bool outgoing : 1; /**< True: to upstream; false: from a client. */ + bool throttled : 1; /**< True: data reading from peer is temporarily stopped. */ + bool has_tls : 1; /**< True: given session uses TLS. */ + bool connected : 1; /**< True: TCP connection is established. */ + bool closing : 1; /**< True: session close sequence is in progress. */ + bool wirebuf_error : 1; /**< True: last operation with wirebuf ended up with an error. */ +}; + +/* Allocate new session for a libuv handle. */ +struct session *session_new(uv_handle_t *handle); +/* Clear and free given session. */ +void session_free(struct session *session); +/* Clear session. */ +void session_clear(struct session *session); +/** Close session. */ +void session_close(struct session *session); +/** Start reading from underlying libuv IO handle. */ +int session_start_read(struct session *session); +/** Stop reading from underlying libuv IO handle. */ +int session_stop_read(struct session *session); + +/** List of tasks been waiting for IO. */ +/** Check if list is empty. */ +bool session_waitinglist_is_empty(const struct session *session); +/** Add task to the end of the list. */ +int session_waitinglist_push(struct session *session, struct qr_task *task); +/** Get the first element. */ +struct qr_task *session_waitinglist_get(const struct session *session); +/** Get the first element and remove it from the list. */ +struct qr_task *session_waitinglist_pop(struct session *session, bool deref); +/** Get the list length. */ +size_t session_waitinglist_get_len(const struct session *session); +/** Retry resolution for each task in the list. */ +void session_waitinglist_retry(struct session *session, bool increase_timeout_cnt); +/** Finalize all tasks in the list. */ +void session_waitinglist_finalize(struct session *session, int status); + +/** List of tasks associated with session. */ +/** Check if list is empty. */ +bool session_tasklist_is_empty(const struct session *session); +/** Get the first element. */ +struct qr_task *session_tasklist_get_first(struct session *session); +/** Get the first element and remove it from the list. */ +struct qr_task *session_tasklist_del_first(struct session *session, bool deref); +/** Get the list length. */ +size_t session_tasklist_get_len(const struct session *session); +/** Add task to the list. */ +int session_tasklist_add(struct session *session, struct qr_task *task); +/** Remove task from the list. */ +int session_tasklist_del(struct session *session, struct qr_task *task); +/** Remove task with given msg_id, session_flags(session)->outgoing must be true. */ +struct qr_task* session_tasklist_del_msgid(const struct session *session, uint16_t msg_id); +/** Find task with given msg_id */ +struct qr_task* session_tasklist_find_msgid(const struct session *session, uint16_t msg_id); +/** Finalize all tasks in the list. */ +void session_tasklist_finalize(struct session *session, int status); +/** Finalize all expired tasks in the list. */ +int session_tasklist_finalize_expired(struct session *session); + +/** Both of task lists (associated & waiting). */ +/** Check if empty. */ +bool session_is_empty(const struct session *session); +/** Get pointer to session flags */ +struct session_flags *session_flags(struct session *session); +/** Get peer address. */ +struct sockaddr *session_get_peer(struct session *session); +/** Get pointer to server-side tls-related data. */ +struct tls_ctx_t *session_tls_get_server_ctx(const struct session *session); +/** Set pointer to server-side tls-related data. */ +void session_tls_set_server_ctx(struct session *session, struct tls_ctx_t *ctx); +/** Get pointer to client-side tls-related data. */ +struct tls_client_ctx_t *session_tls_get_client_ctx(const struct session *session); +/** Set pointer to client-side tls-related data. */ +void session_tls_set_client_ctx(struct session *session, struct tls_client_ctx_t *ctx); +/** Get pointer to that part of tls-related data which has common structure for + * server and client. */ +struct tls_common_ctx *session_tls_get_common_ctx(const struct session *session); + +/** Get pointer to underlying libuv handle for IO operations. */ +uv_handle_t *session_get_handle(struct session *session); +struct session *session_get(uv_handle_t *h); + +/** Start session timer. */ +int session_timer_start(struct session *session, uv_timer_cb cb, + uint64_t timeout, uint64_t repeat); +/** Restart session timer without changing it parameters. */ +int session_timer_restart(struct session *session); +/** Stop session timer. */ +int session_timer_stop(struct session *session); + +/** Get pointer to the beginning of session wirebuffer. */ +uint8_t *session_wirebuf_get_start(struct session *session); +/** Get size of session wirebuffer. */ +size_t session_wirebuf_get_size(struct session *session); +/** Get length of data in the session wirebuffer. */ +size_t session_wirebuf_get_len(struct session *session); +/** Get pointer to the beginning of free space in session wirebuffer. */ +uint8_t *session_wirebuf_get_free_start(struct session *session); +/** Get amount of free space in session wirebuffer. */ +size_t session_wirebuf_get_free_size(struct session *session); +/** Discard all data in session wirebuffer. */ +void session_wirebuf_discard(struct session *session); +/** Move all data to the beginning of the buffer. */ +void session_wirebuf_compress(struct session *session); +int session_wirebuf_process(struct session *session); +ssize_t session_wirebuf_consume(struct session *session, + const uint8_t *data, ssize_t len); + +/** poison session structure with ASAN. */ +void session_poison(struct session *session); +/** unpoison session structure with ASAN. */ +void session_unpoison(struct session *session); + +knot_pkt_t *session_produce_packet(struct session *session, knot_mm_t *mm); +int session_discard_packet(struct session *session, const knot_pkt_t *pkt); + +void session_kill_ioreq(struct session *s, struct qr_task *task); +/** Update timestamp */ +void session_touch(struct session *s); +uint64_t session_last_input_activity(struct session *s); diff --git a/daemon/tls.c b/daemon/tls.c index 0fdc6f9e164ce08c11bc3a4cb9902121b4ec91a2..3ffbb1595d954aadefc0b320d663efda52e2118c 100644 --- a/daemon/tls.c +++ b/daemon/tls.c @@ -34,6 +34,7 @@ #include "daemon/io.h" #include "daemon/tls.h" #include "daemon/worker.h" +#include "daemon/session.h" #define EPHEMERAL_CERT_EXPIRATION_SECONDS_RENEW_BEFORE 60*60*24*7 #define GNUTLS_PIN_MIN_VERSION 0x030400 @@ -45,6 +46,12 @@ #define DEBUG_MSG(fmt...) #endif +struct async_write_ctx { + uv_write_t write_req; + struct tls_common_ctx *t; + char buf[0]; +}; + static char const server_logstring[] = "tls"; static char const client_logstring[] = "tls_client"; @@ -93,18 +100,16 @@ static ssize_t kres_gnutls_pull(gnutls_transport_ptr_t h, void *buf, size_t len) static void on_write_complete(uv_write_t *req, int status) { assert(req->data != NULL); + struct async_write_ctx *async_ctx = (struct async_write_ctx *)req->data; + struct tls_common_ctx *t = async_ctx->t; + assert(t->write_queue_size); + t->write_queue_size -= 1; free(req->data); - free(req); } -static bool stream_queue_is_empty(uv_stream_t *handle) +static bool stream_queue_is_empty(struct tls_common_ctx *t) { -#if UV_VERSION_HEX >= 0x011900 - return uv_stream_get_write_queue_size(handle) == 0; -#else - /* Assume best case */ - return true; -#endif + return (t->write_queue_size == 0); } static ssize_t kres_gnutls_vec_push(gnutls_transport_ptr_t h, const giovec_t * iov, int iovcnt) @@ -120,9 +125,9 @@ static ssize_t kres_gnutls_vec_push(gnutls_transport_ptr_t h, const giovec_t * i return 0; } - assert(t->session && t->session->handle && - t->session->handle->type == UV_TCP); - uv_stream_t *handle = (uv_stream_t *)t->session->handle; + assert(t->session); + uv_stream_t *handle = (uv_stream_t *)session_get_handle(t->session); + assert(handle && handle->type == UV_TCP); /* * This is a little bit complicated. There are two different writes: @@ -143,7 +148,7 @@ static ssize_t kres_gnutls_vec_push(gnutls_transport_ptr_t h, const giovec_t * i /* Try to perform the immediate write first to avoid copy */ int ret = 0; - if (stream_queue_is_empty(handle)) { + if (stream_queue_is_empty(t)) { ret = uv_try_write(handle, uv_buf, iovcnt); DEBUG_MSG("[%s] push %zu <%p> = %d\n", t->client_side ? "tls_client" : "tls", total_len, h, ret); @@ -152,10 +157,17 @@ static ssize_t kres_gnutls_vec_push(gnutls_transport_ptr_t h, const giovec_t * i > 0: number of bytes written (can be less than the supplied buffer size). < 0: negative error code (UV_EAGAIN is returned if no data can be sent immediately). */ - if ((ret == total_len) || (ret < 0 && ret != UV_EAGAIN)) { - /* Either all the data were buffered by libuv or - * uv_try_write() has returned error code other then UV_EAGAIN. + if (ret == total_len) { + /* All the data were buffered by libuv. + * Return. */ + return ret; + } + + if (ret < 0 && ret != UV_EAGAIN) { + /* uv_try_write() has returned error code other then UV_EAGAIN. * Return. */ + ret = -1; + errno = EIO; return ret; } /* Since we are here expression below is true @@ -172,10 +184,14 @@ static ssize_t kres_gnutls_vec_push(gnutls_transport_ptr_t h, const giovec_t * i } /* Fallback when the queue is full, and it's not possible to do an immediate write */ - char *buf = malloc(total_len - ret); - if (buf != NULL) { + char *p = malloc(sizeof(struct async_write_ctx) + total_len - ret); + if (p != NULL) { + struct async_write_ctx *async_ctx = (struct async_write_ctx *)p; + /* Save pointer to session tls context */ + async_ctx->t = t; + char *buf = async_ctx->buf; /* Skip data written in the partial write */ - int to_skip = ret; + size_t to_skip = ret; /* Copy the buffer into owned memory */ size_t off = 0; for (int i = 0; i < iovcnt; ++i) { @@ -197,21 +213,16 @@ static ssize_t kres_gnutls_vec_push(gnutls_transport_ptr_t h, const giovec_t * i uv_buf[0].len = off; /* Create an asynchronous write request */ - uv_write_t *write_req = calloc(1, sizeof(uv_write_t)); - if (write_req != NULL) { - write_req->data = buf; - } else { - free(buf); - errno = ENOMEM; - return -1; - } + uv_write_t *write_req = &async_ctx->write_req; + memset(write_req, 0, sizeof(uv_write_t)); + write_req->data = p; /* Perform an asynchronous write with a callback */ if (uv_write(write_req, handle, uv_buf, 1, on_write_complete) == 0) { ret = total_len; + t->write_queue_size += 1; } else { - free(buf); - free(write_req); + free(p); errno = EIO; ret = -1; } @@ -238,8 +249,9 @@ static int tls_handshake(struct tls_common_ctx *ctx, tls_handshake_cb handshake_ if (err == GNUTLS_E_SUCCESS) { /* Handshake finished, return success */ ctx->handshake_state = TLS_HS_DONE; + struct sockaddr *peer = session_get_peer(session); kr_log_verbose("[%s] TLS handshake with %s has completed\n", - logstring, kr_straddr(&session->peer.ip)); + logstring, kr_straddr(peer)); if (handshake_cb) { handshake_cb(session, 0); } @@ -258,8 +270,9 @@ static int tls_handshake(struct tls_common_ctx *ctx, tls_handshake_cb handshake_ /* Handle warning when in verbose mode */ const char *alert_name = gnutls_alert_get_name(gnutls_alert_get(ctx->tls_session)); if (alert_name != NULL) { + struct sockaddr *peer = session_get_peer(session); kr_log_verbose("[%s] TLS alert from %s received: %s\n", - logstring, kr_straddr(&session->peer.ip), alert_name); + logstring, kr_straddr(peer), alert_name); } } return kr_ok(); @@ -354,9 +367,10 @@ void tls_close(struct tls_common_ctx *ctx) assert(ctx->session); if (ctx->handshake_state == TLS_HS_DONE) { + const struct sockaddr *peer = session_get_peer(ctx->session); kr_log_verbose("[%s] closing tls connection to `%s`\n", ctx->client_side ? "tls_client" : "tls", - kr_straddr(&ctx->session->peer.ip)); + kr_straddr(peer)); ctx->handshake_state = TLS_HS_CLOSING; gnutls_bye(ctx->tls_session, GNUTLS_SHUT_RDWR); } @@ -384,12 +398,11 @@ int tls_write(uv_write_t *req, uv_handle_t *handle, knot_pkt_t *pkt, uv_write_cb return kr_error(EINVAL); } - struct session *session = handle->data; - struct tls_common_ctx *tls_ctx = session->outgoing ? &session->tls_client_ctx->c : - &session->tls_ctx->c; + struct session *s = handle->data; + struct tls_common_ctx *tls_ctx = session_tls_get_common_ctx(s); assert (tls_ctx); - assert (session->outgoing == tls_ctx->client_side); + assert (session_flags(s)->outgoing == tls_ctx->client_side); const uint16_t pkt_size = htons(pkt->size); const char *logstring = tls_ctx->client_side ? client_logstring : server_logstring; @@ -407,10 +420,14 @@ int tls_write(uv_write_t *req, uv_handle_t *handle, knot_pkt_t *pkt, uv_write_cb const ssize_t submitted = sizeof(pkt_size) + pkt->size; int ret = gnutls_record_uncork(tls_session, GNUTLS_RECORD_WAIT); - if (gnutls_error_is_fatal(ret)) { - kr_log_error("[%s] gnutls_record_uncork failed: %s (%d)\n", - logstring, gnutls_strerror_name(ret), ret); - return kr_error(EIO); + if (ret < 0) { + if (!gnutls_error_is_fatal(ret)) { + return kr_error(EAGAIN); + } else { + kr_log_error("[%s] gnutls_record_uncork failed: %s (%d)\n", + logstring, gnutls_strerror_name(ret), ret); + return kr_error(EIO); + } } if (ret != submitted) { @@ -426,17 +443,16 @@ int tls_write(uv_write_t *req, uv_handle_t *handle, knot_pkt_t *pkt, uv_write_cb return kr_ok(); } -int tls_process(struct worker_ctx *worker, uv_stream_t *handle, const uint8_t *buf, ssize_t nread) +ssize_t tls_process_input_data(struct session *s, const uint8_t *buf, ssize_t nread) { - struct session *session = handle->data; - struct tls_common_ctx *tls_p = session->outgoing ? &session->tls_client_ctx->c : - &session->tls_ctx->c; + struct tls_common_ctx *tls_p = session_tls_get_common_ctx(s); if (!tls_p) { return kr_error(ENOSYS); } - assert(tls_p->session == session); - + assert(tls_p->session == s); + assert(tls_p->recv_buf == buf && nread <= sizeof(tls_p->recv_buf)); + const char *logstring = tls_p->client_side ? client_logstring : server_logstring; tls_p->buf = buf; @@ -455,9 +471,11 @@ int tls_process(struct worker_ctx *worker, uv_stream_t *handle, const uint8_t *b } /* See https://gnutls.org/manual/html_node/Data-transfer-and-termination.html#Data-transfer-and-termination */ - int submitted = 0; + ssize_t submitted = 0; + uint8_t *wire_buf = session_wirebuf_get_free_start(s); + size_t wire_buf_size = session_wirebuf_get_free_size(s); while (true) { - ssize_t count = gnutls_record_recv(tls_p->tls_session, tls_p->recv_buf, sizeof(tls_p->recv_buf)); + ssize_t count = gnutls_record_recv(tls_p->tls_session, wire_buf, wire_buf_size); if (count == GNUTLS_E_AGAIN) { break; /* No data available */ } else if (count == GNUTLS_E_INTERRUPTED) { @@ -479,17 +497,15 @@ int tls_process(struct worker_ctx *worker, uv_stream_t *handle, const uint8_t *b kr_log_verbose("[%s] gnutls_record_recv failed: %s (%zd)\n", logstring, gnutls_strerror_name(count), count); return kr_error(EIO); - } - DEBUG_MSG("[%s] submitting %zd data to worker\n", logstring, count); - int ret = worker_process_tcp(worker, handle, tls_p->recv_buf, count); - if (ret < 0) { - return ret; - } - if (count <= 0) { + } else if (count == 0) { break; } - submitted += ret; + DEBUG_MSG("[%s] received %zd data\n", logstring, count); + wire_buf += count; + wire_buf_size -= count; + submitted += count; } + assert(tls_p->consumed == tls_p->nread); return submitted; } @@ -1127,13 +1143,13 @@ int tls_client_connect_start(struct tls_client_ctx_t *client_ctx, return kr_error(EINVAL); } - assert(session->outgoing && session->handle->type == UV_TCP); + assert(session_flags(session)->outgoing && session_get_handle(session)->type == UV_TCP); struct tls_common_ctx *ctx = &client_ctx->c; gnutls_session_set_ptr(ctx->tls_session, client_ctx); gnutls_handshake_set_timeout(ctx->tls_session, ctx->worker->engine->net.tcp.tls_handshake_timeout); - session->tls_client_ctx = client_ctx; + session_tls_set_client_ctx(session, client_ctx); ctx->handshake_cb = handshake_cb; ctx->handshake_state = TLS_HS_IN_PROGRESS; ctx->session = session; diff --git a/daemon/tls.h b/daemon/tls.h index d208f4cb80e0acc7a1c48e3082e46d4ef53dccbd..cb3d4a64f1e79b3f7f42254d3416a7991c54ed4f 100644 --- a/daemon/tls.h +++ b/daemon/tls.h @@ -94,9 +94,10 @@ struct tls_common_ctx { const uint8_t *buf; ssize_t nread; ssize_t consumed; - uint8_t recv_buf[4096]; + uint8_t recv_buf[8192]; tls_handshake_cb handshake_cb; struct worker_ctx *worker; + size_t write_queue_size; }; struct tls_ctx_t { @@ -134,7 +135,7 @@ int tls_write(uv_write_t *req, uv_handle_t* handle, knot_pkt_t * pkt, uv_write_c /*! Unwrap incoming data from a TLS stream and pass them to TCP session. * @return the number of newly-completed requests (>=0) or an error code */ -int tls_process(struct worker_ctx *worker, uv_stream_t *handle, const uint8_t *buf, ssize_t nread); +ssize_t tls_process_input_data(struct session *s, const uint8_t *buf, ssize_t nread); /*! Set TLS certificate and key from files. */ int tls_certificate_set(struct network *net, const char *tls_cert, const char *tls_key); diff --git a/daemon/worker.c b/daemon/worker.c index 5a6e2e58357a3018fafa68383ac75b99c053fc10..4412305e1e1a2c5240dbaa4e4927ef78a8f3b051 100644 --- a/daemon/worker.c +++ b/daemon/worker.c @@ -36,6 +36,23 @@ #include "daemon/io.h" #include "daemon/tls.h" #include "daemon/zimport.h" +#include "daemon/session.h" + + +/* Magic defaults for the worker. */ +#ifndef MP_FREELIST_SIZE +# ifdef __clang_analyzer__ +# define MP_FREELIST_SIZE 0 +# else +# define MP_FREELIST_SIZE 64 /**< Maximum length of the worker mempool freelist */ +# endif +#endif +#ifndef QUERY_RATE_THRESHOLD +#define QUERY_RATE_THRESHOLD (2 * MP_FREELIST_SIZE) /**< Nr of parallel queries considered as high rate */ +#endif +#ifndef MAX_PIPELINED +#define MAX_PIPELINED 100 +#endif #define VERBOSE_MSG(qry, fmt...) QRVERBOSE(qry, "wrkr", fmt) @@ -52,7 +69,7 @@ struct request_ctx struct session *session; } source; struct worker_ctx *worker; - qr_tasklist_t tasks; + struct qr_task *task; }; /** Query resolution task. */ @@ -61,17 +78,17 @@ struct qr_task struct request_ctx *ctx; knot_pkt_t *pktbuf; qr_tasklist_t waiting; - uv_handle_t *pending[MAX_PENDING]; + struct session *pending[MAX_PENDING]; uint16_t pending_count; uint16_t addrlist_count; uint16_t addrlist_turn; uint16_t timeouts; uint16_t iter_count; - uint16_t bytes_remaining; struct sockaddr *addrlist; uint32_t refs; bool finished : 1; bool leading : 1; + uint64_t creation_time; }; @@ -80,8 +97,6 @@ struct qr_task do { ++(task)->refs; } while(0) #define qr_task_unref(task) \ do { if (task && --(task)->refs == 0) { qr_task_free(task); } } while (0) -#define qr_valid_handle(task, checked) \ - (!uv_is_closing((checked)) || (task)->ctx->source.session->handle == (checked)) /** @internal get key for tcp session * @note kr_straddr() return pointer to static string @@ -93,15 +108,10 @@ static void qr_task_free(struct qr_task *task); static int qr_task_step(struct qr_task *task, const struct sockaddr *packet_source, knot_pkt_t *packet); -static int qr_task_send(struct qr_task *task, uv_handle_t *handle, +static int qr_task_send(struct qr_task *task, struct session *session, struct sockaddr *addr, knot_pkt_t *pkt); static int qr_task_finalize(struct qr_task *task, int state); static void qr_task_complete(struct qr_task *task); -static int worker_add_tcp_connected(struct worker_ctx *worker, - const struct sockaddr *addr, - struct session *session); -static int worker_del_tcp_connected(struct worker_ctx *worker, - const struct sockaddr *addr); static struct session* worker_find_tcp_connected(struct worker_ctx *worker, const struct sockaddr *addr); static int worker_add_tcp_waiting(struct worker_ctx *worker, @@ -111,16 +121,7 @@ static int worker_del_tcp_waiting(struct worker_ctx *worker, const struct sockaddr *addr); static struct session* worker_find_tcp_waiting(struct worker_ctx *worker, const struct sockaddr *addr); -static int session_add_waiting(struct session *session, struct qr_task *task); -static int session_del_waiting(struct session *session, struct qr_task *task); -static int session_add_tasks(struct session *session, struct qr_task *task); -static int session_del_tasks(struct session *session, struct qr_task *task); -static void session_close(struct session *session); -static void on_session_idle_timeout(uv_timer_t *timer); -static int timer_start(struct session *session, uv_timer_cb cb, - uint64_t timeout, uint64_t repeat); static void on_tcp_connect_timeout(uv_timer_t *timer); -static void on_tcp_watchdog_timeout(uv_timer_t *timer); /** @internal Get singleton worker. */ static inline struct worker_ctx *get_worker(void) @@ -128,76 +129,9 @@ static inline struct worker_ctx *get_worker(void) return uv_default_loop()->data; } -static inline void *iohandle_borrow(struct worker_ctx *worker) -{ - void *h = NULL; - - const size_t size = sizeof(uv_handles_t); - if (worker->pool_iohandles.len > 0) { - h = array_tail(worker->pool_iohandles); - array_pop(worker->pool_iohandles); - kr_asan_unpoison(h, size); - } else { - h = malloc(size); - } - - return h; -} - -static inline void iohandle_release(struct worker_ctx *worker, void *h) -{ - assert(h); - - if (worker->pool_iohandles.len < MP_FREELIST_SIZE) { - array_push(worker->pool_iohandles, h); - kr_asan_poison(h, sizeof(uv_handles_t)); - } else { - free(h); - } -} - -void *worker_iohandle_borrow(struct worker_ctx *worker) -{ - return iohandle_borrow(worker); -} - -void worker_iohandle_release(struct worker_ctx *worker, void *h) -{ - iohandle_release(worker, h); -} - -static inline void *iorequest_borrow(struct worker_ctx *worker) -{ - void *r = NULL; - - const size_t size = sizeof(uv_reqs_t); - if (worker->pool_ioreqs.len > 0) { - r = array_tail(worker->pool_ioreqs); - array_pop(worker->pool_ioreqs); - kr_asan_unpoison(r, size); - } else { - r = malloc(size); - } - - return r; -} - -static inline void iorequest_release(struct worker_ctx *worker, void *r) -{ - assert(r); - - if (worker->pool_ioreqs.len < MP_FREELIST_SIZE) { - array_push(worker->pool_ioreqs, r); - kr_asan_poison(r, sizeof(uv_reqs_t)); - } else { - free(r); - } -} - - /*! @internal Create a UDP/TCP handle for an outgoing AF_INET* connection. * socktype is SOCK_* */ -static uv_handle_t *ioreq_spawn(struct qr_task *task, int socktype, sa_family_t family) +static uv_handle_t *ioreq_spawn(struct worker_ctx *worker, int socktype, sa_family_t family) { bool precond = (socktype == SOCK_DGRAM || socktype == SOCK_STREAM) && (family == AF_INET || family == AF_INET6); @@ -207,13 +141,9 @@ static uv_handle_t *ioreq_spawn(struct qr_task *task, int socktype, sa_family_t return NULL; } - if (task->pending_count >= MAX_PENDING) { - return NULL; - } /* Create connection for iterative query */ - struct worker_ctx *worker = task->ctx->worker; - void *h = iohandle_borrow(worker); - uv_handle_t *handle = (uv_handle_t *)h; + uv_handle_t *handle = malloc(socktype == SOCK_DGRAM + ? sizeof(uv_udp_t) : sizeof(uv_tcp_t)); if (!handle) { return NULL; } @@ -223,7 +153,7 @@ static uv_handle_t *ioreq_spawn(struct qr_task *task, int socktype, sa_family_t worker->too_many_open = true; worker->rconcurrent_highwatermark = worker->stats.rconcurrent; } - iohandle_release(worker, h); + free(handle); return NULL; } @@ -245,203 +175,27 @@ static uv_handle_t *ioreq_spawn(struct qr_task *task, int socktype, sa_family_t } } - /* Set current handle as a subrequest type. */ - struct session *session = handle->data; - if (ret == 0) { - session->outgoing = true; - ret = session_add_tasks(session, task); - } - if (ret < 0) { + if (ret != 0) { io_deinit(handle); - iohandle_release(worker, h); + free(handle); return NULL; } - /* Connect or issue query datagram */ - task->pending[task->pending_count] = handle; - task->pending_count += 1; - return handle; -} -static void on_session_close(uv_handle_t *handle) -{ - uv_loop_t *loop = handle->loop; - struct worker_ctx *worker = loop->data; + /* Set current handle as a subrequest type. */ struct session *session = handle->data; - assert(session->handle == handle); - session->handle = NULL; - io_deinit(handle); - iohandle_release(worker, handle); -} - -static void on_session_timer_close(uv_handle_t *timer) -{ - struct session *session = timer->data; - uv_handle_t *handle = session->handle; - assert(handle && handle->data == session); - assert (session->outgoing || handle->type == UV_TCP); - if (!uv_is_closing(handle)) { - uv_close(handle, on_session_close); - } -} - -static void ioreq_kill_udp(uv_handle_t *req, struct qr_task *task) -{ - assert(req); - struct session *session = req->data; - assert(session->outgoing); - if (session->closing) { - return; - } - uv_timer_stop(&session->timeout); - session_del_tasks(session, task); - assert(session->tasks.len == 0); - session_close(session); -} - -static void ioreq_kill_tcp(uv_handle_t *req, struct qr_task *task) -{ - assert(req); - struct session *session = req->data; - assert(session->outgoing); - if (session->closing) { - return; - } - - session_del_waiting(session, task); - session_del_tasks(session, task); - - int res = 0; - - if (session->outgoing && session->peer.ip.sa_family != AF_UNSPEC && - session->tasks.len == 0 && session->waiting.len == 0 && !session->closing) { - assert(session->peer.ip.sa_family == AF_INET || - session->peer.ip.sa_family == AF_INET6); - res = 1; - if (session->connected) { - /* This is outbound TCP connection which can be reused. - * Close it after timeout */ - uv_timer_t *timer = &session->timeout; - timer->data = session; - uv_timer_stop(timer); - res = uv_timer_start(timer, on_session_idle_timeout, - KR_CONN_RTT_MAX, 0); - } - } - - if (res != 0) { - /* if any errors, close the session immediately */ - session_close(session); - } + session_flags(session)->outgoing = true; + /* Connect or issue query datagram */ + return handle; } static void ioreq_kill_pending(struct qr_task *task) { for (uint16_t i = 0; i < task->pending_count; ++i) { - if (task->pending[i]->type == UV_UDP) { - ioreq_kill_udp(task->pending[i], task); - } else if (task->pending[i]->type == UV_TCP) { - ioreq_kill_tcp(task->pending[i], task); - } else { - assert(false); - } + session_kill_ioreq(task->pending[i], task); } task->pending_count = 0; } -static void session_close(struct session *session) -{ - assert(session->tasks.len == 0 && session->waiting.len == 0); - - if (session->closing) { - return; - } - - if (!session->outgoing && session->buffering != NULL) { - qr_task_complete(session->buffering); - } - session->buffering = NULL; - - uv_handle_t *handle = session->handle; - io_stop_read(handle); - session->closing = true; - if (session->outgoing && - session->peer.ip.sa_family != AF_UNSPEC) { - struct worker_ctx *worker = get_worker(); - struct sockaddr *peer = &session->peer.ip; - worker_del_tcp_connected(worker, peer); - session->connected = false; - } - - if (!uv_is_closing((uv_handle_t *)&session->timeout)) { - uv_timer_stop(&session->timeout); - if (session->tls_client_ctx) { - tls_close(&session->tls_client_ctx->c); - } - if (session->tls_ctx) { - tls_close(&session->tls_ctx->c); - } - - session->timeout.data = session; - uv_close((uv_handle_t *)&session->timeout, on_session_timer_close); - } -} - -static int session_add_waiting(struct session *session, struct qr_task *task) -{ - for (int i = 0; i < session->waiting.len; ++i) { - if (session->waiting.at[i] == task) { - return i; - } - } - int ret = array_push(session->waiting, task); - if (ret >= 0) { - qr_task_ref(task); - } - return ret; -} - -static int session_del_waiting(struct session *session, struct qr_task *task) -{ - int ret = kr_error(ENOENT); - for (int i = 0; i < session->waiting.len; ++i) { - if (session->waiting.at[i] == task) { - array_del(session->waiting, i); - qr_task_unref(task); - ret = kr_ok(); - break; - } - } - return ret; -} - -static int session_add_tasks(struct session *session, struct qr_task *task) -{ - for (int i = 0; i < session->tasks.len; ++i) { - if (session->tasks.at[i] == task) { - return i; - } - } - int ret = array_push(session->tasks, task); - if (ret >= 0) { - qr_task_ref(task); - } - return ret; -} - -static int session_del_tasks(struct session *session, struct qr_task *task) -{ - int ret = kr_error(ENOENT); - for (int i = 0; i < session->tasks.len; ++i) { - if (session->tasks.at[i] == task) { - array_del(session->tasks, i); - qr_task_unref(task); - ret = kr_ok(); - break; - } - } - return ret; -} - /** @cond This memory layout is internal to mempool.c, use only for debugging. */ #if defined(__SANITIZE_ADDRESS__) struct mempool_chunk { @@ -511,7 +265,8 @@ static int subreq_key(char *dst, knot_pkt_t *pkt) */ static struct request_ctx *request_create(struct worker_ctx *worker, uv_handle_t *handle, - const struct sockaddr *addr) + const struct sockaddr *addr, + uint32_t uid) { knot_mm_t pool = { .ctx = pool_borrow(worker), @@ -529,16 +284,16 @@ static struct request_ctx *request_create(struct worker_ctx *worker, /* TODO Relocate pool to struct request */ ctx->worker = worker; - array_init(ctx->tasks); - struct session *session = handle ? handle->data : NULL; - if (session) { - assert(session->outgoing == false); + struct session *s = handle ? handle->data : NULL; + if (s) { + assert(session_flags(s)->outgoing == false); } - ctx->source.session = session; + ctx->source.session = s; struct kr_request *req = &ctx->req; req->pool = pool; req->vars_ref = LUA_NOREF; + req->uid = uid; /* Remember query source addr */ if (!addr || (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)) { @@ -584,8 +339,8 @@ static int request_start(struct request_ctx *ctx, knot_pkt_t *query) struct kr_request *req = &ctx->req; /* source.session can be empty if request was generated by kresd itself */ - if (!ctx->source.session || - ctx->source.session->handle->type == UV_TCP) { + struct session *s = ctx->source.session; + if (!s || session_get_handle(s)->type == UV_TCP) { answer_max = KNOT_WIRE_MAX_PKTSIZE; } else if (knot_pkt_has_edns(query)) { /* EDNS */ answer_max = MAX(knot_edns_get_payload(query->opt_rr), @@ -651,35 +406,6 @@ static void request_free(struct request_ctx *ctx) worker->stats.rconcurrent -= 1; } -static int request_add_tasks(struct request_ctx *ctx, struct qr_task *task) -{ - for (int i = 0; i < ctx->tasks.len; ++i) { - if (ctx->tasks.at[i] == task) { - return i; - } - } - int ret = array_push(ctx->tasks, task); - if (ret >= 0) { - qr_task_ref(task); - } - return ret; -} - -static int request_del_tasks(struct request_ctx *ctx, struct qr_task *task) -{ - int ret = kr_error(ENOENT); - for (int i = 0; i < ctx->tasks.len; ++i) { - if (ctx->tasks.at[i] == task) { - array_del(ctx->tasks, i); - qr_task_unref(task); - ret = kr_ok(); - break; - } - } - return ret; -} - - static struct qr_task *qr_task_create(struct request_ctx *ctx) { /* How much can client handle? */ @@ -695,7 +421,7 @@ static struct qr_task *qr_task_create(struct request_ctx *ctx) if (!task) { return NULL; } - memset(task, 0, sizeof(*task)); /* avoid accidentally unitialized fields */ + memset(task, 0, sizeof(*task)); /* avoid accidentally unintialized fields */ /* Create packet buffers for answer and subrequests */ knot_pkt_t *pktbuf = knot_pkt_new(NULL, pktbuf_max, &ctx->req.pool); @@ -709,12 +435,11 @@ static struct qr_task *qr_task_create(struct request_ctx *ctx) task->pktbuf = pktbuf; array_init(task->waiting); task->refs = 0; - int ret = request_add_tasks(ctx, task); - if (ret < 0) { - mm_free(&ctx->req.pool, task); - mm_free(&ctx->req.pool, pktbuf); - return NULL; - } + assert(ctx->task == NULL); + ctx->task = task; + /* Make the primary reference to task. */ + qr_task_ref(task); + task->creation_time = kr_now(); ctx->worker->stats.concurrent += 1; return task; } @@ -726,25 +451,9 @@ static void qr_task_free(struct qr_task *task) assert(ctx); - /* Process outbound session. */ - struct session *source_session = ctx->source.session; struct worker_ctx *worker = ctx->worker; - /* Process source session. */ - if (source_session && - source_session->tasks.len < worker->tcp_pipeline_max/2 && - !source_session->closing && source_session->throttled) { - uv_handle_t *handle = source_session->handle; - /* Start reading again if the session is throttled and - * the number of outgoing requests is below watermark. */ - if (handle) { - io_start_read(handle); - source_session->throttled = false; - } - } - - if (ctx->tasks.len == 0) { - array_clear(ctx->tasks); + if (ctx->task == NULL) { request_free(ctx); } @@ -755,14 +464,9 @@ static void qr_task_free(struct qr_task *task) /*@ Register new qr_task within session. */ static int qr_task_register(struct qr_task *task, struct session *session) { - assert(session->outgoing == false && session->handle->type == UV_TCP); - - int ret = array_reserve(session->tasks, session->tasks.len + 1); - if (ret != 0) { - return kr_error(ENOMEM); - } + assert(!session_flags(session)->outgoing && session_get_handle(session)->type == UV_TCP); - session_add_tasks(session, task); + session_tasklist_add(session, task); struct request_ctx *ctx = task->ctx; assert(ctx && (ctx->source.session == NULL || ctx->source.session == session)); @@ -772,12 +476,10 @@ static int qr_task_register(struct qr_task *task, struct session *session) * an in effect shrink TCP window size. To get more precise throttling, * we would need to copy remainder of the unread buffer and reassemble * when resuming reading. This is NYI. */ - if (session->tasks.len >= task->ctx->worker->tcp_pipeline_max) { - uv_handle_t *handle = session->handle; - if (handle && !session->throttled && !session->closing) { - io_stop_read(handle); - session->throttled = true; - } + if (session_tasklist_get_len(session) >= task->ctx->worker->tcp_pipeline_max && + !session_flags(session)->throttled && !session_flags(session)->closing) { + session_stop_read(session); + session_flags(session)->throttled = true; } return 0; @@ -792,136 +494,112 @@ static void qr_task_complete(struct qr_task *task) assert(task->waiting.len == 0); assert(task->leading == false); - struct session *source_session = ctx->source.session; - if (source_session) { - assert(source_session->outgoing == false && - source_session->waiting.len == 0); - session_del_tasks(source_session, task); + struct session *s = ctx->source.session; + if (s) { + assert(!session_flags(s)->outgoing && session_waitinglist_is_empty(s)); + ctx->source.session = NULL; + session_tasklist_del(s, task); } /* Release primary reference to task. */ - request_del_tasks(ctx, task); + if (ctx->task == task) { + ctx->task = NULL; + qr_task_unref(task); + } } /* This is called when we send subrequest / answer */ static int qr_task_on_send(struct qr_task *task, uv_handle_t *handle, int status) { + if (task->finished) { assert(task->leading == false); qr_task_complete(task); - if (!handle || handle->type != UV_TCP) { - return status; - } - struct session* session = handle->data; - assert(session); - if (!session->outgoing || - session->waiting.len == 0) { - return status; - } } - if (handle) { - struct session* session = handle->data; - if (!session->outgoing && task->ctx->source.session) { - assert (task->ctx->source.session->handle == handle); - } - if (handle->type == UV_TCP && session->outgoing && - session->waiting.len > 0) { - session_del_waiting(session, task); - if (session->closing) { - return status; - } - /* Finalize the task, if any errors. - * We can't add it to the end of waiting list for retrying - * since it may lead endless loop in some circumstances - * (for instance: tls; send->tls_push->too many non-critical errors-> - * on_send with nonzero status->re-add to waiting->send->etc).*/ - if (status != 0) { - if (session->outgoing) { - qr_task_finalize(task, KR_STATE_FAIL); - } else { - assert(task->ctx->source.session == session); - task->ctx->source.session = NULL; - } - session_del_tasks(session, task); - } - if (session->waiting.len > 0) { - struct qr_task *t = session->waiting.at[0]; - int ret = qr_task_send(t, handle, &session->peer.ip, t->pktbuf); - if (ret != kr_ok()) { - while (session->waiting.len > 0) { - struct qr_task *t = session->waiting.at[0]; - if (session->outgoing) { - qr_task_finalize(t, KR_STATE_FAIL); - } else { - assert(t->ctx->source.session == session); - t->ctx->source.session = NULL; - } - array_del(session->waiting, 0); - session_del_tasks(session, t); - qr_task_unref(t); - } - while (session->tasks.len > 0) { - struct qr_task *t = session->tasks.at[0]; - if (session->outgoing) { - qr_task_finalize(t, KR_STATE_FAIL); - } else { - assert(t->ctx->source.session == session); - t->ctx->source.session = NULL; - } - session_del_tasks(session, t); - } - session_close(session); - return status; - } - } - } - if (!session->closing) { - io_start_read(handle); /* Start reading new query */ - } + if (!handle || handle->type != UV_TCP) { + return status; + } + + struct session* s = handle->data; + assert(s); + if (status != 0) { + session_tasklist_del(s, task); + } + + if (session_flags(s)->outgoing || session_flags(s)->closing) { + return status; } + + struct worker_ctx *worker = task->ctx->worker; + if (session_flags(s)->throttled && + session_tasklist_get_len(s) < worker->tcp_pipeline_max/2) { + /* Start reading again if the session is throttled and + * the number of outgoing requests is below watermark. */ + session_start_read(s); + session_flags(s)->throttled = false; + } + return status; } static void on_send(uv_udp_send_t *req, int status) { - uv_handle_t *handle = (uv_handle_t *)(req->handle); - uv_loop_t *loop = handle->loop; - struct worker_ctx *worker = loop->data; - assert(worker == get_worker()); struct qr_task *task = req->data; - qr_task_on_send(task, handle, status); + uv_handle_t *h = (uv_handle_t *)req->handle; + qr_task_on_send(task, h, status); qr_task_unref(task); - iorequest_release(worker, req); + free(req); } -static void on_task_write(uv_write_t *req, int status) +static void on_write(uv_write_t *req, int status) { - uv_handle_t *handle = (uv_handle_t *)(req->handle); - uv_loop_t *loop = handle->loop; - struct worker_ctx *worker = loop->data; - assert(worker == get_worker()); struct qr_task *task = req->data; - qr_task_on_send(task, handle, status); + uv_handle_t *h = (uv_handle_t *)req->handle; + qr_task_on_send(task, h, status); qr_task_unref(task); - iorequest_release(worker, req); + free(req); } -static int qr_task_send(struct qr_task *task, uv_handle_t *handle, +static int qr_task_send(struct qr_task *task, struct session *session, struct sockaddr *addr, knot_pkt_t *pkt) { - if (!handle) { - return qr_task_on_send(task, handle, kr_error(EIO)); + if (!session) { + return qr_task_on_send(task, NULL, kr_error(EIO)); } int ret = 0; struct request_ctx *ctx = task->ctx; - struct worker_ctx *worker = ctx->worker; struct kr_request *req = &ctx->req; - void *ioreq = iorequest_borrow(worker); - if (!ioreq) { - return qr_task_on_send(task, handle, kr_error(ENOMEM)); + + uv_handle_t *handle = session_get_handle(session); + assert(handle && handle->data == session); + const bool is_stream = handle->type == UV_TCP; + if (!is_stream && handle->type != UV_UDP) abort(); + + if (addr == NULL) { + addr = session_get_peer(session); + } + + if (pkt == NULL) { + pkt = worker_task_get_pktbuf(task); + } + + if (session_flags(session)->outgoing && handle->type == UV_TCP) { + size_t try_limit = session_tasklist_get_len(session) + 1; + uint16_t msg_id = knot_wire_get_id(pkt->wire); + size_t try_count = 0; + while (session_tasklist_find_msgid(session, msg_id) && + try_count <= try_limit) { + ++msg_id; + ++try_count; + } + if (try_count > try_limit) { + return kr_error(ENOENT); + } + worker_task_pkt_set_msgid(task, msg_id); } + if (knot_wire_get_qr(pkt->wire) == 0) { /* * Query must be finalised using destination address before @@ -936,24 +614,28 @@ static int qr_task_send(struct qr_task *task, uv_handle_t *handle, * trying to obtain the IP address from it. */ ret = kr_resolve_checkout(req, NULL, addr, - handle->type == UV_UDP ? SOCK_DGRAM : SOCK_STREAM, + is_stream ? SOCK_STREAM : SOCK_DGRAM, pkt); if (ret != 0) { - iorequest_release(worker, ioreq); return ret; } } + uv_handle_t *ioreq = malloc(is_stream ? sizeof(uv_write_t) : sizeof(uv_udp_send_t)); + if (!ioreq) { + return qr_task_on_send(task, handle, kr_error(ENOMEM)); + } + /* Pending ioreq on current task */ qr_task_ref(task); + struct worker_ctx *worker = ctx->worker; /* Send using given protocol */ - struct session *session = handle->data; - assert(session->closing == false); - if (session->has_tls) { + assert(!session_flags(session)->closing); + if (session_flags(session)->has_tls) { uv_write_t *write_req = (uv_write_t *)ioreq; write_req->data = task; - ret = tls_write(write_req, handle, pkt, &on_task_write); + ret = tls_write(write_req, handle, pkt, &on_write); } else if (handle->type == UV_UDP) { uv_udp_send_t *send_req = (uv_udp_send_t *)ioreq; uv_buf_t buf = { (char *)pkt->wire, pkt->size }; @@ -967,31 +649,33 @@ static int qr_task_send(struct qr_task *task, uv_handle_t *handle, { (char *)pkt->wire, pkt->size } }; write_req->data = task; - ret = uv_write(write_req, (uv_stream_t *)handle, buf, 2, &on_task_write); + ret = uv_write(write_req, (uv_stream_t *)handle, buf, 2, &on_write); } else { assert(false); } if (ret == 0) { + if (session_flags(session)->outgoing) { + session_tasklist_add(session, task); + } if (worker->too_many_open && worker->stats.rconcurrent < worker->rconcurrent_highwatermark - 10) { worker->too_many_open = false; } } else { - iorequest_release(worker, ioreq); + free(ioreq); qr_task_unref(task); if (ret == UV_EMFILE) { worker->too_many_open = true; worker->rconcurrent_highwatermark = worker->stats.rconcurrent; + ret = kr_error(UV_EMFILE); } } /* Update statistics */ - if (ctx->source.session && - handle != ctx->source.session->handle && - addr) { - if (session->has_tls) + if (session_flags(session)->outgoing && addr) { + if (session_flags(session)->has_tls) worker->stats.tls += 1; else if (handle->type == UV_UDP) worker->stats.udp += 1; @@ -1003,37 +687,28 @@ static int qr_task_send(struct qr_task *task, uv_handle_t *handle, else if (addr->sa_family == AF_INET) worker->stats.ipv4 += 1; } - - return ret; -} - -static int session_next_waiting_send(struct session *session) -{ - union inaddr *peer = &session->peer; - int ret = kr_ok(); - if (session->waiting.len > 0) { - struct qr_task *task = session->waiting.at[0]; - ret = qr_task_send(task, session->handle, &peer->ip, task->pktbuf); - } return ret; } static int session_tls_hs_cb(struct session *session, int status) { - struct worker_ctx *worker = get_worker(); - union inaddr *peer = &session->peer; - int deletion_res = worker_del_tcp_waiting(worker, &peer->ip); + assert(session_flags(session)->outgoing); + uv_handle_t *handle = session_get_handle(session); + uv_loop_t *loop = handle->loop; + struct worker_ctx *worker = loop->data; + struct sockaddr *peer = session_get_peer(session); + int deletion_res = worker_del_tcp_waiting(worker, peer); int ret = kr_ok(); if (status) { - kr_nsrep_update_rtt(NULL, &peer->ip, KR_NS_DEAD, + kr_nsrep_update_rtt(NULL, peer, KR_NS_DEAD, worker->engine->resolver.cache_rtt, KR_NS_UPDATE_NORESET); return ret; } /* handshake was completed successfully */ - struct tls_client_ctx_t *tls_client_ctx = session->tls_client_ctx; + struct tls_client_ctx_t *tls_client_ctx = session_tls_get_client_ctx(session); struct tls_client_paramlist_entry *tls_params = tls_client_ctx->params; gnutls_session_t tls_session = tls_client_ctx->c.tls_session; if (gnutls_session_is_resumed(tls_session) != 0) { @@ -1054,9 +729,16 @@ static int session_tls_hs_cb(struct session *session, int status) } } - ret = worker_add_tcp_connected(worker, &peer->ip, session); + ret = worker_add_tcp_connected(worker, peer, session); if (deletion_res == kr_ok() && ret == kr_ok()) { - ret = session_next_waiting_send(session); + while (!session_waitinglist_is_empty(session)) { + struct qr_task *t = session_waitinglist_get(session); + ret = qr_task_send(t, session, NULL, NULL); + if (ret != 0) { + break; + } + session_waitinglist_pop(session, true); + } } else { ret = kr_error(EINVAL); } @@ -1066,146 +748,112 @@ static int session_tls_hs_cb(struct session *session, int status) * Session isn't in the list of waiting sessions, * or addition to the list of connected sessions failed, * or write to upstream failed. */ - while (session->waiting.len > 0) { - struct qr_task *task = session->waiting.at[0]; - session_del_tasks(session, task); - array_del(session->waiting, 0); - qr_task_finalize(task, KR_STATE_FAIL); - qr_task_unref(task); - } - worker_del_tcp_connected(worker, &peer->ip); - assert(session->tasks.len == 0); + worker_del_tcp_connected(worker, peer); + session_waitinglist_finalize(session, KR_STATE_FAIL); + assert(session_tasklist_is_empty(session)); session_close(session); } else { - uv_timer_stop(&session->timeout); - session->timeout.data = session; - timer_start(session, on_tcp_watchdog_timeout, MAX_TCP_INACTIVITY, 0); + session_timer_stop(session); + session_timer_start(session, tcp_timeout_trigger, + MAX_TCP_INACTIVITY, MAX_TCP_INACTIVITY); } return kr_ok(); } -static struct kr_query *session_current_query(struct session *session) -{ - if (session->waiting.len == 0) { - return NULL; - } - struct qr_task *task = session->waiting.at[0]; - if (task->ctx->req.rplan.pending.len == 0) { +static struct kr_query *task_get_last_pending_query(struct qr_task *task) +{ + if (!task || task->ctx->req.rplan.pending.len == 0) { return NULL; } return array_tail(task->ctx->req.rplan.pending); } + static void on_connect(uv_connect_t *req, int status) { struct worker_ctx *worker = get_worker(); uv_stream_t *handle = req->handle; struct session *session = handle->data; - union inaddr *peer = &session->peer; + struct sockaddr *peer = session_get_peer(session); + free(req); + + assert(session_flags(session)->outgoing); if (status == UV_ECANCELED) { - worker_del_tcp_waiting(worker, &peer->ip); - assert(session->closing && session->waiting.len == 0 && session->tasks.len == 0); - iorequest_release(worker, req); + worker_del_tcp_waiting(worker, peer); + assert(session_is_empty(session) && session_flags(session)->closing); return; } - if (session->closing) { - worker_del_tcp_waiting(worker, &peer->ip); - assert(session->waiting.len == 0 && session->tasks.len == 0); - iorequest_release(worker, req); + if (session_flags(session)->closing) { + worker_del_tcp_waiting(worker, peer); + assert(session_is_empty(session)); return; } - uv_timer_stop(&session->timeout); - if (status != 0) { - worker_del_tcp_waiting(worker, &peer->ip); - while (session->waiting.len > 0) { - struct qr_task *task = session->waiting.at[0]; - session_del_tasks(session, task); - array_del(session->waiting, 0); - assert(task->refs > 1); - qr_task_unref(task); - qr_task_step(task, NULL, NULL); - } - assert(session->tasks.len == 0); - iorequest_release(worker, req); + worker_del_tcp_waiting(worker, peer); + assert(session_tasklist_is_empty(session)); + session_waitinglist_retry(session, false); session_close(session); return; } - if (!session->has_tls) { + if (!session_flags(session)->has_tls) { /* if there is a TLS, session still waiting for handshake, * otherwise remove it from waiting list */ - if (worker_del_tcp_waiting(worker, &peer->ip) != 0) { + if (worker_del_tcp_waiting(worker, peer) != 0) { /* session isn't in list of waiting queries, * * something gone wrong */ - while (session->waiting.len > 0) { - struct qr_task *task = session->waiting.at[0]; - session_del_tasks(session, task); - array_del(session->waiting, 0); - ioreq_kill_pending(task); - assert(task->pending_count == 0); - qr_task_finalize(task, KR_STATE_FAIL); - qr_task_unref(task); - } - assert(session->tasks.len == 0); - iorequest_release(worker, req); + session_waitinglist_finalize(session, KR_STATE_FAIL); + assert(session_tasklist_is_empty(session)); session_close(session); return; } } - struct kr_query *qry = session_current_query(session); + struct qr_task *task = session_waitinglist_get(session); + struct kr_query *qry = task_get_last_pending_query(task); WITH_VERBOSE (qry) { - char addr_str[INET6_ADDRSTRLEN]; - inet_ntop(session->peer.ip.sa_family, kr_inaddr(&session->peer.ip), - addr_str, sizeof(addr_str)); - VERBOSE_MSG(qry, "=> connected to '%s'\n", addr_str); + struct sockaddr *peer = session_get_peer(session); + char peer_str[INET6_ADDRSTRLEN]; + inet_ntop(peer->sa_family, kr_inaddr(peer), peer_str, sizeof(peer_str)); + VERBOSE_MSG(qry, "=> connected to '%s'\n", peer_str); } - session->connected = true; - session->handle = (uv_handle_t *)handle; + session_flags(session)->connected = true; + session_start_read(session); int ret = kr_ok(); - if (session->has_tls) { - ret = tls_client_connect_start(session->tls_client_ctx, - session, session_tls_hs_cb); + if (session_flags(session)->has_tls) { + struct tls_client_ctx_t *tls_ctx = session_tls_get_client_ctx(session); + ret = tls_client_connect_start(tls_ctx, session, session_tls_hs_cb); if (ret == kr_error(EAGAIN)) { - iorequest_release(worker, req); - io_start_read(session->handle); - timer_start(session, on_tcp_watchdog_timeout, MAX_TCP_INACTIVITY, 0); + session_timer_stop(session); + session_timer_start(session, tcp_timeout_trigger, + MAX_TCP_INACTIVITY, MAX_TCP_INACTIVITY); return; } + } else { + worker_add_tcp_connected(worker, peer, session); } - - if (ret == kr_ok()) { - ret = session_next_waiting_send(session); - if (ret == kr_ok()) { - timer_start(session, on_tcp_watchdog_timeout, MAX_TCP_INACTIVITY, 0); - worker_add_tcp_connected(worker, &session->peer.ip, session); - iorequest_release(worker, req); + while (!session_waitinglist_is_empty(session)) { + struct qr_task *t = session_waitinglist_get(session); + ret = qr_task_send(t, session, NULL, NULL); + if (ret != 0) { + worker_del_tcp_connected(worker, peer); + session_waitinglist_finalize(session, KR_STATE_FAIL); + session_tasklist_finalize(session, KR_STATE_FAIL); + session_close(session); return; } + session_waitinglist_pop(session, true); } - - while (session->waiting.len > 0) { - struct qr_task *task = session->waiting.at[0]; - session_del_tasks(session, task); - array_del(session->waiting, 0); - ioreq_kill_pending(task); - assert(task->pending_count == 0); - qr_task_finalize(task, KR_STATE_FAIL); - qr_task_unref(task); - } - - assert(session->tasks.len == 0); - - iorequest_release(worker, req); - session_close(session); + session_timer_stop(session); + session_timer_start(session, tcp_timeout_trigger, + MAX_TCP_INACTIVITY, MAX_TCP_INACTIVITY); } static void on_tcp_connect_timeout(uv_timer_t *timer) @@ -1215,76 +863,26 @@ static void on_tcp_connect_timeout(uv_timer_t *timer) uv_timer_stop(timer); struct worker_ctx *worker = get_worker(); - assert (session->waiting.len == session->tasks.len); + assert (session_tasklist_is_empty(session)); - union inaddr *peer = &session->peer; - worker_del_tcp_waiting(worker, &peer->ip); + struct sockaddr *peer = session_get_peer(session); + worker_del_tcp_waiting(worker, peer); - struct kr_query *qry = session_current_query(session); + struct qr_task *task = session_waitinglist_get(session); + struct kr_query *qry = task_get_last_pending_query(task); WITH_VERBOSE (qry) { - char addr_str[INET6_ADDRSTRLEN]; - inet_ntop(peer->ip.sa_family, kr_inaddr(&peer->ip), addr_str, sizeof(addr_str)); - VERBOSE_MSG(qry, "=> connection to '%s' failed\n", addr_str); + char peer_str[INET6_ADDRSTRLEN]; + inet_ntop(peer->sa_family, kr_inaddr(peer), peer_str, sizeof(peer_str)); + VERBOSE_MSG(qry, "=> connection to '%s' failed\n", peer_str); } - kr_nsrep_update_rtt(NULL, &peer->ip, KR_NS_DEAD, + kr_nsrep_update_rtt(NULL, peer, KR_NS_DEAD, worker->engine->resolver.cache_rtt, KR_NS_UPDATE_NORESET); - while (session->waiting.len > 0) { - struct qr_task *task = session->waiting.at[0]; - assert(task->ctx); - task->timeouts += 1; - worker->stats.timeout += 1; - session_del_tasks(session, task); - array_del(session->waiting, 0); - assert(task->refs > 1); - qr_task_unref(task); - qr_task_step(task, NULL, NULL); - } - - assert (session->tasks.len == 0); - session_close(session); -} - -static void on_tcp_watchdog_timeout(uv_timer_t *timer) -{ - struct session *session = timer->data; - - assert(session->outgoing); - uv_timer_stop(timer); - struct worker_ctx *worker = get_worker(); - if (session->outgoing) { - if (session->has_tls) { - worker_del_tcp_waiting(worker, &session->peer.ip); - } - worker_del_tcp_connected(worker, &session->peer.ip); - - while (session->waiting.len > 0) { - struct qr_task *task = session->waiting.at[0]; - task->timeouts += 1; - worker->stats.timeout += 1; - array_del(session->waiting, 0); - session_del_tasks(session, task); - ioreq_kill_pending(task); - assert(task->pending_count == 0); - qr_task_finalize(task, KR_STATE_FAIL); - qr_task_unref(task); - } - } - - while (session->tasks.len > 0) { - struct qr_task *task = session->tasks.at[0]; - task->timeouts += 1; - worker->stats.timeout += 1; - assert(task->refs > 1); - array_del(session->tasks, 0); - ioreq_kill_pending(task); - assert(task->pending_count == 0); - qr_task_finalize(task, KR_STATE_FAIL); - qr_task_unref(task); - } - + worker->stats.timeout += session_waitinglist_get_len(session); + session_waitinglist_retry(session, true); + assert (session_tasklist_is_empty(session)); session_close(session); } @@ -1292,14 +890,14 @@ static void on_tcp_watchdog_timeout(uv_timer_t *timer) static void on_udp_timeout(uv_timer_t *timer) { struct session *session = timer->data; - assert(session->handle->data == session); + assert(session_get_handle(session)->data == session); + assert(session_tasklist_get_len(session) == 1); + assert(session_waitinglist_is_empty(session)); uv_timer_stop(timer); - assert(session->tasks.len == 1); - assert(session->waiting.len == 0); /* Penalize all tried nameservers with a timeout. */ - struct qr_task *task = session->tasks.at[0]; + struct qr_task *task = session_tasklist_get_first(session); struct worker_ctx *worker = task->ctx->worker; if (task->leading && task->pending_count > 0) { struct kr_query *qry = array_tail(task->ctx->req.rplan.pending); @@ -1321,21 +919,6 @@ static void on_udp_timeout(uv_timer_t *timer) qr_task_step(task, NULL, NULL); } -static void on_session_idle_timeout(uv_timer_t *timer) -{ - struct session *s = timer->data; - assert(s); - uv_timer_stop(timer); - if (s->closing) { - return; - } - /* session was not in use during timer timeout - * remove it from connection list and close - */ - assert(s->tasks.len == 0 && s->waiting.len == 0); - session_close(s); -} - static uv_handle_t *retransmit(struct qr_task *task) { uv_handle_t *ret = NULL; @@ -1344,18 +927,28 @@ static uv_handle_t *retransmit(struct qr_task *task) if (!choice) { return ret; } - ret = ioreq_spawn(task, SOCK_DGRAM, choice->sin6_family); + if (task->pending_count >= MAX_PENDING) { + return ret; + } + ret = ioreq_spawn(task->ctx->worker, SOCK_DGRAM, choice->sin6_family); if (!ret) { return ret; } struct sockaddr *addr = (struct sockaddr *)choice; struct session *session = ret->data; - assert (session->peer.ip.sa_family == AF_UNSPEC && session->outgoing); - memcpy(&session->peer, addr, sizeof(session->peer)); - if (qr_task_send(task, ret, (struct sockaddr *)choice, - task->pktbuf) == 0) { + struct sockaddr *peer = session_get_peer(session); + assert (peer->sa_family == AF_UNSPEC && session_flags(session)->outgoing); + memcpy(peer, addr, kr_sockaddr_len(addr)); + if (qr_task_send(task, session, (struct sockaddr *)choice, + task->pktbuf) != 0) { + session_close(session); + ret = NULL; + } else { + task->pending[task->pending_count] = session; + task->pending_count += 1; task->addrlist_turn = (task->addrlist_turn + 1) % task->addrlist_count; /* Round robin */ + session_start_read(session); /* Start reading answer */ } } return ret; @@ -1364,10 +957,10 @@ static uv_handle_t *retransmit(struct qr_task *task) static void on_retransmit(uv_timer_t *req) { struct session *session = req->data; - assert(session->tasks.len == 1); + assert(session_tasklist_get_len(session) == 1); uv_timer_stop(req); - struct qr_task *task = session->tasks.at[0]; + struct qr_task *task = session_tasklist_get_first(session); if (retransmit(task) == NULL) { /* Not possible to spawn request, start timeout timer with remaining deadline. */ uint64_t timeout = KR_CONN_RTT_MAX - task->pending_count * KR_CONN_RETRY; @@ -1377,21 +970,11 @@ static void on_retransmit(uv_timer_t *req) } } -static int timer_start(struct session *session, uv_timer_cb cb, - uint64_t timeout, uint64_t repeat) -{ - uv_timer_t *timer = &session->timeout; - assert(timer->data == session); - int ret = uv_timer_start(timer, cb, timeout, repeat); - if (ret != 0) { - uv_timer_stop(timer); - return kr_error(ENOMEM); - } - return 0; -} - static void subreq_finalize(struct qr_task *task, const struct sockaddr *packet_source, knot_pkt_t *pkt) { + if (!task || task->finished) { + return; + } /* Close pending timer */ ioreq_kill_pending(task); /* Clear from outgoing table. */ @@ -1461,7 +1044,6 @@ static bool subreq_enqueue(struct qr_task *task) return true; } - static int qr_task_finalize(struct qr_task *task, int state) { assert(task && task->leading == false); @@ -1482,31 +1064,25 @@ static int qr_task_finalize(struct qr_task *task, int state) /* Send back answer */ struct session *source_session = ctx->source.session; - uv_handle_t *handle = source_session->handle; - assert(source_session->closing == false); - assert(handle && handle->data == ctx->source.session); + assert(!session_flags(source_session)->closing); assert(ctx->source.addr.ip.sa_family != AF_UNSPEC); - int res = qr_task_send(task, handle, + int res = qr_task_send(task, source_session, (struct sockaddr *)&ctx->source.addr, ctx->req.answer); if (res != kr_ok()) { (void) qr_task_on_send(task, NULL, kr_error(EIO)); /* Since source session is erroneous detach all tasks. */ - while (source_session->tasks.len > 0) { - struct qr_task *t = source_session->tasks.at[0]; + while (!session_tasklist_is_empty(source_session)) { + struct qr_task *t = session_tasklist_del_first(source_session, false); struct request_ctx *c = t->ctx; assert(c->source.session == source_session); c->source.session = NULL; /* Don't finalize them as there can be other tasks * waiting for answer to this particular task. * (ie. task->leading is true) */ - session_del_tasks(source_session, t); + worker_task_unref(t); } session_close(source_session); - } else if (handle->type == UV_TCP && ctx->source.session) { - /* Don't try to close source session at least - * retry_interval_for_timeout_timer milliseconds */ - uv_timer_again(&ctx->source.session->timeout); } qr_task_unref(task); @@ -1533,16 +1109,18 @@ static int qr_task_step(struct qr_task *task, task->addrlist = NULL; task->addrlist_count = 0; task->addrlist_turn = 0; - req->has_tls = (ctx->source.session && ctx->source.session->has_tls); + req->has_tls = (ctx->source.session && session_flags(ctx->source.session)->has_tls); if (worker->too_many_open) { struct kr_rplan *rplan = &req->rplan; if (worker->stats.rconcurrent < worker->rconcurrent_highwatermark - 10) { worker->too_many_open = false; - } else if (packet && kr_rplan_empty(rplan)) { - /* new query; TODO - make this detection more obvious */ - kr_resolve_consume(req, packet_source, packet); + } else { + if (packet && kr_rplan_empty(rplan)) { + /* new query; TODO - make this detection more obvious */ + kr_resolve_consume(req, packet_source, packet); + } return qr_task_finalize(task, KR_STATE_FAIL); } } @@ -1581,7 +1159,8 @@ static int qr_task_step(struct qr_task *task, /* Start transmitting */ uv_handle_t *handle = retransmit(task); if (handle == NULL) { - return qr_task_step(task, NULL, NULL); + subreq_finalize(task, packet_source, packet); + return qr_task_finalize(task, KR_STATE_FAIL); } /* Check current query NSLIST */ struct kr_query *qry = array_tail(req->rplan.pending); @@ -1600,8 +1179,8 @@ static int qr_task_step(struct qr_task *task, */ subreq_lead(task); struct session *session = handle->data; - assert(session->handle->type == UV_UDP); - ret = timer_start(session, on_retransmit, timeout, 0); + assert(session_get_handle(session) == handle && (handle->type == UV_UDP)); + ret = session_timer_start(session, on_retransmit, timeout, 0); /* Start next step with timeout, fatal if can't start a timer. */ if (ret != 0) { subreq_finalize(task, packet_source, packet); @@ -1609,129 +1188,80 @@ static int qr_task_step(struct qr_task *task, } } else { assert (sock_type == SOCK_STREAM); + assert(task->pending_count == 0); const struct sockaddr *addr = packet_source ? packet_source : task->addrlist; if (addr->sa_family == AF_UNSPEC) { + /* task->pending_count is zero, but there are can be followers */ subreq_finalize(task, packet_source, packet); return qr_task_finalize(task, KR_STATE_FAIL); } struct session* session = NULL; if ((session = worker_find_tcp_waiting(ctx->worker, addr)) != NULL) { - assert(session->outgoing); - if (session->closing) { - subreq_finalize(task, packet_source, packet); - return qr_task_finalize(task, KR_STATE_FAIL); - } - /* There are waiting tasks. - * It means that connection establishing or data sending - * is coming right now. */ - /* Task will be notified in on_connect() or qr_task_on_send(). */ - ret = session_add_waiting(session, task); - if (ret < 0) { + assert(session_flags(session)->outgoing); + if (session_flags(session)->closing) { subreq_finalize(task, packet_source, packet); return qr_task_finalize(task, KR_STATE_FAIL); } - ret = session_add_tasks(session, task); + /* Connection is in the list of waiting connections. + * It means that connection establishing is coming right now. + * Add task to the end of list of waiting tasks.. + * It will be notified in on_connect() or qr_task_on_send(). */ + ret = session_waitinglist_push(session, task); if (ret < 0) { - session_del_waiting(session, task); subreq_finalize(task, packet_source, packet); return qr_task_finalize(task, KR_STATE_FAIL); } - assert(task->pending_count == 0); - task->pending[task->pending_count] = session->handle; - task->pending_count += 1; } else if ((session = worker_find_tcp_connected(ctx->worker, addr)) != NULL) { /* Connection has been already established */ - assert(session->outgoing); - if (session->closing) { - session_del_tasks(session, task); + assert(session_flags(session)->outgoing); + if (session_flags(session)->closing) { subreq_finalize(task, packet_source, packet); return qr_task_finalize(task, KR_STATE_FAIL); } - if (session->tasks.len >= worker->tcp_pipeline_max) { - session_del_tasks(session, task); - subreq_finalize(task, packet_source, packet); - return qr_task_finalize(task, KR_STATE_FAIL); + while (!session_waitinglist_is_empty(session)) { + struct qr_task *t = session_waitinglist_get(session); + ret = qr_task_send(t, session, NULL, NULL); + if (ret != 0) { + session_waitinglist_finalize(session, KR_STATE_FAIL); + session_tasklist_finalize(session, KR_STATE_FAIL); + subreq_finalize(task, packet_source, packet); + session_close(session); + return qr_task_finalize(task, KR_STATE_FAIL); + } + session_waitinglist_pop(session, true); } - /* will be removed in qr_task_on_send() */ - ret = session_add_waiting(session, task); - if (ret < 0) { - session_del_tasks(session, task); + if (session_tasklist_get_len(session) >= worker->tcp_pipeline_max) { subreq_finalize(task, packet_source, packet); return qr_task_finalize(task, KR_STATE_FAIL); } - ret = session_add_tasks(session, task); - if (ret < 0) { - session_del_waiting(session, task); - session_del_tasks(session, task); + + ret = qr_task_send(task, session, NULL, NULL); + if (ret != 0 /* && ret != kr_error(EMFILE) */) { + session_tasklist_finalize(session, KR_STATE_FAIL); subreq_finalize(task, packet_source, packet); + session_close(session); return qr_task_finalize(task, KR_STATE_FAIL); } - if (session->waiting.len == 1) { - ret = qr_task_send(task, session->handle, - &session->peer.ip, task->pktbuf); - if (ret < 0) { - session_del_waiting(session, task); - session_del_tasks(session, task); - while (session->tasks.len != 0) { - struct qr_task *t = session->tasks.at[0]; - qr_task_finalize(t, KR_STATE_FAIL); - session_del_tasks(session, t); - } - subreq_finalize(task, packet_source, packet); - session_close(session); - return qr_task_finalize(task, KR_STATE_FAIL); - } - if (session->tasks.len == 1) { - uv_timer_stop(&session->timeout); - ret = timer_start(session, on_tcp_watchdog_timeout, - MAX_TCP_INACTIVITY, 0); - } - if (ret < 0) { - session_del_waiting(session, task); - session_del_tasks(session, task); - while (session->tasks.len != 0) { - struct qr_task *t = session->tasks.at[0]; - qr_task_finalize(t, KR_STATE_FAIL); - session_del_tasks(session, t); - } - subreq_finalize(task, packet_source, packet); - session_close(session); - return qr_task_finalize(task, KR_STATE_FAIL); - } - } - assert(task->pending_count == 0); - task->pending[task->pending_count] = session->handle; - task->pending_count += 1; } else { /* Make connection */ - uv_connect_t *conn = (uv_connect_t *)iorequest_borrow(ctx->worker); + uv_connect_t *conn = malloc(sizeof(uv_connect_t)); if (!conn) { return qr_task_step(task, NULL, NULL); } - uv_handle_t *client = ioreq_spawn(task, sock_type, + uv_handle_t *client = ioreq_spawn(worker, sock_type, addr->sa_family); if (!client) { - iorequest_release(ctx->worker, conn); + free(conn); subreq_finalize(task, packet_source, packet); return qr_task_finalize(task, KR_STATE_FAIL); } session = client->data; ret = worker_add_tcp_waiting(ctx->worker, addr, session); if (ret < 0) { - session_del_tasks(session, task); - iorequest_release(ctx->worker, conn); - subreq_finalize(task, packet_source, packet); - return qr_task_finalize(task, KR_STATE_FAIL); - } - /* will be removed in qr_task_on_send() */ - ret = session_add_waiting(session, task); - if (ret < 0) { - session_del_tasks(session, task); - worker_del_tcp_waiting(ctx->worker, addr); - iorequest_release(ctx->worker, conn); + free(conn); subreq_finalize(task, packet_source, packet); return qr_task_finalize(task, KR_STATE_FAIL); } @@ -1742,52 +1272,54 @@ static int qr_task_step(struct qr_task *task, const char *key = tcpsess_key(addr); struct tls_client_paramlist_entry *entry = map_get(&net->tls_client_params, key); if (entry) { - assert(session->tls_client_ctx == NULL); + assert(session_tls_get_client_ctx(session) == NULL); struct tls_client_ctx_t *tls_ctx = tls_client_ctx_new(entry, worker); if (!tls_ctx) { - session_del_tasks(session, task); - session_del_waiting(session, task); worker_del_tcp_waiting(ctx->worker, addr); - iorequest_release(ctx->worker, conn); + free(conn); subreq_finalize(task, packet_source, packet); return qr_task_step(task, NULL, NULL); } tls_client_ctx_set_session(tls_ctx, session); - session->tls_client_ctx = tls_ctx; - session->has_tls = true; + session_tls_set_client_ctx(session, tls_ctx); + session_flags(session)->has_tls = true; } conn->data = session; - memcpy(&session->peer, addr, sizeof(session->peer)); + struct sockaddr *peer = session_get_peer(session); + memcpy(peer, addr, kr_sockaddr_len(addr)); - ret = timer_start(session, on_tcp_connect_timeout, - KR_CONN_RTT_MAX, 0); + ret = session_timer_start(session, on_tcp_connect_timeout, + KR_CONN_RTT_MAX, 0); if (ret != 0) { - session_del_tasks(session, task); - session_del_waiting(session, task); worker_del_tcp_waiting(ctx->worker, addr); - iorequest_release(ctx->worker, conn); + free(conn); subreq_finalize(task, packet_source, packet); return qr_task_finalize(task, KR_STATE_FAIL); } - struct kr_query *qry = session_current_query(session); + struct kr_query *qry = task_get_last_pending_query(task); WITH_VERBOSE (qry) { - char addr_str[INET6_ADDRSTRLEN]; - inet_ntop(session->peer.ip.sa_family, kr_inaddr(&session->peer.ip), addr_str, sizeof(addr_str)); - VERBOSE_MSG(qry, "=> connecting to: '%s'\n", addr_str); + char peer_str[INET6_ADDRSTRLEN]; + inet_ntop(peer->sa_family, kr_inaddr(peer), peer_str, sizeof(peer_str)); + VERBOSE_MSG(qry, "=> connecting to: '%s'\n", peer_str); } if (uv_tcp_connect(conn, (uv_tcp_t *)client, addr , on_connect) != 0) { - uv_timer_stop(&session->timeout); - session_del_tasks(session, task); - session_del_waiting(session, task); + session_timer_stop(session); worker_del_tcp_waiting(ctx->worker, addr); - iorequest_release(ctx->worker, conn); + free(conn); subreq_finalize(task, packet_source, packet); return qr_task_step(task, NULL, NULL); } + + /* will be removed in on_connect() or qr_task_on_send() */ + ret = session_waitinglist_push(session, task); + if (ret < 0) { + subreq_finalize(task, packet_source, packet); + return qr_task_finalize(task, KR_STATE_FAIL); + } } } return kr_ok(); @@ -1814,46 +1346,43 @@ static int parse_packet(knot_pkt_t *query) return ret; } -static struct qr_task* find_task(const struct session *session, uint16_t msg_id) +int worker_submit(struct session *session, knot_pkt_t *query) { - struct qr_task *ret = NULL; - const qr_tasklist_t *tasklist = &session->tasks; - for (size_t i = 0; i < tasklist->len; ++i) { - struct qr_task *task = tasklist->at[i]; - uint16_t task_msg_id = knot_wire_get_id(task->pktbuf->wire); - if (task_msg_id == msg_id) { - ret = task; - break; - } + if (!session) { + assert(false); + return kr_error(EINVAL); } - return ret; -} - -int worker_submit(struct worker_ctx *worker, uv_handle_t *handle, - knot_pkt_t *query, const struct sockaddr* addr) -{ - bool OK = worker && handle && handle->data; + uv_handle_t *handle = session_get_handle(session); + bool OK = handle && handle->loop->data; if (!OK) { assert(false); return kr_error(EINVAL); } - struct session *session = handle->data; + struct worker_ctx *worker = handle->loop->data; /* Parse packet */ int ret = parse_packet(query); + const bool is_query = (knot_wire_get_qr(query->wire) == 0); + const bool is_outgoing = session_flags(session)->outgoing; + /* Ignore badly formed queries. */ + if (!query || + (ret != kr_ok() && ret != kr_error(EMSGSIZE)) || + (is_query == is_outgoing)) { + if (query && !is_outgoing) worker->stats.dropped += 1; + return kr_error(EILSEQ); + } + /* Start new task on listening sockets, * or resume if this is subrequest */ struct qr_task *task = NULL; - if (!session->outgoing) { /* request from a client */ - /* Ignore badly formed queries. */ - if (!query || ret != 0 || knot_wire_get_qr(query->wire)) { - if (query) worker->stats.dropped += 1; - return kr_error(EILSEQ); - } - struct request_ctx *ctx = request_create(worker, handle, addr); + struct sockaddr *addr = NULL; + if (!is_outgoing) { /* request from a client */ + struct request_ctx *ctx = request_create(worker, handle, + session_get_peer(session), + knot_wire_get_id(query->wire)); if (!ctx) { return kr_error(ENOMEM); } @@ -1869,21 +1398,23 @@ int worker_submit(struct worker_ctx *worker, uv_handle_t *handle, request_free(ctx); return kr_error(ENOMEM); } - addr = NULL; - } else if (query) { /* response from upstream */ - if ((ret != kr_ok() && ret != kr_error(EMSGSIZE)) || - !knot_wire_get_qr(query->wire)) { - /* Ignore badly formed responses. */ - return kr_error(EILSEQ); + + if (handle->type == UV_TCP && qr_task_register(task, session)) { + return kr_error(ENOMEM); } - task = find_task(session, knot_wire_get_id(query->wire)); + } else if (query) { /* response from upstream */ + task = session_tasklist_del_msgid(session, knot_wire_get_id(query->wire)); if (task == NULL) { return kr_error(ENOENT); } - assert(session->closing == false); + assert(!session_flags(session)->closing); + addr = session_get_peer(session); } - assert(uv_is_closing(session->handle) == false); + assert(uv_is_closing(session_get_handle(session)) == false); + /* Packet was successfully parsed. + * Task was created (found). */ + session_touch(session); /* Consume input and produce next message */ return qr_task_step(task, addr, query); } @@ -1918,7 +1449,7 @@ static struct session* map_find_tcp_session(map_t *map, return ret; } -static int worker_add_tcp_connected(struct worker_ctx *worker, +int worker_add_tcp_connected(struct worker_ctx *worker, const struct sockaddr* addr, struct session *session) { @@ -1931,7 +1462,7 @@ static int worker_add_tcp_connected(struct worker_ctx *worker, return map_add_tcp_session(&worker->tcp_connected, addr, session); } -static int worker_del_tcp_connected(struct worker_ctx *worker, +int worker_del_tcp_connected(struct worker_ctx *worker, const struct sockaddr* addr) { assert(addr && tcpsess_key(addr)); @@ -1970,385 +1501,73 @@ static struct session* worker_find_tcp_waiting(struct worker_ctx *worker, return map_find_tcp_session(&worker->tcp_waiting, addr); } -/* Return DNS/TCP message size. */ -static int get_msg_size(const uint8_t *msg) +int worker_end_tcp(struct session *session) { - return wire_read_u16(msg); -} - -/* If buffering, close last task as it isn't live yet. */ -static void discard_buffered(struct session *session) -{ - if (session->buffering) { - qr_task_free(session->buffering); - session->buffering = NULL; - session->msg_hdr_idx = 0; - } -} - -int worker_end_tcp(struct worker_ctx *worker, uv_handle_t *handle) -{ - if (!worker || !handle) { + if (!session) { return kr_error(EINVAL); } - /* If this is subrequest, notify parent task with empty input - * because in this case session doesn't own tasks, it has just - * borrowed the task from parent session. */ - struct session *session = handle->data; - if (session->outgoing) { - worker_submit(worker, handle, NULL, NULL); - } else { - discard_buffered(session); - } - return 0; -} - -int worker_process_tcp(struct worker_ctx *worker, uv_stream_t *handle, - const uint8_t *msg, ssize_t len) -{ - if (!worker || !handle) { - return kr_error(EINVAL); - } - /* Connection error or forced disconnect */ - struct session *session = handle->data; - assert(session && session->handle == (uv_handle_t *)handle && handle->type == UV_TCP); - if (session->closing) { - return kr_ok(); - } - if (len <= 0 || !msg) { - /* If we have pending tasks, we must dissociate them from the - * connection so they don't try to access closed and freed handle. - * @warning Do not modify task if this is outgoing request - * as it is shared with originator. - */ - struct kr_query *qry = session_current_query(session); - WITH_VERBOSE (qry) { - char addr_str[INET6_ADDRSTRLEN]; - inet_ntop(session->peer.ip.sa_family, kr_inaddr(&session->peer.ip), - addr_str, sizeof(addr_str)); - VERBOSE_MSG(qry, "=> connection to '%s' closed by peer\n", addr_str); - } - uv_timer_t *timer = &session->timeout; - uv_timer_stop(timer); - struct sockaddr *peer = &session->peer.ip; - worker_del_tcp_connected(worker, peer); - session->connected = false; + session_timer_stop(session); + + uv_handle_t *handle = session_get_handle(session); + struct worker_ctx *worker = handle->loop->data; + struct sockaddr *peer = session_get_peer(session); - if (session->tls_client_ctx) { - /* Avoid gnutls_bye() call */ - tls_set_hs_state(&session->tls_client_ctx->c, - TLS_HS_NOT_STARTED); - } + worker_del_tcp_connected(worker, peer); + session_flags(session)->connected = false; - if (session->tls_ctx) { - /* Avoid gnutls_bye() call */ - tls_set_hs_state(&session->tls_ctx->c, - TLS_HS_NOT_STARTED); - } + struct tls_client_ctx_t *tls_client_ctx = session_tls_get_client_ctx(session); + if (tls_client_ctx) { + /* Avoid gnutls_bye() call */ + tls_set_hs_state(&tls_client_ctx->c, TLS_HS_NOT_STARTED); + } - if (session->outgoing && session->buffering) { - session->buffering = NULL; - } + struct tls_ctx_t *tls_ctx = session_tls_get_server_ctx(session); + if (tls_ctx) { + /* Avoid gnutls_bye() call */ + tls_set_hs_state(&tls_ctx->c, TLS_HS_NOT_STARTED); + } - assert(session->tasks.len >= session->waiting.len); - while (session->waiting.len > 0) { - struct qr_task *task = session->waiting.at[0]; - array_del(session->waiting, 0); - assert(task->refs > 1); - session_del_tasks(session, task); - if (session->outgoing) { - if (task->ctx->req.options.FORWARD) { - /* We are in TCP_FORWARD mode. - * To prevent failing at kr_resolve_consume() - * qry.flags.TCP must be cleared. - * TODO - refactoring is needed. */ - struct kr_request *req = &task->ctx->req; - struct kr_rplan *rplan = &req->rplan; - struct kr_query *qry = array_tail(rplan->pending); - qry->flags.TCP = false; - } - qr_task_step(task, NULL, NULL); - } else { - assert(task->ctx->source.session == session); - task->ctx->source.session = NULL; - } - qr_task_unref(task); - } - while (session->tasks.len > 0) { - struct qr_task *task = session->tasks.at[0]; - if (session->outgoing) { - if (task->ctx->req.options.FORWARD) { + while (!session_waitinglist_is_empty(session)) { + struct qr_task *task = session_waitinglist_pop(session, false); + assert(task->refs > 1); + session_tasklist_del(session, task); + if (session_flags(session)->outgoing) { + if (task->ctx->req.options.FORWARD) { + /* We are in TCP_FORWARD mode. + * To prevent failing at kr_resolve_consume() + * qry.flags.TCP must be cleared. + * TODO - refactoring is needed. */ struct kr_request *req = &task->ctx->req; struct kr_rplan *rplan = &req->rplan; struct kr_query *qry = array_tail(rplan->pending); qry->flags.TCP = false; - } - qr_task_step(task, NULL, NULL); - } else { - assert(task->ctx->source.session == session); - task->ctx->source.session = NULL; - } - session_del_tasks(session, task); - } - session_close(session); - return kr_ok(); - } - - if (session->bytes_to_skip) { - assert(session->buffering == NULL); - ssize_t min_len = MIN(session->bytes_to_skip, len); - len -= min_len; - msg += min_len; - session->bytes_to_skip -= min_len; - if (len < 0 || session->bytes_to_skip < 0) { - /* Something gone wrong. - * Better kill the connection */ - return kr_error(EILSEQ); - } - if (len == 0) { - return kr_ok(); - } - assert(session->bytes_to_skip == 0); - } - - int submitted = 0; - struct qr_task *task = session->buffering; - knot_pkt_t *pkt_buf = NULL; - if (task) { - pkt_buf = task->pktbuf; - } else { - /* Update DNS header in session->msg_hdr* */ - assert(session->msg_hdr_idx <= sizeof(session->msg_hdr)); - ssize_t hdr_amount = sizeof(session->msg_hdr) - - session->msg_hdr_idx; - if (hdr_amount > len) { - hdr_amount = len; - } - if (hdr_amount > 0) { - memcpy(session->msg_hdr + session->msg_hdr_idx, msg, hdr_amount); - session->msg_hdr_idx += hdr_amount; - len -= hdr_amount; - msg += hdr_amount; - } - if (len == 0) { /* no data beyond msg_hdr -> not much to do */ - return kr_ok(); - } - assert(session->msg_hdr_idx == sizeof(session->msg_hdr)); - session->msg_hdr_idx = 0; - uint16_t msg_size = get_msg_size(session->msg_hdr); - uint16_t msg_id = knot_wire_get_id(session->msg_hdr + 2); - if (msg_size < KNOT_WIRE_HEADER_SIZE) { - /* better kill the connection; we would probably get out of sync */ - uv_timer_t *timer = &session->timeout; - uv_timer_stop(timer); - while (session->waiting.len > 0) { - struct qr_task *task = session->waiting.at[0]; - if (session->outgoing) { - qr_task_finalize(task, KR_STATE_FAIL); - } else { - assert(task->ctx->source.session == session); - task->ctx->source.session = NULL; - } - array_del(session->waiting, 0); - session_del_tasks(session, task); - qr_task_unref(task); - } - while (session->tasks.len > 0) { - struct qr_task *task = session->tasks.at[0]; - if (session->outgoing) { - qr_task_finalize(task, KR_STATE_FAIL); - } else { - assert(task->ctx->source.session == session); - task->ctx->source.session = NULL; - } - session_del_tasks(session, task); - } - session_close(session); - - return kr_ok(); - } - - /* get task */ - if (!session->outgoing) { - /* This is a new query, create a new task that we can use - * to buffer incoming message until it's complete. */ - struct sockaddr *addr = &(session->peer.ip); - assert(addr->sa_family != AF_UNSPEC); - struct request_ctx *ctx = request_create(worker, - (uv_handle_t *)handle, - addr); - if (!ctx) { - return kr_error(ENOMEM); - } - task = qr_task_create(ctx); - if (!task) { - request_free(ctx); - return kr_error(ENOMEM); } + qr_task_step(task, NULL, NULL); } else { - /* Start of response from upstream. - * The session task list must contain a task - * with the same msg id. */ - task = find_task(session, msg_id); - /* FIXME: on high load over one connection, it's likely - * that we will get multiple matches sooner or later (!) */ - if (task) { - /* Make sure we can process maximum packet sizes over TCP for outbound queries. - * Previous packet is allocated with mempool, so there's no need to free it manually. */ - if (task->pktbuf->max_size < KNOT_WIRE_MAX_PKTSIZE) { - knot_mm_t *pool = &task->pktbuf->mm; - pkt_buf = knot_pkt_new(NULL, KNOT_WIRE_MAX_PKTSIZE, pool); - if (!pkt_buf) { - return kr_error(ENOMEM); - } - task->pktbuf = pkt_buf; - } - knot_pkt_clear(task->pktbuf); - assert(task->leading == false); - } else { - session->bytes_to_skip = msg_size - 2; - ssize_t min_len = MIN(session->bytes_to_skip, len); - len -= min_len; - msg += min_len; - session->bytes_to_skip -= min_len; - if (len < 0 || session->bytes_to_skip < 0) { - /* Something gone wrong. - * Better kill the connection */ - return kr_error(EILSEQ); - } - if (len == 0) { - return submitted; - } - assert(session->bytes_to_skip == 0); - int ret = worker_process_tcp(worker, handle, msg, len); - if (ret < 0) { - submitted = ret; - } else { - submitted += ret; - } - return submitted; - } - } - - pkt_buf = task->pktbuf; - knot_wire_set_id(pkt_buf->wire, msg_id); - pkt_buf->size = 2; - task->bytes_remaining = msg_size - 2; - assert(session->buffering == NULL); - session->buffering = task; - } - /* At this point session must have either created new task - * or it's already assigned. */ - assert(task); - assert(len > 0); - - /* Message is too long, can't process it. */ - ssize_t to_read = MIN(len, task->bytes_remaining); - if (pkt_buf->size + to_read > pkt_buf->max_size) { - // TODO reallocate pkt_buf - pkt_buf->size = 0; - len -= to_read; - msg += to_read; - session->bytes_to_skip = task->bytes_remaining - to_read; - task->bytes_remaining = 0; - if (session->buffering) { - if (!session->outgoing) { - qr_task_complete(session->buffering); - } - session->buffering = NULL; - } - if (len > 0) { - int ret = worker_process_tcp(worker, handle, msg, len); - if (ret < 0) { - submitted = ret; - } else { - submitted += ret; - } - } - return submitted; - } - /* Buffer message and check if it's complete */ - memcpy(pkt_buf->wire + pkt_buf->size, msg, to_read); - pkt_buf->size += to_read; - task->bytes_remaining -= to_read; - len -= to_read; - msg += to_read; - if (task->bytes_remaining == 0) { - /* Message was assembled, clear temporary. */ - session->buffering = NULL; - session->msg_hdr_idx = 0; - const struct sockaddr *addr = NULL; - knot_pkt_t *pkt = pkt_buf; - if (session->outgoing) { - addr = &session->peer.ip; - assert ((task->pending_count == 1) && (task->pending[0] == session->handle)); - task->pending_count = 0; - session_del_tasks(session, task); - } - /* Parse the packet and start resolving complete query */ - int ret = parse_packet(pkt); - if (ret == 0) { - if (session->outgoing) { - /* To prevent slow lorris attack restart watchdog only after - * the whole message was successfully assembled and parsed */ - if (session->tasks.len > 0 || session->waiting.len > 0) { - uv_timer_stop(&session->timeout); - timer_start(session, on_tcp_watchdog_timeout, MAX_TCP_INACTIVITY, 0); - } - } else { - /* Start only new queries, - * not subrequests that are already pending */ - ret = request_start(task->ctx, pkt); - if (ret != 0) { - /* Allocation of answer buffer has failed. - * We can't notify client about failure, - * so just end the task processing. */ - qr_task_complete(task); - goto next_msg; - } - - ret = qr_task_register(task, session); - if (ret != 0) { - /* Answer buffer has been allocated, - * but task can't be attached to the given - * session due to memory problems. - * Finalize the task, otherwise it becomes orphaned. */ - knot_pkt_init_response(task->ctx->req.answer, pkt); - qr_task_finalize(task, KR_STATE_FAIL); - goto next_msg; - } - submitted += 1; - if (task->leading) { - assert(false); - } + assert(task->ctx->source.session == session); + task->ctx->source.session = NULL; + } + worker_task_unref(task); + } + while (!session_tasklist_is_empty(session)) { + struct qr_task *task = session_tasklist_del_first(session, false); + if (session_flags(session)->outgoing) { + if (task->ctx->req.options.FORWARD) { + struct kr_request *req = &task->ctx->req; + struct kr_rplan *rplan = &req->rplan; + struct kr_query *qry = array_tail(rplan->pending); + qry->flags.TCP = false; } - } else if (session->outgoing) { - /* Drop malformed packet and retry resolution */ - pkt = NULL; - ret = 0; + qr_task_step(task, NULL, NULL); } else { - qr_task_complete(task); - } - /* Only proceed if the message is valid, or it's an invalid response to - * an outbound query which needs to be treated as a timeout. */ - if (ret == 0) { - /* since there can be next dns message, we must to proceed - * even if qr_task_step() returns error */ - qr_task_step(task, addr, pkt); - } -next_msg: - if (len > 0) { - /* TODO: this is simple via iteration; recursion doesn't really help */ - ret = worker_process_tcp(worker, handle, msg, len); - if (ret < 0) { - return ret; - } - submitted += ret; + assert(task->ctx->source.session == session); + task->ctx->source.session = NULL; } + worker_task_unref(task); } - assert(submitted >= 0); - return submitted; + session_close(session); + return kr_ok(); } struct qr_task *worker_resolve_start(struct worker_ctx *worker, knot_pkt_t *query, struct kr_qflags options) @@ -2358,7 +1577,8 @@ struct qr_task *worker_resolve_start(struct worker_ctx *worker, knot_pkt_t *quer return NULL; } - struct request_ctx *ctx = request_create(worker, NULL, NULL); + + struct request_ctx *ctx = request_create(worker, NULL, NULL, worker->next_request_uid); if (!ctx) { return NULL; } @@ -2375,12 +1595,17 @@ struct qr_task *worker_resolve_start(struct worker_ctx *worker, knot_pkt_t *quer if (ret != 0) { /* task is attached to request context, * so dereference (and deallocate) it first */ - request_del_tasks(ctx, task); - array_clear(ctx->tasks); + ctx->task = NULL; + qr_task_unref(task); request_free(ctx); return NULL; } + worker->next_request_uid += 1; + if (worker->next_request_uid == 0) { + worker->next_request_uid = UINT16_MAX + 1; + } + /* Set options late, as qr_task_start() -> kr_resolve_begin() rewrite it. */ kr_qflags_set(&task->ctx->req.options, options); return task; @@ -2394,6 +1619,11 @@ int worker_resolve_exec(struct qr_task *task, knot_pkt_t *query) return qr_task_step(task, NULL, query); } +int worker_task_numrefs(const struct qr_task *task) +{ + return task->refs; +} + struct kr_request *worker_task_request(struct qr_task *task) { if (!task || !task->ctx) { @@ -2408,22 +1638,86 @@ int worker_task_finalize(struct qr_task *task, int state) return qr_task_finalize(task, state); } -void worker_session_close(struct session *session) + int worker_task_step(struct qr_task *task, const struct sockaddr *packet_source, + knot_pkt_t *packet) + { + return qr_task_step(task, packet_source, packet); + } + +void worker_task_complete(struct qr_task *task) { - session_close(session); + return qr_task_complete(task); +} + +void worker_task_ref(struct qr_task *task) +{ + qr_task_ref(task); } +void worker_task_unref(struct qr_task *task) +{ + qr_task_unref(task); +} + +void worker_task_timeout_inc(struct qr_task *task) +{ + task->timeouts += 1; +} + +knot_pkt_t *worker_task_get_pktbuf(const struct qr_task *task) +{ + return task->pktbuf; +} + +struct request_ctx *worker_task_get_request(struct qr_task *task) +{ + return task->ctx; +} + +struct session *worker_request_get_source_session(struct request_ctx *ctx) +{ + return ctx->source.session; +} + +void worker_request_set_source_session(struct request_ctx *ctx, struct session *session) +{ + ctx->source.session = session; +} + +uint16_t worker_task_pkt_get_msgid(struct qr_task *task) +{ + knot_pkt_t *pktbuf = worker_task_get_pktbuf(task); + uint16_t msg_id = knot_wire_get_id(pktbuf->wire); + return msg_id; +} + +void worker_task_pkt_set_msgid(struct qr_task *task, uint16_t msgid) +{ + knot_pkt_t *pktbuf = worker_task_get_pktbuf(task); + knot_wire_set_id(pktbuf->wire, msgid); + struct kr_query *q = task_get_last_pending_query(task); + q->id = msgid; +} + +uint64_t worker_task_creation_time(struct qr_task *task) +{ + return task->creation_time; +} + +void worker_task_subreq_finalize(struct qr_task *task) +{ + subreq_finalize(task, NULL, NULL); +} + +bool worker_task_finished(struct qr_task *task) +{ + return task->finished; +} /** Reserve worker buffers */ static int worker_reserve(struct worker_ctx *worker, size_t ring_maxlen) { array_init(worker->pool_mp); - array_init(worker->pool_ioreqs); - array_init(worker->pool_iohandles); - array_init(worker->pool_sessions); - if (array_reserve(worker->pool_mp, ring_maxlen) || - array_reserve(worker->pool_ioreqs, ring_maxlen) || - array_reserve(worker->pool_iohandles, ring_maxlen) || - array_reserve(worker->pool_sessions, ring_maxlen)) { + if (array_reserve(worker->pool_mp, ring_maxlen)) { return kr_error(ENOMEM); } memset(&worker->pkt_pool, 0, sizeof(worker->pkt_pool)); @@ -2437,20 +1731,19 @@ static int worker_reserve(struct worker_ctx *worker, size_t ring_maxlen) return kr_ok(); } -#define reclaim_freelist(list, type, cb) \ - for (unsigned i = 0; i < list.len; ++i) { \ - void *elm = list.at[i]; \ - kr_asan_unpoison(elm, sizeof(type)); \ - cb(elm); \ - } \ - array_clear(list) +static inline void reclaim_mp_freelist(mp_freelist_t *list) +{ + for (unsigned i = 0; i < list->len; ++i) { + struct mempool *e = list->at[i]; + kr_asan_unpoison(e, sizeof(*e)); + mp_delete(e); + } + array_clear(*list); +} void worker_reclaim(struct worker_ctx *worker) { - reclaim_freelist(worker->pool_mp, struct mempool, mp_delete); - reclaim_freelist(worker->pool_ioreqs, uv_reqs_t, free); - reclaim_freelist(worker->pool_iohandles, uv_handles_t, free); - reclaim_freelist(worker->pool_sessions, struct session, session_free); + reclaim_mp_freelist(&worker->pool_mp); mp_delete(worker->pkt_pool.ctx); worker->pkt_pool.ctx = NULL; trie_free(worker->subreq_out); @@ -2482,6 +1775,7 @@ struct worker_ctx *worker_create(struct engine *engine, knot_mm_t *pool, worker->id = worker_id; worker->count = worker_count; worker->engine = engine; + worker->next_request_uid = UINT16_MAX + 1; worker_reserve(worker, MP_FREELIST_SIZE); worker->out_addr4.sin_family = AF_UNSPEC; worker->out_addr6.sin6_family = AF_UNSPEC; diff --git a/daemon/worker.h b/daemon/worker.h index 3acecfd0eab6721f1f5fcc5a8b1cfcda01f1c00f..3d9ade8bc48d40af8413f24a940aefb8c3d4ba86 100644 --- a/daemon/worker.h +++ b/daemon/worker.h @@ -37,30 +37,17 @@ struct worker_ctx *worker_create(struct engine *engine, knot_mm_t *pool, /** * Process an incoming packet (query from a client or answer from upstream). * - * @param worker the singleton worker - * @param handle socket through which the request came - * @param query the packet, or NULL on an error from the transport layer - * @param addr the address from which the packet came (or NULL, possibly, on error) + * @param session session the where packet came from + * @param query the packet, or NULL on an error from the transport layer * @return 0 or an error code */ -int worker_submit(struct worker_ctx *worker, uv_handle_t *handle, knot_pkt_t *query, - const struct sockaddr* addr); - -/** - * Process incoming DNS message fragment(s) that arrived over a stream (TCP, TLS). - * - * If the fragment contains only a partial message, it is buffered. - * If the fragment contains a complete query or completes current fragment, execute it. - * @return the number of newly-completed requests (>=0) or an error code - */ -int worker_process_tcp(struct worker_ctx *worker, uv_stream_t *handle, - const uint8_t *msg, ssize_t len); +int worker_submit(struct session *session, knot_pkt_t *query); /** * End current DNS/TCP session, this disassociates pending tasks from this session * which may be freely closed afterwards. */ -int worker_end_tcp(struct worker_ctx *worker, uv_handle_t *handle); +int worker_end_tcp(struct session *session); /** * Start query resolution with given query. @@ -83,16 +70,42 @@ struct kr_request *worker_task_request(struct qr_task *task); /** Collect worker mempools */ void worker_reclaim(struct worker_ctx *worker); -/** Closes given session */ -void worker_session_close(struct session *session); - -void *worker_iohandle_borrow(struct worker_ctx *worker); +int worker_task_step(struct qr_task *task, const struct sockaddr *packet_source, + knot_pkt_t *packet); -void worker_iohandle_release(struct worker_ctx *worker, void *h); +int worker_task_numrefs(const struct qr_task *task); /** Finalize given task */ int worker_task_finalize(struct qr_task *task, int state); +void worker_task_complete(struct qr_task *task); + +void worker_task_ref(struct qr_task *task); + +void worker_task_unref(struct qr_task *task); + +void worker_task_timeout_inc(struct qr_task *task); + +int worker_add_tcp_connected(struct worker_ctx *worker, + const struct sockaddr *addr, + struct session *session); +int worker_del_tcp_connected(struct worker_ctx *worker, + const struct sockaddr *addr); + +knot_pkt_t *worker_task_get_pktbuf(const struct qr_task *task); + +struct request_ctx *worker_task_get_request(struct qr_task *task); + +struct session *worker_request_get_source_session(struct request_ctx *); + +void worker_request_set_source_session(struct request_ctx *, struct session *session); + +uint16_t worker_task_pkt_get_msgid(struct qr_task *task); +void worker_task_pkt_set_msgid(struct qr_task *task, uint16_t msgid); +uint64_t worker_task_creation_time(struct qr_task *task); +void worker_task_subreq_finalize(struct qr_task *task); +bool worker_task_finished(struct qr_task *task); + /** @cond internal */ /** Number of request within timeout window. */ @@ -101,15 +114,16 @@ int worker_task_finalize(struct qr_task *task, int state); /** Maximum response time from TCP upstream, milliseconds */ #define MAX_TCP_INACTIVITY (KR_RESOLVE_TIME_LIMIT + KR_CONN_RTT_MAX) +#ifndef RECVMMSG_BATCH /* see check_bufsize() */ +#define RECVMMSG_BATCH 1 +#endif + /** Freelist of available mempools. */ -typedef array_t(void *) mp_freelist_t; +typedef array_t(struct mempool *) mp_freelist_t; /** List of query resolution tasks. */ typedef array_t(struct qr_task *) qr_tasklist_t; -/** Session list. */ -typedef array_t(struct session *) qr_sessionlist_t; - /** \details Worker state is meant to persist during the whole life of daemon. */ struct worker_ctx { struct engine *engine; @@ -123,11 +137,8 @@ struct worker_ctx { struct sockaddr_in out_addr4; struct sockaddr_in6 out_addr6; -#if __linux__ uint8_t wire_buf[RECVMMSG_BATCH * KNOT_WIRE_MAX_PKTSIZE]; -#else - uint8_t wire_buf[KNOT_WIRE_MAX_PKTSIZE]; -#endif + struct { size_t concurrent; size_t rconcurrent; @@ -151,35 +162,9 @@ struct worker_ctx { /** Subrequest leaders (struct qr_task*), indexed by qname+qtype+qclass. */ trie_t *subreq_out; mp_freelist_t pool_mp; - mp_freelist_t pool_ioreqs; - mp_freelist_t pool_sessions; - mp_freelist_t pool_iohandles; knot_mm_t pkt_pool; + unsigned int next_request_uid; }; -/* @internal Union of some libuv handles for freelist. - * These have session as their `handle->data` and own it. - * Subset of uv_any_handle. */ -union uv_handles { - uv_handle_t handle; - uv_stream_t stream; - uv_udp_t udp; - uv_tcp_t tcp; - uv_timer_t timer; -}; -typedef union uv_any_handle uv_handles_t; - -/* @internal Union of derivatives from uv_req_t libuv request handles for freelist. - * These have only a reference to the task they're operating on. - * Subset of uv_any_req. */ -union uv_reqs { - uv_req_t req; - uv_shutdown_t sdown; - uv_write_t write; - uv_connect_t connect; - uv_udp_send_t send; -}; -typedef union uv_reqs uv_reqs_t; - /** @endcond */ diff --git a/lib/cache/api.c b/lib/cache/api.c index df9b439152fdcb8f7085aada6258a98eef4112b1..d6414ebd2b118e980fc98e7c164c5b50ee6f6446 100644 --- a/lib/cache/api.c +++ b/lib/cache/api.c @@ -614,7 +614,8 @@ static int stash_rrarray_entry(ranked_rr_array_t *arr, int arr_i, ssize_t written = stash_rrset(cache, qry, rr, rr_sigs, qry->timestamp.tv_sec, entry->rank, nsec_pmap, has_optout); if (written < 0) { - kr_log_error("[%5hu][cach] stash failed, ret = %d\n", qry->id, ret); + kr_log_error("[%05u.%02u][cach] stash failed, ret = %d\n", qry->request->uid, + qry->uid, ret); return (int) written; } diff --git a/lib/defines.h b/lib/defines.h index 6595588376a651e93b40b80ce4f4bacf05040cec..84da059e3cbcdca87ab141f092265e53c62d1e6f 100644 --- a/lib/defines.h +++ b/lib/defines.h @@ -91,8 +91,12 @@ void __asan_poison_memory_region(void const volatile *addr, size_t size); void __asan_unpoison_memory_region(void const volatile *addr, size_t size); #define kr_asan_poison(addr, size) __asan_poison_memory_region((addr), (size)) #define kr_asan_unpoison(addr, size) __asan_unpoison_memory_region((addr), (size)) +#define kr_asan_custom_poison(fn, addr) fn ##_poison((addr)) +#define kr_asan_custom_unpoison(fn, addr) fn ##_unpoison((addr)) #else #define kr_asan_poison(addr, size) #define kr_asan_unpoison(addr, size) +#define kr_asan_custom_poison(fn, addr) +#define kr_asan_custom_unpoison(fn, addr) #endif /* @endcond */ diff --git a/lib/generic/README.rst b/lib/generic/README.rst index bd63e274f137360ca6fcf0c653436350e7fe9693..7adff863a423194d5de0a5e46a2c23bb3a9cb1cd 100644 --- a/lib/generic/README.rst +++ b/lib/generic/README.rst @@ -7,6 +7,7 @@ doesn't allow custom allocation scheme. BSD-licensed (or compatible) code is all as long as it comes with a test case in `tests/test_generics.c`. * array_ - a set of simple macros to make working with dynamic arrays easier. +* queue_ - a FIFO + LIFO queue. * map_ - a `Crit-bit tree`_ key-value map implementation (public domain) that comes with tests. * set_ - set abstraction implemented on top of ``map`` (unused now). * pack_ - length-prefixed list of objects (i.e. array-list). @@ -19,6 +20,12 @@ array .. doxygenfile:: array.h :project: libkres +queue +~~~~~ + +.. doxygenfile:: queue.h + :project: libkres + map ~~~ diff --git a/lib/generic/lru.c b/lib/generic/lru.c index c04f6f09fc49715fdbbe3fb9a2feb5407f191e1e..7ab31ae9b8e082b550d00ff32e260f0430b7a38f 100644 --- a/lib/generic/lru.c +++ b/lib/generic/lru.c @@ -21,7 +21,15 @@ typedef struct lru_group lru_group_t; struct lru_item { uint16_t key_len, val_len; /**< Two bytes should be enough for our purposes. */ - char data[]; /**< Place for both key and value. */ + char data[]; + /**< Place for both key and value. + * + * We use "char" to satisfy the C99+ aliasing rules. + * See C99 section 6.5 Expressions, paragraph 7. + * Any type can be accessed through char-pointer, + * so we can use a common struct definition + * for all types being held. + */ }; /** @internal Compute offset of value in struct lru_item. */ diff --git a/lib/generic/lru.h b/lib/generic/lru.h index 397e9bb4159ee83d6f7b5e74684fe0818373b8a4..61f3ce085866dada7dfcdc9aebc0bfaf160202a8 100644 --- a/lib/generic/lru.h +++ b/lib/generic/lru.h @@ -24,32 +24,31 @@ * most frequent keys/hashes. This tracking is done for *more* keys than * those that are actually stored. * - * # Example usage: - * + * Example usage: * @code{.c} * // Define new LRU type * typedef lru_t(int) lru_int_t; * * // Create LRU * lru_int_t *lru; - * lru_create(&lru, 5, NULL); + * lru_create(&lru, 5, NULL, NULL); * * // Insert some values - * int *pi = lru_get_new(lru, "luke", strlen("luke")); + * int *pi = lru_get_new(lru, "luke", strlen("luke"), NULL); * if (pi) * *pi = 42; - * pi = lru_get_new(lru, "leia", strlen("leia")); + * pi = lru_get_new(lru, "leia", strlen("leia"), NULL); * if (pi) * *pi = 24; * * // Retrieve values - * int *ret = lru_get_try(lru, "luke", strlen("luke")); + * int *ret = lru_get_try(lru, "luke", strlen("luke"), NULL); * if (!ret) printf("luke dropped out!\n"); * else printf("luke's number is %d\n", *ret); * * char *enemies[] = {"goro", "raiden", "subzero", "scorpion"}; * for (int i = 0; i < 4; ++i) { - * int *val = lru_get_new(lru, enemies[i], strlen(enemies[i])); + * int *val = lru_get_new(lru, enemies[i], strlen(enemies[i]), NULL); * if (val) * *val = i; * } diff --git a/lib/generic/queue.c b/lib/generic/queue.c new file mode 100644 index 0000000000000000000000000000000000000000..7bda790158ce8af86b479d3e380d025e8509ed62 --- /dev/null +++ b/lib/generic/queue.c @@ -0,0 +1,124 @@ +/* Copyright (C) 2018 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "lib/generic/queue.h" +#include <string.h> + +KR_EXPORT void queue_init_impl(struct queue *q, size_t item_size) +{ + q->len = 0; + q->item_size = item_size; + q->head = q->tail = NULL; + /* Take 128 B (two x86 cache lines), except a small margin + * that the allocator can use for its overhead. + * Normally (64-bit pointers) this means 16 B header + 13*8 B data. */ + q->chunk_cap = ( ((ssize_t)128) - offsetof(struct queue_chunk, data) + - sizeof(size_t) + ) / item_size; + if (!q->chunk_cap) q->chunk_cap = 1; /* item_size big enough by itself */ +} + +KR_EXPORT void queue_deinit_impl(struct queue *q) +{ + assert(q); + struct queue_chunk *p = q->head; + while (p != NULL) { + struct queue_chunk *pf = p; + p = p->next; + free(pf); + } +#ifndef NDEBUG + memset(q, 0, sizeof(*q)); +#endif +} + +static struct queue_chunk * queue_chunk_new(const struct queue *q) +{ + struct queue_chunk *c = malloc(offsetof(struct queue_chunk, data) + + q->chunk_cap * q->item_size); + if (unlikely(!c)) abort(); // simplify stuff + memset(c, 0, offsetof(struct queue_chunk, data)); + c->cap = q->chunk_cap; + /* ->begin and ->end are zero, i.e. we optimize for _push + * and not _push_head, by default. */ + return c; +} + +/* Return pointer to the space for the new element. */ +KR_EXPORT void * queue_push_impl(struct queue *q) +{ + assert(q); + struct queue_chunk *t = q->tail; // shorthand + if (unlikely(!t)) { + assert(!q->head && !q->len); + q->head = q->tail = t = queue_chunk_new(q); + } else + if (t->end == t->cap) { + if (t->begin * 2 >= t->cap) { + /* Utilization is below 50%, so let's shift (no overlap). */ + memcpy(t->data, t->data + t->begin * q->item_size, + (t->end - t->begin) * q->item_size); + t->end -= t->begin; + t->begin = 0; + } else { + /* Let's grow the tail by another chunk. */ + assert(!t->next); + t->next = queue_chunk_new(q); + t = q->tail = t->next; + } + } + assert(t->end < t->cap); + ++(q->len); + ++(t->end); + return t->data + q->item_size * (t->end - 1); +} + +/* Return pointer to the space for the new element. */ +KR_EXPORT void * queue_push_head_impl(struct queue *q) +{ + /* When we have choice, we optimize for further _push_head, + * i.e. when shifting or allocating a chunk, + * we store items on the tail-end of the chunk. */ + assert(q); + struct queue_chunk *h = q->head; // shorthand + if (unlikely(!h)) { + assert(!q->tail && !q->len); + h = q->head = q->tail = queue_chunk_new(q); + h->begin = h->end = h->cap; + } else + if (h->begin == 0) { + if (h->end * 2 <= h->cap) { + /* Utilization is below 50%, so let's shift (no overlap). + * Computations here are simplified due to h->begin == 0. */ + const int cnt = h->end; + memcpy(h->data + (h->cap - cnt) * q->item_size, h->data, + cnt * q->item_size); + h->begin = h->cap - cnt; + h->end = h->cap; + } else { + /* Let's grow the head by another chunk. */ + h = queue_chunk_new(q); + h->next = q->head; + q->head = h; + h->begin = h->end = h->cap; + } + } + assert(h->begin > 0); + --(h->begin); + ++(q->len); + return h->data + q->item_size * h->begin; +} + diff --git a/lib/generic/queue.h b/lib/generic/queue.h new file mode 100644 index 0000000000000000000000000000000000000000..1f09cc0742cde49f79210876a10e6a6eeaa99873 --- /dev/null +++ b/lib/generic/queue.h @@ -0,0 +1,257 @@ +/* Copyright (C) 2018 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +/** + * @file queue.h + * @brief A queue, usable for FIFO and LIFO simultaneously. + * + * Both the head and tail of the queue can be accessed and pushed to, + * but only the head can be popped from. + * + * @note The implementation uses a singly linked list of blocks + * where each block stores an array of values (for better efficiency). + * + * Example usage: + * @code{.c} + // define new queue type, and init a new queue instance + typedef queue_t(int) queue_int_t; + queue_int_t q; + queue_init(q); + // do some operations + queue_push(q, 1); + queue_push(q, 2); + queue_push(q, 3); + queue_push(q, 4); + queue_pop(q); + assert(queue_head(q) == 2); + assert(queue_tail(q) == 4); + + // you may iterate + typedef queue_it_t(int) queue_it_int_t; + for (queue_it_int_t it = queue_it_begin(q); !queue_it_finished(it); + queue_it_next(it)) { + ++queue_it_val(it); + } + assert(queue_tail(q) == 5); + + queue_push_head(q, 0); + ++queue_tail(q); + assert(queue_tail(q) == 6); + // free it up + queue_deinit(q); + + // you may use dynamic allocation for the type itself + queue_int_t *qm = malloc(sizeof(queue_int_t)); + queue_init(*qm); + queue_deinit(*qm); + free(qm); + * @endcode + * + * \addtogroup generics + * @{ + */ + +#pragma once + +#include "lib/defines.h" +#include "contrib/ucw/lib.h" +#include <assert.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <stdlib.h> + +/** @brief The type for queue, parametrized by value type. */ +#define queue_t(type) \ + union { \ + type *pdata_t; /* only the *type* information is used */ \ + struct queue queue; \ + } + +/** @brief Initialize a queue. You can malloc() it the usual way. */ +#define queue_init(q) do { \ + (void)(((__typeof__(((q).pdata_t)))0) == (void *)0); /* typecheck queue_t */ \ + queue_init_impl(&(q).queue, sizeof(*(q).pdata_t)); \ + } while (false) + +/** @brief De-initialize a queue: make it invalid and free any inner allocations. */ +#define queue_deinit(q) \ + queue_deinit_impl(&(q).queue) + +/** @brief Push data to queue's tail. (Type-safe version; use _impl() otherwise.) */ +#define queue_push(q, data) \ + *((__typeof__((q).pdata_t)) queue_push_impl(&(q).queue)) = data + +/** @brief Push data to queue's head. (Type-safe version; use _impl() otherwise.) */ +#define queue_push_head(q, data) \ + *((__typeof__((q).pdata_t)) queue_push_head_impl(&(q).queue)) = data + +/** @brief Remove the element at the head. */ +#define queue_pop(q) \ + queue_pop_impl(&(q).queue) + +/** @brief Return a "reference" to the element at the head (it's an L-value) . */ +#define queue_head(q) \ + ( *(__typeof__((q).pdata_t)) queue_head_impl(&(q).queue) ) + +/** @brief Return a "reference" to the element at the tail (it's an L-value) . */ +#define queue_tail(q) \ + ( *(__typeof__((q).pdata_t)) queue_tail_impl(&(q).queue) ) + +/** @brief Return the number of elements in the queue. */ +#define queue_len(q) \ + ((const size_t)(q).queue.len) + + +/** @brief Type for queue iterator, parametrized by value type. + * It's a simple structure that owns no other resources. + * You may NOT use it after doing any push or pop (without _begin again). */ +#define queue_it_t(type) \ + union { \ + type *pdata_t; /* only the *type* information is used */ \ + struct queue_it iter; \ + } + +/** @brief Initialize a queue iterator at the head of the queue. + * If you use this in assignment (instead of initialization), + * you will unfortunately need to add corresponding type-cast in front. + * Beware: there's no type-check between queue and iterator! */ +#define queue_it_begin(q) \ + { .iter = queue_it_begin_impl(&(q).queue) } + +/** @brief Return a "reference" to the current element (it's an L-value) . */ +#define queue_it_val(it) \ + ( *(__typeof__((it).pdata_t)) queue_it_val_impl(&(it).iter) ) + +/** @brief Test if the iterator has gone past the last element. + * If it has, you may not use _val or _next. */ +#define queue_it_finished(it) \ + queue_it_finished_impl(&(it).iter) + +/** @brief Advance the iterator to the next element. */ +#define queue_it_next(it) \ + queue_it_next_impl(&(it).iter) + + + +/* ====================== Internal for the implementation ================== */ +/** @cond internal */ + +struct queue; +/* Non-inline functions are exported to be usable from daemon. */ +void queue_init_impl(struct queue *q, size_t item_size); +void queue_deinit_impl(struct queue *q); +void * queue_push_impl(struct queue *q); +void * queue_push_head_impl(struct queue *q); + +struct queue_chunk; +struct queue { + size_t len; + uint16_t chunk_cap, item_size; + struct queue_chunk *head, *tail; +}; + +struct queue_chunk { + struct queue_chunk *next; /*< head -> ... -> tail */ + int16_t begin, end, cap, pad_; /*< indices: zero is closest to head */ + /*< We could fit into uint8_t for example, but the choice of (3+1)*2 bytes + * is a compromise between wasting space and getting a good alignment. + * In particular, queue_t(type*) will store the pointers on addresses + * aligned to the pointer size, in both 64-bit and 32-bit platforms. + */ + char data[]; + /**< The item data. We use "char" to satisfy the C99+ aliasing rules. + * See C99 section 6.5 Expressions, paragraph 7. + * Any type can be accessed through char-pointer, + * so we can use a common struct definition + * for all types being held. + */ +}; + +static inline void * queue_head_impl(const struct queue *q) +{ + assert(q); + struct queue_chunk *h = q->head; + if (unlikely(!h)) + return NULL; + assert(h->end > h->begin); + return h->data + h->begin * q->item_size; +} + +static inline void * queue_tail_impl(const struct queue *q) +{ + assert(q); + struct queue_chunk *t = q->tail; + if (unlikely(!t)) + return NULL; + assert(t->end > t->begin); + return t->data + (t->end - 1) * q->item_size; +} + +static inline void queue_pop_impl(struct queue *q) +{ + assert(q); + struct queue_chunk *h = q->head; + assert(h && h->end > h->begin); + if (h->end - h->begin == 1) { + /* removing the last element in the chunk */ + q->head = h->next; + free(h); + } else { + ++(h->begin); + } + --(q->len); +} + + +struct queue_it { + struct queue_chunk *chunk; + int16_t pos, item_size; +}; + +static inline struct queue_it queue_it_begin_impl(struct queue *q) +{ + assert(q); + return (struct queue_it){ + .chunk = q->head, + .pos = q->head ? q->head->begin : -1, + .item_size = q->item_size, + }; +} + +static inline bool queue_it_finished_impl(struct queue_it *it) +{ + return it->chunk == NULL || it->pos >= it->chunk->end; +} + +static inline void * queue_it_val_impl(struct queue_it *it) +{ + assert(!queue_it_finished_impl(it)); + return it->chunk->data + it->pos * it->item_size; +} + +static inline void queue_it_next_impl(struct queue_it *it) +{ + assert(!queue_it_finished_impl(it)); + ++(it->pos); + if (it->pos < it->chunk->end) + return; + it->chunk = it->chunk->next; + it->pos = it->chunk ? it->chunk->begin : -1; +} + +/** @endcond (internal) */ +/** @} (addtogroup generics) */ + diff --git a/lib/generic/trie.c b/lib/generic/trie.c index b14b13c6e73b9c60e7d6567a23e6f0e326a454f7..9cf5033747c84c1b96603b99fc4c00f612190161 100644 --- a/lib/generic/trie.c +++ b/lib/generic/trie.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2016 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> +/* Copyright (C) 2016-2018 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -154,6 +154,13 @@ static uint bitmap_weight(bitmap_t w) return __builtin_popcount(w); } +/*! \brief Only keep the lowest bit in the bitmap (least significant -> twigs[0]). */ +static bitmap_t bitmap_lowest_bit(bitmap_t w) +{ + assert((w & ~((1 << 17) - 1)) == 0); // using the least-important 17 bits + return 1 << __builtin_ctz(w); +} + /*! \brief Test flags to determine type of this node. */ static bool isbranch(const node_t *t) { @@ -171,7 +178,7 @@ static bitmap_t nibbit(byte k, uint flags) } /*! \brief Extract a nibble from a key and turn it into a bitmask. */ -static bitmap_t twigbit(node_t *t, const char *key, uint32_t len) +static bitmap_t twigbit(const node_t *t, const char *key, uint32_t len) { assert(isbranch(t)); uint i = t->branch.index; @@ -183,14 +190,14 @@ static bitmap_t twigbit(node_t *t, const char *key, uint32_t len) } /*! \brief Test if a branch node has a child indicated by a bitmask. */ -static bool hastwig(node_t *t, bitmap_t bit) +static bool hastwig(const node_t *t, bitmap_t bit) { assert(isbranch(t)); return t->branch.bitmap & bit; } /*! \brief Compute offset of an existing child in a branch node. */ -static uint twigoff(node_t *t, bitmap_t b) +static uint twigoff(const node_t *t, bitmap_t b) { assert(isbranch(t)); return bitmap_weight(t->branch.bitmap & (b - 1)); @@ -285,64 +292,108 @@ size_t trie_weight(const trie_t *tbl) return tbl->weight; } -trie_val_t* trie_get_try(trie_t *tbl, const char *key, uint32_t len) +struct found { + leaf_t *l; /**< the found leaf (NULL if not found) */ + branch_t *p; /**< the leaf's parent (if exists) */ + bitmap_t b; /**< bit-mask with a single bit marking l under p */ +}; +/** Search trie for an item with the given key (equality only). */ +static struct found find_equal(trie_t *tbl, const char *key, uint32_t len) { assert(tbl); + struct found ret0; + memset(&ret0, 0, sizeof(ret0)); if (!tbl->weight) - return NULL; + return ret0; + /* Current node and parent while descending (returned values basically). */ node_t *t = &tbl->root; + branch_t *p = NULL; + bitmap_t b = 0; while (isbranch(t)) { __builtin_prefetch(t->branch.twigs); - bitmap_t b = twigbit(t, key, len); + b = twigbit(t, key, len); if (!hastwig(t, b)) - return NULL; + return ret0; + p = &t->branch; t = twig(t, twigoff(t, b)); } if (key_cmp(key, len, t->leaf.key->chars, t->leaf.key->len) != 0) - return NULL; - return &t->leaf.val; + return ret0; + return (struct found) { + .l = &t->leaf, + .p = p, + .b = b, + }; } - -int trie_del(trie_t *tbl, const char *key, uint32_t len, trie_val_t *val) +/** Find item with the first key (lexicographical order). */ +static struct found find_first(trie_t *tbl) { assert(tbl); - if (!tbl->weight) - return KNOT_ENOENT; - node_t *t = &tbl->root; // current and parent node + if (!tbl->weight) { + struct found ret0; + memset(&ret0, 0, sizeof(ret0)); + return ret0; + } + /* Current node and parent while descending (returned values basically). */ + node_t *t = &tbl->root; branch_t *p = NULL; - bitmap_t b = 0; while (isbranch(t)) { - __builtin_prefetch(t->branch.twigs); - b = twigbit(t, key, len); - if (!hastwig(t, b)) - return KNOT_ENOENT; p = &t->branch; - t = twig(t, twigoff(t, b)); + t = &p->twigs[0]; } - if (key_cmp(key, len, t->leaf.key->chars, t->leaf.key->len) != 0) + return (struct found) { + .l = &t->leaf, + .p = p, + .b = p ? bitmap_lowest_bit(p->bitmap) : 0, + }; +} + +trie_val_t* trie_get_try(trie_t *tbl, const char *key, uint32_t len) +{ + struct found found = find_equal(tbl, key, len); + return found.l ? &found.l->val : NULL; +} + +trie_val_t* trie_get_first(trie_t *tbl, char **key, uint32_t *len) +{ + struct found found = find_first(tbl); + if (!found.l) + return NULL; + if (key) + *key = found.l->key->chars; + if (len) + *len = found.l->key->len; + return &found.l->val; +} + +/** Delete the found element (if any) and return value (unless NULL is passed) */ +static int del_found(trie_t *tbl, struct found found, trie_val_t *val) +{ + if (!found.l) return KNOT_ENOENT; - mm_free(&tbl->mm, t->leaf.key); + mm_free(&tbl->mm, found.l->key); if (val != NULL) - *val = t->leaf.val; // we return trie_val_t directly when deleting + *val = found.l->val; // we return trie_val_t directly when deleting --tbl->weight; + branch_t * const p = found.p; // short-hand if (unlikely(!p)) { // whole trie was a single leaf assert(tbl->weight == 0); empty_root(&tbl->root); return KNOT_EOK; } - // remove leaf t as child of p - int ci = t - p->twigs, // child index via pointer arithmetic + // remove leaf t as child of p; get child index via pointer arithmetic + int ci = ((union node *)found.l) - p->twigs, cc = bitmap_weight(p->bitmap); // child count assert(ci >= 0 && ci < cc); if (cc == 2) { // collapse binary node p: move the other child to this node node_t *twigs = p->twigs; - (*(node_t *)p) = twigs[1 - ci]; // it might be a leaf or branch + (*(union node *)p) = twigs[1 - ci]; // it might be a leaf or branch mm_free(&tbl->mm, twigs); return KNOT_EOK; } memmove(p->twigs + ci, p->twigs + ci + 1, sizeof(node_t) * (cc - ci - 1)); - p->bitmap &= ~b; + p->bitmap &= ~found.b; node_t *twigs = mm_realloc(&tbl->mm, p->twigs, sizeof(node_t) * (cc - 1), sizeof(node_t) * cc); if (likely(twigs != NULL)) @@ -352,6 +403,30 @@ int trie_del(trie_t *tbl, const char *key, uint32_t len, trie_val_t *val) return KNOT_EOK; } +int trie_del(trie_t *tbl, const char *key, uint32_t len, trie_val_t *val) +{ + struct found found = find_equal(tbl, key, len); + return del_found(tbl, found, val); +} + +int trie_del_first(trie_t *tbl, char *key, uint32_t *len, trie_val_t *val) +{ + struct found found = find_first(tbl); + if (!found.l) + return KNOT_ENOENT; + if (key) { + if (!len) + return KNOT_EINVAL; + if (*len < found.l->key->len) + return kr_error(ENOSPC); + memcpy(key, found.l->key->chars, found.l->key->len); + } + if (len) { // makes sense even with key == NULL + *len = found.l->key->len; + } + return del_found(tbl, found, val); +} + /*! * \brief Stack of nodes, storing a path down a trie. * @@ -434,6 +509,8 @@ static inline int ns_longer(nstack_t *ns) * \param info Set position of the point of first mismatch (in index and flags). * \param first Set the value of the first non-matching character (from trie), * optionally; end-of-string character has value -256 (that's why it's int). + * Note: the character is converted to *unsigned* char (i.e. 0..255), + * as that's the ordering used in the trie. * * \return KNOT_EOK or KNOT_ENOMEM. */ @@ -466,7 +543,7 @@ static int ns_find_branch(nstack_t *ns, const char *key, uint32_t len, } info->index = index; if (first) - *first = lkey->len > index ? lkey->chars[index] : -256; + *first = lkey->len > index ? (unsigned char)lkey->chars[index] : -256; // Find flags: which half-byte has matched. uint flags; if (index == len && len == lkey->len) { // found equivalent key @@ -622,7 +699,7 @@ int trie_get_leq(trie_t *tbl, const char *key, uint32_t len, trie_val_t **val) branch_t bp; int un_leaf; // first unmatched character in the leaf ERR_RETURN(ns_find_branch(ns, key, len, &bp, &un_leaf)); - int un_key = bp.index < len ? key[bp.index] : -256; + int un_key = bp.index < len ? (unsigned char)key[bp.index] : -256; node_t *t = ns->stack[ns->len - 1]; if (bp.flags == 0) { // found exact match *val = &t->leaf.val; diff --git a/lib/generic/trie.h b/lib/generic/trie.h index 0d3a76e990d741152e6ed17442163aaff7f85203..1591c1b4a10141d596dcde251f3ba2ff73d414f9 100644 --- a/lib/generic/trie.h +++ b/lib/generic/trie.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> +/* Copyright (C) 2017-2018 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -44,7 +44,7 @@ typedef struct trie trie_t; /*! \brief Opaque type for holding a QP-trie iterator. */ typedef struct trie_it trie_it_t; -/*! \brief Create a trie instance. */ +/*! \brief Create a trie instance. Pass NULL to use malloc+free. */ KR_EXPORT trie_t* trie_create(knot_mm_t *mm); @@ -64,6 +64,11 @@ size_t trie_weight(const trie_t *tbl); KR_EXPORT trie_val_t* trie_get_try(trie_t *tbl, const char *key, uint32_t len); +/*! + * \brief Return pointer to the minimum. Optionally with key and its length. */ +KR_EXPORT +trie_val_t* trie_get_first(trie_t *tbl, char **key, uint32_t *len); + /*! \brief Search the trie, inserting NULL trie_val_t on failure. */ KR_EXPORT trie_val_t* trie_get_ins(trie_t *tbl, const char *key, uint32_t len); @@ -78,6 +83,7 @@ trie_val_t* trie_get_ins(trie_t *tbl, const char *key, uint32_t len); * \return KNOT_EOK for exact match, 1 for previous, KNOT_ENOENT for not-found, * or KNOT_E*. */ +KR_EXPORT int trie_get_leq(trie_t *tbl, const char *key, uint32_t len, trie_val_t **val); /*! @@ -96,6 +102,16 @@ int trie_apply(trie_t *tbl, int (*f)(trie_val_t *, void *), void *d); KR_EXPORT int trie_del(trie_t *tbl, const char *key, uint32_t len, trie_val_t *val); +/*! + * \brief Remove the first item, returning KNOT_EOK on success. + * + * You may optionally get the key and/or value. + * The key is copied, so you need to pass sufficient len, + * otherwise kr_error(ENOSPC) is returned. + */ +KR_EXPORT +int trie_del_first(trie_t *tbl, char *key, uint32_t *len, trie_val_t *val); + /*! \brief Create a new iterator pointing to the first element (if any). */ KR_EXPORT trie_it_t* trie_it_begin(trie_t *tbl); diff --git a/lib/layer.h b/lib/layer.h index 227ff99aabd7a56feeff34448fbf1d6c25278260..d7a7de2aa06d1b2158cb05c512cfa978aa6606d5 100644 --- a/lib/layer.h +++ b/lib/layer.h @@ -27,9 +27,10 @@ kr_log_trace(q, cls, fmt, ## __VA_ARGS__); \ } else WITH_VERBOSE(q) { \ unsigned _ind = 0; \ - uint16_t _id = q ? q->id : 0; \ + uint32_t _q_uid = q ? q->uid : 0; \ + uint32_t _req_uid = q && q->request ? q->request->uid : 0; \ for (; q; q = q->parent, _ind += 2); \ - kr_log_verbose("[%5hu][%s] %*s" fmt, _id, cls, _ind, "", ## __VA_ARGS__); \ + kr_log_verbose("[%05u.%02u][%s] %*s" fmt, _req_uid, _q_uid, cls, _ind, "", ## __VA_ARGS__); \ } \ } #else diff --git a/lib/layer/iterate.c b/lib/layer/iterate.c index e094dbb6181241b9c7dbebb3988fd499426dccd0..9d06969e27c47f44032587096f6b2823075e308d 100644 --- a/lib/layer/iterate.c +++ b/lib/layer/iterate.c @@ -910,15 +910,18 @@ int kr_make_query(struct kr_query *query, knot_pkt_t *pkt) /* Query built, expect answer. */ uint32_t rnd = kr_rand_uint(0); + /* We must respect https://tools.ietf.org/html/rfc7766#section-6.2.1 + * - When sending multiple queries over a TCP connection, clients MUST NOT + * reuse the DNS Message ID of an in-flight query on that connection. + * + * So, if query is going to be sent over TCP connection + * this id can be changed to avoid duplication with query that already was sent + * but didn't receive answer yet. + */ query->id = rnd ^ (rnd >> 16); /* cheap way to strengthen unpredictability */ knot_wire_set_id(pkt->wire, query->id); pkt->parsed = pkt->size; - WITH_VERBOSE(query) { - KR_DNAME_GET_STR(name_str, query->sname); - KR_RRTYPE_GET_STR(type_str, query->stype); - QVERBOSE_MSG(query, "'%s' type '%s' id was assigned, parent id %u\n", - name_str, type_str, query->parent ? query->parent->id : 0); - } + return kr_ok(); } @@ -937,6 +940,14 @@ static int prepare_query(kr_layer_t *ctx, knot_pkt_t *pkt) return KR_STATE_FAIL; } + WITH_VERBOSE(query) { + KR_DNAME_GET_STR(name_str, query->sname); + KR_RRTYPE_GET_STR(type_str, query->stype); + QVERBOSE_MSG(query, "'%s' type '%s' new uid was assigned .%02u, parent uid .%02u\n", + name_str, type_str, req->rplan.next_uid, + query->parent ? query->parent->uid : 0); + } + query->uid = req->rplan.next_uid; req->rplan.next_uid += 1; diff --git a/lib/lib.mk b/lib/lib.mk index d476779cd34879038d2f5118abd7f917e0944521..8ac4e91b95dfb0d7b8c6dacf8480b5ab2cfe333f 100644 --- a/lib/lib.mk +++ b/lib/lib.mk @@ -15,6 +15,7 @@ libkres_SOURCES := \ lib/dnssec/ta.c \ lib/generic/lru.c \ lib/generic/map.c \ + lib/generic/queue.c \ lib/generic/trie.c \ lib/layer/cache.c \ lib/layer/iterate.c \ @@ -41,6 +42,7 @@ libkres_HEADERS := \ lib/generic/lru.h \ lib/generic/map.h \ lib/generic/pack.h \ + lib/generic/queue.h \ lib/generic/trie.h \ lib/layer.h \ lib/layer/iterate.h \ diff --git a/lib/resolve.c b/lib/resolve.c index 7e027d3a6f9123adcfbe329ae8ff2d6d7b252aab..fd067520e91399a003f70555a33da1187205a282 100644 --- a/lib/resolve.c +++ b/lib/resolve.c @@ -668,7 +668,7 @@ static int answer_finalize(struct kr_request *request, int state) /* No detailed analysis ATM, just _SECURE or not. * LATER: request->rank might better be computed in validator's finish phase. */ - VERBOSE_MSG(NULL, " AD: request%s classified as SECURE\n", secure ? "" : " NOT"); + VERBOSE_MSG(last, "AD: request%s classified as SECURE\n", secure ? "" : " NOT"); request->rank = secure ? KR_RANK_SECURE : KR_RANK_INITIAL; /* Clear AD if not secure. ATM answer has AD=1 if requested secured answer. */ @@ -1581,8 +1581,10 @@ int kr_resolve_checkout(struct kr_request *request, struct sockaddr *src, } inet_ntop(addr->sa_family, kr_inaddr(&qry->ns.addr[i].ip), ns_str, sizeof(ns_str)); VERBOSE_MSG(qry, - "=> querying: '%s' score: %u zone cut: '%s' qname: '%s' qtype: '%s' proto: '%s'\n", - ns_str, qry->ns.score, zonecut_str, qname_str, type_str, (qry->flags.TCP) ? "tcp" : "udp"); + "=> id: '%05u' querying: '%s' score: %u zone cut: '%s' " + "qname: '%s' qtype: '%s' proto: '%s'\n", + qry->id, ns_str, qry->ns.score, zonecut_str, + qname_str, type_str, (qry->flags.TCP) ? "tcp" : "udp"); break; }} diff --git a/lib/resolve.h b/lib/resolve.h index 1b8647ef5c913429cf014fb2edcaf304af5265a5..d1b6da04279596aa777202f999f374313b0ecbfd 100644 --- a/lib/resolve.h +++ b/lib/resolve.h @@ -227,6 +227,7 @@ struct kr_request { trace_callback_f trace_finish; /**< Request finish tracepoint */ int vars_ref; /**< Reference to per-request variable table. LUA_NOREF if not set. */ knot_mm_t pool; + unsigned int uid; /** for logging purposes only */ }; /** Initializer for an array of *_selected. */ diff --git a/lib/rplan.c b/lib/rplan.c index 51973d1f53ddd1db41c76283e9e87b6211308484..8c07a0d01a13cd98596c8cd3359b6b5ea2ebca4e 100644 --- a/lib/rplan.c +++ b/lib/rplan.c @@ -207,7 +207,8 @@ struct kr_query *kr_rplan_push_empty(struct kr_rplan *rplan, struct kr_query *pa } WITH_VERBOSE(qry) { - VERBOSE_MSG(qry, "plan '%s' type '%s'\n", "", ""); + VERBOSE_MSG(qry, "plan '%s' type '%s' uid [%05u.%02u]\n", "", "", + qry->request ? qry->request->uid : 0, qry->uid); } return qry; } @@ -230,7 +231,9 @@ struct kr_query *kr_rplan_push(struct kr_rplan *rplan, struct kr_query *parent, WITH_VERBOSE(qry) { KR_DNAME_GET_STR(name_str, name); KR_RRTYPE_GET_STR(type_str, type); - VERBOSE_MSG(parent, "plan '%s' type '%s'\n", name_str, type_str); + VERBOSE_MSG(parent, "plan '%s' type '%s' uid [%05u.%02u]\n", + name_str, type_str, + qry->request ? qry->request->uid : 0, qry->uid); } return qry; } diff --git a/tests/deckard b/tests/deckard index 5436af53cb393ce38519c87868e303d9993fd5dd..4e6e22ccef2e5c7688c38b447fed40c9f9b21359 160000 --- a/tests/deckard +++ b/tests/deckard @@ -1 +1 @@ -Subproject commit 5436af53cb393ce38519c87868e303d9993fd5dd +Subproject commit 4e6e22ccef2e5c7688c38b447fed40c9f9b21359 diff --git a/tests/test_queue.c b/tests/test_queue.c new file mode 100644 index 0000000000000000000000000000000000000000..9ed6fc98e80e70f1ca19d1b48240d47c4f40f6b5 --- /dev/null +++ b/tests/test_queue.c @@ -0,0 +1,75 @@ +/* Copyright (C) 2018 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +#include "tests/test.h" +#include "lib/generic/queue.h" + +/* The main intention is to use queues with pointers, so we test the same-sized int. */ +typedef queue_t(ptrdiff_t) queue_int_t; +typedef queue_it_t(int) queue_int_it_t; + +static void test_int(void **state_) +{ + queue_int_t q; + queue_init(q); + + queue_push_head(q, 2); + queue_push_head(q, 1); + queue_push_head(q, 0); + for (int i = 0; i < 100; ++i) { + assert_int_equal(queue_head(q), i); + queue_push(q, i + 3); + queue_pop(q); + } + assert_int_equal(queue_len(q), 3); + for (int i = 99; i > 0; --i) { + assert_int_equal(queue_head(q), i + 1); + queue_push_head(q, i); + } + assert_int_equal(queue_len(q), 3 + 99); + + /* Basic iterator test. */ + int i = 0; + for (queue_int_it_t it = queue_it_begin(q); !queue_it_finished(it); + queue_it_next(it)) { + ++queue_it_val(it); + ++i; + } + assert_int_equal(queue_len(q), i); + + queue_deinit(q); + queue_init(q); + + for (int i = 0; i < 100; ++i) { + queue_push(q, 2*i); + queue_push(q, 2*i + 1); + assert_int_equal(queue_head(q), i); + queue_pop(q); + } + + queue_deinit(q); +} + + +int main(void) +{ + const UnitTest tests[] = { + unit_test(test_int), + }; + + return run_tests(tests); +} + diff --git a/tests/test_trie.c b/tests/test_trie.c new file mode 100644 index 0000000000000000000000000000000000000000..a029153240cea0d22810a88c113fa077b3f827b7 --- /dev/null +++ b/tests/test_trie.c @@ -0,0 +1,165 @@ +/* Copyright (C) 2018 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +#include "lib/generic/trie.h" +#include "tests/test.h" + +static const char *dict[] = { + "catagmatic", "prevaricator", "statoscope", "workhand", "benzamide", + "work", "workhands", // have some keys that are prefixes of each other + "alluvia", "fanciful", "bladish", "Tarsius", "unfast", "appropriative", + "seraphically", "monkeypod", "deflectometer", "tanglesome", "zodiacal", + "physiologically", "economizer", "forcepslike", "betrumpet", + "Danization", "broadthroat", "randir", "usherette", "nephropyosis", + "hematocyanin", "chrysohermidin", "uncave", "mirksome", "podophyllum", + "siphonognathous", "indoor", "featheriness", "forwardation", + "archruler", "soricoid", "Dailamite", "carmoisin", "controllability", + "unpragmatical", "childless", "transumpt", "productive", + "thyreotoxicosis", "oversorrow", "disshadow", "osse", "roar", + "pantomnesia", "talcer", "hydrorrhoea", "Satyridae", "undetesting", + "smoothbored", "widower", "sivathere", "pendle", "saltation", + "autopelagic", "campfight", "unexplained", "Macrorhamphosus", + "absconsa", "counterflory", "interdependent", "triact", "reconcentration", + "oversharpness", "sarcoenchondroma", "superstimulate", "assessory", + "pseudepiscopacy", "telescopically", "ventriloque", "politicaster", + "Caesalpiniaceae", "inopportunity", "Helion", "uncompatible", + "cephaloclasia", "oversearch", "Mahayanistic", "quarterspace", + "bacillogenic", "hamartite", "polytheistical", "unescapableness", + "Pterophorus", "cradlemaking", "Hippoboscidae", "overindustrialize", + "perishless", "cupidity", "semilichen", "gadge", "detrimental", + "misencourage", "toparchia", "lurchingly", "apocatastasis" +}; +#define KEY_LEN(x) (strlen(x) + 1) +static const int dict_size = sizeof(dict) / sizeof(const char *); + +static void test_init(void **state) +{ + trie_t *t = trie_create(NULL); + assert_non_null(t); + *state = t; +} + +static void test_insert(void **state) +{ + trie_t *t = *state; + + for (int i = 0; i < dict_size; ++i) { + trie_val_t *data = trie_get_ins(t, dict[i], KEY_LEN(dict[i])); + assert_non_null(data); + assert_null(*data); + *data = NULL + (ptrdiff_t)i; // yes, ugly + assert_ptr_equal(trie_get_try(t, dict[i], KEY_LEN(dict[i])), data); + } + assert_int_equal(trie_weight(t), dict_size); +} + +static void test_missing(void **state) +{ + trie_t *t = *state; + const char *notin = "p"; + assert_null(trie_get_try(t, notin, KEY_LEN(notin))); +} + +static int cmpstringp(const void *p1, const void *p2) +{ + return strcmp(* (char * const *) p1, * (char * const *) p2); +} + +static void test_iter(void **state) +{ + // prepare sorted dictionary + char *dict_sorted[dict_size]; + memcpy(dict_sorted, dict, sizeof(dict)); + qsort(dict_sorted, dict_size, sizeof(dict[0]), cmpstringp); + + // iterate and check the order is consistent + trie_t *t = *state; + trie_it_t *it = trie_it_begin(t); + for (int i = 0; i < dict_size; ++i, trie_it_next(it)) { + assert_false(trie_it_finished(it)); + size_t len; + const char *key = trie_it_key(it, &len); + assert_int_equal(KEY_LEN(key), len); + assert_string_equal(key, dict_sorted[i]); + assert_ptr_equal(dict[*trie_it_val(it) - NULL], dict_sorted[i]); + } + assert_true(trie_it_finished(it)); + trie_it_free(it); +} + +static void test_queue(void **state) +{ + trie_t *t = *state; + // remove all the elements in ascending order + for (int i = 0; i < dict_size; ++i) { + char *key; + uint32_t len; + trie_val_t *data = trie_get_first(t, &key, &len); + assert_non_null(key); + assert_int_equal(len, KEY_LEN(key)); + assert_non_null(data); + ptrdiff_t key_i = *data - NULL; + assert_string_equal(key, dict[key_i]); + + len = 30; + char key_buf[len]; + ptrdiff_t key_i_new; + int ret = trie_del_first(t, key_buf, &len, (trie_val_t *)&key_i_new); + assert_int_equal(ret, kr_ok()); + assert_int_equal(KEY_LEN(key_buf), len); + assert_int_equal(key_i, key_i_new); + assert_string_equal(dict[key_i], key_buf); + } +} + +static void test_leq_bug(void **state) +{ + /* We use different contents of the trie, + * so that the particular bug would've been triggered. */ + trie_t *t = trie_create(NULL); + char key = 'a'; + trie_get_ins(t, &key, sizeof(key)); + + key = 0xff; + trie_val_t *val; + int ret = trie_get_leq(t, &key, sizeof(key), &val); + assert_int_equal(ret, 1); + trie_free(t); +} + +static void test_deinit(void **state) +{ + trie_t *t = *state; + trie_free(t); + *state = NULL; +} + +/* Program entry point */ +int main(int argc, char **argv) +{ + const UnitTest tests[] = { + group_test_setup(test_init), + unit_test(test_insert), + unit_test(test_leq_bug), + unit_test(test_missing), + unit_test(test_iter), + unit_test(test_queue), + group_test_teardown(test_deinit) + }; + + return run_group_tests(tests); +} + diff --git a/tests/unit.mk b/tests/unit.mk index df3b878fa155157c96a1fbb1ba12dbef9655c478..6819fb0f78186aad1b8ca605b83df5bf6c8a7ed2 100644 --- a/tests/unit.mk +++ b/tests/unit.mk @@ -3,15 +3,17 @@ # tests_BIN := \ - test_set \ - test_map \ test_array \ - test_pack \ test_lru \ - test_utils \ + test_map \ test_module \ + test_pack \ + test_queue \ + test_rplan \ + test_set \ + test_trie \ + test_utils \ test_zonecut \ - test_rplan #test_cache TODO: re-consider how best to test cache mock_cmodule_CFLAGS := -fPIC