diff --git a/src/contrib/bpf/bpf_helpers.h b/src/contrib/bpf/bpf_helpers.h index b34ba5695e1ca2cafcd5f2d0247dbceb42bfade5..4f0976f6abddddb96fd8b0f3234e0d73ade6c7af 100644 --- a/src/contrib/bpf/bpf_helpers.h +++ b/src/contrib/bpf/bpf_helpers.h @@ -231,6 +231,7 @@ struct bpf_map_def { __attribute__ ((section(".maps." #name), used)) \ ____btf_map_##name = { } +/* static int (*bpf_skb_load_bytes)(void *ctx, int off, void *to, int len) = (void *) BPF_FUNC_skb_load_bytes; static int (*bpf_skb_load_bytes_relative)(void *ctx, int off, void *to, int len, __u32 start_header) = @@ -279,6 +280,7 @@ static unsigned int (*bpf_set_hash)(void *ctx, __u32 hash) = static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode, unsigned long long flags) = (void *) BPF_FUNC_skb_adjust_room; +*/ /* Scan the ARCH passed in from ARCH env variable (see Makefile) */ #if defined(__TARGET_ARCH_x86) diff --git a/src/contrib/bpf/parsing_helpers.h b/src/contrib/bpf/parsing_helpers.h index 7bd2764f585db1455dfa03445621333b279ef7a2..e282d1dbe4005e5f07c3f0f5f6263424a90c7083 100644 --- a/src/contrib/bpf/parsing_helpers.h +++ b/src/contrib/bpf/parsing_helpers.h @@ -93,17 +93,20 @@ static __always_inline int parse_ethhdr(struct hdr_cursor *nh, void *data_end, /* Use loop unrolling to avoid the verifier restriction on loops; * support up to VLAN_MAX_DEPTH layers of VLAN encapsulation. */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunknown-pragmas" #pragma unroll for (i = 0; i < VLAN_MAX_DEPTH; i++) { if (!proto_is_vlan(h_proto)) break; - if (vlh + 1 > data_end) + if ((void *)(vlh + 1) > data_end) break; h_proto = vlh->h_vlan_encapsulated_proto; vlh++; } +#pragma GCC diagnostic pop nh->pos = vlh; return bpf_ntohs(h_proto); @@ -119,7 +122,7 @@ static __always_inline int parse_ip6hdr(struct hdr_cursor *nh, * thing being pointed to. We will be using this style in the remainder * of the tutorial. */ - if (ip6h + 1 > data_end) + if ((void *)(ip6h + 1) > data_end) return -1; nh->pos = ip6h + 1; @@ -135,7 +138,7 @@ static __always_inline int parse_iphdr(struct hdr_cursor *nh, struct iphdr *iph = nh->pos; int hdrsize; - if (iph + 1 > data_end) + if ((void *)(iph + 1) > data_end) return -1; hdrsize = iph->ihl * 4; @@ -156,7 +159,7 @@ static __always_inline int parse_icmp6hdr(struct hdr_cursor *nh, { struct icmp6hdr *icmp6h = nh->pos; - if (icmp6h + 1 > data_end) + if ((void *)(icmp6h + 1) > data_end) return -1; nh->pos = icmp6h + 1; @@ -171,7 +174,7 @@ static __always_inline int parse_icmphdr(struct hdr_cursor *nh, { struct icmphdr *icmph = nh->pos; - if (icmph + 1 > data_end) + if ((void *)(icmph + 1) > data_end) return -1; nh->pos = icmph + 1; @@ -186,7 +189,7 @@ static __always_inline int parse_icmphdr_common(struct hdr_cursor *nh, { struct icmphdr_common *h = nh->pos; - if (h + 1 > data_end) + if ((void *)(h + 1) > data_end) return -1; nh->pos = h + 1; @@ -205,7 +208,7 @@ static __always_inline int parse_udphdr(struct hdr_cursor *nh, int len; struct udphdr *h = nh->pos; - if (h + 1 > data_end) + if ((void *)(h + 1) > data_end) return -1; nh->pos = h + 1; @@ -228,7 +231,7 @@ static __always_inline int parse_tcphdr(struct hdr_cursor *nh, int len; struct tcphdr *h = nh->pos; - if (h + 1 > data_end) + if ((void *)(h + 1) > data_end) return -1; len = h->doff * 4; diff --git a/src/knot/conf/base.c b/src/knot/conf/base.c index d724e68d21e4a3192bbf5c59719a78649f1e0c35..c016f992baef9be286c666d52c263779eab97870 100644 --- a/src/knot/conf/base.c +++ b/src/knot/conf/base.c @@ -182,6 +182,8 @@ static void init_cache( conf->cache.srv_udp_threads = running_udp_threads; + conf->cache.srv_xdp_threads = 1; // FIXME + conf->cache.srv_tcp_threads = running_tcp_threads; conf->cache.srv_bg_threads = running_bg_threads; diff --git a/src/knot/conf/base.h b/src/knot/conf/base.h index 0b1909981cf1dfa0b81e334549eeb0114dae8581..e33ba15d0f4513396d4a2c51ab45b967357fa8c4 100644 --- a/src/knot/conf/base.h +++ b/src/knot/conf/base.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2019 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> +/* Copyright (C) 2020 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -114,6 +114,7 @@ typedef struct { bool srv_tcp_reuseport; size_t srv_udp_threads; size_t srv_tcp_threads; + size_t srv_xdp_threads; size_t srv_bg_threads; size_t srv_tcp_max_clients; int ctl_timeout; diff --git a/src/knot/ctl/commands.c b/src/knot/ctl/commands.c index 0ffab8c4c76f38b625f01c3d1b803a4a30a96181..a20b4b2f473772aab39a1358f22a686e2c2adf63 100644 --- a/src/knot/ctl/commands.c +++ b/src/knot/ctl/commands.c @@ -1385,8 +1385,10 @@ static int server_status(ctl_args_t *args) int running_bkg_wrk, wrk_queue; worker_pool_status(args->server->workers, &running_bkg_wrk, &wrk_queue); ret = snprintf(buff, sizeof(buff), "UDP workers: %zu, TCP workers %zu, " + "XDP workers: %zu, " "background workers: %zu (running: %d, pending: %d)", conf()->cache.srv_udp_threads, conf()->cache.srv_tcp_threads, + conf()->cache.srv_xdp_threads, conf()->cache.srv_bg_threads, running_bkg_wrk, wrk_queue); } else if (strcasecmp(type, "configure") == 0) { ret = snprintf(buff, sizeof(buff), "%s", CONFIGURE_SUMMARY); diff --git a/src/knot/include/module.h b/src/knot/include/module.h index 14ef4248682cd4dd49527165b8ef4e7e3903390d..4d02727d97b5c90b201056bf6afd1c27e131ae8b 100644 --- a/src/knot/include/module.h +++ b/src/knot/include/module.h @@ -277,6 +277,7 @@ typedef enum { KNOTD_CONF_ENV_HOSTNAME = 1, /*!< Current hostname. */ KNOTD_CONF_ENV_WORKERS_UDP = 2, /*!< Current number of UDP workers. */ KNOTD_CONF_ENV_WORKERS_TCP = 3, /*!< Current number of TCP workers. */ + KNOTD_CONF_ENV_WORKERS_XDP = 4, /*!< Current number of UDP-over-XDP workers. */ } knotd_conf_env_t; /*! diff --git a/src/knot/modules/dnstap/dnstap.c b/src/knot/modules/dnstap/dnstap.c index 8c9120234befa45a3fa6598321842db17bdfb003..637e6cd5a39403a995b0513c4d03c0645f852146 100644 --- a/src/knot/modules/dnstap/dnstap.c +++ b/src/knot/modules/dnstap/dnstap.c @@ -273,8 +273,9 @@ int dnstap_load(knotd_mod_t *mod) /* Initialize queues. */ knotd_conf_t udp = knotd_conf_env(mod, KNOTD_CONF_ENV_WORKERS_UDP); + knotd_conf_t xdp = knotd_conf_env(mod, KNOTD_CONF_ENV_WORKERS_XDP); knotd_conf_t tcp = knotd_conf_env(mod, KNOTD_CONF_ENV_WORKERS_TCP); - size_t qcount = udp.single.integer + tcp.single.integer; + size_t qcount = udp.single.integer + tcp.single.integer + xdp.single.integer; fstrm_iothr_options_set_num_input_queues(opt, qcount); /* Create the I/O thread. */ diff --git a/src/knot/modules/noudp/noudp.c b/src/knot/modules/noudp/noudp.c index 4dc789cb5ce0655929a1e642c5f595cb393dd9c4..91a4924cd93de87041ad2aac4033b6b5b2ad7dab 100644 --- a/src/knot/modules/noudp/noudp.c +++ b/src/knot/modules/noudp/noudp.c @@ -60,6 +60,7 @@ int noudp_load(knotd_mod_t *mod) ctx->rate = conf.single.integer; if (ctx->rate > 0) { knotd_conf_t udp = knotd_conf_env(mod, KNOTD_CONF_ENV_WORKERS_UDP); + // TODO xdp ? size_t udp_workers = udp.single.integer; ctx->counters = calloc(udp_workers, sizeof(uint32_t)); if (ctx->counters == NULL) { diff --git a/src/knot/nameserver/query_module.c b/src/knot/nameserver/query_module.c index 6768e498a072bec9d753c5d165446ffc1a0c080f..b6c3c369ffa03b851088e98e480ff80b1667f959 100644 --- a/src/knot/nameserver/query_module.c +++ b/src/knot/nameserver/query_module.c @@ -345,6 +345,9 @@ knotd_conf_t knotd_conf_env(knotd_mod_t *mod, knotd_conf_env_t env) case KNOTD_CONF_ENV_WORKERS_UDP: out.single.integer = config->cache.srv_udp_threads; break; + case KNOTD_CONF_ENV_WORKERS_XDP: + out.single.integer = config->cache.srv_xdp_threads; + break; case KNOTD_CONF_ENV_WORKERS_TCP: out.single.integer = config->cache.srv_tcp_threads; break; diff --git a/src/knot/server/server.c b/src/knot/server/server.c index 81cc4f4512de9619ad3d2ef6ea2e3f74038c7fd8..340264faaea392667480efd2f8d3db09bc64ad07 100644 --- a/src/knot/server/server.c +++ b/src/knot/server/server.c @@ -21,6 +21,7 @@ #include <netinet/tcp.h> #include "libknot/errcode.h" +#include "libknot/xdp/af_xdp.h" #include "libknot/yparser/ypschema.h" #include "knot/common/log.h" #include "knot/common/stats.h" @@ -62,6 +63,10 @@ static void server_deinit_iface(iface_t *iface) free(iface->fd_udp); } + if (iface->fd_xdp > -1) { + knot_xsk_deinit(); + } + /* Free TCP handler. */ if (iface->fd_tcp != NULL) { for (int i = 0; i < iface->fd_tcp_count; i++) { @@ -203,7 +208,7 @@ static int enable_fastopen(int sock, int backlog) */ static iface_t *server_init_iface(struct sockaddr_storage *addr, int udp_thread_count, int tcp_thread_count, - bool tcp_reuseport) + bool tcp_reuseport, bool use_xdp) { iface_t *new_if = calloc(1, sizeof(*new_if)); if (new_if == NULL) { @@ -286,6 +291,17 @@ static iface_t *server_init_iface(struct sockaddr_storage *addr, new_if->fd_udp_count += 1; } + new_if->fd_xdp = -1; + if (use_xdp) { + int ret = knot_xsk_init("enp0s8", "/bpf-kernel.o", NULL); // FIXME + if (ret != KNOT_EOK) { + log_warning("failed to init XDP (%s)", knot_strerror(ret)); + } else { + new_if->fd_xdp = knot_xsk_get_poll_fd(); + } + } + + warn_bind = true; warn_bufsize = true; warn_flag_misc = true; @@ -372,7 +388,8 @@ static int configure_sockets(conf_t *conf, server_t *s) unsigned size_udp = s->handlers[IO_UDP].handler.unit->size; unsigned size_tcp = s->handlers[IO_TCP].handler.unit->size; bool tcp_reuseport = conf->cache.srv_tcp_reuseport; - iface_t *new_if = server_init_iface(&addr, size_udp, size_tcp, tcp_reuseport); + iface_t *new_if = server_init_iface(&addr, size_udp, size_tcp, tcp_reuseport, + conf->cache.srv_xdp_threads > 0); if (new_if != NULL) { add_tail(newlist, &new_if->n); } @@ -386,9 +403,9 @@ static int configure_sockets(conf_t *conf, server_t *s) /* Set the ID's (thread_id) of both the TCP and UDP threads. */ unsigned thread_count = 0; - for (unsigned proto = IO_UDP; proto <= IO_TCP; ++proto) { + for (unsigned proto = IO_UDP; proto <= IO_XDP; ++proto) { dt_unit_t *tu = s->handlers[proto].handler.unit; - for (unsigned i = 0; i < tu->size; ++i) { + for (unsigned i = 0; tu != NULL && i < tu->size; ++i) { s->handlers[proto].handler.thread_id[i] = thread_count++; } } @@ -535,7 +552,7 @@ int server_start(server_t *server, bool async) /* Start I/O handlers. */ server->state |= ServerRunning; - for (int proto = IO_UDP; proto <= IO_TCP; ++proto) { + for (int proto = IO_UDP; proto <= IO_XDP; ++proto) { if (server->handlers[proto].size > 0) { int ret = dt_start(server->handlers[proto].handler.unit); if (ret != KNOT_EOK) { @@ -556,7 +573,7 @@ void server_wait(server_t *server) evsched_join(&server->sched); worker_pool_join(server->workers); - for (int proto = IO_UDP; proto <= IO_TCP; ++proto) { + for (int proto = IO_UDP; proto <= IO_XDP; ++proto) { if (server->handlers[proto].size > 0) { server_free_handler(&server->handlers[proto].handler); } @@ -782,7 +799,7 @@ void server_stop(server_t *server) server->state &= ~ServerRunning; } -static int set_handler(server_t *server, int index, unsigned size, runnable_t run) +static int set_handler(server_t *server, int index, unsigned size, bool use_xdp, runnable_t run) { /* Initialize I/O handlers. */ int ret = server_init_handler(server, index, size, run, NULL); @@ -791,6 +808,7 @@ static int set_handler(server_t *server, int index, unsigned size, runnable_t ru } server->handlers[index].size = size; + server->handlers[index].handler.use_xdp = use_xdp; return KNOT_EOK; } @@ -798,12 +816,19 @@ static int set_handler(server_t *server, int index, unsigned size, runnable_t ru /*! \brief Reconfigure UDP and TCP query processing threads. */ static int configure_threads(conf_t *conf, server_t *server) { - int ret = set_handler(server, IO_UDP, conf->cache.srv_udp_threads, udp_master); + int ret = set_handler(server, IO_UDP, conf->cache.srv_udp_threads, false, udp_master); if (ret != KNOT_EOK) { return ret; } - return set_handler(server, IO_TCP, conf->cache.srv_tcp_threads, tcp_master); + if (conf->cache.srv_xdp_threads > 0) { + ret = set_handler(server, IO_XDP, conf->cache.srv_xdp_threads, true, udp_master); + if (ret != KNOT_EOK) { + return ret; + } + } + + return set_handler(server, IO_TCP, conf->cache.srv_tcp_threads, false, tcp_master); } static int reconfigure_journal_db(conf_t *conf, server_t *server) diff --git a/src/knot/server/server.h b/src/knot/server/server.h index 429105e3a7ac2e2b084650b00fd0d4995564001c..06872b87d1b0f1394c38fd0d4b29edd170251b5c 100644 --- a/src/knot/server/server.h +++ b/src/knot/server/server.h @@ -38,6 +38,7 @@ typedef struct iohandler { dt_unit_t *unit; /*!< Threading unit */ unsigned *thread_state; /*!< Thread state */ unsigned *thread_id; /*!< Thread identifier. */ + bool use_xdp; /*!< Using XDP. */ } iohandler_t; /*! \brief Server state flags. @@ -56,13 +57,15 @@ typedef struct iface { int fd_udp_count; int *fd_tcp; int fd_tcp_count; + int fd_xdp; struct sockaddr_storage addr; } iface_t; /* Handler indexes. */ enum { IO_UDP = 0, - IO_TCP = 1 + IO_TCP = 1, + IO_XDP = 2, }; /*! @@ -85,7 +88,7 @@ typedef struct server { struct { unsigned size; iohandler_t handler; - } handlers[2]; + } handlers[3]; /*! \brief Background jobs. */ worker_pool_t *workers; diff --git a/src/knot/server/udp-handler.c b/src/knot/server/udp-handler.c index 8abab411c968ce69e0f01e4829c44d6b0d60073f..a0a205ce2f239477e43df98746d94e68e1450734 100644 --- a/src/knot/server/udp-handler.c +++ b/src/knot/server/udp-handler.c @@ -39,6 +39,8 @@ #include "knot/server/server.h" #include "knot/server/udp-handler.h" +#include "libknot/xdp/af_xdp.h" + /* Buffer identifiers. */ enum { RX = 0, @@ -102,12 +104,13 @@ static void udp_handle(udp_context_t *udp, int fd, struct sockaddr_storage *ss, mp_flush(udp->layer.mm->ctx); } -/*! \brief Pointer to selected UDP master implementation. */ -static void* (*_udp_init)(void) = 0; -static void (*_udp_deinit)(void *) = 0; -static int (*_udp_recv)(int, void *) = 0; -static int (*_udp_handle)(udp_context_t *, void *) = 0; -static int (*_udp_send)(void *) = 0; +typedef struct { + void* (*udp_init)(void); + void (*udp_deinit)(void *); + int (*udp_recv)(int, void *); + int (*udp_handle)(udp_context_t *, void *); + int (*udp_send)(void *); +} udp_api_t; /*! \brief Control message to fit IP_PKTINFO or IPv6_RECVPKTINFO. */ typedef union { @@ -230,6 +233,15 @@ static int udp_recvfrom_send(void *d) return 0; } +__attribute__ ((unused)) +static udp_api_t udp_recvfrom_api = { + udp_recvfrom_init, + udp_recvfrom_deinit, + udp_recvfrom_recv, + udp_recvfrom_handle, + udp_recvfrom_send +}; + #ifdef ENABLE_RECVMMSG /* UDP recvmmsg() request struct. */ @@ -336,30 +348,111 @@ static int udp_recvmmsg_send(void *d) } return rc; } + +static udp_api_t udp_recvmmsg_api = { + udp_recvmmsg_init, + udp_recvmmsg_deinit, + udp_recvmmsg_recv, + udp_recvmmsg_handle, + udp_recvmmsg_send +}; #endif /* ENABLE_RECVMMSG */ -/*! \brief Initialize UDP master routine on run-time. */ -void __attribute__ ((constructor)) udp_master_init(void) +struct xdp_recvmmsg { + knot_xsk_msg_t msgs_rx[XDP_BATCHLEN]; + knot_xsk_msg_t msgs_tx[XDP_BATCHLEN]; + uint32_t rcvd; +}; + +static void *xdp_recvmmsg_init(void) { - /* Initialize defaults. */ - _udp_init = udp_recvfrom_init; - _udp_deinit = udp_recvfrom_deinit; - _udp_recv = udp_recvfrom_recv; - _udp_handle = udp_recvfrom_handle; - _udp_send = udp_recvfrom_send; + struct xdp_recvmmsg *rq = malloc(sizeof(*rq)); + if (rq != NULL) { + memset(rq, 0, sizeof(*rq)); + } + return rq; +} -#ifdef ENABLE_RECVMMSG - _udp_init = udp_recvmmsg_init; - _udp_deinit = udp_recvmmsg_deinit; - _udp_recv = udp_recvmmsg_recv; - _udp_handle = udp_recvmmsg_handle; - _udp_send = udp_recvmmsg_send; -#endif /* ENABLE_RECVMMSG */ +static void xdp_recvmmsg_deinit(void *d) +{ + free(d); +} + +static int xdp_recvmmsg_recv(int fd, void *d) +{ + UNUSED(fd); + + struct xdp_recvmmsg *rq = (struct xdp_recvmmsg *)d; + + int ret = knot_xsk_recvmmsg(rq->msgs_rx, XDP_BATCHLEN, &rq->rcvd); + + return ret == KNOT_EOK ? rq->rcvd : ret; +} + +static int xdp_recvmmsg_handle(udp_context_t *ctx, void *d) +{ + struct xdp_recvmmsg *rq = (struct xdp_recvmmsg *)d; + + for (unsigned i = 0; i < rq->rcvd; ++i) { + struct iovec *rx = &rq->msgs_rx[i].payload; + struct iovec *tx = &rq->msgs_tx[i].payload; + + *tx = knot_xsk_alloc_frame(); + if (tx->iov_base == NULL) { + return KNOT_ERROR; + } + + udp_handle(ctx, 0 /* TODO ? */, &rq->msgs_rx[i].ip_from, rx, tx); + + memcpy(rq->msgs_tx[i].eth_from, rq->msgs_rx[i].eth_to, sizeof(rq->msgs_tx[i].eth_from)); + memcpy(rq->msgs_tx[i].eth_to, rq->msgs_rx[i].eth_from, sizeof(rq->msgs_tx[i].eth_to)); + memcpy(&rq->msgs_tx[i].ip_from, &rq->msgs_rx[i].ip_to, sizeof(rq->msgs_tx[i].ip_from)); + memcpy(&rq->msgs_tx[i].ip_to, &rq->msgs_rx[i].ip_from, sizeof(rq->msgs_tx[i].ip_to)); + + knot_xsk_free_recvd(&rq->msgs_rx[i]); + + // FIXME!! : + /* + rq->msgs[TX][i].msg_len = tx->iov_len; + rq->msgs[TX][i].msg_hdr.msg_namelen = 0; + if (tx->iov_len > 0) { + rq->msgs[TX][i].msg_hdr.msg_namelen = rq->msgs[RX][i].msg_hdr.msg_namelen; + } */ + } + + return KNOT_EOK; +} + +static int xdp_recvmmsg_send(void *d) +{ + struct xdp_recvmmsg *rq = (struct xdp_recvmmsg *)d; + uint32_t sent = rq->rcvd; + + int ret = knot_xsk_sendmmsg(rq->msgs_tx, sent); + + memset(rq, 0, sizeof(*rq)); + + knot_xsk_check(); + + return ret == KNOT_EOK ? sent : ret; } +static udp_api_t xdp_recvmmsg_api = { + xdp_recvmmsg_init, + xdp_recvmmsg_deinit, + xdp_recvmmsg_recv, + xdp_recvmmsg_handle, + xdp_recvmmsg_send +}; + /*! \brief Get interface UDP descriptor for a given thread. */ -static int iface_udp_fd(const iface_t *iface, int thread_id) +static int iface_udp_fd(const iface_t *iface, int thread_id, bool use_xdp) { + if (use_xdp) { + assert(iface->fd_xdp > -1); + return iface->fd_xdp; + } + #ifdef ENABLE_REUSEPORT assert(thread_id < iface->fd_udp_count); @@ -379,7 +472,7 @@ static int iface_udp_fd(const iface_t *iface, int thread_id) * \return Number of watched descriptors, zero on error. */ static unsigned udp_set_ifaces(const list_t *ifaces, struct pollfd **fds_ptr, - int thread_id) + int thread_id, bool use_xdp) { if (ifaces == NULL) { return 0; @@ -394,7 +487,7 @@ static unsigned udp_set_ifaces(const list_t *ifaces, struct pollfd **fds_ptr, iface_t *iface; int i = 0; WALK_LIST(iface, *ifaces) { - fds[i].fd = iface_udp_fd(iface, thread_id); + fds[i].fd = iface_udp_fd(iface, thread_id, use_xdp); fds[i].events = POLLIN; fds[i].revents = 0; i += 1; @@ -420,7 +513,17 @@ int udp_master(dthread_t *thread) /* Prepare structures for bound sockets. */ unsigned thr_id = dt_get_id(thread); iohandler_t *handler = (iohandler_t *)thread->data; - void *rq = _udp_init(); + udp_api_t *api = NULL; + if (handler->use_xdp) { + api = &xdp_recvmmsg_api; + } else { +#ifdef ENABLE_RECVMMSG + api = &udp_recvmmsg_api; +#else + api = &udp_recvfrom_api; +#endif + } + void *rq = api->udp_init(); /* Create big enough memory cushion. */ knot_mm_t mm; @@ -437,7 +540,8 @@ int udp_master(dthread_t *thread) struct pollfd *fds = NULL; /* Allocate descriptors for the configured interfaces. */ - unsigned nfds = udp_set_ifaces(handler->server->ifaces, &fds, udp.thread_id); + unsigned nfds = udp_set_ifaces(handler->server->ifaces, &fds, + udp.thread_id, handler->use_xdp); if (nfds == 0) { goto finish; } @@ -464,15 +568,15 @@ int udp_master(dthread_t *thread) continue; } events -= 1; - if (_udp_recv(fds[i].fd, rq) > 0) { - _udp_handle(&udp, rq); - _udp_send(rq); + if (api->udp_recv(fds[i].fd, rq) > 0) { + api->udp_handle(&udp, rq); + api->udp_send(rq); } } } finish: - _udp_deinit(rq); + api->udp_deinit(rq); free(fds); mp_delete(mm.ctx); diff --git a/src/knot/server/udp-handler.h b/src/knot/server/udp-handler.h index 8067144c13b9ecb88d4b7aa2fe8da8b6f0897d67..18c55642023c2ebc310c1c4c096f9f76552d29e9 100644 --- a/src/knot/server/udp-handler.h +++ b/src/knot/server/udp-handler.h @@ -27,6 +27,7 @@ #include "knot/server/dthreads.h" #define RECVMMSG_BATCHLEN 10 /*!< Default recvmmsg() batch size. */ +#define XDP_BATCHLEN RECVMMSG_BATCHLEN /*! * \brief UDP handler thread runnable. diff --git a/src/libknot/xdp/af_xdp.c b/src/libknot/xdp/af_xdp.c index 2a14943586cbf594277b5c4aea7ef93355a34ff9..9bb5fc3a29a689f0988eee82bdfdf4ca6eb89658 100644 --- a/src/libknot/xdp/af_xdp.c +++ b/src/libknot/xdp/af_xdp.c @@ -1,11 +1,29 @@ -/* LATER: - * - XDP_USE_NEED_WAKEUP (optimization discussed in summer 2019) +/* Copyright (C) 2019 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. */ -#include "daemon/af_xdp.h" +/* LATER: + * - XDP_USE_NEED_WAKEUP (optimization discussed in summer 2019) + */ + +#include "libknot/xdp/af_xdp.h" +#include "libknot/attribute.h" +#include "libknot/error.h" #include <assert.h> #include <errno.h> @@ -29,18 +47,19 @@ #include <linux/filter.h> //#include <linux/icmpv6.h> -#include "contrib/ucw/lib.h" +//#include "contrib/ucw/lib.h" #include "contrib/ucw/mempool.h" +#include "contrib/macros.h" -#include "lib/resolve.h" -#include "daemon/session.h" -#include "daemon/worker.h" +//#include "lib/resolve.h" +//#include "daemon/session.h" +//#include "daemon/worker.h" -#include "daemon/kxsk/impl.h" +#include "libknot/xdp/bpf-user.h" // placate libclang :-/ -typedef uint64_t size_t; +//typedef uint64_t size_t; #define FRAME_SIZE 4096 #define RX_BATCH_SIZE 64 @@ -49,15 +68,17 @@ typedef uint64_t size_t; struct umem_frame { union { uint8_t bytes[FRAME_SIZE]; struct { - struct qr_task *task; struct udpv4 udpv4; }; }; }; +static const size_t UDPV4_PAYLOAD_OFFSET = offsetof(struct udpv4, data); +static const size_t FRAME_PAYLOAD_OFFSET = UDPV4_PAYLOAD_OFFSET + offsetof(struct umem_frame, udpv4); +// FIXME later: get rid of those singletons! struct xsk_socket_info *the_socket = NULL; -struct config *the_config = NULL; +struct kxsk_config *the_config = NULL; /** Swap two bytes as a *constant* expression. ATM we assume we're LE, i.e. we do need to swap. */ #define BS16(n) (((n) >> 8) + (((n) & 0xff) << 8)) @@ -94,26 +115,27 @@ failed: return NULL; } -static struct umem_frame *xsk_alloc_umem_frame(struct xsk_umem_info *umem) // TODO: confusing to use xsk_ +static struct umem_frame *xsk_alloc_umem_frame(struct xsk_umem_info *umem) { if (unlikely(umem->free_count == 0)) { - fprintf(stderr, "[uxsk] no free frame!\n"); return NULL; } + uint32_t index = umem->free_indices[--umem->free_count]; - //kr_log_verbose("[uxsk] allocating frame %d\n", (int)index); - #ifndef NDEBUG - umem->free_indices[umem->free_count] = -1; - #endif return umem->frames + index; } -void *kr_xsk_alloc_wire(uint16_t *maxlen) + +_public_ +struct iovec knot_xsk_alloc_frame() { + struct iovec res = { 0 }; + struct umem_frame *uframe = xsk_alloc_umem_frame(the_socket->umem); - if (!uframe) return NULL; - *maxlen = MIN(UINT16_MAX, FRAME_SIZE - offsetof(struct umem_frame, udpv4.data) - - 4/*eth CRC*/); - return uframe->udpv4.data; + if (uframe != NULL) { + res.iov_len = MIN(UINT16_MAX, FRAME_SIZE - FRAME_PAYLOAD_OFFSET - 4/*eth CRC*/); + res.iov_base = uframe->udpv4.data; + } + return res; } static void xsk_dealloc_umem_frame(struct xsk_umem_info *umem, uint8_t *uframe_p) @@ -126,7 +148,8 @@ static void xsk_dealloc_umem_frame(struct xsk_umem_info *umem, uint8_t *uframe_p umem->free_indices[umem->free_count++] = index; } -void kr_xsk_deinit_global(void) +_public_ +void knot_xsk_deinit() { if (!the_socket) return; @@ -139,7 +162,7 @@ void kr_xsk_deinit_global(void) } /** Add some free frames into the RX fill queue (possibly zero, etc.) */ -int kxsk_umem_refill(const struct config *cfg, struct xsk_umem_info *umem) +static int kxsk_umem_refill(const struct kxsk_config *cfg, struct xsk_umem_info *umem) { /* First find to_reserve: how many frames to move to the RX fill queue. * Let's keep about as many frames ready for TX (free_count) as for RX (fq_ready), @@ -152,8 +175,8 @@ int kxsk_umem_refill(const struct config *cfg, struct xsk_umem_info *umem) const int balance = (fq_ready + umem->free_count) / 2; const int fq_want = MIN(balance, fq_target); // don't overshoot the target const int to_reserve = fq_want - fq_ready; - kr_log_verbose("[uxsk] refilling %d frames TX->RX; TX = %d, RX = %d\n", - to_reserve, (int)umem->free_count, (int)fq_ready); + //kr_log_verbose("[uxsk] refilling %d frames TX->RX; TX = %d, RX = %d\n", + // to_reserve, (int)umem->free_count, (int)fq_ready); if (to_reserve <= 0) return 0; @@ -177,8 +200,9 @@ int kxsk_umem_refill(const struct config *cfg, struct xsk_umem_info *umem) return 0; } -static struct xsk_socket_info * xsk_configure_socket(struct config *cfg, - struct xsk_umem_info *umem, const struct kxsk_iface *iface) +static struct xsk_socket_info *xsk_configure_socket(struct kxsk_config *cfg, + struct xsk_umem_info *umem, + const struct kxsk_iface *iface) { /* Put a couple RX buffers into the fill queue. * Even if we don't need them, it silences a dmesg line, @@ -196,15 +220,14 @@ static struct xsk_socket_info * xsk_configure_socket(struct config *cfg, assert(cfg->xsk.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD); errno = xsk_socket__create(&xsk_info->xsk, iface->ifname, - cfg->xsk_if_queue, umem->umem, &xsk_info->rx, - &xsk_info->tx, &cfg->xsk); + cfg->xsk_if_queue, umem->umem, + &xsk_info->rx, &xsk_info->tx, &cfg->xsk); return xsk_info; } - - /* Two helper functions taken from Linux kernel 5.2, slightly modified. */ +__attribute__ ((unused)) static inline uint32_t from64to32(uint64_t x) { /* add up 32-bit and 32-bit for 32+c bit */ @@ -219,49 +242,8 @@ static inline uint16_t from32to16(uint32_t sum) sum = (sum & 0xffff) + (sum >> 16); return sum; } -/** Compute the checksum of the IPv4 header. - * - * Slightly inspired by Linux 5.2 csum_tcpudp_* and friends. - * This version only works on little endian; the result is in BE/network order. - * - * FIXME: this is wrong, apparently; use *_2() at least for now. - */ -static __be16 pkt_ipv4_checksum(const struct iphdr *h) -{ - int64_t s = 0; - s += (h->ihl << 8) + (h->version << 12) + h->tos; - s += (h->tot_len + h->id + h->frag_off) << 8; - s += (h->ttl << 8) + h->protocol; - s += h->saddr; - s += h->daddr; - uint16_t res_le = ~from32to16(from64to32(s)); - return BS16(res_le); -} -static void test_pkt_ipv4_checksum() -{ - // https://en.wikipedia.org/wiki/IPv4_header_checksum#Calculating_the_IPv4_header_checksum - const struct iphdr h1 = { - .version = 4, - .ihl = 5, - .tos = 0, - .tot_len = BS16(0x73), - .id = BS16(0), - .frag_off = BS16(0x4000), - .ttl = 0x40, - .protocol = 0x11, // UDP - .check = 0, // unused - .saddr = 0xc0a80001, - .daddr = 0xc0a800c7, - }; - const uint16_t c1 = 0xb861; - - uint16_t cc1 = BS16(pkt_ipv4_checksum(&h1)); // we work in native order here - if (cc1 == c1) - fprintf(stderr, "OK\n"); - else - fprintf(stderr, "0x%x != 0x%x\n", cc1, c1); -} +// TODO: slow? static __be16 pkt_ipv4_checksum_2(const struct iphdr *h) { const uint16_t *ha = (const uint16_t *)h; @@ -272,45 +254,12 @@ static __be16 pkt_ipv4_checksum_2(const struct iphdr *h) return ~BS16(from32to16(sum32)); } -static void pkt_fill_headers(struct udpv4 *dst, struct udpv4 *template, int data_len) -{ - memcpy(dst, template, sizeof(*template)); - - const uint16_t udp_len = sizeof(dst->udp) + data_len; - dst->udp.len = BS16(udp_len); - - assert(dst->ipv4.ihl == 5); // header length 20 - dst->ipv4.tot_len = BS16(20 + udp_len); - dst->ipv4.check = pkt_ipv4_checksum_2(&dst->ipv4); - - // Ethernet checksum not needed, apparently. -#ifdef KR_XDP_ETH_CRC - /* Finally CRC32 over the whole ethernet frame; we use zlib here. */ - uLong eth_crc = crc32(0L, Z_NULL, 0); - eth_crc = crc32(eth_crc, (const void *)dst, offsetof(struct udpv4, data) + data_len); - uint32_t eth_crc_be = BS32(eth_crc); - memcpy(dst->data + data_len, ð_crc_be, sizeof(eth_crc_be)); - - return; // code below is broken/wrong, probably -#ifndef NDEBUG - fprintf(stderr, "%x\n", (uint32_t)eth_crc); - eth_crc = crc32(eth_crc, (const void *)&dst->data[data_len], 4); - fprintf(stderr, "%x\n", (uint32_t)eth_crc); - eth_crc = crc32(0L, Z_NULL, 0); - eth_crc = crc32(eth_crc, (const void *)dst, offsetof(struct udpv4, data) + data_len + 4); - fprintf(stderr, "%x\n", (uint32_t)eth_crc); - assert(eth_crc == 0xC704DD7B); -#endif -#endif -} - -static void pkt_send(struct xsk_socket_info *xsk, uint64_t addr, uint32_t len) +static int pkt_send(struct xsk_socket_info *xsk, uint64_t addr, uint32_t len) { uint32_t tx_idx; int ret = xsk_ring_prod__reserve(&xsk->tx, 1, &tx_idx); if (unlikely(ret != 1)) { - fprintf(stderr, "No more transmit slots, dropping the packet\n"); - return; + return KNOT_NET_ESEND; } *xsk_ring_prod__tx_desc(&xsk->tx, tx_idx) = (struct xdp_desc){ @@ -319,85 +268,99 @@ static void pkt_send(struct xsk_socket_info *xsk, uint64_t addr, uint32_t len) }; xsk_ring_prod__submit(&xsk->tx, 1); xsk->kernel_needs_wakeup = true; + return KNOT_EOK; } -void kr_xsk_push(const struct sockaddr *src, const struct sockaddr *dst, - struct kr_request *req, struct qr_task *task, uint8_t eth_addrs[2][6]) + +static uint8_t *msg_uframe_p(const knot_xsk_msg_t *msg) { - kr_log_verbose("[uxsk] pushing a packet\n"); - assert(src->sa_family == AF_INET && dst->sa_family == AF_INET); - uint8_t *uframe_p = req->answer->wire - offsetof(struct umem_frame, udpv4.data); + uint8_t *uframe_p = msg->payload.iov_base - FRAME_PAYLOAD_OFFSET; const uint8_t *umem_mem_start = the_socket->umem->frames->bytes; - #ifndef NDEBUG - assert((uframe_p - (uint8_t *)NULL) % FRAME_SIZE == 0); - size_t offset = uframe_p - umem_mem_start; - assert(offset / FRAME_SIZE < the_socket->umem->frame_count); - #endif - struct umem_frame *uframe = (struct umem_frame *)uframe_p; - uframe->task = task; + if (((uframe_p - (uint8_t *)NULL) % FRAME_SIZE != 0) || + ((uframe_p - umem_mem_start) / FRAME_SIZE >= the_socket->umem->frame_count)) { + // not allocated msg->payload correctly + return NULL; + } + return uframe_p; +} +_public_ +int knot_xsk_sendmsg(const knot_xsk_msg_t *msg) +{ + uint8_t *uframe_p = msg_uframe_p(msg); + if (uframe_p == NULL) { + return KNOT_EINVAL; + } - // Filling headers; testing version in pkt_fill_headers() + struct umem_frame *uframe = (struct umem_frame *)uframe_p; + struct udpv4 *h = &uframe->udpv4; // sockaddr* contents is already in network byte order - const struct sockaddr_in *src_v4 = (const struct sockaddr_in *)src; - const struct sockaddr_in *dst_v4 = (const struct sockaddr_in *)dst; - - const struct udpv4 *t = &the_config->pkt_template; - struct udpv4 *h = &uframe->udpv4; + const struct sockaddr_in *src_v4 = (const struct sockaddr_in *)&msg->ip_from; + const struct sockaddr_in *dst_v4 = (const struct sockaddr_in *)&msg->ip_to; - // UDP: struct udphdr - const uint16_t udp_len = sizeof(h->udp) + req->answer->size; + const uint16_t udp_len = sizeof(h->udp) + msg->payload.iov_len; h->udp.len = BS16(udp_len); h->udp.source = src_v4->sin_port; h->udp.dest = dst_v4->sin_port; h->udp.check = 0; - // IPv4: struct iphdr - h->ipv4.ihl = t->ipv4.ihl; - h->ipv4.version = t->ipv4.version; - h->ipv4.tos = t->ipv4.tos; - assert(h->ipv4.ihl == 5); // header length 20 + h->ipv4.ihl = 5; // required <= hdr len 20 + h->ipv4.version = 4; + h->ipv4.tos = 0; // default: best-effort DSCP + no ECN support h->ipv4.tot_len = BS16(20 + udp_len); - h->ipv4.id = t->ipv4.id; - h->ipv4.frag_off = t->ipv4.frag_off; - h->ipv4.ttl = t->ipv4.ttl; - h->ipv4.protocol = t->ipv4.protocol; + h->ipv4.id = BS16(0); // probably anything; details: RFC 6864 + h->ipv4.frag_off = 0; // TODO ? + h->ipv4.ttl = IPDEFTTL; + h->ipv4.protocol = 0x11; // UDP + memcpy(&h->ipv4.saddr, &src_v4->sin_addr, sizeof(src_v4->sin_addr)); memcpy(&h->ipv4.daddr, &dst_v4->sin_addr, sizeof(dst_v4->sin_addr)); + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Waddress-of-packed-member" h->ipv4.check = pkt_ipv4_checksum_2(&h->ipv4); +#pragma GCC diagnostic pop - // Ethernet: struct ethhdr - memcpy(h->eth.h_dest, eth_addrs[1], sizeof(eth_addrs[1])); - memcpy(h->eth.h_source, eth_addrs[0], sizeof(eth_addrs[0])); - h->eth.h_proto = t->eth.h_proto; - uint32_t eth_len = offsetof(struct udpv4, data) + req->answer->size + 4/*CRC*/; - pkt_send(the_socket, h->bytes - umem_mem_start, eth_len); + memcpy(h->eth.h_dest, msg->eth_to, sizeof(msg->eth_to)); + memcpy(h->eth.h_source, msg->eth_from, sizeof(msg->eth_from)); + h->eth.h_proto = BS16(ETH_P_IP); + + uint32_t eth_len = FRAME_PAYLOAD_OFFSET + msg->payload.iov_len + 4/*CRC*/; + + return pkt_send(the_socket, h->bytes - the_socket->umem->frames->bytes, eth_len); } -/** Periodical callback . */ -static void xsk_check(uv_check_t *handle) +_public_ +int knot_xsk_sendmmsg(const knot_xsk_msg_t msgs[], uint32_t count) +{ + int ret = KNOT_EOK; + for (int i = 0; i < count && ret == KNOT_EOK; i++) { + if (msgs[i].payload.iov_len > 0) { + ret = knot_xsk_sendmsg(&msgs[i]); + } + } + return ret; +} + +/** Periodical callback. Just using 'the_socket' global. */ +_public_ +int knot_xsk_check() { /* Trigger sending queued packets. * LATER(opt.): the periodical epoll due to the uv_poll* stuff * is probably enough to wake the kernel even for sending * (though AFAIK it might be specific to driver and/or kernel version). */ if (the_socket->kernel_needs_wakeup) { - bool is_ok = sendto(xsk_socket__fd(the_socket->xsk), NULL, 0, - MSG_DONTWAIT, NULL, 0) != -1; + int sendret = sendto(xsk_socket__fd(the_socket->xsk), NULL, 0, MSG_DONTWAIT, NULL, 0); + bool is_ok = (sendret != -1); const bool is_again = !is_ok && (errno == EWOULDBLOCK || errno == EAGAIN); if (is_ok || is_again) { the_socket->kernel_needs_wakeup = false; // EAGAIN is unclear; we'll retry the syscall later, to be sure } if (!is_ok && !is_again) { - const uint64_t stamp_now = kr_now(); - static uint64_t stamp_last = 0; - if (stamp_now > stamp_last + 10*1000) { - kr_log_info("WARNING: sendto error (reported at most once per 10s)\n\t%s\n", - strerror(errno)); - stamp_last = stamp_now; - } + return KNOT_EAGAIN; } } @@ -405,26 +368,22 @@ static void xsk_check(uv_check_t *handle) struct xsk_ring_cons *cq = &the_socket->umem->cq; uint32_t idx_cq; const uint32_t completed = xsk_ring_cons__peek(cq, UINT32_MAX, &idx_cq); - kr_log_verbose("."); - if (!completed) return; + if (!completed) return KNOT_EOK; // ? + + /* Free shared memory. */ for (int i = 0; i < completed; ++i, ++idx_cq) { - uint8_t *uframe_p = (uint8_t *)the_socket->umem->frames - + *xsk_ring_cons__comp_addr(cq, idx_cq) - - offsetof(struct umem_frame, udpv4); - const struct umem_frame *uframe = (struct umem_frame *)uframe_p; - qr_task_on_send(uframe->task, NULL, 0/*no error feedback*/); + uint8_t *uframe_p = (uint8_t *)the_socket->umem->frames + *xsk_ring_cons__comp_addr(cq, idx_cq) - offsetof(struct umem_frame, udpv4); xsk_dealloc_umem_frame(the_socket->umem, uframe_p); } + xsk_ring_cons__release(cq, completed); - kr_log_verbose("[uxsk] completed %d frames; busy frames: %d\n", (int)completed, - the_socket->umem->frame_count - the_socket->umem->free_count); //TODO: one uncompleted packet/batch is left until the next I/O :-/ /* And feed frames into RX fill queue. */ - kxsk_umem_refill(the_config, the_socket->umem); + return kxsk_umem_refill(the_config, the_socket->umem); } - -static void rx_desc(struct xsk_socket_info *xsi, const struct xdp_desc *desc) +static int rx_desc(struct xsk_socket_info *xsi, const struct xdp_desc *desc, + knot_xsk_msg_t *msg) { uint8_t *uframe_p = xsi->umem->frames->bytes + desc->addr; const struct ethhdr *eth = (struct ethhdr *)uframe_p; @@ -432,95 +391,87 @@ static void rx_desc(struct xsk_socket_info *xsi, const struct xdp_desc *desc) const struct ipv6hdr *ipv6 = NULL; const struct udphdr *udp; + int ret = KNOT_EOK; // FIXME: length checks on multiple places if (eth->h_proto == BS16(ETH_P_IP)) { ipv4 = (struct iphdr *)(uframe_p + sizeof(struct ethhdr)); - kr_log_verbose("[kxsk] frame len %d, ipv4 len %d\n", - (int)desc->len, (int)BS16(ipv4->tot_len)); // Any fragmentation stuff is bad for use, except for the DF flag - if (ipv4->version != 4 || (ipv4->frag_off & ~(1 << 14))) { - kr_log_info("[kxsk] weird IPv4 received: " - "version %d, frag_off %d\n", - (int)ipv4->version, (int)ipv4->frag_off); + uint16_t frag_off = BS16(ipv4->frag_off); + if (ipv4->version != 4 || (frag_off & ~(1 << 14))) { + ret = KNOT_EFEWDATA; goto free_frame; } - if (ipv4->protocol != 0x11) // UDP + if (ipv4->protocol != 0x11) { // UDP + ret = KNOT_ESEMCHECK; goto free_frame; + } // FIXME ipv4->check (sensitive to ipv4->ihl), ipv4->tot_len, udp->len udp = (struct udphdr *)(uframe_p + sizeof(struct ethhdr) + ipv4->ihl * 4); } else if (eth->h_proto == BS16(ETH_P_IPV6)) { (void)ipv6; - goto free_frame; // TODO - + ret = KNOT_ENOTSUP; // FIXME later + goto free_frame; } else { - kr_log_verbose("[kxsk] frame with unknown h_proto %d (ignored)\n", - (int)BS16(eth->h_proto)); + ret = KNOT_ENOTSUP; goto free_frame; } assert(eth && (!!ipv4 != !!ipv6) && udp); - uint8_t *udp_data = (uint8_t *)udp + sizeof(struct udphdr); - const uint16_t udp_data_len = BS16(udp->len) - sizeof(struct udphdr); + + msg->payload.iov_base = (uint8_t *)udp + sizeof(struct udphdr); + msg->payload.iov_len = BS16(udp->len) - sizeof(struct udphdr); + + memcpy(msg->eth_from, eth->h_source, sizeof(msg->eth_from)); + memcpy(msg->eth_to, eth->h_dest, sizeof(msg->eth_to)); // process the packet; ownership is passed on, but beware of holding frames // LATER: filter the address-port combinations that we listen on? - union inaddr sa_peer; - if (ipv4) { - sa_peer.ip4.sin_family = AF_INET; - sa_peer.ip4.sin_port = udp->source; - memcpy(&sa_peer.ip4.sin_addr, &ipv4->saddr, sizeof(ipv4->saddr)); - } else { - sa_peer.ip6.sin6_family = AF_INET6; - sa_peer.ip6.sin6_port = udp->source; - memcpy(&sa_peer.ip6.sin6_addr, &ipv6->saddr, sizeof(ipv6->saddr)); - //sa_peer.ip6.sin6_scope_id = the_config->xsk_if_queue; - //sin6_flowinfo: probably completely useless here - } - - knot_pkt_t *kpkt = knot_pkt_new(udp_data, udp_data_len, &the_worker->pkt_pool); - int ret = kpkt == NULL ? kr_error(ENOMEM) : - worker_submit(xsi->session, &sa_peer.ip, (const uint8_t (*)[6])eth, kpkt); - if (ret) - kr_log_verbose("[kxsk] worker_submit() == %d: %s\n", ret, kr_strerror(ret)); - mp_flush(the_worker->pkt_pool.ctx); + assert(ipv4); + struct sockaddr_in *src_v4 = (struct sockaddr_in *)&msg->ip_from; + struct sockaddr_in *dst_v4 = (struct sockaddr_in *)&msg->ip_to; + memcpy(&src_v4->sin_addr, &ipv4->saddr, sizeof(src_v4->sin_addr)); + memcpy(&dst_v4->sin_addr, &ipv4->daddr, sizeof(dst_v4->sin_addr)); + src_v4->sin_port = udp->source; + dst_v4->sin_port = udp->dest; - return; + return KNOT_EOK; free_frame: xsk_dealloc_umem_frame(xsi->umem, uframe_p); + return ret; } -// TODO: probably split up into generic part and kresd+UV part. -void kxsk_rx(uv_poll_t* handle, int status, int events) + +_public_ +int knot_xsk_recvmmsg(knot_xsk_msg_t msgs[], uint32_t max_count, uint32_t *count) { - if (status < 0) { - kr_log_error("[kxsk] poll status %d: %s\n", status, uv_strerror(status)); - return; - } - if (events != UV_READABLE) { - kr_log_error("[kxsk] poll unexpected events: %d\n", events); - return; - } + uint32_t idx_rx = 0; + int ret = KNOT_EOK; + *count = xsk_ring_cons__peek(&the_socket->rx, max_count, &idx_rx); + assert(*count <= max_count); - struct xsk_socket_info *xsi = handle->data; - assert(xsi == the_socket); // for now + for (size_t i = 0; i < *count && ret == KNOT_EOK; ++i, ++idx_rx) { + ret = rx_desc(the_socket, xsk_ring_cons__rx_desc(&the_socket->rx, idx_rx), &msgs[i]); + } - uint32_t idx_rx; - const size_t rcvd = xsk_ring_cons__peek(&xsi->rx, RX_BATCH_SIZE, &idx_rx); - kr_log_verbose("[kxsk] poll triggered, processing a batch of %d packets\n", - (int)rcvd); - if (!rcvd) - return; - for (int i = 0; i < rcvd; ++i, ++idx_rx) { - rx_desc(xsi, xsk_ring_cons__rx_desc(&xsi->rx, idx_rx)); + if (ret == KNOT_EOK && *count > 0) { + xsk_ring_cons__release(&the_socket->rx, *count); } - xsk_ring_cons__release(&xsi->rx, rcvd); + return ret; } +_public_ +void knot_xsk_free_recvd(const knot_xsk_msg_t *msg) +{ + uint8_t *uframe_p = msg_uframe_p(msg); + if (uframe_p != NULL) { + xsk_dealloc_umem_frame(the_socket->umem, uframe_p); + } +} -static struct config the_config_storage = { // static to get zeroed by default +static struct kxsk_config the_config_storage = { // static to get zeroed by default .xsk_if_queue = 0, // defaults overridable by command-line -x eth3:0 .umem_frame_count = 8192, .umem = { @@ -535,92 +486,25 @@ static struct config the_config_storage = { // static to get zeroed by default .libbpf_flags = XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD, .xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST, }, - .pkt_template = { - .eth = { - //.h_dest = "\xd8\x58\xd7\x00\x74\x34", - //.h_source = "\x70\x85\xc2\x3a\xc7\x84", - // mirkwood -> knot-bench-player: - .h_dest = "\xa0\x36\x9f\x50\x2a\x9c", - .h_source = "\x3c\xfd\xfe\x2b\xcf\x02", - // doriath -> eriador - //.h_dest = "\x00\x15\x17\xf8\xd0\x4a", - //.h_source = "\xf0\x1f\xaf\xe2\x80\x0d", - //.h_source = "\x00\x1e\x67\xe3\xb1\x24", // rohan - .h_proto = BS16(ETH_P_IP), - }, - .ipv4 = { - .version = 4, - .ihl = 5, - .tos = 0, // default: best-effort DSCP + no ECN support - .tot_len = BS16(0), // to be overwritten - .id = BS16(0), // probably anything; details: RFC 6864 - .frag_off = BS16(0), // TODO: add the DF flag, probably (1 << 14) - .ttl = IPDEFTTL, - .protocol = 0x11, // UDP - .check = 0, // to be overwritten - }, - .udp = { - .source = BS16(5353), - .dest = BS16(5353), - .len = BS16(0), // to be overwritten - .check = BS16(0), // checksum is optional - }, - }, }; -int kr_xsk_init_global(uv_loop_t *loop, char *cmdarg) +_public_ +int knot_xsk_init(const char *ifname, const char *prog_fname, + ssize_t *out_busy_frames) { - kxsk_alloc_hack = kr_xsk_alloc_wire; - if (!cmdarg) - return 0; - - /* Hard-coded configuration */ - const char - //sip_str[] = "192.168.8.71", - //dip_str[] = "192.168.8.1"; - sip_str[] = "192.168.100.8", - dip_str[] = "192.168.100.3"; - //sip_str[] = "217.31.193.167", - //dip_str[] = "217.31.193.166"; the_config = &the_config_storage; - if (inet_pton(AF_INET, sip_str, &the_config->pkt_template.ipv4.saddr) != 1 - || inet_pton(AF_INET, dip_str, &the_config->pkt_template.ipv4.daddr) != 1) { - fprintf(stderr, "ERROR: failed to convert IPv4 address\n"); - exit(EXIT_FAILURE); - } - char *colon = strchr(cmdarg, ':'); - if (colon) { - *colon = '\0'; // yes, modifying argv[i][j] isn't very nice - the_config->xsk_if_queue = atoi(colon + 1); - } - struct kxsk_iface *iface = kxsk_iface_new(cmdarg, - "./bpf-kernel.o" // FIXME: proper installation, etc. - ); + struct kxsk_iface *iface = kxsk_iface_new(ifname, prog_fname); if (!iface) { - fprintf(stderr, "ERROR: Can't set up network interface %s: %s\n", - cmdarg, strerror(errno)); - exit(EXIT_FAILURE); + return KNOT_EINVAL; } - /* Some failed test - void *data = malloc(2048); - struct udpv4 *pkt = data; - pkt_fill_headers(pkt, &the_config->pkt_template, 0); - // */ - - /* This one is OK! - test_pkt_ipv4_checksum(); - return 0; - // */ - /* Initialize shared packet_buffer for umem usage */ struct xsk_umem_info *umem = configure_xsk_umem(&the_config->umem, the_config->umem_frame_count); if (umem == NULL) { - fprintf(stderr, "ERROR: Can't create umem \"%s\"\n", - strerror(errno)); - exit(EXIT_FAILURE); + kxsk_iface_free(iface, false); + return KNOT_ENOMEM; } /* Open and configure the AF_XDP (xsk) socket */ @@ -628,60 +512,28 @@ int kr_xsk_init_global(uv_loop_t *loop, char *cmdarg) the_socket = xsk_configure_socket(the_config, umem, iface); if (!the_socket) { - fprintf(stderr, "ERROR, can't setup AF_XDP socket on %s:%d: %s\n", - iface->ifname, the_config->xsk_if_queue, strerror(errno)); - exit(EXIT_FAILURE); + xsk_umem__delete(umem->umem); + kxsk_iface_free(iface, false); + return KNOT_NET_ESOCKET; } int ret = kxsk_socket_start(iface, the_config->xsk_if_queue, the_socket->xsk); - if (ret) { - fprintf(stderr, "ERROR, can't start listening on AF_XDP socket on %s:%d: %s\n", - iface->ifname, the_config->xsk_if_queue, strerror(ret)); - exit(EXIT_FAILURE); + if (ret != KNOT_EOK) { + xsk_socket__delete(the_socket->xsk); + xsk_umem__delete(the_socket->umem->umem); + kxsk_iface_free(iface, false); + return ret; } - kr_log_verbose("[uxsk] busy frames: %d\n", - the_socket->umem->frame_count - the_socket->umem->free_count); - - - ret = uv_check_init(loop, &the_socket->check_handle); - if (!ret) ret = uv_check_start(&the_socket->check_handle, xsk_check); - - if (!ret) ret = uv_poll_init(loop, &the_socket->poll_handle, - xsk_socket__fd(the_socket->xsk)); - if (!ret) { - // beware: this sets poll_handle->data - struct session *s = the_socket->session = - session_new((uv_handle_t *)&the_socket->poll_handle, false); - assert(!session_flags(s)->outgoing); - - // TMP: because worker will pass this back as source address to us - struct sockaddr_in *ssa = (struct sockaddr_in *)session_get_sockname(s); - ssa->sin_family = AF_INET; - memcpy(&ssa->sin_addr, &the_config->pkt_template.ipv4.saddr, - sizeof(ssa->sin_addr)); - ssa->sin_port = the_config->pkt_template.udp.source; - - ret = s ? 0 : kr_error(ENOMEM); - } - if (!ret) { - the_socket->poll_handle.data = the_socket; - ret = uv_poll_start(&the_socket->poll_handle, UV_READABLE, kxsk_rx); + if (out_busy_frames != NULL) { + *out_busy_frames = the_socket->umem->frame_count - the_socket->umem->free_count; } + return ret; } -#define SOL_XDP 283 -static void print_stats(struct xsk_socket *xsk) +_public_ +int knot_xsk_get_poll_fd() { - struct xdp_statistics stats; - socklen_t optlen = sizeof(stats); - if (getsockopt(xsk_socket__fd(xsk), SOL_XDP, XDP_STATISTICS, &stats, &optlen)) { - fprintf(stderr, "getsockopt: %s\n", strerror(errno)); - } else { - fprintf(stderr, "stats: RX drop %d, RX ID %d, TX ID %d\n", - (int)stats.rx_dropped, (int)stats.rx_invalid_descs, - (int)stats.tx_invalid_descs); - } + return xsk_socket__fd(the_socket->xsk); } - diff --git a/src/libknot/xdp/af_xdp.h b/src/libknot/xdp/af_xdp.h index 94c9e60f6d18cff0c1623683551f7ec8de63cdd5..5bed4bf4a89b9a4dcf78c12e1e0e64fc13656ccc 100644 --- a/src/libknot/xdp/af_xdp.h +++ b/src/libknot/xdp/af_xdp.h @@ -1,18 +1,48 @@ +/* Copyright (C) 2019 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. + */ #pragma once #include <stdint.h> -#include <uv.h> -int kr_xsk_init_global(uv_loop_t *loop, char *cmdarg); -void kr_xsk_deinit_global(void); +#include <sys/socket.h> + +typedef struct { + struct sockaddr_storage ip_from; + struct sockaddr_storage ip_to; + uint8_t eth_from[6]; + uint8_t eth_to[6]; + struct iovec payload; +} knot_xsk_msg_t; + +int knot_xsk_init(const char *ifname, const char *prog_fname, + ssize_t *out_busy_frames); + +void knot_xsk_deinit(void); + +struct iovec knot_xsk_alloc_frame(void); + +int knot_xsk_sendmsg(const knot_xsk_msg_t *msg); // msg->payload MUST have been allocated by knot_xsk_alloc_frame() + +int knot_xsk_sendmmsg(const knot_xsk_msg_t msgs[], uint32_t count); // skip messages with payload length == 0 + +int knot_xsk_recvmmsg(knot_xsk_msg_t msgs[], uint32_t max_count, uint32_t *count); -//void *kr_xsk_alloc_wire(uint16_t *maxlen); +void knot_xsk_free_recvd(const knot_xsk_msg_t *msg); -struct sockaddr; -struct kr_request; -struct qr_task; -/** Send req->answer via UDP, possibly not immediately. */ -void kr_xsk_push(const struct sockaddr *src, const struct sockaddr *dest, - struct kr_request *req, struct qr_task *task, uint8_t eth_addrs[2][6]); +int knot_xsk_check(void); +int knot_xsk_get_poll_fd(void); diff --git a/src/libknot/xdp/bpf-kernel.c b/src/libknot/xdp/bpf-kernel.c index 546a15f8d98d4a04e8a005597a3907dbc1aa9b53..e7958237e9d4986e36046b652bc73fe2686c3110 100644 --- a/src/libknot/xdp/bpf-kernel.c +++ b/src/libknot/xdp/bpf-kernel.c @@ -3,9 +3,9 @@ #include <linux/if_ether.h> #include <linux/ipv6.h> #include <linux/udp.h> -#include "bpf_helpers.h" -#include "bpf_endian.h" -#include "parsing_helpers.h" +#include "contrib/bpf/bpf_helpers.h" +#include "contrib/bpf/bpf_endian.h" +#include "contrib/bpf/parsing_helpers.h" /** Assume netdev has no more than 64 queues * LATER: it might be better to detect this on startup time (per-device). */ diff --git a/src/libknot/xdp/bpf-user.c b/src/libknot/xdp/bpf-user.c index ce92dbca220b898304d98c8e18192637bbf49af9..71d2f42e00ac6e60ca0249a8eaf0ac0dff3749b2 100644 --- a/src/libknot/xdp/bpf-user.c +++ b/src/libknot/xdp/bpf-user.c @@ -1,6 +1,7 @@ -#include "daemon/kxsk/impl.h" +#include "libknot/xdp/bpf-user.h" +#include <errno.h> #include <stdlib.h> #include <string.h> #include <unistd.h> diff --git a/src/libknot/xdp/bpf-user.h b/src/libknot/xdp/bpf-user.h index 90a8c4ec30d3340df664370206a8aff24397e791..8d1556c94f6e492508f9d0c81c2b7aac3ebe5320 100644 --- a/src/libknot/xdp/bpf-user.h +++ b/src/libknot/xdp/bpf-user.h @@ -1,6 +1,7 @@ - #pragma once +#include <stdint.h> + #include <bpf/xsk.h> #include <linux/if_ether.h> @@ -8,7 +9,7 @@ #include <linux/ipv6.h> #include <linux/udp.h> -#include <uv.h> // LATER: split kresd-specific stuff +//#include <uv.h> // LATER: split kresd-specific stuff struct udpv4 { union { uint8_t bytes[1]; struct { @@ -33,7 +34,7 @@ struct kxsk_iface { }; -struct config { +struct kxsk_config { int xsk_if_queue; struct xsk_umem_config umem; /**< For xsk_umem__create() from libbpf. */ @@ -70,11 +71,6 @@ struct xsk_socket_info { bool kernel_needs_wakeup; const struct kxsk_iface *iface; - - /* kresd-specific stuff */ - uv_check_t check_handle; - uv_poll_t poll_handle; - struct session *session; /**< mock session, to minimize kresd changes for now */ }; diff --git a/tests-fuzz/knotd_wrap/udp-handler.c b/tests-fuzz/knotd_wrap/udp-handler.c index ffba8480c11fe023e7b7b732fe38314a03469b1a..a2bdf2941d9814496c4ad99e7556f5ff93e74325 100644 --- a/tests-fuzz/knotd_wrap/udp-handler.c +++ b/tests-fuzz/knotd_wrap/udp-handler.c @@ -86,6 +86,14 @@ static int udp_stdin_send(void *d) return 0; } +static udp_api_t stdin_api = { + udp_stdin_init, + udp_stdin_deinit, + udp_stdin_recv, + udp_stdin_handle, + udp_stdin_send +}; + void udp_master_init_stdio(server_t *server) { log_info("AFL, UDP handler listening on stdin"); @@ -100,9 +108,8 @@ void udp_master_init_stdio(server_t *server) { add_tail(server->ifaces, (node_t *)ifc); - _udp_init = udp_stdin_init; - _udp_recv = udp_stdin_recv; - _udp_handle = udp_stdin_handle; - _udp_send = udp_stdin_send; - _udp_deinit = udp_stdin_deinit; + udp_recvfrom_api = stdin_api; +#ifdef ENABLE_RECVMMSG + udp_recvmmsg_api = stdin_api; +#endif }