diff --git a/Knot.files b/Knot.files index 62df9bdae4b94d340f9a352cbbd08e9ad8393ea8..2925b3b2e36bd45501c6c8af5dbe25dc72bffcf9 100644 --- a/Knot.files +++ b/Knot.files @@ -3,6 +3,9 @@ src/contrib/base32hex.c src/contrib/base32hex.h src/contrib/base64.c src/contrib/base64.h +src/contrib/bpf/bpf_endian.h +src/contrib/bpf/bpf_helpers.h +src/contrib/bpf/parsing_helpers.h src/contrib/ctype.h src/contrib/dnstap/convert.c src/contrib/dnstap/convert.h @@ -362,6 +365,11 @@ src/libknot/tsig-op.h src/libknot/tsig.c src/libknot/tsig.h src/libknot/wire.h +src/libknot/xdp/af_xdp.c +src/libknot/xdp/af_xdp.h +src/libknot/xdp/bpf-kernel.c +src/libknot/xdp/bpf-user.c +src/libknot/xdp/bpf-user.h src/libknot/yparser/yparser.c src/libknot/yparser/yparser.h src/libknot/yparser/ypbody.c diff --git a/src/contrib/Makefile.inc b/src/contrib/Makefile.inc index 0c13ba12e66468c655a970eb399402132ea3a6d8..409cacceb4e97e024c783ff1f99c5223c0540ffb 100644 --- a/src/contrib/Makefile.inc +++ b/src/contrib/Makefile.inc @@ -27,6 +27,9 @@ libcontrib_la_SOURCES = \ contrib/base32hex.h \ contrib/base64.c \ contrib/base64.h \ + contrib/bpf/bpf_endian.h \ + contrib/bpf/bpf_helpers.h \ + contrib/bpf/parsing_helpers.h \ contrib/ctype.h \ contrib/dynarray.h \ contrib/files.c \ diff --git a/src/contrib/bpf/bpf_endian.h b/src/contrib/bpf/bpf_endian.h new file mode 100644 index 0000000000000000000000000000000000000000..2b0ede3d556133801cbcd364b691109329b66228 --- /dev/null +++ b/src/contrib/bpf/bpf_endian.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copied from $(LINUX)/tools/testing/selftests/bpf/bpf_endian.h */ +#ifndef __BPF_ENDIAN__ +#define __BPF_ENDIAN__ + +#include <linux/swab.h> + +/* LLVM's BPF target selects the endianness of the CPU + * it compiles on, or the user specifies (bpfel/bpfeb), + * respectively. The used __BYTE_ORDER__ is defined by + * the compiler, we cannot rely on __BYTE_ORDER from + * libc headers, since it doesn't reflect the actual + * requested byte order. + * + * Note, LLVM's BPF target has different __builtin_bswapX() + * semantics. It does map to BPF_ALU | BPF_END | BPF_TO_BE + * in bpfel and bpfeb case, which means below, that we map + * to cpu_to_be16(). We could use it unconditionally in BPF + * case, but better not rely on it, so that this header here + * can be used from application and BPF program side, which + * use different targets. + */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +# define __bpf_ntohs(x)__builtin_bswap16(x) +# define __bpf_htons(x)__builtin_bswap16(x) +# define __bpf_constant_ntohs(x)___constant_swab16(x) +# define __bpf_constant_htons(x)___constant_swab16(x) +# define __bpf_ntohl(x)__builtin_bswap32(x) +# define __bpf_htonl(x)__builtin_bswap32(x) +# define __bpf_constant_ntohl(x)___constant_swab32(x) +# define __bpf_constant_htonl(x)___constant_swab32(x) +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +# define __bpf_ntohs(x)(x) +# define __bpf_htons(x)(x) +# define __bpf_constant_ntohs(x)(x) +# define __bpf_constant_htons(x)(x) +# define __bpf_ntohl(x)(x) +# define __bpf_htonl(x)(x) +# define __bpf_constant_ntohl(x)(x) +# define __bpf_constant_htonl(x)(x) +#else +# error "Fix your compiler's __BYTE_ORDER__?!" +#endif + +#define bpf_htons(x)\ + (__builtin_constant_p(x) ?\ + __bpf_constant_htons(x) : __bpf_htons(x)) +#define bpf_ntohs(x)\ + (__builtin_constant_p(x) ?\ + __bpf_constant_ntohs(x) : __bpf_ntohs(x)) +#define bpf_htonl(x)\ + (__builtin_constant_p(x) ?\ + __bpf_constant_htonl(x) : __bpf_htonl(x)) +#define bpf_ntohl(x)\ + (__builtin_constant_p(x) ?\ + __bpf_constant_ntohl(x) : __bpf_ntohl(x)) + +#endif /* __BPF_ENDIAN__ */ diff --git a/src/contrib/bpf/bpf_helpers.h b/src/contrib/bpf/bpf_helpers.h new file mode 100644 index 0000000000000000000000000000000000000000..b34ba5695e1ca2cafcd5f2d0247dbceb42bfade5 --- /dev/null +++ b/src/contrib/bpf/bpf_helpers.h @@ -0,0 +1,420 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copied from $(LINUX)/tools/testing/selftests/bpf/bpf_helpers.h */ + +/* Added to fix compilation on old Ubuntu systems - please preserve when + updating file! */ +#ifndef __always_inline +# define __always_inline inline __attribute__((always_inline)) +#endif + +#ifndef __BPF_HELPERS_H +#define __BPF_HELPERS_H + +/* helper macro to place programs, maps, license in + * different sections in elf_bpf file. Section names + * are interpreted by elf_bpf loader + */ +#define SEC(NAME) __attribute__((section(NAME), used)) + +/* helper functions called from eBPF programs written in C */ +/* Some are only added in later kernel headers, so let's enable them on-demand. */ +static void *(*bpf_map_lookup_elem)(void *map, void *key) = + (void *) BPF_FUNC_map_lookup_elem; +/* +static int (*bpf_map_update_elem)(void *map, void *key, void *value, + unsigned long long flags) = + (void *) BPF_FUNC_map_update_elem; +static int (*bpf_map_delete_elem)(void *map, void *key) = + (void *) BPF_FUNC_map_delete_elem; +static int (*bpf_map_push_elem)(void *map, void *value, + unsigned long long flags) = + (void *) BPF_FUNC_map_push_elem; +static int (*bpf_map_pop_elem)(void *map, void *value) = + (void *) BPF_FUNC_map_pop_elem; +static int (*bpf_map_peek_elem)(void *map, void *value) = + (void *) BPF_FUNC_map_peek_elem; +static int (*bpf_probe_read)(void *dst, int size, void *unsafe_ptr) = + (void *) BPF_FUNC_probe_read; +static unsigned long long (*bpf_ktime_get_ns)(void) = + (void *) BPF_FUNC_ktime_get_ns; +static int (*bpf_trace_printk)(const char *fmt, int fmt_size, ...) = + (void *) BPF_FUNC_trace_printk; +static void (*bpf_tail_call)(void *ctx, void *map, int index) = + (void *) BPF_FUNC_tail_call; +static unsigned long long (*bpf_get_smp_processor_id)(void) = + (void *) BPF_FUNC_get_smp_processor_id; +static unsigned long long (*bpf_get_current_pid_tgid)(void) = + (void *) BPF_FUNC_get_current_pid_tgid; +static unsigned long long (*bpf_get_current_uid_gid)(void) = + (void *) BPF_FUNC_get_current_uid_gid; +static int (*bpf_get_current_comm)(void *buf, int buf_size) = + (void *) BPF_FUNC_get_current_comm; +static unsigned long long (*bpf_perf_event_read)(void *map, + unsigned long long flags) = + (void *) BPF_FUNC_perf_event_read; +static int (*bpf_clone_redirect)(void *ctx, int ifindex, int flags) = + (void *) BPF_FUNC_clone_redirect; +static int (*bpf_redirect)(int ifindex, int flags) = + (void *) BPF_FUNC_redirect; +*/ +static int (*bpf_redirect_map)(void *map, int key, int flags) = + (void *) BPF_FUNC_redirect_map; +/* +static int (*bpf_perf_event_output)(void *ctx, void *map, + unsigned long long flags, void *data, + int size) = + (void *) BPF_FUNC_perf_event_output; +static int (*bpf_get_stackid)(void *ctx, void *map, int flags) = + (void *) BPF_FUNC_get_stackid; +static int (*bpf_probe_write_user)(void *dst, void *src, int size) = + (void *) BPF_FUNC_probe_write_user; +static int (*bpf_current_task_under_cgroup)(void *map, int index) = + (void *) BPF_FUNC_current_task_under_cgroup; +static int (*bpf_skb_get_tunnel_key)(void *ctx, void *key, int size, int flags) = + (void *) BPF_FUNC_skb_get_tunnel_key; +static int (*bpf_skb_set_tunnel_key)(void *ctx, void *key, int size, int flags) = + (void *) BPF_FUNC_skb_set_tunnel_key; +static int (*bpf_skb_get_tunnel_opt)(void *ctx, void *md, int size) = + (void *) BPF_FUNC_skb_get_tunnel_opt; +static int (*bpf_skb_set_tunnel_opt)(void *ctx, void *md, int size) = + (void *) BPF_FUNC_skb_set_tunnel_opt; +static unsigned long long (*bpf_get_prandom_u32)(void) = + (void *) BPF_FUNC_get_prandom_u32; +static int (*bpf_xdp_adjust_head)(void *ctx, int offset) = + (void *) BPF_FUNC_xdp_adjust_head; +static int (*bpf_xdp_adjust_meta)(void *ctx, int offset) = + (void *) BPF_FUNC_xdp_adjust_meta; +static int (*bpf_get_socket_cookie)(void *ctx) = + (void *) BPF_FUNC_get_socket_cookie; +static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval, + int optlen) = + (void *) BPF_FUNC_setsockopt; +static int (*bpf_getsockopt)(void *ctx, int level, int optname, void *optval, + int optlen) = + (void *) BPF_FUNC_getsockopt; +static int (*bpf_sock_ops_cb_flags_set)(void *ctx, int flags) = + (void *) BPF_FUNC_sock_ops_cb_flags_set; +static int (*bpf_sk_redirect_map)(void *ctx, void *map, int key, int flags) = + (void *) BPF_FUNC_sk_redirect_map; +static int (*bpf_sk_redirect_hash)(void *ctx, void *map, void *key, int flags) = + (void *) BPF_FUNC_sk_redirect_hash; +static int (*bpf_sock_map_update)(void *map, void *key, void *value, + unsigned long long flags) = + (void *) BPF_FUNC_sock_map_update; +static int (*bpf_sock_hash_update)(void *map, void *key, void *value, + unsigned long long flags) = + (void *) BPF_FUNC_sock_hash_update; +static int (*bpf_perf_event_read_value)(void *map, unsigned long long flags, + void *buf, unsigned int buf_size) = + (void *) BPF_FUNC_perf_event_read_value; +static int (*bpf_perf_prog_read_value)(void *ctx, void *buf, + unsigned int buf_size) = + (void *) BPF_FUNC_perf_prog_read_value; +static int (*bpf_override_return)(void *ctx, unsigned long rc) = + (void *) BPF_FUNC_override_return; +static int (*bpf_msg_redirect_map)(void *ctx, void *map, int key, int flags) = + (void *) BPF_FUNC_msg_redirect_map; +static int (*bpf_msg_redirect_hash)(void *ctx, + void *map, void *key, int flags) = + (void *) BPF_FUNC_msg_redirect_hash; +static int (*bpf_msg_apply_bytes)(void *ctx, int len) = + (void *) BPF_FUNC_msg_apply_bytes; +static int (*bpf_msg_cork_bytes)(void *ctx, int len) = + (void *) BPF_FUNC_msg_cork_bytes; +static int (*bpf_msg_pull_data)(void *ctx, int start, int end, int flags) = + (void *) BPF_FUNC_msg_pull_data; +static int (*bpf_msg_push_data)(void *ctx, int start, int end, int flags) = + (void *) BPF_FUNC_msg_push_data; +static int (*bpf_msg_pop_data)(void *ctx, int start, int cut, int flags) = + (void *) BPF_FUNC_msg_pop_data; +static int (*bpf_bind)(void *ctx, void *addr, int addr_len) = + (void *) BPF_FUNC_bind; +static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) = + (void *) BPF_FUNC_xdp_adjust_tail; +static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state, + int size, int flags) = + (void *) BPF_FUNC_skb_get_xfrm_state; +static int (*bpf_sk_select_reuseport)(void *ctx, void *map, void *key, __u32 flags) = + (void *) BPF_FUNC_sk_select_reuseport; +static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) = + (void *) BPF_FUNC_get_stack; +static int (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params, + int plen, __u32 flags) = + (void *) BPF_FUNC_fib_lookup; +static int (*bpf_lwt_push_encap)(void *ctx, unsigned int type, void *hdr, + unsigned int len) = + (void *) BPF_FUNC_lwt_push_encap; +static int (*bpf_lwt_seg6_store_bytes)(void *ctx, unsigned int offset, + void *from, unsigned int len) = + (void *) BPF_FUNC_lwt_seg6_store_bytes; +static int (*bpf_lwt_seg6_action)(void *ctx, unsigned int action, void *param, + unsigned int param_len) = + (void *) BPF_FUNC_lwt_seg6_action; +static int (*bpf_lwt_seg6_adjust_srh)(void *ctx, unsigned int offset, + unsigned int len) = + (void *) BPF_FUNC_lwt_seg6_adjust_srh; +static int (*bpf_rc_repeat)(void *ctx) = + (void *) BPF_FUNC_rc_repeat; +static int (*bpf_rc_keydown)(void *ctx, unsigned int protocol, + unsigned long long scancode, unsigned int toggle) = + (void *) BPF_FUNC_rc_keydown; +static unsigned long long (*bpf_get_current_cgroup_id)(void) = + (void *) BPF_FUNC_get_current_cgroup_id; +static void *(*bpf_get_local_storage)(void *map, unsigned long long flags) = + (void *) BPF_FUNC_get_local_storage; +static unsigned long long (*bpf_skb_cgroup_id)(void *ctx) = + (void *) BPF_FUNC_skb_cgroup_id; +static unsigned long long (*bpf_skb_ancestor_cgroup_id)(void *ctx, int level) = + (void *) BPF_FUNC_skb_ancestor_cgroup_id; +static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx, + struct bpf_sock_tuple *tuple, + int size, unsigned long long netns_id, + unsigned long long flags) = + (void *) BPF_FUNC_sk_lookup_tcp; +static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx, + struct bpf_sock_tuple *tuple, + int size, unsigned long long netns_id, + unsigned long long flags) = + (void *) BPF_FUNC_sk_lookup_udp; +static int (*bpf_sk_release)(struct bpf_sock *sk) = + (void *) BPF_FUNC_sk_release; +static int (*bpf_skb_vlan_push)(void *ctx, __be16 vlan_proto, __u16 vlan_tci) = + (void *) BPF_FUNC_skb_vlan_push; +static int (*bpf_skb_vlan_pop)(void *ctx) = + (void *) BPF_FUNC_skb_vlan_pop; +static int (*bpf_rc_pointer_rel)(void *ctx, int rel_x, int rel_y) = + (void *) BPF_FUNC_rc_pointer_rel; +static void (*bpf_spin_lock)(struct bpf_spin_lock *lock) = + (void *) BPF_FUNC_spin_lock; +static void (*bpf_spin_unlock)(struct bpf_spin_lock *lock) = + (void *) BPF_FUNC_spin_unlock; +static struct bpf_sock *(*bpf_sk_fullsock)(struct bpf_sock *sk) = + (void *) BPF_FUNC_sk_fullsock; +static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) = + (void *) BPF_FUNC_tcp_sock; +static struct bpf_sock *(*bpf_get_listener_sock)(struct bpf_sock *sk) = + (void *) BPF_FUNC_get_listener_sock; +static int (*bpf_skb_ecn_set_ce)(void *ctx) = + (void *) BPF_FUNC_skb_ecn_set_ce; +*/ + +/* llvm builtin functions that eBPF C program may use to + * emit BPF_LD_ABS and BPF_LD_IND instructions + */ +struct sk_buff; +unsigned long long load_byte(void *skb, + unsigned long long off) asm("llvm.bpf.load.byte"); +unsigned long long load_half(void *skb, + unsigned long long off) asm("llvm.bpf.load.half"); +unsigned long long load_word(void *skb, + unsigned long long off) asm("llvm.bpf.load.word"); + +/* a helper structure used by eBPF C program + * to describe map attributes to elf_bpf loader + */ +struct bpf_map_def { + unsigned int type; + unsigned int key_size; + unsigned int value_size; + unsigned int max_entries; + unsigned int map_flags; + unsigned int inner_map_idx; + unsigned int numa_node; +}; + +#define BPF_ANNOTATE_KV_PAIR(name, type_key, type_val) \ + struct ____btf_map_##name { \ + type_key key; \ + type_val value; \ + }; \ + struct ____btf_map_##name \ + __attribute__ ((section(".maps." #name), used)) \ + ____btf_map_##name = { } + +static int (*bpf_skb_load_bytes)(void *ctx, int off, void *to, int len) = + (void *) BPF_FUNC_skb_load_bytes; +static int (*bpf_skb_load_bytes_relative)(void *ctx, int off, void *to, int len, __u32 start_header) = + (void *) BPF_FUNC_skb_load_bytes_relative; +static int (*bpf_skb_store_bytes)(void *ctx, int off, void *from, int len, int flags) = + (void *) BPF_FUNC_skb_store_bytes; +static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flags) = + (void *) BPF_FUNC_l3_csum_replace; +static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, int to, int flags) = + (void *) BPF_FUNC_l4_csum_replace; +static int (*bpf_csum_diff)(void *from, int from_size, void *to, int to_size, int seed) = + (void *) BPF_FUNC_csum_diff; +static int (*bpf_skb_under_cgroup)(void *ctx, void *map, int index) = + (void *) BPF_FUNC_skb_under_cgroup; +static int (*bpf_skb_change_head)(void *, int len, int flags) = + (void *) BPF_FUNC_skb_change_head; +static int (*bpf_skb_pull_data)(void *, int len) = + (void *) BPF_FUNC_skb_pull_data; +static unsigned int (*bpf_get_cgroup_classid)(void *ctx) = + (void *) BPF_FUNC_get_cgroup_classid; +static unsigned int (*bpf_get_route_realm)(void *ctx) = + (void *) BPF_FUNC_get_route_realm; +static int (*bpf_skb_change_proto)(void *ctx, __be16 proto, __u64 flags) = + (void *) BPF_FUNC_skb_change_proto; +static int (*bpf_skb_change_type)(void *ctx, __u32 type) = + (void *) BPF_FUNC_skb_change_type; +static unsigned int (*bpf_get_hash_recalc)(void *ctx) = + (void *) BPF_FUNC_get_hash_recalc; +static unsigned long long (*bpf_get_current_task)(void *ctx) = + (void *) BPF_FUNC_get_current_task; +static int (*bpf_skb_change_tail)(void *ctx, __u32 len, __u64 flags) = + (void *) BPF_FUNC_skb_change_tail; +static long long (*bpf_csum_update)(void *ctx, __u32 csum) = + (void *) BPF_FUNC_csum_update; +static void (*bpf_set_hash_invalid)(void *ctx) = + (void *) BPF_FUNC_set_hash_invalid; +static int (*bpf_get_numa_node_id)(void) = + (void *) BPF_FUNC_get_numa_node_id; +static int (*bpf_probe_read_str)(void *ctx, __u32 size, + const void *unsafe_ptr) = + (void *) BPF_FUNC_probe_read_str; +static unsigned int (*bpf_get_socket_uid)(void *ctx) = + (void *) BPF_FUNC_get_socket_uid; +static unsigned int (*bpf_set_hash)(void *ctx, __u32 hash) = + (void *) BPF_FUNC_set_hash; +static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode, + unsigned long long flags) = + (void *) BPF_FUNC_skb_adjust_room; + +/* Scan the ARCH passed in from ARCH env variable (see Makefile) */ +#if defined(__TARGET_ARCH_x86) + #define bpf_target_x86 + #define bpf_target_defined +#elif defined(__TARGET_ARCH_s930x) + #define bpf_target_s930x + #define bpf_target_defined +#elif defined(__TARGET_ARCH_arm64) + #define bpf_target_arm64 + #define bpf_target_defined +#elif defined(__TARGET_ARCH_mips) + #define bpf_target_mips + #define bpf_target_defined +#elif defined(__TARGET_ARCH_powerpc) + #define bpf_target_powerpc + #define bpf_target_defined +#elif defined(__TARGET_ARCH_sparc) + #define bpf_target_sparc + #define bpf_target_defined +#else + #undef bpf_target_defined +#endif + +/* Fall back to what the compiler says */ +#ifndef bpf_target_defined +#if defined(__x86_64__) + #define bpf_target_x86 +#elif defined(__s390x__) + #define bpf_target_s930x +#elif defined(__aarch64__) + #define bpf_target_arm64 +#elif defined(__mips__) + #define bpf_target_mips +#elif defined(__powerpc__) + #define bpf_target_powerpc +#elif defined(__sparc__) + #define bpf_target_sparc +#endif +#endif + +#if defined(bpf_target_x86) + +#define PT_REGS_PARM1(x) ((x)->di) +#define PT_REGS_PARM2(x) ((x)->si) +#define PT_REGS_PARM3(x) ((x)->dx) +#define PT_REGS_PARM4(x) ((x)->cx) +#define PT_REGS_PARM5(x) ((x)->r8) +#define PT_REGS_RET(x) ((x)->sp) +#define PT_REGS_FP(x) ((x)->bp) +#define PT_REGS_RC(x) ((x)->ax) +#define PT_REGS_SP(x) ((x)->sp) +#define PT_REGS_IP(x) ((x)->ip) + +#elif defined(bpf_target_s390x) + +#define PT_REGS_PARM1(x) ((x)->gprs[2]) +#define PT_REGS_PARM2(x) ((x)->gprs[3]) +#define PT_REGS_PARM3(x) ((x)->gprs[4]) +#define PT_REGS_PARM4(x) ((x)->gprs[5]) +#define PT_REGS_PARM5(x) ((x)->gprs[6]) +#define PT_REGS_RET(x) ((x)->gprs[14]) +#define PT_REGS_FP(x) ((x)->gprs[11]) /* Works only with CONFIG_FRAME_POINTER */ +#define PT_REGS_RC(x) ((x)->gprs[2]) +#define PT_REGS_SP(x) ((x)->gprs[15]) +#define PT_REGS_IP(x) ((x)->psw.addr) + +#elif defined(bpf_target_arm64) + +#define PT_REGS_PARM1(x) ((x)->regs[0]) +#define PT_REGS_PARM2(x) ((x)->regs[1]) +#define PT_REGS_PARM3(x) ((x)->regs[2]) +#define PT_REGS_PARM4(x) ((x)->regs[3]) +#define PT_REGS_PARM5(x) ((x)->regs[4]) +#define PT_REGS_RET(x) ((x)->regs[30]) +#define PT_REGS_FP(x) ((x)->regs[29]) /* Works only with CONFIG_FRAME_POINTER */ +#define PT_REGS_RC(x) ((x)->regs[0]) +#define PT_REGS_SP(x) ((x)->sp) +#define PT_REGS_IP(x) ((x)->pc) + +#elif defined(bpf_target_mips) + +#define PT_REGS_PARM1(x) ((x)->regs[4]) +#define PT_REGS_PARM2(x) ((x)->regs[5]) +#define PT_REGS_PARM3(x) ((x)->regs[6]) +#define PT_REGS_PARM4(x) ((x)->regs[7]) +#define PT_REGS_PARM5(x) ((x)->regs[8]) +#define PT_REGS_RET(x) ((x)->regs[31]) +#define PT_REGS_FP(x) ((x)->regs[30]) /* Works only with CONFIG_FRAME_POINTER */ +#define PT_REGS_RC(x) ((x)->regs[1]) +#define PT_REGS_SP(x) ((x)->regs[29]) +#define PT_REGS_IP(x) ((x)->cp0_epc) + +#elif defined(bpf_target_powerpc) + +#define PT_REGS_PARM1(x) ((x)->gpr[3]) +#define PT_REGS_PARM2(x) ((x)->gpr[4]) +#define PT_REGS_PARM3(x) ((x)->gpr[5]) +#define PT_REGS_PARM4(x) ((x)->gpr[6]) +#define PT_REGS_PARM5(x) ((x)->gpr[7]) +#define PT_REGS_RC(x) ((x)->gpr[3]) +#define PT_REGS_SP(x) ((x)->sp) +#define PT_REGS_IP(x) ((x)->nip) + +#elif defined(bpf_target_sparc) + +#define PT_REGS_PARM1(x) ((x)->u_regs[UREG_I0]) +#define PT_REGS_PARM2(x) ((x)->u_regs[UREG_I1]) +#define PT_REGS_PARM3(x) ((x)->u_regs[UREG_I2]) +#define PT_REGS_PARM4(x) ((x)->u_regs[UREG_I3]) +#define PT_REGS_PARM5(x) ((x)->u_regs[UREG_I4]) +#define PT_REGS_RET(x) ((x)->u_regs[UREG_I7]) +#define PT_REGS_RC(x) ((x)->u_regs[UREG_I0]) +#define PT_REGS_SP(x) ((x)->u_regs[UREG_FP]) + +/* Should this also be a bpf_target check for the sparc case? */ +#if defined(__arch64__) +#define PT_REGS_IP(x) ((x)->tpc) +#else +#define PT_REGS_IP(x) ((x)->pc) +#endif + +#endif + +#ifdef bpf_target_powerpc +#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = (ctx)->link; }) +#define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP +#elif bpf_target_sparc +#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = PT_REGS_RET(ctx); }) +#define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP +#else +#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ \ + bpf_probe_read(&(ip), sizeof(ip), (void *)PT_REGS_RET(ctx)); }) +#define BPF_KRETPROBE_READ_RET_IP(ip, ctx) ({ \ + bpf_probe_read(&(ip), sizeof(ip), \ + (void *)(PT_REGS_FP(ctx) + sizeof(ip))); }) +#endif + +#endif diff --git a/src/contrib/bpf/parsing_helpers.h b/src/contrib/bpf/parsing_helpers.h new file mode 100644 index 0000000000000000000000000000000000000000..7bd2764f585db1455dfa03445621333b279ef7a2 --- /dev/null +++ b/src/contrib/bpf/parsing_helpers.h @@ -0,0 +1,244 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This file contains parsing functions that are used in the packetXX XDP + * programs. The functions are marked as __always_inline, and fully defined in + * this header file to be included in the BPF program. + * + * Each helper parses a packet header, including doing bounds checking, and + * returns the type of its contents if successful, and -1 otherwise. + * + * For Ethernet and IP headers, the content type is the type of the payload + * (h_proto for Ethernet, nexthdr for IPv6), for ICMP it is the ICMP type field. + * All return values are in host byte order. + * + * The versions of the functions included here are slightly expanded versions of + * the functions in the packet01 lesson. For instance, the Ethernet header + * parsing has support for parsing VLAN tags. + */ + +#ifndef __PARSING_HELPERS_H +#define __PARSING_HELPERS_H + +#include <stddef.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/icmp.h> +#include <linux/icmpv6.h> +#include <linux/udp.h> +#include <linux/tcp.h> + +/* Header cursor to keep track of current parsing position */ +struct hdr_cursor { + void *pos; +}; + +/* + * struct vlan_hdr - vlan header + * @h_vlan_TCI: priority and VLAN ID + * @h_vlan_encapsulated_proto: packet type ID or len + */ +struct vlan_hdr { + __be16 h_vlan_TCI; + __be16 h_vlan_encapsulated_proto; +}; + +/* + * Struct icmphdr_common represents the common part of the icmphdr and icmp6hdr + * structures. + */ +struct icmphdr_common { + __u8 type; + __u8 code; + __sum16 cksum; +}; + +/* Allow users of header file to redefine VLAN max depth */ +#ifndef VLAN_MAX_DEPTH +#define VLAN_MAX_DEPTH 4 +#endif + +static __always_inline int proto_is_vlan(__u16 h_proto) +{ + return !!(h_proto == bpf_htons(ETH_P_8021Q) || + h_proto == bpf_htons(ETH_P_8021AD)); +} + +/* Notice, parse_ethhdr() will skip VLAN tags, by advancing nh->pos and returns + * next header EtherType, BUT the ethhdr pointer supplied still points to the + * Ethernet header. Thus, caller can look at eth->h_proto to see if this was a + * VLAN tagged packet. + */ +static __always_inline int parse_ethhdr(struct hdr_cursor *nh, void *data_end, + struct ethhdr **ethhdr) +{ + struct ethhdr *eth = nh->pos; + int hdrsize = sizeof(*eth); + struct vlan_hdr *vlh; + __u16 h_proto; + int i; + + /* Byte-count bounds check; check if current pointer + size of header + * is after data_end. + */ + if (nh->pos + hdrsize > data_end) + return -1; + + nh->pos += hdrsize; + *ethhdr = eth; + vlh = nh->pos; + h_proto = eth->h_proto; + + /* Use loop unrolling to avoid the verifier restriction on loops; + * support up to VLAN_MAX_DEPTH layers of VLAN encapsulation. + */ + #pragma unroll + for (i = 0; i < VLAN_MAX_DEPTH; i++) { + if (!proto_is_vlan(h_proto)) + break; + + if (vlh + 1 > data_end) + break; + + h_proto = vlh->h_vlan_encapsulated_proto; + vlh++; + } + + nh->pos = vlh; + return bpf_ntohs(h_proto); +} + +static __always_inline int parse_ip6hdr(struct hdr_cursor *nh, + void *data_end, + struct ipv6hdr **ip6hdr) +{ + struct ipv6hdr *ip6h = nh->pos; + + /* Pointer-arithmetic bounds check; pointer +1 points to after end of + * thing being pointed to. We will be using this style in the remainder + * of the tutorial. + */ + if (ip6h + 1 > data_end) + return -1; + + nh->pos = ip6h + 1; + *ip6hdr = ip6h; + + return ip6h->nexthdr; +} + +static __always_inline int parse_iphdr(struct hdr_cursor *nh, + void *data_end, + struct iphdr **iphdr) +{ + struct iphdr *iph = nh->pos; + int hdrsize; + + if (iph + 1 > data_end) + return -1; + + hdrsize = iph->ihl * 4; + + /* Variable-length IPv4 header, need to use byte-based arithmetic */ + if (nh->pos + hdrsize > data_end) + return -1; + + nh->pos += hdrsize; + *iphdr = iph; + + return iph->protocol; +} + +static __always_inline int parse_icmp6hdr(struct hdr_cursor *nh, + void *data_end, + struct icmp6hdr **icmp6hdr) +{ + struct icmp6hdr *icmp6h = nh->pos; + + if (icmp6h + 1 > data_end) + return -1; + + nh->pos = icmp6h + 1; + *icmp6hdr = icmp6h; + + return icmp6h->icmp6_type; +} + +static __always_inline int parse_icmphdr(struct hdr_cursor *nh, + void *data_end, + struct icmphdr **icmphdr) +{ + struct icmphdr *icmph = nh->pos; + + if (icmph + 1 > data_end) + return -1; + + nh->pos = icmph + 1; + *icmphdr = icmph; + + return icmph->type; +} + +static __always_inline int parse_icmphdr_common(struct hdr_cursor *nh, + void *data_end, + struct icmphdr_common **icmphdr) +{ + struct icmphdr_common *h = nh->pos; + + if (h + 1 > data_end) + return -1; + + nh->pos = h + 1; + *icmphdr = h; + + return h->type; +} + +/* + * parse_tcphdr: parse the udp header and return the length of the udp payload + */ +static __always_inline int parse_udphdr(struct hdr_cursor *nh, + void *data_end, + struct udphdr **udphdr) +{ + int len; + struct udphdr *h = nh->pos; + + if (h + 1 > data_end) + return -1; + + nh->pos = h + 1; + *udphdr = h; + + len = bpf_ntohs(h->len) - sizeof(struct udphdr); + if (len < 0) + return -1; + + return len; +} + +/* + * parse_tcphdr: parse and return the length of the tcp header + */ +static __always_inline int parse_tcphdr(struct hdr_cursor *nh, + void *data_end, + struct tcphdr **tcphdr) +{ + int len; + struct tcphdr *h = nh->pos; + + if (h + 1 > data_end) + return -1; + + len = h->doff * 4; + if ((void *) h + len > data_end) + return -1; + + nh->pos = h + 1; + *tcphdr = h; + + return len; +} + +#endif /* __PARSING_HELPERS_H */ diff --git a/src/libknot/Makefile.inc b/src/libknot/Makefile.inc index bc47e8404dc2bd153a9853f7f6a814b08ee08b3a..8fe9cf4280bae403272f62427aac85131fa1ab72 100644 --- a/src/libknot/Makefile.inc +++ b/src/libknot/Makefile.inc @@ -45,6 +45,8 @@ nobase_include_libknot_HEADERS = \ libknot/tsig-op.h \ libknot/tsig.h \ libknot/wire.h \ + libknot/xdp/af_xdp.h \ + libknot/xdp/bpf-user.h \ libknot/yparser/yparser.h \ libknot/yparser/ypformat.h \ libknot/yparser/ypschema.h \ @@ -70,6 +72,9 @@ libknot_la_SOURCES = \ libknot/rrtype/tsig.c \ libknot/tsig-op.c \ libknot/tsig.c \ + libknot/xdp/af_xdp.c \ + libknot/xdp/bpf-kernel.c \ + libknot/xdp/bpf-user.c \ libknot/yparser/yparser.c \ libknot/yparser/ypbody.c \ libknot/yparser/ypformat.c \ diff --git a/src/libknot/xdp/af_xdp.c b/src/libknot/xdp/af_xdp.c new file mode 100644 index 0000000000000000000000000000000000000000..2a14943586cbf594277b5c4aea7ef93355a34ff9 --- /dev/null +++ b/src/libknot/xdp/af_xdp.c @@ -0,0 +1,687 @@ +/* LATER: + * - XDP_USE_NEED_WAKEUP (optimization discussed in summer 2019) + */ + + + +#include "daemon/af_xdp.h" + + +#include <assert.h> +#include <errno.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + + +#ifdef KR_XDP_ETH_CRC +#include <zlib.h> +#endif + +#include <byteswap.h> + +#include <arpa/inet.h> +#include <netinet/in.h> +#include <linux/if_link.h> +#include <linux/filter.h> +//#include <linux/icmpv6.h> + +#include "contrib/ucw/lib.h" +#include "contrib/ucw/mempool.h" + +#include "lib/resolve.h" +#include "daemon/session.h" +#include "daemon/worker.h" + + +#include "daemon/kxsk/impl.h" + +// placate libclang :-/ +typedef uint64_t size_t; + +#define FRAME_SIZE 4096 +#define RX_BATCH_SIZE 64 + +/** The memory layout of each umem frame. */ +struct umem_frame { + union { uint8_t bytes[FRAME_SIZE]; struct { + + struct qr_task *task; + struct udpv4 udpv4; + + }; }; +}; + + +struct xsk_socket_info *the_socket = NULL; +struct config *the_config = NULL; + +/** Swap two bytes as a *constant* expression. ATM we assume we're LE, i.e. we do need to swap. */ +#define BS16(n) (((n) >> 8) + (((n) & 0xff) << 8)) +#define BS32 bswap_32 + +static struct xsk_umem_info *configure_xsk_umem(const struct xsk_umem_config *umem_config, + uint32_t frame_count) +{ + struct xsk_umem_info *umem = calloc(1, sizeof(*umem)); + if (!umem) return NULL; + + /* Allocate memory for the frames, aligned to a page boundary. */ + umem->frame_count = frame_count; + errno = posix_memalign((void **)&umem->frames, getpagesize(), FRAME_SIZE * frame_count); + if (errno) goto failed; + /* Initialize our "frame allocator". */ + umem->free_indices = malloc(frame_count * sizeof(umem->free_indices[0])); + if (!umem->free_indices) goto failed; + umem->free_count = frame_count; + for (uint32_t i = 0; i < frame_count; ++i) + umem->free_indices[i] = i; + + // NOTE: we don't need a fill queue (fq), but the API won't allow us to call + // with NULL - perhaps it doesn't matter that we don't utilize it later. + errno = -xsk_umem__create(&umem->umem, umem->frames, FRAME_SIZE * frame_count, + &umem->fq, &umem->cq, umem_config); + if (errno) goto failed; + + return umem; +failed: + free(umem->free_indices); + free(umem->frames); + free(umem); + return NULL; +} + +static struct umem_frame *xsk_alloc_umem_frame(struct xsk_umem_info *umem) // TODO: confusing to use xsk_ +{ + if (unlikely(umem->free_count == 0)) { + fprintf(stderr, "[uxsk] no free frame!\n"); + return NULL; + } + uint32_t index = umem->free_indices[--umem->free_count]; + //kr_log_verbose("[uxsk] allocating frame %d\n", (int)index); + #ifndef NDEBUG + umem->free_indices[umem->free_count] = -1; + #endif + return umem->frames + index; +} +void *kr_xsk_alloc_wire(uint16_t *maxlen) +{ + struct umem_frame *uframe = xsk_alloc_umem_frame(the_socket->umem); + if (!uframe) return NULL; + *maxlen = MIN(UINT16_MAX, FRAME_SIZE - offsetof(struct umem_frame, udpv4.data) + - 4/*eth CRC*/); + return uframe->udpv4.data; +} + +static void xsk_dealloc_umem_frame(struct xsk_umem_info *umem, uint8_t *uframe_p) +// TODO: confusing to use xsk_ +{ + assert(umem->free_count < umem->frame_count); + ptrdiff_t diff = uframe_p - umem->frames->bytes; + size_t index = diff / FRAME_SIZE; + assert(index < umem->frame_count); + umem->free_indices[umem->free_count++] = index; +} + +void kr_xsk_deinit_global(void) +{ + if (!the_socket) + return; + kxsk_socket_stop(the_socket->iface, the_config->xsk_if_queue); + xsk_socket__delete(the_socket->xsk); + xsk_umem__delete(the_socket->umem->umem); + + kxsk_iface_free((struct kxsk_iface *)/*const-cast*/the_socket->iface, false); + //TODO: more memory +} + +/** Add some free frames into the RX fill queue (possibly zero, etc.) */ +int kxsk_umem_refill(const struct config *cfg, struct xsk_umem_info *umem) +{ + /* First find to_reserve: how many frames to move to the RX fill queue. + * Let's keep about as many frames ready for TX (free_count) as for RX (fq_ready), + * and don't fill the queue to more than a half. */ + const int fq_target = cfg->umem.fill_size / 2; + uint32_t fq_free = xsk_prod_nb_free(&umem->fq, fq_target); + if (fq_free <= fq_target) + return 0; + const int fq_ready = cfg->umem.fill_size - fq_free; + const int balance = (fq_ready + umem->free_count) / 2; + const int fq_want = MIN(balance, fq_target); // don't overshoot the target + const int to_reserve = fq_want - fq_ready; + kr_log_verbose("[uxsk] refilling %d frames TX->RX; TX = %d, RX = %d\n", + to_reserve, (int)umem->free_count, (int)fq_ready); + if (to_reserve <= 0) + return 0; + + /* Now really reserve the frames. */ + uint32_t idx; + int ret = xsk_ring_prod__reserve(&umem->fq, to_reserve, &idx); + if (ret != to_reserve) { + assert(false); + return ENOSPC; + } + for (int i = 0; i < to_reserve; ++i, ++idx) { + struct umem_frame *uframe = xsk_alloc_umem_frame(umem); + if (!uframe) { + assert(false); + return ENOSPC; + } + size_t offset = uframe->bytes - umem->frames->bytes; + *xsk_ring_prod__fill_addr(&umem->fq, idx) = offset; + } + xsk_ring_prod__submit(&umem->fq, to_reserve); + return 0; +} + +static struct xsk_socket_info * xsk_configure_socket(struct config *cfg, + struct xsk_umem_info *umem, const struct kxsk_iface *iface) +{ + /* Put a couple RX buffers into the fill queue. + * Even if we don't need them, it silences a dmesg line, + * and it avoids 100% CPU usage of ksoftirqd/i for each queue i! + */ + errno = kxsk_umem_refill(cfg, umem); + if (errno) + return NULL; + + struct xsk_socket_info *xsk_info = calloc(1, sizeof(*xsk_info)); + if (!xsk_info) + return NULL; + xsk_info->iface = iface; + xsk_info->umem = umem; + + assert(cfg->xsk.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD); + errno = xsk_socket__create(&xsk_info->xsk, iface->ifname, + cfg->xsk_if_queue, umem->umem, &xsk_info->rx, + &xsk_info->tx, &cfg->xsk); + + return xsk_info; +} + + + +/* Two helper functions taken from Linux kernel 5.2, slightly modified. */ +static inline uint32_t from64to32(uint64_t x) +{ + /* add up 32-bit and 32-bit for 32+c bit */ + x = (x & 0xffffffff) + (x >> 32); + /* add up carry.. */ + x = (x & 0xffffffff) + (x >> 32); + return (uint32_t)x; +} +static inline uint16_t from32to16(uint32_t sum) +{ + sum = (sum & 0xffff) + (sum >> 16); + sum = (sum & 0xffff) + (sum >> 16); + return sum; +} +/** Compute the checksum of the IPv4 header. + * + * Slightly inspired by Linux 5.2 csum_tcpudp_* and friends. + * This version only works on little endian; the result is in BE/network order. + * + * FIXME: this is wrong, apparently; use *_2() at least for now. + */ +static __be16 pkt_ipv4_checksum(const struct iphdr *h) +{ + int64_t s = 0; + s += (h->ihl << 8) + (h->version << 12) + h->tos; + s += (h->tot_len + h->id + h->frag_off) << 8; + s += (h->ttl << 8) + h->protocol; + s += h->saddr; + s += h->daddr; + uint16_t res_le = ~from32to16(from64to32(s)); + return BS16(res_le); +} +static void test_pkt_ipv4_checksum() +{ + // https://en.wikipedia.org/wiki/IPv4_header_checksum#Calculating_the_IPv4_header_checksum + const struct iphdr h1 = { + .version = 4, + .ihl = 5, + .tos = 0, + .tot_len = BS16(0x73), + .id = BS16(0), + .frag_off = BS16(0x4000), + .ttl = 0x40, + .protocol = 0x11, // UDP + .check = 0, // unused + .saddr = 0xc0a80001, + .daddr = 0xc0a800c7, + }; + const uint16_t c1 = 0xb861; + + uint16_t cc1 = BS16(pkt_ipv4_checksum(&h1)); // we work in native order here + if (cc1 == c1) + fprintf(stderr, "OK\n"); + else + fprintf(stderr, "0x%x != 0x%x\n", cc1, c1); +} + +static __be16 pkt_ipv4_checksum_2(const struct iphdr *h) +{ + const uint16_t *ha = (const uint16_t *)h; + uint32_t sum32 = 0; + for (int i = 0; i < 10; ++i) + if (i != 5) + sum32 += BS16(ha[i]); + return ~BS16(from32to16(sum32)); +} + +static void pkt_fill_headers(struct udpv4 *dst, struct udpv4 *template, int data_len) +{ + memcpy(dst, template, sizeof(*template)); + + const uint16_t udp_len = sizeof(dst->udp) + data_len; + dst->udp.len = BS16(udp_len); + + assert(dst->ipv4.ihl == 5); // header length 20 + dst->ipv4.tot_len = BS16(20 + udp_len); + dst->ipv4.check = pkt_ipv4_checksum_2(&dst->ipv4); + + // Ethernet checksum not needed, apparently. +#ifdef KR_XDP_ETH_CRC + /* Finally CRC32 over the whole ethernet frame; we use zlib here. */ + uLong eth_crc = crc32(0L, Z_NULL, 0); + eth_crc = crc32(eth_crc, (const void *)dst, offsetof(struct udpv4, data) + data_len); + uint32_t eth_crc_be = BS32(eth_crc); + memcpy(dst->data + data_len, ð_crc_be, sizeof(eth_crc_be)); + + return; // code below is broken/wrong, probably +#ifndef NDEBUG + fprintf(stderr, "%x\n", (uint32_t)eth_crc); + eth_crc = crc32(eth_crc, (const void *)&dst->data[data_len], 4); + fprintf(stderr, "%x\n", (uint32_t)eth_crc); + eth_crc = crc32(0L, Z_NULL, 0); + eth_crc = crc32(eth_crc, (const void *)dst, offsetof(struct udpv4, data) + data_len + 4); + fprintf(stderr, "%x\n", (uint32_t)eth_crc); + assert(eth_crc == 0xC704DD7B); +#endif +#endif +} + +static void pkt_send(struct xsk_socket_info *xsk, uint64_t addr, uint32_t len) +{ + uint32_t tx_idx; + int ret = xsk_ring_prod__reserve(&xsk->tx, 1, &tx_idx); + if (unlikely(ret != 1)) { + fprintf(stderr, "No more transmit slots, dropping the packet\n"); + return; + } + + *xsk_ring_prod__tx_desc(&xsk->tx, tx_idx) = (struct xdp_desc){ + .addr = addr, + .len = len, + }; + xsk_ring_prod__submit(&xsk->tx, 1); + xsk->kernel_needs_wakeup = true; +} +void kr_xsk_push(const struct sockaddr *src, const struct sockaddr *dst, + struct kr_request *req, struct qr_task *task, uint8_t eth_addrs[2][6]) +{ + kr_log_verbose("[uxsk] pushing a packet\n"); + assert(src->sa_family == AF_INET && dst->sa_family == AF_INET); + uint8_t *uframe_p = req->answer->wire - offsetof(struct umem_frame, udpv4.data); + const uint8_t *umem_mem_start = the_socket->umem->frames->bytes; + #ifndef NDEBUG + assert((uframe_p - (uint8_t *)NULL) % FRAME_SIZE == 0); + size_t offset = uframe_p - umem_mem_start; + assert(offset / FRAME_SIZE < the_socket->umem->frame_count); + #endif + struct umem_frame *uframe = (struct umem_frame *)uframe_p; + uframe->task = task; + + + + // Filling headers; testing version in pkt_fill_headers() + + // sockaddr* contents is already in network byte order + const struct sockaddr_in *src_v4 = (const struct sockaddr_in *)src; + const struct sockaddr_in *dst_v4 = (const struct sockaddr_in *)dst; + + const struct udpv4 *t = &the_config->pkt_template; + struct udpv4 *h = &uframe->udpv4; + + // UDP: struct udphdr + const uint16_t udp_len = sizeof(h->udp) + req->answer->size; + h->udp.len = BS16(udp_len); + h->udp.source = src_v4->sin_port; + h->udp.dest = dst_v4->sin_port; + h->udp.check = 0; + + // IPv4: struct iphdr + h->ipv4.ihl = t->ipv4.ihl; + h->ipv4.version = t->ipv4.version; + h->ipv4.tos = t->ipv4.tos; + assert(h->ipv4.ihl == 5); // header length 20 + h->ipv4.tot_len = BS16(20 + udp_len); + h->ipv4.id = t->ipv4.id; + h->ipv4.frag_off = t->ipv4.frag_off; + h->ipv4.ttl = t->ipv4.ttl; + h->ipv4.protocol = t->ipv4.protocol; + memcpy(&h->ipv4.saddr, &src_v4->sin_addr, sizeof(src_v4->sin_addr)); + memcpy(&h->ipv4.daddr, &dst_v4->sin_addr, sizeof(dst_v4->sin_addr)); + h->ipv4.check = pkt_ipv4_checksum_2(&h->ipv4); + + // Ethernet: struct ethhdr + memcpy(h->eth.h_dest, eth_addrs[1], sizeof(eth_addrs[1])); + memcpy(h->eth.h_source, eth_addrs[0], sizeof(eth_addrs[0])); + h->eth.h_proto = t->eth.h_proto; + uint32_t eth_len = offsetof(struct udpv4, data) + req->answer->size + 4/*CRC*/; + pkt_send(the_socket, h->bytes - umem_mem_start, eth_len); +} + +/** Periodical callback . */ +static void xsk_check(uv_check_t *handle) +{ + /* Trigger sending queued packets. + * LATER(opt.): the periodical epoll due to the uv_poll* stuff + * is probably enough to wake the kernel even for sending + * (though AFAIK it might be specific to driver and/or kernel version). */ + if (the_socket->kernel_needs_wakeup) { + bool is_ok = sendto(xsk_socket__fd(the_socket->xsk), NULL, 0, + MSG_DONTWAIT, NULL, 0) != -1; + const bool is_again = !is_ok && (errno == EWOULDBLOCK || errno == EAGAIN); + if (is_ok || is_again) { + the_socket->kernel_needs_wakeup = false; + // EAGAIN is unclear; we'll retry the syscall later, to be sure + } + if (!is_ok && !is_again) { + const uint64_t stamp_now = kr_now(); + static uint64_t stamp_last = 0; + if (stamp_now > stamp_last + 10*1000) { + kr_log_info("WARNING: sendto error (reported at most once per 10s)\n\t%s\n", + strerror(errno)); + stamp_last = stamp_now; + } + } + } + + /* Collect completed packets. */ + struct xsk_ring_cons *cq = &the_socket->umem->cq; + uint32_t idx_cq; + const uint32_t completed = xsk_ring_cons__peek(cq, UINT32_MAX, &idx_cq); + kr_log_verbose("."); + if (!completed) return; + for (int i = 0; i < completed; ++i, ++idx_cq) { + uint8_t *uframe_p = (uint8_t *)the_socket->umem->frames + + *xsk_ring_cons__comp_addr(cq, idx_cq) + - offsetof(struct umem_frame, udpv4); + const struct umem_frame *uframe = (struct umem_frame *)uframe_p; + qr_task_on_send(uframe->task, NULL, 0/*no error feedback*/); + xsk_dealloc_umem_frame(the_socket->umem, uframe_p); + } + xsk_ring_cons__release(cq, completed); + kr_log_verbose("[uxsk] completed %d frames; busy frames: %d\n", (int)completed, + the_socket->umem->frame_count - the_socket->umem->free_count); + //TODO: one uncompleted packet/batch is left until the next I/O :-/ + /* And feed frames into RX fill queue. */ + kxsk_umem_refill(the_config, the_socket->umem); +} + + +static void rx_desc(struct xsk_socket_info *xsi, const struct xdp_desc *desc) +{ + uint8_t *uframe_p = xsi->umem->frames->bytes + desc->addr; + const struct ethhdr *eth = (struct ethhdr *)uframe_p; + const struct iphdr *ipv4 = NULL; + const struct ipv6hdr *ipv6 = NULL; + const struct udphdr *udp; + + + // FIXME: length checks on multiple places + if (eth->h_proto == BS16(ETH_P_IP)) { + ipv4 = (struct iphdr *)(uframe_p + sizeof(struct ethhdr)); + kr_log_verbose("[kxsk] frame len %d, ipv4 len %d\n", + (int)desc->len, (int)BS16(ipv4->tot_len)); + // Any fragmentation stuff is bad for use, except for the DF flag + if (ipv4->version != 4 || (ipv4->frag_off & ~(1 << 14))) { + kr_log_info("[kxsk] weird IPv4 received: " + "version %d, frag_off %d\n", + (int)ipv4->version, (int)ipv4->frag_off); + goto free_frame; + } + if (ipv4->protocol != 0x11) // UDP + goto free_frame; + // FIXME ipv4->check (sensitive to ipv4->ihl), ipv4->tot_len, udp->len + udp = (struct udphdr *)(uframe_p + sizeof(struct ethhdr) + ipv4->ihl * 4); + + } else if (eth->h_proto == BS16(ETH_P_IPV6)) { + (void)ipv6; + goto free_frame; // TODO + + } else { + kr_log_verbose("[kxsk] frame with unknown h_proto %d (ignored)\n", + (int)BS16(eth->h_proto)); + goto free_frame; + } + + assert(eth && (!!ipv4 != !!ipv6) && udp); + uint8_t *udp_data = (uint8_t *)udp + sizeof(struct udphdr); + const uint16_t udp_data_len = BS16(udp->len) - sizeof(struct udphdr); + + // process the packet; ownership is passed on, but beware of holding frames + // LATER: filter the address-port combinations that we listen on? + + union inaddr sa_peer; + if (ipv4) { + sa_peer.ip4.sin_family = AF_INET; + sa_peer.ip4.sin_port = udp->source; + memcpy(&sa_peer.ip4.sin_addr, &ipv4->saddr, sizeof(ipv4->saddr)); + } else { + sa_peer.ip6.sin6_family = AF_INET6; + sa_peer.ip6.sin6_port = udp->source; + memcpy(&sa_peer.ip6.sin6_addr, &ipv6->saddr, sizeof(ipv6->saddr)); + //sa_peer.ip6.sin6_scope_id = the_config->xsk_if_queue; + //sin6_flowinfo: probably completely useless here + } + + knot_pkt_t *kpkt = knot_pkt_new(udp_data, udp_data_len, &the_worker->pkt_pool); + int ret = kpkt == NULL ? kr_error(ENOMEM) : + worker_submit(xsi->session, &sa_peer.ip, (const uint8_t (*)[6])eth, kpkt); + if (ret) + kr_log_verbose("[kxsk] worker_submit() == %d: %s\n", ret, kr_strerror(ret)); + mp_flush(the_worker->pkt_pool.ctx); + + return; + +free_frame: + xsk_dealloc_umem_frame(xsi->umem, uframe_p); +} +// TODO: probably split up into generic part and kresd+UV part. +void kxsk_rx(uv_poll_t* handle, int status, int events) +{ + if (status < 0) { + kr_log_error("[kxsk] poll status %d: %s\n", status, uv_strerror(status)); + return; + } + if (events != UV_READABLE) { + kr_log_error("[kxsk] poll unexpected events: %d\n", events); + return; + } + + struct xsk_socket_info *xsi = handle->data; + assert(xsi == the_socket); // for now + + uint32_t idx_rx; + const size_t rcvd = xsk_ring_cons__peek(&xsi->rx, RX_BATCH_SIZE, &idx_rx); + kr_log_verbose("[kxsk] poll triggered, processing a batch of %d packets\n", + (int)rcvd); + if (!rcvd) + return; + for (int i = 0; i < rcvd; ++i, ++idx_rx) { + rx_desc(xsi, xsk_ring_cons__rx_desc(&xsi->rx, idx_rx)); + } + xsk_ring_cons__release(&xsi->rx, rcvd); +} + + +static struct config the_config_storage = { // static to get zeroed by default + .xsk_if_queue = 0, // defaults overridable by command-line -x eth3:0 + .umem_frame_count = 8192, + .umem = { + .fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, + .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, + .frame_size = FRAME_SIZE, // we need to know this value explicitly + .frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM, + }, + .xsk = { + .tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS, + .rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS, + .libbpf_flags = XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD, + .xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST, + }, + .pkt_template = { + .eth = { + //.h_dest = "\xd8\x58\xd7\x00\x74\x34", + //.h_source = "\x70\x85\xc2\x3a\xc7\x84", + // mirkwood -> knot-bench-player: + .h_dest = "\xa0\x36\x9f\x50\x2a\x9c", + .h_source = "\x3c\xfd\xfe\x2b\xcf\x02", + // doriath -> eriador + //.h_dest = "\x00\x15\x17\xf8\xd0\x4a", + //.h_source = "\xf0\x1f\xaf\xe2\x80\x0d", + //.h_source = "\x00\x1e\x67\xe3\xb1\x24", // rohan + .h_proto = BS16(ETH_P_IP), + }, + .ipv4 = { + .version = 4, + .ihl = 5, + .tos = 0, // default: best-effort DSCP + no ECN support + .tot_len = BS16(0), // to be overwritten + .id = BS16(0), // probably anything; details: RFC 6864 + .frag_off = BS16(0), // TODO: add the DF flag, probably (1 << 14) + .ttl = IPDEFTTL, + .protocol = 0x11, // UDP + .check = 0, // to be overwritten + }, + .udp = { + .source = BS16(5353), + .dest = BS16(5353), + .len = BS16(0), // to be overwritten + .check = BS16(0), // checksum is optional + }, + }, +}; + +int kr_xsk_init_global(uv_loop_t *loop, char *cmdarg) +{ + kxsk_alloc_hack = kr_xsk_alloc_wire; + if (!cmdarg) + return 0; + + /* Hard-coded configuration */ + const char + //sip_str[] = "192.168.8.71", + //dip_str[] = "192.168.8.1"; + sip_str[] = "192.168.100.8", + dip_str[] = "192.168.100.3"; + //sip_str[] = "217.31.193.167", + //dip_str[] = "217.31.193.166"; + the_config = &the_config_storage; + if (inet_pton(AF_INET, sip_str, &the_config->pkt_template.ipv4.saddr) != 1 + || inet_pton(AF_INET, dip_str, &the_config->pkt_template.ipv4.daddr) != 1) { + fprintf(stderr, "ERROR: failed to convert IPv4 address\n"); + exit(EXIT_FAILURE); + } + + char *colon = strchr(cmdarg, ':'); + if (colon) { + *colon = '\0'; // yes, modifying argv[i][j] isn't very nice + the_config->xsk_if_queue = atoi(colon + 1); + } + struct kxsk_iface *iface = kxsk_iface_new(cmdarg, + "./bpf-kernel.o" // FIXME: proper installation, etc. + ); + if (!iface) { + fprintf(stderr, "ERROR: Can't set up network interface %s: %s\n", + cmdarg, strerror(errno)); + exit(EXIT_FAILURE); + } + + /* Some failed test + void *data = malloc(2048); + struct udpv4 *pkt = data; + pkt_fill_headers(pkt, &the_config->pkt_template, 0); + // */ + + /* This one is OK! + test_pkt_ipv4_checksum(); + return 0; + // */ + + /* Initialize shared packet_buffer for umem usage */ + struct xsk_umem_info *umem = + configure_xsk_umem(&the_config->umem, the_config->umem_frame_count); + if (umem == NULL) { + fprintf(stderr, "ERROR: Can't create umem \"%s\"\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + /* Open and configure the AF_XDP (xsk) socket */ + assert(!the_socket); + + the_socket = xsk_configure_socket(the_config, umem, iface); + if (!the_socket) { + fprintf(stderr, "ERROR, can't setup AF_XDP socket on %s:%d: %s\n", + iface->ifname, the_config->xsk_if_queue, strerror(errno)); + exit(EXIT_FAILURE); + } + + int ret = kxsk_socket_start(iface, the_config->xsk_if_queue, the_socket->xsk); + if (ret) { + fprintf(stderr, "ERROR, can't start listening on AF_XDP socket on %s:%d: %s\n", + iface->ifname, the_config->xsk_if_queue, strerror(ret)); + exit(EXIT_FAILURE); + } + + kr_log_verbose("[uxsk] busy frames: %d\n", + the_socket->umem->frame_count - the_socket->umem->free_count); + + + ret = uv_check_init(loop, &the_socket->check_handle); + if (!ret) ret = uv_check_start(&the_socket->check_handle, xsk_check); + + if (!ret) ret = uv_poll_init(loop, &the_socket->poll_handle, + xsk_socket__fd(the_socket->xsk)); + if (!ret) { + // beware: this sets poll_handle->data + struct session *s = the_socket->session = + session_new((uv_handle_t *)&the_socket->poll_handle, false); + assert(!session_flags(s)->outgoing); + + // TMP: because worker will pass this back as source address to us + struct sockaddr_in *ssa = (struct sockaddr_in *)session_get_sockname(s); + ssa->sin_family = AF_INET; + memcpy(&ssa->sin_addr, &the_config->pkt_template.ipv4.saddr, + sizeof(ssa->sin_addr)); + ssa->sin_port = the_config->pkt_template.udp.source; + + ret = s ? 0 : kr_error(ENOMEM); + } + if (!ret) { + the_socket->poll_handle.data = the_socket; + ret = uv_poll_start(&the_socket->poll_handle, UV_READABLE, kxsk_rx); + } + return ret; +} + +#define SOL_XDP 283 +static void print_stats(struct xsk_socket *xsk) +{ + struct xdp_statistics stats; + socklen_t optlen = sizeof(stats); + if (getsockopt(xsk_socket__fd(xsk), SOL_XDP, XDP_STATISTICS, &stats, &optlen)) { + fprintf(stderr, "getsockopt: %s\n", strerror(errno)); + } else { + fprintf(stderr, "stats: RX drop %d, RX ID %d, TX ID %d\n", + (int)stats.rx_dropped, (int)stats.rx_invalid_descs, + (int)stats.tx_invalid_descs); + } +} + diff --git a/src/libknot/xdp/af_xdp.h b/src/libknot/xdp/af_xdp.h new file mode 100644 index 0000000000000000000000000000000000000000..94c9e60f6d18cff0c1623683551f7ec8de63cdd5 --- /dev/null +++ b/src/libknot/xdp/af_xdp.h @@ -0,0 +1,18 @@ + +#pragma once + +#include <stdint.h> +#include <uv.h> + +int kr_xsk_init_global(uv_loop_t *loop, char *cmdarg); +void kr_xsk_deinit_global(void); + +//void *kr_xsk_alloc_wire(uint16_t *maxlen); + +struct sockaddr; +struct kr_request; +struct qr_task; +/** Send req->answer via UDP, possibly not immediately. */ +void kr_xsk_push(const struct sockaddr *src, const struct sockaddr *dest, + struct kr_request *req, struct qr_task *task, uint8_t eth_addrs[2][6]); + diff --git a/src/libknot/xdp/bpf-kernel.c b/src/libknot/xdp/bpf-kernel.c new file mode 100644 index 0000000000000000000000000000000000000000..546a15f8d98d4a04e8a005597a3907dbc1aa9b53 --- /dev/null +++ b/src/libknot/xdp/bpf-kernel.c @@ -0,0 +1,65 @@ +#include <linux/bpf.h> +#include <linux/in.h> +#include <linux/if_ether.h> +#include <linux/ipv6.h> +#include <linux/udp.h> +#include "bpf_helpers.h" +#include "bpf_endian.h" +#include "parsing_helpers.h" + +/** Assume netdev has no more than 64 queues + * LATER: it might be better to detect this on startup time (per-device). */ +#define QUEUE_MAX 64 + +/** A set entry here means that the corresponding queue_id + * has an active AF_XDP socket bound to it. */ +struct bpf_map_def SEC("maps") qidconf_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = QUEUE_MAX, +}; +struct bpf_map_def SEC("maps") xsks_map = { + .type = BPF_MAP_TYPE_XSKMAP, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = QUEUE_MAX, +}; + +SEC("xdp_redirect_udp") +int xdp_redirect_udp_func(struct xdp_md *ctx) +{ + struct ethhdr *eth; + struct iphdr *iphdr; + //struct ipv6hdr *ipv6hdr; + //struct udphdr *udphdr; + + void *data_end = (void *)(long)ctx->data_end; + struct hdr_cursor nh = { .pos = (void *)(long)ctx->data }; + + int ip_type; + switch (parse_ethhdr(&nh, data_end, ð)) { + case ETH_P_IP: + ip_type = parse_iphdr(&nh, data_end, &iphdr); + break; + /* + case ETH_P_IPV6: + ip_type = parse_ip6hdr(&nh, data_end, &ipv6hdr); + break; + */ + default: + return XDP_PASS; + } + + if (ip_type != IPPROTO_UDP) + return XDP_PASS; + + int index = ctx->rx_queue_index; + int *qidconf = bpf_map_lookup_elem(&qidconf_map, &index); + if (!qidconf) + return XDP_ABORTED; + if (*qidconf) + return bpf_redirect_map(&xsks_map, index, 0); + return XDP_PASS; +} + diff --git a/src/libknot/xdp/bpf-user.c b/src/libknot/xdp/bpf-user.c new file mode 100644 index 0000000000000000000000000000000000000000..ce92dbca220b898304d98c8e18192637bbf49af9 --- /dev/null +++ b/src/libknot/xdp/bpf-user.c @@ -0,0 +1,185 @@ + +#include "daemon/kxsk/impl.h" + +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <bpf/bpf.h> +#include <net/if.h> + + +static int ensure_udp_prog(const struct kxsk_iface *iface, const char *prog_fname) +{ + int ret; + + uint32_t prog_id; + ret = bpf_get_link_xdp_id(iface->ifindex, &prog_id, 0); + if (ret) + return -abs(ret); + if (prog_id) + return bpf_prog_get_fd_by_id(prog_id); + + /* Use libbpf for extracting BPF byte-code from BPF-ELF object, and + * loading this into the kernel via bpf-syscall */ + int prog_fd; + struct bpf_object *obj; // TODO: leak or what? + ret = bpf_prog_load(prog_fname, BPF_PROG_TYPE_XDP, &obj, &prog_fd); + if (ret) { + fprintf(stderr, "[kxsk] failed loading BPF program (%s) (%d): %s\n", + prog_fname, ret, strerror(-ret)); + return -abs(ret); + } + + ret = bpf_set_link_xdp_fd(iface->ifindex, prog_fd, 0); + if (ret) { + fprintf(stderr, "bpf_set_link_xdp_fd() == %d\n", ret); + return -abs(ret); + } else { + fprintf(stderr, "[kxsk] loaded BPF program\n"); + } + + return prog_fd; +} + +/** Get FDs for the two maps and assign them into xsk_info-> fields. + * + * It's almost precise copy of xsk_lookup_bpf_maps() from libbpf + * (version before they eliminated qidconf_map) + * Copyright by Intel, LGPL-2.1 or BSD-2-Clause. */ +static int get_bpf_maps(int prog_fd, struct kxsk_iface *iface) +{ + __u32 i, *map_ids, num_maps, prog_len = sizeof(struct bpf_prog_info); + __u32 map_len = sizeof(struct bpf_map_info); + struct bpf_prog_info prog_info = {}; + struct bpf_map_info map_info; + int fd, err; + + err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &prog_len); + if (err) + return err; + + num_maps = prog_info.nr_map_ids; + + map_ids = calloc(prog_info.nr_map_ids, sizeof(*map_ids)); + if (!map_ids) + return -ENOMEM; + + memset(&prog_info, 0, prog_len); + prog_info.nr_map_ids = num_maps; + prog_info.map_ids = (__u64)(unsigned long)map_ids; + + err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &prog_len); + if (err) + goto out_map_ids; + + for (i = 0; i < prog_info.nr_map_ids; ++i) { + if (iface->qidconf_map_fd >= 0 && iface->xsks_map_fd >= 0) + break; + + fd = bpf_map_get_fd_by_id(map_ids[i]); + if (fd < 0) + continue; + + err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len); + if (err) { + close(fd); + continue; + } + + if (!strcmp(map_info.name, "qidconf_map")) { + iface->qidconf_map_fd = fd; + continue; + } + + if (!strcmp(map_info.name, "xsks_map")) { + iface->xsks_map_fd = fd; + continue; + } + + close(fd); + } + + if (iface->qidconf_map_fd < 0 || iface->xsks_map_fd < 0) { + err = -ENOENT; + close(iface->qidconf_map_fd); + close(iface->xsks_map_fd); + iface->qidconf_map_fd = iface->xsks_map_fd = -1; + goto out_map_ids; + } + + err = 0; // success! + +out_map_ids: + free(map_ids); + return err; +} +static void unget_bpf_maps(struct kxsk_iface *iface) +{ + close(iface->qidconf_map_fd); + close(iface->xsks_map_fd); + iface->qidconf_map_fd = iface->xsks_map_fd = -1; +} + +int kxsk_socket_start(const struct kxsk_iface *iface, int queue_id, struct xsk_socket *xsk) +{ + int fd = xsk_socket__fd(xsk); + int err = bpf_map_update_elem(iface->xsks_map_fd, &queue_id, &fd, 0); + if (err) + return err; + + int qid = true; + err = bpf_map_update_elem(iface->qidconf_map_fd, &queue_id, &qid, 0); + if (err) + bpf_map_delete_elem(iface->xsks_map_fd, &queue_id); + return err; +} +int kxsk_socket_stop(const struct kxsk_iface *iface, int queue_id) +{ + int qid = false; + int err = bpf_map_update_elem(iface->qidconf_map_fd, &queue_id, &qid, 0); + // Clearing the second map doesn't seem important, but why not. + bpf_map_delete_elem(iface->xsks_map_fd, &queue_id); + return err; +} + +struct kxsk_iface * kxsk_iface_new(const char *ifname, const char *prog_fname) +{ + struct kxsk_iface *iface = malloc(sizeof(*iface)); + if (!iface) { + errno = ENOMEM; + return NULL; + } + iface->ifname = ifname; // we strdup it later + iface->ifindex = if_nametoindex(ifname); + if (!iface->ifindex) { + free(iface); + return NULL; + } + iface->qidconf_map_fd = iface->xsks_map_fd = -1; + + int ret = ensure_udp_prog(iface, prog_fname); + if (ret >= 0) + ret = get_bpf_maps(ret, iface); + + if (ret < 0) { + errno = abs(ret); + free(iface); + return NULL; + } // else + + iface->ifname = strdup(iface->ifname); + return iface; +} +int kxsk_iface_free(struct kxsk_iface *iface, bool unload_bpf) +{ + unget_bpf_maps(iface); + if (unload_bpf) { + int ret = bpf_set_link_xdp_fd(iface->ifindex, -1, 0); + if (ret) return ret; + } + free((char *)/*const-cast*/iface->ifname); + free(iface); + return 0; +} + diff --git a/src/libknot/xdp/bpf-user.h b/src/libknot/xdp/bpf-user.h new file mode 100644 index 0000000000000000000000000000000000000000..90a8c4ec30d3340df664370206a8aff24397e791 --- /dev/null +++ b/src/libknot/xdp/bpf-user.h @@ -0,0 +1,102 @@ + +#pragma once + +#include <bpf/xsk.h> + +#include <linux/if_ether.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/udp.h> + +#include <uv.h> // LATER: split kresd-specific stuff + +struct udpv4 { + union { uint8_t bytes[1]; struct { + + struct ethhdr eth; // no VLAN support; CRC at the "end" of .data! + struct iphdr ipv4; + struct udphdr udp; + uint8_t data[]; + + } __attribute__((packed)); }; +}; + + +/** Data around one network interface. */ +struct kxsk_iface { + const char *ifname; + int ifindex; /**< computed from ifname */ + + /* File-descriptors to BPF maps for the program running on the interface. */ + int qidconf_map_fd; + int xsks_map_fd; +}; + + +struct config { + int xsk_if_queue; + + struct xsk_umem_config umem; /**< For xsk_umem__create() from libbpf. */ + uint32_t umem_frame_count; + + struct xsk_socket_config xsk; /**< For xsk_socket__create() from libbpf. */ + + struct udpv4 pkt_template; +}; + +struct xsk_umem_info { + /** Fill queue: passing memory frames to kernel - ready to receive. */ + struct xsk_ring_prod fq; + /** Completion queue: passing memory frames from kernel - after send finishes. */ + struct xsk_ring_cons cq; + /** Handle internal to libbpf. */ + struct xsk_umem *umem; + + struct umem_frame *frames; /**< The memory frames. TODO: (uint8_t *frammem) might be more practical. */ + uint32_t frame_count; + uint32_t free_count; /**< The number of free frames. */ + uint32_t *free_indices; /**< Stack of indices of the free frames. */ +}; +struct xsk_socket_info { + /** Receive queue: passing arrived packets from kernel. */ + struct xsk_ring_cons rx; + /** Transmit queue: passing packets to kernel for sending. */ + struct xsk_ring_prod tx; + /** Information about memory frames for all the passed packets. */ + struct xsk_umem_info *umem; + /** Handle internal to libbpf. */ + struct xsk_socket *xsk; + + bool kernel_needs_wakeup; + + const struct kxsk_iface *iface; + + /* kresd-specific stuff */ + uv_check_t check_handle; + uv_poll_t poll_handle; + struct session *session; /**< mock session, to minimize kresd changes for now */ +}; + + +/* eBPF stuff (user-space part), implemented in ./bpf-user.c */ + +/** Ensure the BPF program and maps are set up. On failure return NULL + errno. + * + * Note: if one is loaded on the interface already, we assume it's ours. + * LATER: it might be possible to check, e.g. by naming our maps unusually. + */ +struct kxsk_iface * kxsk_iface_new(const char *ifname, const char *prog_fname); + +/** Undo kxsk_iface_new(). It's always freed, even if some problems happen. + * + * Unloading the BPF program is optional, as keeping it only adds some overhead, + * and in case of multi-process it isn't easy to find that we're the last instance. + */ +int kxsk_iface_free(struct kxsk_iface *iface, bool unload_bpf); + +/** Activate this AF_XDP socket through the BPF maps. */ +int kxsk_socket_start(const struct kxsk_iface *iface, int queue_id, struct xsk_socket *xsk); + +/** Deactivate this AF_XDP socket through the BPF maps. */ +int kxsk_socket_stop(const struct kxsk_iface *iface, int queue_id); +