tcp-handler.c 10.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/*  Copyright (C) 2011 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

17
#include <unistd.h>
Marek Vavrusa's avatar
Marek Vavrusa committed
18
#include <fcntl.h>
19
#include <errno.h>
20 21 22
#include <string.h>
#include <sys/types.h>
#include <sys/socket.h>
23 24
#include <netinet/tcp.h>
#include <netinet/in.h>
25
#include <stdio.h>
26
#include <stdlib.h>
Daniel Salzman's avatar
Daniel Salzman committed
27
#include <urcu.h>
28 29 30
#ifdef HAVE_SYS_UIO_H			// struct iovec (OpenBSD)
#include <sys/uio.h>
#endif // HAVE_SYS_UIO_H
31 32 33
#ifdef HAVE_CAP_NG_H
#include <cap-ng.h>
#endif /* HAVE_CAP_NG_H */
34

35
#include "dnssec/random.h"
Daniel Salzman's avatar
Daniel Salzman committed
36
#include "knot/server/tcp-handler.h"
37
#include "knot/common/fdset.h"
38
#include "knot/common/log.h"
Daniel Salzman's avatar
Daniel Salzman committed
39
#include "knot/nameserver/process_query.h"
40
#include "libknot/processing/overlay.h"
41
#include "contrib/macros.h"
Daniel Salzman's avatar
Daniel Salzman committed
42
#include "contrib/mempattern.h"
43 44
#include "contrib/net.h"
#include "contrib/sockaddr.h"
45
#include "contrib/time.h"
46
#include "contrib/ucw/mempool.h"
Marek Vavrusa's avatar
Marek Vavrusa committed
47

48
/*! \brief TCP context data. */
49
typedef struct tcp_context {
50
	struct knot_overlay overlay;/*!< Query processing overlay. */
51 52 53 54
	server_t *server;           /*!< Name server structure. */
	struct iovec iov[2];        /*!< TX/RX buffers. */
	unsigned client_threshold;  /*!< Index of first TCP client. */
	timev_t last_poll_time;     /*!< Time of the last socket poll. */
55
	timev_t throttle_end;       /*!< End of accept() throttling. */
56
	fdset_t set;                /*!< Set of server/client sockets. */
Daniel Salzman's avatar
Daniel Salzman committed
57
	unsigned thread_id;         /*!< Thread identifier. */
58
} tcp_context_t;
59

60 61 62
/*
 * Forward decls.
 */
63 64
#define TCP_THROTTLE_LO 0 /*!< Minimum recovery time on errors. */
#define TCP_THROTTLE_HI 2 /*!< Maximum recovery time on errors. */
65 66 67

/*! \brief Calculate TCP throttle time (random). */
static inline int tcp_throttle() {
68
	return TCP_THROTTLE_LO + (dnssec_random_uint16_t() % TCP_THROTTLE_HI);
69
}
70

71
/*! \brief Sweep TCP connection. */
72
static enum fdset_sweep_state tcp_sweep(fdset_t *set, int i, void *data)
73
{
74
	UNUSED(data);
75 76
	assert(set && i < set->n && i >= 0);
	int fd = set->pfd[i].fd;
Jan Včelák's avatar
Jan Včelák committed
77

78
	/* Best-effort, name and shame. */
79 80
	struct sockaddr_storage ss;
	socklen_t len = sizeof(struct sockaddr_storage);
81 82 83
	if (getpeername(fd, (struct sockaddr*)&ss, &len) == 0) {
		char addr_str[SOCKADDR_STRLEN] = {0};
		sockaddr_tostr(addr_str, sizeof(addr_str), &ss);
Jan Včelák's avatar
Jan Včelák committed
84
		log_notice("TCP, terminated inactive client, address '%s'", addr_str);
85
	}
86

87
	close(fd);
88

89
	return FDSET_SWEEP;
90 91
}

92 93 94
/*!
 * \brief TCP event handler function.
 */
95
static int tcp_handle(tcp_context_t *tcp, int fd,
96
                      struct iovec *rx, struct iovec *tx)
97
{
98
	/* Create query processing parameter. */
99 100
	struct sockaddr_storage ss;
	memset(&ss, 0, sizeof(struct sockaddr_storage));
101
	struct process_query_param param = {0};
102 103
	param.socket = fd;
	param.remote = &ss;
104
	param.server = tcp->server;
105
	param.thread_id = tcp->thread_id;
106 107 108
	rx->iov_len = KNOT_WIRE_MAX_PKTSIZE;
	tx->iov_len = KNOT_WIRE_MAX_PKTSIZE;

109 110 111 112 113 114 115
	/* Receive peer name. */
	socklen_t addrlen = sizeof(struct sockaddr_storage);
	if (getpeername(fd, (struct sockaddr *)&ss, &addrlen) < 0) {
		;
	}

	/* Timeout. */
116
	rcu_read_lock();
117 118
	conf_val_t *val = &conf()->cache.srv_tcp_reply_timeout;
	struct timeval tmout = { conf_int(val), 0 };
119
	rcu_read_unlock();
120

121
	/* Receive data. */
Jan Včelák's avatar
Jan Včelák committed
122
	struct timeval recv_tmout = tmout;
123
	int ret = net_dns_tcp_recv(fd, rx->iov_base, rx->iov_len, &recv_tmout);
124 125
	if (ret <= 0) {
		if (ret == KNOT_EAGAIN) {
126
			char addr_str[SOCKADDR_STRLEN] = {0};
127
			sockaddr_tostr(addr_str, sizeof(addr_str), &ss);
Jan Včelák's avatar
Jan Včelák committed
128 129
			log_warning("TCP, connection timed out, address '%s'",
			            addr_str);
130
		}
Marek Vavrusa's avatar
Marek Vavrusa committed
131
		return KNOT_ECONNREFUSED;
132 133
	} else {
		rx->iov_len = ret;
134
	}
135

136
	knot_mm_t *mm = tcp->overlay.mm;
137

138
	/* Initialize processing overlay. */
139 140 141 142 143 144 145 146 147 148 149 150
	ret = knot_overlay_init(&tcp->overlay, mm);
	if (ret != KNOT_EOK) {
		return ret;
	}
	ret = knot_overlay_add(&tcp->overlay, NS_PROC_QUERY, &param);
	if (ret != KNOT_EOK) {
		return ret;
	}

	/* Create packets. */
	knot_pkt_t *ans = knot_pkt_new(tx->iov_base, tx->iov_len, mm);
	knot_pkt_t *query = knot_pkt_new(rx->iov_base, rx->iov_len, mm);
151

152
	/* Input packet. */
153
	(void) knot_pkt_parse(query, 0);
154
	int state = knot_overlay_consume(&tcp->overlay, query);
155

156
	/* Resolve until NOOP or finished. */
157
	ret = KNOT_EOK;
158 159
	while (state & (KNOT_STATE_PRODUCE|KNOT_STATE_FAIL)) {
		state = knot_overlay_produce(&tcp->overlay, ans);
160

161
		/* Send, if response generation passed and wasn't ignored. */
162
		if (ans->size > 0 && !(state & (KNOT_STATE_FAIL|KNOT_STATE_NOOP))) {
Jan Včelák's avatar
Jan Včelák committed
163
			struct timeval send_tmout = tmout;
164
			if (net_dns_tcp_send(fd, ans->wire, ans->size, &send_tmout) != ans->size) {
165
				ret = KNOT_ECONNREFUSED;
166
				break;
167
			}
Marek Vavrusa's avatar
Marek Vavrusa committed
168 169 170
		}
	}

Marek Vavrusa's avatar
Marek Vavrusa committed
171
	/* Reset after processing. */
172 173
	knot_overlay_finish(&tcp->overlay);
	knot_overlay_deinit(&tcp->overlay);
174 175 176 177

	/* Cleanup. */
	knot_pkt_free(&query);
	knot_pkt_free(&ans);
Marek Vavrusa's avatar
Marek Vavrusa committed
178

179
	return ret;
180 181
}

182
int tcp_accept(int fd)
183
{
184
	/* Accept incoming connection. */
185
	int incoming = net_accept(fd, NULL);
186

187
	/* Evaluate connection. */
188
	if (incoming >= 0) {
189 190
#ifdef SO_RCVTIMEO
		struct timeval tv;
191
		rcu_read_lock();
192 193
		conf_val_t *val = &conf()->cache.srv_tcp_idle_timeout;
		tv.tv_sec = conf_int(val);
194
		rcu_read_unlock();
195
		tv.tv_usec = 0;
196
		if (setsockopt(incoming, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)) < 0) {
Jan Včelák's avatar
Jan Včelák committed
197 198
			log_warning("TCP, failed to set up watchdog timer"
			            ", fd %d", incoming);
199
		}
200
#endif
Marek Vavrusa's avatar
Marek Vavrusa committed
201
	}
202

203
	return incoming;
204 205
}

206
static int tcp_event_accept(tcp_context_t *tcp, unsigned i)
207
{
208 209 210
	/* Accept client. */
	int fd = tcp->set.pfd[i].fd;
	int client = tcp_accept(fd);
211
	if (client >= 0) {
212 213 214
		/* Assign to fdset. */
		int next_id = fdset_add(&tcp->set, client, POLLIN, NULL);
		if (next_id < 0) {
215
			close(client);
216 217 218 219 220
			return next_id; /* Contains errno. */
		}

		/* Update watchdog timer. */
		rcu_read_lock();
221 222
		conf_val_t *val = &conf()->cache.srv_tcp_hshake_timeout;
		fdset_set_watchdog(&tcp->set, next_id, conf_int(val));
223
		rcu_read_unlock();
224 225

		return KNOT_EOK;
Marek Vavrusa's avatar
Marek Vavrusa committed
226
	}
Jan Včelák's avatar
Jan Včelák committed
227

228
	return client;
229
}
Jan Včelák's avatar
Jan Včelák committed
230

231 232 233
static int tcp_event_serve(tcp_context_t *tcp, unsigned i)
{
	int fd = tcp->set.pfd[i].fd;
234
	int ret = tcp_handle(tcp, fd, &tcp->iov[0], &tcp->iov[1]);
Marek Vavrusa's avatar
Marek Vavrusa committed
235

236
	/* Flush per-query memory. */
237
	mp_flush(tcp->overlay.mm->ctx);
Jan Včelák's avatar
Jan Včelák committed
238

239 240 241
	if (ret == KNOT_EOK) {
		/* Update socket activity timer. */
		rcu_read_lock();
242 243
		conf_val_t *val = &conf()->cache.srv_tcp_idle_timeout;
		fdset_set_watchdog(&tcp->set, i, conf_int(val));
244 245
		rcu_read_unlock();
	}
Jan Včelák's avatar
Jan Včelák committed
246

247 248
	return ret;
}
Jan Včelák's avatar
Jan Včelák committed
249

250 251 252 253 254
static int tcp_wait_for_events(tcp_context_t *tcp)
{
	/* Wait for events. */
	fdset_t *set = &tcp->set;
	int nfds = poll(set->pfd, set->n, TCP_SWEEP_INTERVAL * 1000);
Jan Včelák's avatar
Jan Včelák committed
255

256 257
	/* Mark the time of last poll call. */
	time_now(&tcp->last_poll_time);
258 259 260
	bool is_throttled = (tcp->last_poll_time.tv_sec < tcp->throttle_end.tv_sec);
	if (!is_throttled) {
		/* Configuration limit, infer maximal pool size. */
261
		rcu_read_lock();
262 263
		conf_val_t *val = &conf()->cache.srv_max_tcp_clients;
		unsigned max_per_set = MAX(conf_int(val) / conf_tcp_threads(conf()), 1);
264
		rcu_read_unlock();
265 266 267
		/* Subtract master sockets check limits. */
		is_throttled = (set->n - tcp->client_threshold) >= max_per_set;
	}
268 269 270 271

	/* Process events. */
	unsigned i = 0;
	while (nfds > 0 && i < set->n) {
272
		bool should_close = false;
273
		int fd = set->pfd[i].fd;
274 275 276 277 278
		if (set->pfd[i].revents & (POLLERR|POLLHUP|POLLNVAL)) {
			should_close = (i >= tcp->client_threshold);
			--nfds;
		} else if (set->pfd[i].revents & (POLLIN)) {
			/* Master sockets */
279
			if (i < tcp->client_threshold) {
280 281 282 283 284
				if (!is_throttled && tcp_event_accept(tcp, i) == KNOT_EBUSY) {
					time_now(&tcp->throttle_end);
					tcp->throttle_end.tv_sec += tcp_throttle();
				}
			/* Client sockets */
285 286
			} else {
				if (tcp_event_serve(tcp, i) != KNOT_EOK) {
287
					should_close = true;
288 289
				}
			}
290
			--nfds;
291
		}
Jan Včelák's avatar
Jan Včelák committed
292

293 294
		/* Evaluate */
		if (should_close) {
295
			fdset_remove(set, i);
296
			close(fd);
297 298
		} else {
			++i;
299 300 301 302
		}
	}

	return nfds;
303
}
304

305
int tcp_master(dthread_t *thread)
306
{
307 308
	if (!thread || !thread->data) {
		return KNOT_EINVAL;
309
	}
Jan Včelák's avatar
Jan Včelák committed
310

311 312 313
	iohandler_t *handler = (iohandler_t *)thread->data;
	unsigned *iostate = &handler->thread_state[dt_get_id(thread)];

314
	int ret = KNOT_EOK;
315 316 317 318
	ref_t *ref = NULL;
	tcp_context_t tcp;
	memset(&tcp, 0, sizeof(tcp_context_t));

319
	/* Create big enough memory cushion. */
320
	knot_mm_t mm;
321
	mm_ctx_mempool(&mm, 16 * MM_DEFAULT_BLKSIZE);
322

323
	/* Create TCP answering context. */
324
	tcp.server = handler->server;
325
	tcp.thread_id = handler->thread_id[dt_get_id(thread)];
326
	tcp.overlay.mm = &mm;
327

328
	/* Prepare structures for bound sockets. */
Daniel Salzman's avatar
Daniel Salzman committed
329 330
	conf_val_t val = conf_get(conf(), C_SRV, C_LISTEN);
	fdset_init(&tcp.set, conf_val_count(&val) + CONF_XFERS);
331 332 333

	/* Create iovec abstraction. */
	for (unsigned i = 0; i < 2; ++i) {
334 335 336
		tcp.iov[i].iov_len = KNOT_WIRE_MAX_PKTSIZE;
		tcp.iov[i].iov_base = malloc(tcp.iov[i].iov_len);
		if (tcp.iov[i].iov_base == NULL) {
337 338
			ret = KNOT_ENOMEM;
			goto finish;
339
		}
340 341
	}

342 343
	/* Initialize sweep interval. */
	timev_t next_sweep = {0};
Marek Vavrusa's avatar
Marek Vavrusa committed
344
	time_now(&next_sweep);
345
	next_sweep.tv_sec += TCP_SWEEP_INTERVAL;
346

347
	for(;;) {
Jan Včelák's avatar
Jan Včelák committed
348

349
		/* Check handler state. */
350
		if (unlikely(*iostate & ServerReload)) {
351
			*iostate &= ~ServerReload;
352

353 354
			/* Cancel client connections. */
			for (unsigned i = tcp.client_threshold; i < tcp.set.n; ++i) {
355
				close(tcp.set.pfd[i].fd);
356
			}
357

358
			ref_release(ref);
359
			ref = server_set_ifaces(handler->server, &tcp.set, IO_TCP, tcp.thread_id);
360 361
			if (tcp.set.n == 0) {
				break; /* Terminate on zero interfaces. */
362
			}
Jan Včelák's avatar
Jan Včelák committed
363

364
			tcp.client_threshold = tcp.set.n;
365
		}
Jan Včelák's avatar
Jan Včelák committed
366

367 368 369 370 371 372 373 374 375 376 377 378 379
		/* Check for cancellation. */
		if (dt_is_cancelled(thread)) {
			break;
		}

		/* Serve client requests. */
		tcp_wait_for_events(&tcp);

		/* Sweep inactive clients. */
		if (tcp.last_poll_time.tv_sec >= next_sweep.tv_sec) {
			fdset_sweep(&tcp.set, &tcp_sweep, NULL);
			time_now(&next_sweep);
			next_sweep.tv_sec += TCP_SWEEP_INTERVAL;
380
		}
381 382
	}

383
finish:
384 385
	free(tcp.iov[0].iov_base);
	free(tcp.iov[1].iov_base);
386
	mp_delete(mm.ctx);
387 388 389
	fdset_clear(&tcp.set);
	ref_release(ref);

390
	return ret;
391
}