From 5b6ed75906e989ef513f89030dd3f4f3ab5d9911 Mon Sep 17 00:00:00 2001
From: Libor Peltan <libor.peltan@nic.cz>
Date: Tue, 15 Mar 2022 19:14:36 +0100
Subject: [PATCH] notify: implemented slave-selective retry

---
 src/knot/ctl/commands.c                |  3 +-
 src/knot/events/handlers/dnssec.c      |  6 ++-
 src/knot/events/handlers/load.c        |  4 +-
 src/knot/events/handlers/notify.c      | 42 ++++++++++++---
 src/knot/events/handlers/refresh.c     |  2 +-
 src/knot/events/handlers/update.c      |  2 +-
 src/knot/events/replan.c               |  4 ++
 src/knot/zone/zone.c                   | 14 +++++
 src/knot/zone/zone.h                   | 16 ++++++
 src/knot/zone/zonedb-load.c            |  2 +
 src/libknot/dynarray.h                 | 10 +++-
 tests-extra/tests/notify/retry/test.py | 73 ++++++++++++++++++++++++++
 12 files changed, 163 insertions(+), 15 deletions(-)
 create mode 100644 tests-extra/tests/notify/retry/test.py

diff --git a/src/knot/ctl/commands.c b/src/knot/ctl/commands.c
index 9bdc2192c0..aaca1e9b48 100644
--- a/src/knot/ctl/commands.c
+++ b/src/knot/ctl/commands.c
@@ -447,6 +447,7 @@ static int zone_retransfer(zone_t *zone, _unused_ ctl_args_t *args)
 
 static int zone_notify(zone_t *zone, _unused_ ctl_args_t *args)
 {
+	zone_notifailed_clear(zone); // FIXME need mutex for zone->notifailed
 	return schedule_trigger(zone, args, ZONE_EVENT_NOTIFY, true);
 }
 
@@ -796,7 +797,7 @@ static int zone_txn_commit(zone_t *zone, _unused_ ctl_args_t *args)
 	free(zone->control_update);
 	zone->control_update = NULL;
 
-	zone_events_schedule_now(zone, ZONE_EVENT_NOTIFY);
+	zone_schedule_notify(zone, 0); // FIXME need mutex ?
 
 	return KNOT_EOK;
 }
diff --git a/src/knot/events/handlers/dnssec.c b/src/knot/events/handlers/dnssec.c
index 2f2e1f906a..8263b0d168 100644
--- a/src/knot/events/handlers/dnssec.c
+++ b/src/knot/events/handlers/dnssec.c
@@ -55,9 +55,11 @@ void event_dnssec_reschedule(conf_t *conf, zone_t *zone,
 
 	zone_events_schedule_at(zone,
 		ZONE_EVENT_DNSSEC, refresh_at ? (time_t)refresh_at : ignore,
-		ZONE_EVENT_DS_CHECK, refresh->plan_ds_check ? now : ignore,
-		ZONE_EVENT_NOTIFY, zone_changed ? now : ignore
+		ZONE_EVENT_DS_CHECK, refresh->plan_ds_check ? now : ignore
 	);
+	if (zone_changed) {
+		zone_schedule_notify(zone, 0);
+	}
 }
 
 int event_dnssec(conf_t *conf, zone_t *zone)
diff --git a/src/knot/events/handlers/load.c b/src/knot/events/handlers/load.c
index 8b15625057..f9001c54e4 100644
--- a/src/knot/events/handlers/load.c
+++ b/src/knot/events/handlers/load.c
@@ -1,4 +1,4 @@
-/*  Copyright (C) 2021 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
+/*  Copyright (C) 2022 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
 
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -346,7 +346,7 @@ load_end:
 	replan_from_timers(conf, zone);
 
 	if (!zone_timers_serial_notified(&zone->timers, new_serial)) {
-		zone_events_schedule_now(zone, ZONE_EVENT_NOTIFY);
+		zone_schedule_notify(zone, 0);
 	}
 
 	return KNOT_EOK;
diff --git a/src/knot/events/handlers/notify.c b/src/knot/events/handlers/notify.c
index 5bf2f7c1f7..45be77883b 100644
--- a/src/knot/events/handlers/notify.c
+++ b/src/knot/events/handlers/notify.c
@@ -16,6 +16,7 @@
 
 #include <assert.h>
 
+#include "contrib/openbsd/siphash.h"
 #include "knot/common/log.h"
 #include "knot/conf/conf.h"
 #include "knot/query/query.h"
@@ -23,6 +24,15 @@
 #include "knot/zone/zone.h"
 #include "libknot/errcode.h"
 
+static notifailed_rmt_hash notifailed_hash(conf_val_t *rmt_id)
+{
+	SIPHASH_KEY zero_key = { 0, 0 };
+	SIPHASH_CTX ctx;
+	SipHash24_Init(&ctx, &zero_key);
+	SipHash24_Update(&ctx, rmt_id->data, rmt_id->len);
+	return SipHash24_End(&ctx);
+}
+
 /*!
  * \brief NOTIFY message processing data.
  */
@@ -77,7 +87,7 @@ static const knot_layer_api_t NOTIFY_API = {
 	       (reused), fmt, ## __VA_ARGS__)
 
 static int send_notify(conf_t *conf, zone_t *zone, const knot_rrset_t *soa,
-                       const conf_remote_t *slave, int timeout)
+                       const conf_remote_t *slave, int timeout, bool retry)
 {
 	struct notify_data data = {
 		.zone = zone->name,
@@ -107,20 +117,22 @@ static int send_notify(conf_t *conf, zone_t *zone, const knot_rrset_t *soa,
 
 	int ret = knot_requestor_exec(&requestor, req, timeout);
 
+	const char *log_retry = retry ? "retry, " : "";
+
 	if (ret == KNOT_EOK && knot_pkt_ext_rcode(req->resp) == 0) {
 		NOTIFY_OUT_LOG(LOG_INFO, zone->name, dst,
 		               requestor.layer.flags & KNOT_REQUESTOR_REUSED,
-		               "serial %u", knot_soa_serial(soa->rrs.rdata));
+		               "%sserial %u", log_retry, knot_soa_serial(soa->rrs.rdata));
 		zone->timers.last_notified_serial = (knot_soa_serial(soa->rrs.rdata) | LAST_NOTIFIED_SERIAL_VALID);
 	} else if (knot_pkt_ext_rcode(req->resp) == 0) {
 		NOTIFY_OUT_LOG(LOG_WARNING, zone->name, dst,
 		               requestor.layer.flags & KNOT_REQUESTOR_REUSED,
-		               "failed (%s)", knot_strerror(ret));
+		               "%sfailed (%s)", log_retry, knot_strerror(ret));
 	} else {
 		NOTIFY_OUT_LOG(LOG_WARNING, zone->name, dst,
 		               requestor.layer.flags & KNOT_REQUESTOR_REUSED,
-		               "server responded with error '%s'",
-		               knot_pkt_ext_rcode_name(req->resp));
+		               "%sserver responded with error '%s'",
+		               log_retry, knot_pkt_ext_rcode_name(req->resp));
 	}
 
 	knot_request_free(req, NULL);
@@ -143,11 +155,20 @@ int event_notify(conf_t *conf, zone_t *zone)
 	int timeout = conf->cache.srv_tcp_remote_io_timeout;
 	knot_rrset_t soa = node_rrset(zone->contents->apex, KNOT_RRTYPE_SOA);
 
+	// in case of re-try, NOTIFY only failed remotes
+	bool retry = (zone->notifailed.size > 0);
+
 	// send NOTIFY to each remote, use working address
 	conf_val_t notify = conf_zone_get(conf, C_NOTIFY, zone->name);
 	conf_mix_iter_t iter;
 	conf_mix_iter_init(conf, &notify, &iter);
 	while (iter.id->code == KNOT_EOK) {
+		notifailed_rmt_hash rmt_hash = notifailed_hash(iter.id);
+		if (retry && notifailed_rmt_dynarray_bsearch(&zone->notifailed, &rmt_hash) == NULL) {
+			conf_mix_iter_next(&iter);
+			continue;
+		}
+
 		conf_val_t addr = conf_id_get(conf, C_RMT, C_ADDR, iter.id);
 		size_t addr_count = conf_val_count(&addr);
 
@@ -155,18 +176,27 @@ int event_notify(conf_t *conf, zone_t *zone)
 
 		for (int i = 0; i < addr_count; i++) {
 			conf_remote_t slave = conf_remote(conf, iter.id, i);
-			ret = send_notify(conf, zone, &soa, &slave, timeout);
+			ret = send_notify(conf, zone, &soa, &slave, timeout, retry);
 			if (ret == KNOT_EOK) {
 				break;
 			}
 		}
 
+
 		if (ret != KNOT_EOK) {
 			failed = true;
+			notifailed_rmt_dynarray_add(&zone->notifailed, &rmt_hash);
+		} else {
+			notifailed_rmt_dynarray_remove(&zone->notifailed, &rmt_hash);
 		}
 
 		conf_mix_iter_next(&iter);
 	}
 
+	if (failed) {
+		notifailed_rmt_dynarray_sort_dedup(&zone->notifailed);
+		zone_events_schedule_at(zone, ZONE_EVENT_NOTIFY, time(NULL) + 10); // FIXME exponential backoff of NOTIFY retries
+	}
+
 	return failed ? KNOT_ERROR : KNOT_EOK;
 }
diff --git a/src/knot/events/handlers/refresh.c b/src/knot/events/handlers/refresh.c
index b610094afa..bff5494f98 100644
--- a/src/knot/events/handlers/refresh.c
+++ b/src/knot/events/handlers/refresh.c
@@ -1353,7 +1353,7 @@ int event_refresh(conf_t *conf, zone_t *zone)
 	/* Reschedule events. */
 	replan_from_timers(conf, zone);
 	if (trctx.send_notify) {
-		zone_events_schedule_at(zone, ZONE_EVENT_NOTIFY, time(NULL) + 1);
+		zone_schedule_notify(zone, 1);
 	}
 
 	return ret;
diff --git a/src/knot/events/handlers/update.c b/src/knot/events/handlers/update.c
index b998eb57d3..f337eb5bee 100644
--- a/src/knot/events/handlers/update.c
+++ b/src/knot/events/handlers/update.c
@@ -211,7 +211,7 @@ static void process_requests(conf_t *conf, zone_t *zone, list_t *requests)
 	              "%.02f seconds", old_serial, new_serial,
 	              time_diff_ms(&t_start, &t_end) / 1000.0);
 
-	zone_events_schedule_at(zone, ZONE_EVENT_NOTIFY, time(NULL) + 1);
+	zone_schedule_notify(zone, 1);
 }
 
 static int remote_forward(conf_t *conf, knot_request_t *request, conf_remote_t *remote)
diff --git a/src/knot/events/replan.c b/src/knot/events/replan.c
index da91518cde..e69e67fd7c 100644
--- a/src/knot/events/replan.c
+++ b/src/knot/events/replan.c
@@ -15,6 +15,7 @@
  */
 
 #include <assert.h>
+#include <time.h>
 
 #include "knot/dnssec/kasp/kasp_db.h"
 #include "knot/events/replan.h"
@@ -47,6 +48,8 @@ static void replan_ddns(zone_t *zone, zone_t *old_zone)
 
 /*!
  * \brief Replan events that are already planned for the old zone.
+ *
+ * \notice Preserves notifailed.
  */
 static void replan_from_zone(zone_t *zone, zone_t *old_zone)
 {
@@ -193,6 +196,7 @@ void replan_load_current(conf_t *conf, zone_t *zone, zone_t *old_zone)
 
 void replan_load_updated(zone_t *zone, zone_t *old_zone)
 {
+	zone_notifailed_clear(zone);
 	replan_from_zone(zone, old_zone);
 
 	// other events will cascade from load
diff --git a/src/knot/zone/zone.c b/src/knot/zone/zone.c
index e8f0afeadc..e2c973f840 100644
--- a/src/knot/zone/zone.c
+++ b/src/knot/zone/zone.c
@@ -17,6 +17,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <sys/stat.h>
+#include <time.h>
 #include <urcu.h>
 
 #include "knot/common/log.h"
@@ -43,6 +44,8 @@
 #define JOURNAL_LOCK_RW pthread_mutex_lock(JOURNAL_LOCK_MUTEX);
 #define JOURNAL_UNLOCK_RW pthread_mutex_unlock(JOURNAL_LOCK_MUTEX);
 
+knot_dynarray_define(notifailed_rmt, notifailed_rmt_hash, DYNARRAY_VISIBILITY_NORMAL);
+
 static void free_ddns_queue(zone_t *zone)
 {
 	ptrnode_t *node, *nxt;
@@ -359,6 +362,17 @@ bool zone_journal_has_zij(zone_t *zone)
 	return exists && zij;
 }
 
+void zone_notifailed_clear(zone_t *zone)
+{
+	notifailed_rmt_dynarray_free(&zone->notifailed);
+}
+
+void zone_schedule_notify(zone_t *zone, time_t delay)
+{
+	zone_notifailed_clear(zone);
+	zone_events_schedule_at(zone, ZONE_EVENT_NOTIFY, time(NULL) + delay);
+}
+
 zone_contents_t *zone_switch_contents(zone_t *zone, zone_contents_t *new_contents)
 {
 	if (zone == NULL) {
diff --git a/src/knot/zone/zone.h b/src/knot/zone/zone.h
index 4eafe5fd3f..c0e082ed28 100644
--- a/src/knot/zone/zone.h
+++ b/src/knot/zone/zone.h
@@ -27,6 +27,7 @@
 #include "knot/zone/contents.h"
 #include "knot/zone/timers.h"
 #include "libknot/dname.h"
+#include "libknot/dynarray.h"
 #include "libknot/packet/pkt.h"
 
 struct zone_update;
@@ -48,6 +49,12 @@ typedef enum {
 	ZONE_XFR_FROZEN     = 1 << 7, /*!< Outgoing AXFR/IXFR temporarily disabled. */
 } zone_flag_t;
 
+/*!
+ * \brief Track unsuccessful NOTIFY trgets.
+ */
+typedef uint64_t notifailed_rmt_hash;
+knot_dynarray_declare(notifailed_rmt, notifailed_rmt_hash, DYNARRAY_VISIBILITY_NORMAL, 4);
+
 /*!
  * \brief Structure for holding DNS zone.
  */
@@ -74,6 +81,9 @@ typedef struct zone
 	zone_timers_t timers;      //!< Persistent zone timers.
 	zone_events_t events;      //!< Zone events timers.
 
+	/*! \brief Track unsuccessful NOTIFY targets. */
+	notifailed_rmt_dynarray_t notifailed;
+
 	/*! \brief DDNS queue and lock. */
 	pthread_mutex_t ddns_lock;
 	size_t ddns_queue_size;
@@ -166,6 +176,12 @@ int zone_flush_journal(conf_t *conf, zone_t *zone, bool verbose);
 
 bool zone_journal_has_zij(zone_t *zone);
 
+/*!
+ * \brief Clear failed_notify list before planning new NOTIFY.
+ */
+void zone_notifailed_clear(zone_t *zone);
+void zone_schedule_notify(zone_t *zone, time_t delay);
+
 /*!
  * \brief Atomically switch the content of the zone.
  */
diff --git a/src/knot/zone/zonedb-load.c b/src/knot/zone/zonedb-load.c
index eeff90d1f2..2bdb231554 100644
--- a/src/knot/zone/zonedb-load.c
+++ b/src/knot/zone/zonedb-load.c
@@ -96,6 +96,8 @@ static void replan_events(conf_t *conf, zone_t *zone, zone_t *old_zone)
 		replan_load_updated(zone, old_zone);
 	} else {
 		zone->zonefile = old_zone->zonefile;
+		memcpy(&zone->notifailed, &old_zone->notifailed, sizeof(zone->notifailed));
+		memset(&old_zone->notifailed, 0, sizeof(zone->notifailed));
 		replan_load_current(conf, zone, old_zone);
 	}
 }
diff --git a/src/libknot/dynarray.h b/src/libknot/dynarray.h
index e9f699ef78..7ea66f930e 100644
--- a/src/libknot/dynarray.h
+++ b/src/libknot/dynarray.h
@@ -52,6 +52,12 @@
 	visibility ntype *prefix ## _dynarray_arr(prefix ## _dynarray_t *dynarray); \
 	visibility ntype *prefix ## _dynarray_add(prefix ## _dynarray_t *dynarray, \
 	                                        ntype const *to_add); \
+	visibility void prefix ## _dynarray_remove(prefix ## _dynarray_t *dynarray, \
+	                                        ntype const *to_remove); \
+	visibility void prefix ## _dynarray_sort(prefix ## _dynarray_t *dynarray); \
+	visibility ntype *prefix ## _dynarray_bsearch(prefix ## _dynarray_t *dynarray, \
+	                                        const ntype *bskey); \
+	visibility void prefix ## _dynarray_sort_dedup(prefix ## _dynarray_t *dynarray); \
 	visibility void prefix ## _dynarray_free(prefix ## _dynarray_t *dynarray);
 
 #define knot_dynarray_foreach(prefix, ntype, ptr, array) \
@@ -134,8 +140,8 @@
 		} /* TODO enable lowering capacity, take care of capacity going back to initial! */ \
 	} \
 	\
-	_unused_ \
-	static int prefix ## _dynarray_memb_cmp(const void *a, const void *b) { \
+	static int prefix ## _dynarray_memb_cmp(const void *a, const void *b) \
+	{ \
 		return memcmp(a, b, sizeof(ntype)); \
 	} \
 	\
diff --git a/tests-extra/tests/notify/retry/test.py b/tests-extra/tests/notify/retry/test.py
new file mode 100644
index 0000000000..5ffb932766
--- /dev/null
+++ b/tests-extra/tests/notify/retry/test.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+
+'''Test for NOTIFY retry selectively for failed servers'''
+
+from dnstest.test import Test
+
+t = Test(tsig=True)
+
+master = t.server("knot")
+slave1 = t.server("knot")
+slave2 = t.server("knot")
+slave3 = t.server("knot")
+
+zone = t.zone_rnd(1, records=300)
+
+t.link(zone, master, slave1)
+t.link(zone, master, slave2)
+t.link(zone, master, slave3)
+
+t.start()
+
+serial = master.zone_wait(zone)
+slave1.zone_wait(zone)
+slave2.zone_wait(zone)
+slave3.zone_wait(zone)
+
+# disable notify to slave1 and slave2
+
+master.disable_notify = True
+slave1.gen_confile()
+slave3.gen_confile()
+slave1.reload()
+slave3.reload()
+t.sleep(3)
+
+master.random_ddns(zone)
+master.zone_wait(zone, serial)
+slave2.zone_wait(zone, serial)
+
+t.sleep(1)
+
+# check that slave1 and slave2 haven't been notified
+
+slave1.zone_wait(zone, serial, equal=True, greater=False)
+slave3.zone_wait(zone, serial, equal=True, greater=False)
+
+# check that slave1 has been re-notified
+
+master.disable_notify = False
+slave1.gen_confile()
+slave1.reload()
+
+slave1.zone_wait(zone, serial)
+
+# change port on slave3 and check that slave3 has been re-notified upon master restart
+
+slave2.stop()
+
+slave3.port = slave2.port
+master.gen_confile()
+master.reload()
+
+t.sleep(5)
+
+slave3.gen_confile()
+slave3.stop()
+slave3.start()
+
+t.sleep(2)
+
+slave3.zone_wait(zone, serial)
+
+t.end()
-- 
GitLab