From 5b383a2bb79809f93285a8b42804eacc82786a93 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Vladim=C3=ADr=20=C4=8Cun=C3=A1t?= <vladimir.cunat@nic.cz>
Date: Thu, 9 Nov 2017 17:32:40 +0100
Subject: [PATCH] add packet cache - only direct NAME+TYPE hit ATM

Used for queries that are BOGUS or
(negative and either insecure or unvalidated).

FIXME: review, opt-out, etc.
---
 lib/cache.c   | 533 +++++++++++++++++++++++++++++++++++++-------------
 lib/resolve.h |   7 +-
 2 files changed, 400 insertions(+), 140 deletions(-)

diff --git a/lib/cache.c b/lib/cache.c
index d1fa02105..dd69eea83 100644
--- a/lib/cache.c
+++ b/lib/cache.c
@@ -162,7 +162,7 @@ int kr_cache_clear(struct kr_cache *cache)
  * 'E' entry (exact hit):
  *	- ktype == NS: multiple chained entry_h, based on has_* : 1 flags;
  *		TODO: NSEC3 chain descriptors (iff nsec3_cnt > 0)
- *	- is_negative: uint16_t length, otherwise opaque ATM;
+ *	- is_packet: uint16_t length, otherwise opaque ATM;
  *	- otherwise RRset + its RRSIG set (possibly empty).
  * */
 struct entry_h {
@@ -170,7 +170,7 @@ struct entry_h {
 	uint32_t ttl;	/**< TTL at inception moment.  Assuming it fits into int32_t ATM. */
 	uint8_t  rank;	/**< See enum kr_rank */
 
-	bool is_negative : 1;	/**< Negative-answer packet for insecure/bogus name. */
+	bool is_packet : 1;	/**< Negative-answer packet for insecure/bogus name. */
 
 	unsigned nsec1_pos : 2;	/**< Only used for NS ktype. */
 	unsigned nsec3_cnt : 2;	/**< Only used for NS ktype. */
@@ -200,13 +200,16 @@ static struct entry_h * entry_h_consistent(knot_db_val_t data, uint16_t ktype)
 	const struct entry_h *eh = data.data;
 	bool ok = true;
 
+	ok = ok && (!kr_rank_test(eh->rank, KR_RANK_BOGUS)
+		    || eh->is_packet);
+
 	switch (ktype) {
 	case KNOT_RRTYPE_NSEC:
-		ok = ok && !(eh->is_negative || eh->has_ns || eh->has_cname
+		ok = ok && !(eh->is_packet || eh->has_ns || eh->has_cname
 				|| eh->has_dname);
 		break;
 	default:
-		if (eh->is_negative)
+		if (eh->is_packet)
 			ok = ok && !kr_rank_test(eh->rank, KR_RANK_SECURE);
 	}
 
@@ -432,7 +435,7 @@ static int entry_h_len(const knot_db_val_t val)
 	const void *d = eh->data; /* iterates over the data in entry */
 	const void *data_bound = val.data + val.len;
 	if (d >= data_bound) return kr_error(EILSEQ);
-	if (!eh->is_negative) { /* Positive RRset + its RRsig set (may be empty). */
+	if (!eh->is_packet) { /* Positive RRset + its RRsig set (may be empty). */
 		int sets = 2;
 		while (sets-- > 0) {
 			if (d + 1 > data_bound) return kr_error(EILSEQ);
@@ -511,10 +514,10 @@ static int entry_h_seek(knot_db_val_t *val, uint16_t type)
 		int len = entry_h_len(*val);
 		if (len < 0 || len > val->len) {
 			return kr_error(len < 0 ? len : EILSEQ);
+			// LATER: recovery, perhaps via removing the entry?
 		}
 		val->data += len;
 		val->len -= len;
-		// LATER: recovery, perhaps via removing the entry?
 	}
 	return kr_ok();
 }
@@ -622,15 +625,12 @@ static bool check_dname_for_lf(const knot_dname_t *n)
 	return knot_dname_size(n) == strlen((const char *)n) + 1;
 }
 
-
-/** TODO */
-static knot_db_val_t key_exact_type(struct key *k, uint16_t type)
+/** Like key_exact_type() but omits a couple checks not holding for pkt cache. */
+static knot_db_val_t key_exact_type_maypkt(struct key *k, uint16_t type)
 {
+	assert(!knot_rrtype_is_metatype(type));
 	switch (type) {
-	/* Sanity check: forbidden types represented in other way(s). */
-	case KNOT_RRTYPE_RRSIG:
-	case KNOT_RRTYPE_NSEC:
-	case KNOT_RRTYPE_NSEC3:
+	case KNOT_RRTYPE_RRSIG: /* no RRSIG query caching, at least for now */
 		assert(false);
 		return (knot_db_val_t){};
 	/* xNAME lumped into NS. */
@@ -650,6 +650,19 @@ static knot_db_val_t key_exact_type(struct key *k, uint16_t type)
 	return (knot_db_val_t){ k->buf + 1, name_len + 4 };
 }
 
+/** TODO */
+static knot_db_val_t key_exact_type(struct key *k, uint16_t type)
+{
+	switch (type) {
+	/* Sanity check: forbidden types represented in other way(s). */
+	case KNOT_RRTYPE_NSEC:
+	case KNOT_RRTYPE_NSEC3:
+		assert(false);
+		return (knot_db_val_t){};
+	}
+	return key_exact_type_maypkt(k, type);
+}
+
 /** TODO
  * \param add_wildcard Act as if the name was extended by "*."
  */
@@ -762,7 +775,6 @@ static const char * find_leq_NSEC1(struct kr_cache *cache, const struct kr_query
 		*exact_match = is_exact;
 	}
 	const struct entry_h *eh = entry_h_consistent(val, KNOT_RRTYPE_NSEC);
-	void *eh_data_bound = val.data + val.len;
 	if (!eh) {
 		/* This might be just finding something else than NSEC1 entry,
 		 * in case we searched before the very first one in the zone. */
@@ -873,7 +885,7 @@ int cache_lmdb_peek(kr_layer_t *ctx, knot_pkt_t *pkt)
 	struct kr_query *qry = req->current_query;
 	struct kr_cache *cache = &req->ctx->cache;
 
-	if (ctx->state & (KR_STATE_FAIL|KR_STATE_DONE) || (qry->flags.NO_CACHE)
+	if (ctx->state & (KR_STATE_FAIL|KR_STATE_DONE) || qry->flags.NO_CACHE
 	    || qry->sclass != KNOT_CLASS_IN) {
 		return ctx->state; /* Already resolved/failed or already tried, etc. */
 	}
@@ -883,7 +895,6 @@ int cache_lmdb_peek(kr_layer_t *ctx, knot_pkt_t *pkt)
 	 * as CNAME chains need more iterations to get fetched. */
 	qry->flags.NO_CACHE = true;
 
-
 	struct key k_storage, *k = &k_storage;
 	if (!check_dname_for_lf(qry->sname)) {
 		return ctx->state;
@@ -902,7 +913,7 @@ int cache_lmdb_peek(kr_layer_t *ctx, knot_pkt_t *pkt)
 	if (qry->stype == KNOT_RRTYPE_RRSIG) {
 		return ctx->state; /* LATER: some other behavior for this STYPE? */
 	}
-	knot_db_val_t key = key_exact_type(k, qry->stype);
+	knot_db_val_t key = key_exact_type_maypkt(k, qry->stype);
 	knot_db_val_t val = { };
 	ret = cache_op(cache, read, &key, &val, 1);
 	switch (ret) {
@@ -915,7 +926,7 @@ int cache_lmdb_peek(kr_layer_t *ctx, knot_pkt_t *pkt)
 					ret, strerror(abs(ret)));
 			return ctx->state;
 		}
-		VERBOSE_MSG(qry, "=> satisfied from cache (direct positive hit)\n");
+		VERBOSE_MSG(qry, "=> satisfied from cache (direct hit)\n");
 		return KR_STATE_DONE;
 	case (-abs(ENOENT)):
 		break;
@@ -1019,7 +1030,7 @@ int cache_lmdb_peek(kr_layer_t *ctx, knot_pkt_t *pkt)
 		case 1: {
  			/* find a previous-or-equal name+NSEC in cache covering
 			 * the QNAME, checking TTL etc. */
-			
+
 			ans.nsec_v = 1;
 			//nsec_leq()
 			knot_db_val_t key = key_NSEC1(k, qry->sname, false);
@@ -1203,7 +1214,7 @@ int cache_lmdb_peek(kr_layer_t *ctx, knot_pkt_t *pkt)
 		void *eh_data_bound = val.data + val.len;
 
 		int32_t new_ttl = get_new_ttl(eh, qry->creation_time.tv_sec);
-		if (new_ttl < 0 || eh->rank < lowest_rank || eh->is_negative) {
+		if (new_ttl < 0 || eh->rank < lowest_rank || eh->is_packet) {
 			return ctx->state;
 		}
 		ret = entry2answer(&ans, AR_SOA, eh, eh_data_bound,
@@ -1258,11 +1269,182 @@ int cache_lmdb_peek(kr_layer_t *ctx, knot_pkt_t *pkt)
 	qry->flags.EXPIRING = expiring;
 	qry->flags.CACHED = true;
 	qry->flags.NO_MINIMIZE = true;
-	
+
 	return KR_STATE_DONE;
 }
 
 
+/** Prepare space to insert an entry.
+ *
+ * Some checks are performed (rank, TTL), the current entry in cache is copied
+ * with a hole ready for the new entry (old one of the same type is cut out).
+ *
+ * \param val_new_entry The only changing parameter; ->len is read, ->data written.
+ * Beware: the entry_h in *val_new_entry->data is zeroed, and in some cases it has
+ * some flags set - and in those cases you can't just overwrite those flags.
+ * All flags except is_packet are sensitive in this way.
+ */
+static int entry_h_splice(
+	knot_db_val_t *val_new_entry, uint8_t rank,
+	const knot_db_val_t key, const uint16_t ktype, const uint16_t type,
+	const knot_dname_t *owner/*log only*/,
+	const struct kr_query *qry, struct kr_cache *cache)
+{
+	/* Find the whole entry-set and the particular entry within. */
+	knot_db_val_t val_orig_all = { }, val_orig_entry = { };
+	const struct entry_h *eh_orig = NULL;
+	if (!kr_rank_test(rank, KR_RANK_SECURE) || ktype == KNOT_RRTYPE_NS) {
+		int ret = cache_op(cache, read, &key, &val_orig_all, 1);
+		if (ret) val_orig_all = (knot_db_val_t){ };
+		val_orig_entry = val_orig_all;
+		switch (entry_h_seek(&val_orig_entry, type)) {
+		case 0:
+			ret = entry_h_len(val_orig_entry);
+			if (ret >= 0) {
+				val_orig_entry.len = ret;
+				eh_orig = entry_h_consistent(val_orig_entry, type);
+				if (eh_orig) {
+					break;
+				}
+			} /* otherwise fall through */
+		default:
+			val_orig_entry = val_orig_all = (knot_db_val_t){};
+		case -ENOENT:
+			val_orig_entry.len = 0;
+			break;
+		};
+	}
+
+	if (!kr_rank_test(rank, KR_RANK_SECURE) && eh_orig) {
+		/* If equal rank was accepted, spoofing a *single* answer would be
+		 * enough to e.g. override NS record in AUTHORITY section.
+		 * This way they would have to hit the first answer
+		 * (whenever TTL nears expiration). */
+		int32_t old_ttl = get_new_ttl(eh_orig, qry->creation_time.tv_sec);
+		if (old_ttl > 0 && !is_expiring(old_ttl, eh_orig->ttl)
+		    && rank <= eh_orig->rank) {
+			WITH_VERBOSE {
+				VERBOSE_MSG(qry, "=> not overwriting ");
+				kr_rrtype_print(type, "", " ");
+				kr_dname_print(owner, "", "\n");
+			}
+			return kr_error(EEXIST);
+		}
+	}
+
+	/* Obtain new storage from LMDB.
+	 * Note: this does NOT invalidate val_orig_all.data. */
+	ssize_t storage_size = val_orig_all.len - val_orig_entry.len
+				+ val_new_entry->len;
+	assert(storage_size > 0);
+	knot_db_val_t val = { .len = storage_size, .data = NULL };
+	int ret = cache_op(cache, write, &key, &val, 1);
+	if (ret || !val.data || !val.len) {
+		assert(ret); /* otherwise "succeeding" but `val` is bad */
+		VERBOSE_MSG(qry, "=> failed LMDB write, ret = %d\n", ret);
+		return kr_error(ret ? ret : ENOSPC);
+	}
+
+	/* Write original data before entry, if any. */
+	const ssize_t len_before = val_orig_entry.data - val_orig_all.data;
+	assert(len_before >= 0);
+	if (len_before) {
+		memcpy(val.data, val_orig_all.data, len_before);
+	}
+	/* Write original data after entry, if any. */
+	const ssize_t len_after = val_orig_all.len - val_orig_entry.len;
+	assert(len_after >= 0);
+	if (len_after) {
+		memcpy(val.data + len_before + val_new_entry->len,
+			val_orig_entry.data + val_orig_entry.len, len_after);
+	}
+
+	val_new_entry->data = val.data + len_before;
+	{
+		struct entry_h *eh = val_new_entry->data;
+		memset(eh, 0, offsetof(struct entry_h, data));
+		/* In case (len_before == 0 && ktype == KNOT_RRTYPE_NS) the *eh
+		 * set below would be uninitialized and the caller wouldn't be able
+		 * to do it after return, as that would overwrite what we do below. */
+	}
+	/* The multi-entry type needs adjusting the flags. */
+	if (ktype == KNOT_RRTYPE_NS) {
+		struct entry_h *eh = val.data;
+		if (!len_before) {
+		}
+		if (val_orig_all.len) {
+			const struct entry_h *eh0 = val_orig_all.data;
+			/* ENTRY_H_FLAGS */
+			eh->nsec1_pos = eh0->nsec1_pos;
+			eh->nsec3_cnt = eh0->nsec3_cnt;
+			eh->has_ns    = eh0->has_ns;
+			eh->has_cname = eh0->has_cname;
+			eh->has_dname = eh0->has_dname;
+		}
+		/* we just added/replaced some type */
+		switch (type) {
+		case KNOT_RRTYPE_NS:
+			eh->has_ns = true;  break;
+		case KNOT_RRTYPE_CNAME:
+			eh->has_cname = true;  break;
+		case KNOT_RRTYPE_DNAME:
+			eh->has_dname = true;  break;
+		default:
+			assert(false);
+		}
+	}
+	return kr_ok();
+}
+
+
+static const uint32_t DEFAULT_MAXTTL = 15 * 60,
+	DEFAULT_NOTTL = 5; /* Short-time "no data" retention to avoid bursts */
+
+static inline uint32_t limit_ttl(uint32_t ttl)
+{
+	/* @todo Configurable limit */
+	return (ttl > DEFAULT_MAXTTL) ? DEFAULT_MAXTTL : ttl;
+}
+/** Compute TTL for a packet.  Generally it's minimum TTL, with extra conditions. */
+static uint32_t packet_ttl(const knot_pkt_t *pkt, bool is_negative)
+{
+	bool has_ttl = false;
+	uint32_t ttl = UINT32_MAX;
+	/* Find minimum entry TTL in the packet or SOA minimum TTL. */
+	for (knot_section_t i = KNOT_ANSWER; i <= KNOT_ADDITIONAL; ++i) {
+		const knot_pktsection_t *sec = knot_pkt_section(pkt, i);
+		for (unsigned k = 0; k < sec->count; ++k) {
+			const knot_rrset_t *rr = knot_pkt_rr(sec, k);
+			if (is_negative) {
+				/* Use SOA minimum TTL for negative answers. */
+				if (rr->type == KNOT_RRTYPE_SOA) {
+					return limit_ttl(MIN(knot_rrset_ttl(rr), knot_soa_minimum(&rr->rrs)));
+				} else {
+					continue; /* Use SOA only for negative answers. */
+				}
+			}
+			if (knot_rrtype_is_metatype(rr->type)) {
+				continue; /* Skip metatypes. */
+			}
+			/* Find minimum TTL in the record set */
+			knot_rdata_t *rd = rr->rrs.data;
+			for (uint16_t j = 0; j < rr->rrs.rr_count; ++j) {
+				if (knot_rdata_ttl(rd) < ttl) {
+					ttl = limit_ttl(knot_rdata_ttl(rd));
+					has_ttl = true;
+				}
+				rd = kr_rdataset_next(rd);
+			}
+		}
+	}
+	/* Get default if no valid TTL present */
+	if (!has_ttl) {
+		ttl = DEFAULT_NOTTL;
+	}
+	return limit_ttl(ttl);
+}
+
+
 static int stash_rrset(const ranked_rr_array_t *arr, int arr_i, uint32_t min_ttl,
 			const struct kr_query *qry, struct kr_cache *cache);
 
@@ -1272,14 +1454,19 @@ int cache_lmdb_stash(kr_layer_t *ctx, knot_pkt_t *pkt)
 	struct kr_query *qry = req->current_query;
 	struct kr_cache *cache = &req->ctx->cache;
 
-	if (!qry || ctx->state & KR_STATE_FAIL || qry->flags.CACHED) {
+	uint16_t pkt_type = knot_pkt_qtype(pkt);
+	const bool type_bad = knot_rrtype_is_metatype(pkt_type)
+				|| pkt_type == KNOT_RRTYPE_RRSIG;
+	/* Note: we cache even in KR_STATE_FAIL.  For example,
+	 * BOGUS answer can go to +cd cache even without +cd request. */
+	if (!qry || qry->flags.CACHED || type_bad || qry->sclass != KNOT_CLASS_IN) {
 		return ctx->state;
 	}
 	/* Do not cache truncated answers, at least for now.  LATER */
 	if (knot_wire_get_tc(pkt->wire)) {
 		return ctx->state;
 	}
-
+	/* Stash individual records. */
 	const uint32_t min_ttl = MAX(DEFAULT_MINTTL, req->ctx->cache.ttl_min);
 	ranked_rr_array_t *selected[] = kr_request_selected(req);
 	int ret = 0;
@@ -1292,15 +1479,91 @@ int cache_lmdb_stash(kr_layer_t *ctx, knot_pkt_t *pkt)
 				continue;
 				/* TODO: probably safe to break but maybe not worth it */
 			}
-			int ret = stash_rrset(arr, i, min_ttl, qry, cache);
+			ret = stash_rrset(arr, i, min_ttl, qry, cache);
 			if (ret) goto finally;
+			/* LATER(optim.): maybe filter out some type-rank combinations
+			 * that won't be useful as separate RRsets. */
 		}
 	}
+
+	/* In some cases, stash also the packet. */
+	const bool is_negative = kr_response_classify(pkt)
+				& (PKT_NODATA|PKT_NXDOMAIN);
+	const bool want_pkt = qry->flags.DNSSEC_BOGUS
+		|| (is_negative && qry->flags.DNSSEC_INSECURE);
+	if (!want_pkt || !knot_wire_get_aa(pkt->wire)) {
+		goto finally;
+	}
+	
+	const uint16_t pkt_size = pkt->size;
+	knot_db_val_t val_new_entry = {
+		.data = NULL,
+		.len = offsetof(struct entry_h, data) + sizeof(pkt_size) + pkt->size,
+	};
+
+	/* Compute rank.  If cd bit is set or we got answer via non-validated
+	 * forwarding, make the rank bad; otherwise it depends on flags.
+	 * TODO: probably make validator attempt validation even with +cd. */
+	uint8_t rank = KR_RANK_AUTH;
+	const bool risky_vldr = is_negative && qry->flags.FORWARD && qry->flags.CNAME;
+		/* ^^ CNAME'ed NXDOMAIN answer in forwarding mode can contain
+		 * unvalidated records; original commit: d6e22f476. */
+	if (knot_wire_get_cd(req->answer->wire) || qry->flags.STUB || risky_vldr) {
+		kr_rank_set(&rank, KR_RANK_OMIT);
+	} else {
+		if (qry->flags.DNSSEC_BOGUS) {
+			kr_rank_set(&rank, KR_RANK_BOGUS);
+		} else if (qry->flags.DNSSEC_INSECURE) {
+			kr_rank_set(&rank, KR_RANK_INSECURE);
+		} else assert(false);
+	}
+
+	const knot_dname_t *owner = knot_pkt_qname(pkt); /* qname can't be compressed */
+	WITH_VERBOSE {
+		VERBOSE_MSG(qry, "=> stashing packet: rank 0%0.2o, ", rank);
+		kr_rrtype_print(pkt_type, "", " ");
+		kr_dname_print(owner, "", " ");
+		kr_log_verbose("(%d B)\n", (int)val_new_entry.len);
+	}
+
+	// TODO: nothing exists under NXDOMAIN
+#if 0
+	if (knot_wire_get_rcode(pkt->wire) == KNOT_RCODE_NXDOMAIN
+	 /* && !qry->flags.DNSSEC_INSECURE */ ) {
+		pkt_type = KNOT_RRTYPE_NS;
+	}
+#endif
+
+	/* Construct the key under which the pkt will be stored. */
+	struct key k_storage, *k = &k_storage;
+	knot_db_val_t key;
+	ret = kr_dname_lf(k->buf, owner, NULL);
+	if (ret) {
+		assert(!ret);
+		goto finally;
+	}
+	key = key_exact_type_maypkt(k, pkt_type);
+
+	/* Prepare raw memory for the new entry and fill it. */
+	ret = entry_h_splice(&val_new_entry, rank, key, k->type, pkt_type,
+				owner, qry, cache);
+	if (ret) goto finally; /* some aren't really errors */
+	assert(val_new_entry.data);
+	struct entry_h *eh = val_new_entry.data;
+	eh->time = qry->timestamp.tv_sec;
+	eh->ttl  = packet_ttl(pkt, is_negative);
+	eh->rank = rank;
+	eh->is_packet = true;
+	memcpy(eh->data, &pkt_size, sizeof(pkt_size));
+	memcpy(eh->data + sizeof(pkt_size), pkt->wire, pkt_size);
+
 finally:
 	kr_cache_sync(cache);
-	return ret ? ret : ctx->state;
+	return ctx->state; /* we ignore cache-stashing errors */
 }
 
+
+
 /** It's simply inside of cycle taken out to decrease indentation.
  * \return kr_ok() or KR_STATE_FAIL */
 static int stash_rrset(const ranked_rr_array_t *arr, int arr_i, uint32_t min_ttl,
@@ -1310,7 +1573,7 @@ static int stash_rrset(const ranked_rr_array_t *arr, int arr_i, uint32_t min_ttl
 	if (entry->cached) {
 		return kr_ok();
 	}
-	knot_rrset_t *rr = entry->rr;
+	const knot_rrset_t *rr = entry->rr;
 	if (!rr) {
 		assert(false);
 		return KR_STATE_FAIL;
@@ -1323,7 +1586,6 @@ static int stash_rrset(const ranked_rr_array_t *arr, int arr_i, uint32_t min_ttl
 		return kr_ok();
 	}
 
-
 	WITH_VERBOSE {
 		VERBOSE_MSG(qry, "=> considering to stash ");
 		kr_rrtype_print(rr->type, "", " ");
@@ -1333,7 +1595,7 @@ static int stash_rrset(const ranked_rr_array_t *arr, int arr_i, uint32_t min_ttl
 	switch (rr->type) {
 	case KNOT_RRTYPE_RRSIG:
 	case KNOT_RRTYPE_NSEC3:
-		// for now; FIXME
+		// for now; LATER NSEC3
 		return kr_ok();
 	default:
 		break;
@@ -1388,49 +1650,21 @@ static int stash_rrset(const ranked_rr_array_t *arr, int arr_i, uint32_t min_ttl
 		key = key_exact_type(k, rr->type);
 	}
 
-	/* Find the whole entry-set and the particular entry within. */
-	knot_db_val_t val_orig_all = { }, val_orig_entry = { };
-	const struct entry_h *eh_orig = NULL;
-	if (!kr_rank_test(entry->rank, KR_RANK_SECURE) || k->type == KNOT_RRTYPE_NS) {
-		ret = cache_op(cache, read, &key, &val_orig_all, 1);
-		if (ret) val_orig_all = (knot_db_val_t){ };
-		val_orig_entry = val_orig_all;
-		switch (entry_h_seek(&val_orig_entry, rr->type)) {
-		case 0:
-			ret = entry_h_len(val_orig_entry);
-			if (ret >= 0) {
-				val_orig_entry.len = ret;
-				eh_orig = entry_h_consistent(val_orig_entry, rr->type);
-				if (eh_orig) {
-					break;
-				}
-			} /* otherwise fall through */
-		default:
-			val_orig_entry = val_orig_all = (knot_db_val_t){};
-		case -ENOENT:
-			val_orig_entry.len = 0;
-			break;
-		};
-	}
+	/* Compute materialized sizes of the new data. */
+	const knot_rdataset_t *rds_sigs = rr_sigs ? &rr_sigs->rrs : NULL;
+	const int rr_ssize = rdataset_dematerialize_size(&rr->rrs);
+	knot_db_val_t val_new_entry = {
+		.data = NULL,
+		.len = offsetof(struct entry_h, data) + rr_ssize
+			+ rdataset_dematerialize_size(rds_sigs),
+	};
 
-	if (!kr_rank_test(entry->rank, KR_RANK_SECURE) && eh_orig) {
-		/* If equal rank was accepted, spoofing a *single* answer would be
-		 * enough to e.g. override NS record in AUTHORITY section.
-		 * This way they would have to hit the first answer
-		 * (whenever TTL nears expiration). */
-		int32_t old_ttl = get_new_ttl(eh_orig, qry->creation_time.tv_sec);
-		if (old_ttl > 0 && !is_expiring(old_ttl, eh_orig->ttl)
-		    && entry->rank <= eh_orig->rank) {
-			WITH_VERBOSE {
-				VERBOSE_MSG(qry, "=> not overwriting ");
-				kr_rrtype_print(rr->type, "", " ");
-				kr_dname_print(rr->owner, "", "\n");
-			}
-			return kr_ok();
-		}
-	}
+	/* Prepare raw memory for the new entry. */
+	ret = entry_h_splice(&val_new_entry, entry->rank, key, k->type, rr->type,
+				rr->owner, qry, cache);
+	if (ret) return kr_ok(); /* some aren't really errors */
+	assert(val_new_entry.data);
 
-	const knot_rdataset_t *rds_sigs = rr_sigs ? &rr_sigs->rrs : NULL;
 	/* Compute TTL, just in case they weren't equal. */
 	uint32_t ttl = -1;
 	const knot_rdataset_t *rdatasets[] = { &rr->rrs, rds_sigs, NULL };
@@ -1443,31 +1677,12 @@ static int stash_rrset(const ranked_rr_array_t *arr, int arr_i, uint32_t min_ttl
 		}
 	} /* TODO: consider expirations of RRSIGs as well, just in case. */
 	ttl = MAX(ttl, min_ttl);
-	/* Compute materialized sizes of the new data, and combine with remaining size. */
-	const int rr_ssize = rdataset_dematerialize_size(&rr->rrs);
-	const int entry_size = offsetof(struct entry_h, data) + rr_ssize
-		+ rdataset_dematerialize_size(rds_sigs);
-	size_t storage_size = val_orig_all.len - val_orig_entry.len + entry_size;
-	/* Obtain new storage from LMDB.  Note: this does NOT invalidate val_orig.data. */
-	knot_db_val_t val = { .len = storage_size, .data = NULL };
-	ret = cache_op(cache, write, &key, &val, 1);
-	if (ret || !val.data || !val.len) {
-		assert(ret); /* otherwise "succeeding" but `val` is bad */
-		return kr_ok();
-	}
-	/* Write original data before entry, if any. */
-	const ssize_t len_before = val_orig_entry.data - val_orig_all.data;
-	assert(len_before >= 0);
-	if (len_before) {
-		memcpy(val.data, val_orig_all.data, len_before);
-	}
+
 	/* Write the entry itself. */
-	struct entry_h *eh = val.data + len_before;
-	*eh = (struct entry_h){
-		.time = qry->timestamp.tv_sec,
-		.ttl  = ttl,
-		.rank = entry->rank,
-	};
+	struct entry_h *eh = val_new_entry.data;
+	eh->time = qry->timestamp.tv_sec;
+	eh->ttl  = ttl;
+	eh->rank = entry->rank;
 	if (rdataset_dematerialize(&rr->rrs, eh->data)
 	    || rdataset_dematerialize(rds_sigs, eh->data + rr_ssize)) {
 		/* minimize the damage from incomplete write; TODO: better */
@@ -1475,54 +1690,25 @@ static int stash_rrset(const ranked_rr_array_t *arr, int arr_i, uint32_t min_ttl
 		eh->rank = 0;
 		assert(false);
 	}
-	/* Write original data after entry, if any. */
-	const ssize_t len_after = val_orig_all.len - val_orig_entry.len;
-	assert(len_after >= 0);
-	if (len_after) {
-		memcpy(val.data + len_before + entry_size,
-			val_orig_entry.data + val_orig_entry.len, len_after);
-	}
-	/* The multi-entry type needs adjusting the flags. */
-	if (k->type == KNOT_RRTYPE_NS) {
-		eh = val.data;
-		if (val_orig_all.len) {
-			const struct entry_h *eh0 = val_orig_all.data;
-			/* ENTRY_H_FLAGS */
-			eh->nsec1_pos = eh0->nsec1_pos;
-			eh->nsec3_cnt = eh0->nsec3_cnt;
-			eh->has_ns    = eh0->has_ns;
-			eh->has_cname = eh0->has_cname;
-			eh->has_dname = eh0->has_dname;
-		}
-		/* we just added/replaced some type */
-		switch (rr->type) {
-		case KNOT_RRTYPE_NS:
-			eh->has_ns = true;  break;
-		case KNOT_RRTYPE_CNAME:
-			eh->has_cname = true;  break;
-		case KNOT_RRTYPE_DNAME:
-			eh->has_dname = true;  break;
-		default:
-			assert(false);
-		}
-	}
+
 	WITH_VERBOSE {
 		VERBOSE_MSG(qry, "=> stashed rank: 0%0.2o, ", entry->rank);
 		kr_rrtype_print(rr->type, "", " ");
 		kr_dname_print(rr->owner, "", " ");
-		int sigs = rr_sigs ? rr_sigs->rrs.rr_count : 0;
-		kr_log_verbose("(%d B total, incl. %d RRSIGs)\n", (int)storage_size, sigs);
+		kr_log_verbose("(%d B total, incl. %d RRSIGs)\n",
+				(int)val_new_entry.len,
+				(int)(rr_sigs ? rr_sigs->rrs.rr_count : 0)
+				);
 	}
 	return kr_ok();
 }
 
 
+static int answer_simple_hit(kr_layer_t *ctx, knot_pkt_t *pkt, uint16_t type,
+		const struct entry_h *eh, const void *eh_bound, uint32_t new_ttl)
 #define CHECK_RET(ret) do { \
 	if ((ret) < 0) { assert(false); return kr_error((ret)); } \
 } while (false)
-
-static int answer_simple_hit(kr_layer_t *ctx, knot_pkt_t *pkt, uint16_t type,
-		const struct entry_h *eh, const void *eh_bound, uint32_t new_ttl)
 {
 	struct kr_request *req = ctx->req;
 	struct kr_query *qry = req->current_query;
@@ -1545,9 +1731,78 @@ static int answer_simple_hit(kr_layer_t *ctx, knot_pkt_t *pkt, uint16_t type,
 	qry->flags.EXPIRING = is_expiring(eh->ttl, new_ttl);
 	qry->flags.CACHED = true;
 	qry->flags.NO_MINIMIZE = true;
+	qry->flags.DNSSEC_INSECURE = kr_rank_test(eh->rank, KR_RANK_INSECURE);
+	if (qry->flags.DNSSEC_INSECURE) {
+		qry->flags.DNSSEC_WANT = false;
+	}
 	return kr_ok();
 }
+#undef CHECK_RET
+
+static int answer_from_pkt(kr_layer_t *ctx, knot_pkt_t *pkt, uint16_t type,
+		const struct entry_h *eh, const void *eh_bound, uint32_t new_ttl)
+{
+	struct kr_request *req = ctx->req;
+	struct kr_query *qry = req->current_query;
+
+	uint16_t pkt_len;
+	memcpy(&pkt_len, eh->data, sizeof(pkt_len));
+		/* TODO: more length checks somewhere, maybe entry_h_consistent */
+	if (pkt_len > pkt->max_size) {
+		return kr_error(ENOENT);
+	}
+
+	/* Copy answer and reparse it, but keep the original message id. */
+	uint16_t msgid = knot_wire_get_id(pkt->wire);
+	knot_pkt_clear(pkt);
+	memcpy(pkt->wire, eh->data + 2, pkt_len);
+	pkt->size = pkt_len;
+	int ret = knot_pkt_parse(pkt, 0);
+	if (ret != KNOT_EOK) {
+		return kr_error(ret);
+	}
+	knot_wire_set_id(pkt->wire, msgid);
 
+	/* Rank-related fixups.  Add rank into the additional field. */
+	if (kr_rank_test(eh->rank, KR_RANK_INSECURE)) {
+		qry->flags.DNSSEC_INSECURE = true;
+		qry->flags.DNSSEC_WANT = false;
+	}
+	for (size_t i = 0; i < pkt->rrset_count; ++i) {
+		assert(!pkt->rr[i].additional);
+		uint8_t *rr_rank = mm_alloc(&pkt->mm, sizeof(*rr_rank));
+		if (!rr_rank) {
+			return kr_error(ENOMEM);
+		}
+		*rr_rank = eh->rank;
+		pkt->rr[i].additional = rr_rank;
+	}
+
+	/* Adjust TTL in records.  We know that no RR has expired yet. */
+	const uint32_t drift = eh->ttl - new_ttl;
+	for (knot_section_t i = KNOT_ANSWER; i <= KNOT_ADDITIONAL; ++i) {
+		const knot_pktsection_t *sec = knot_pkt_section(pkt, i);
+		for (unsigned k = 0; k < sec->count; ++k) {
+			const knot_rrset_t *rr = knot_pkt_rr(sec, k);
+			knot_rdata_t *rd = rr->rrs.data;
+			for (uint16_t i = 0; i < rr->rrs.rr_count; ++i) {
+				knot_rdata_set_ttl(rd, knot_rdata_ttl(rd) - drift);
+				rd = kr_rdataset_next(rd);
+			}
+		}
+	}
+
+	/* Finishing touches. TODO: perhaps factor out */
+	qry->flags.EXPIRING = is_expiring(eh->ttl, new_ttl);
+	qry->flags.CACHED = true;
+	qry->flags.NO_MINIMIZE = true;
+	qry->flags.DNSSEC_INSECURE = kr_rank_test(eh->rank, KR_RANK_INSECURE);
+	qry->flags.DNSSEC_BOGUS = kr_rank_test(eh->rank, KR_RANK_BOGUS);
+	if (qry->flags.DNSSEC_INSECURE || qry->flags.DNSSEC_BOGUS) {
+		qry->flags.DNSSEC_WANT = false;
+	}
+	return kr_ok();
+}
 
 /** TODO: description; see the single call site for now. */
 static int found_exact_hit(kr_layer_t *ctx, knot_pkt_t *pkt, knot_db_val_t val,
@@ -1569,19 +1824,20 @@ static int found_exact_hit(kr_layer_t *ctx, knot_pkt_t *pkt, knot_db_val_t val,
 	int32_t new_ttl = get_new_ttl(eh, qry->creation_time.tv_sec);
 	if (new_ttl < 0 || eh->rank < lowest_rank) {
 		/* Positive record with stale TTL or bad rank.
-		 * It's unlikely that we find a negative one,
+		 * LATER(optim.): It's unlikely that we find a negative one,
 		 * so we might theoretically skip all the cache code. */
 		return kr_error(ENOENT);
 	}
 
-	if (eh->is_negative) {
-		// insecure zones might have a negative-answer packet here
-		//FIXME
-		assert(false);
+	if (eh->is_packet) {
+		/* Note: we answer here immediately, even if it's (theoretically)
+		 * possible that we could generate a higher-security negative proof.
+		 * Rank is high-enough so we take it to save time searching. */
+		return answer_from_pkt  (ctx, pkt, qry->stype, eh, eh_bound, new_ttl);
+	} else {
+		return answer_simple_hit(ctx, pkt, qry->stype, eh, eh_bound, new_ttl);
 	}
-	return answer_simple_hit(ctx, pkt, qry->stype, eh, eh_bound, new_ttl);
 }
-#undef CHECK_RET
 
 int kr_cache_peek_exact(struct kr_cache *cache, const knot_dname_t *name, uint16_t type,
 			struct kr_cache_p *peek)
@@ -1655,7 +1911,7 @@ static knot_db_val_t closest_NS(kr_layer_t *ctx, struct key *k)
 			/* Check consistency, find any type.
 			 * "break;" goes to shortening by another label */
 			const struct entry_h *eh = entry_h_consistent(val, KNOT_RRTYPE_NS),
-			      	*eh_orig = eh;
+				*eh_orig = eh;
 			const knot_db_val_t val_orig = val;
 			assert(eh);
 			if (!eh) break; // do something about EILSEQ?
@@ -1694,7 +1950,7 @@ static knot_db_val_t closest_NS(kr_layer_t *ctx, struct key *k)
 					assert(false);
 					break;
 				}
-				if (eh->is_negative) continue;
+				if (eh->is_packet) continue;
 				int32_t new_ttl = get_new_ttl(eh, qry->creation_time.tv_sec);
 				if (new_ttl < 0) continue;
 				if (type != KNOT_RRTYPE_NS && eh->rank < rank_min) {
@@ -1738,7 +1994,6 @@ static uint8_t get_lowest_rank(const struct kr_request *req, const struct kr_que
 	const bool allow_unverified = knot_wire_get_cd(req->answer->wire)
 					|| qry->flags.STUB;
 	/* TODO: move rank handling into the iterator (DNSSEC_* flags)? */
-	uint8_t rank  = 0;
 	uint8_t lowest_rank = KR_RANK_INITIAL | KR_RANK_AUTH;
 	if (qry->flags.NONAUTH) {
 		lowest_rank = KR_RANK_INITIAL;
diff --git a/lib/resolve.h b/lib/resolve.h
index 845442b57..4c04505f9 100644
--- a/lib/resolve.h
+++ b/lib/resolve.h
@@ -91,18 +91,23 @@
  *   https://tools.ietf.org/html/rfc4035#section-4.3
  */
 enum kr_rank {
+	/* Initial-like states.  No validation has been attempted (yet). */
 	KR_RANK_INITIAL = 0, /**< Did not attempt to validate. It's assumed
 					compulsory to validate (or prove insecure). */
 	KR_RANK_OMIT,        /**< Do not attempt to validate.
 					(And don't consider it a validation failure.) */
 	KR_RANK_TRY,         /**< Attempt to validate, but failures are non-fatal. */
 
+	/* Failure states.  These have higher value because they have more information. */
 	KR_RANK_INDET = 4,   /**< Unable to determine whether it should be secure. */
 	KR_RANK_BOGUS,       /**< Ought to be secure but isn't. */
 	KR_RANK_MISMATCH,
 	KR_RANK_MISSING,     /**< Unable to obtain a good signature. */
 
-	KR_RANK_INSECURE = 8, /**< Proven to be insecure. */
+	/** Proven to be insecure, i.e. we have a chain of trust from TAs
+	 * that cryptographically denies the possibility of existence
+	 * of a positive chain of trust from the TAs to the record. */
+	KR_RANK_INSECURE = 8,
 
 	/** Authoritative data flag; the chain of authority was "verified".
 	 *  Even if not set, only in-bailiwick stuff is acceptable,
-- 
GitLab