Commit 1bc09711 authored by Tony Finch's avatar Tony Finch Committed by Daniel Salzman

trie: support for copy-on-write transactions

A COW transaction allows a trie to be used for reading concurrently
while a modified version of the trie is being prepared. The change
can be committed by swapping the new trie root in place of the old one.

Internally, this feature uses one bit reference counts to identify which
parts of the trie are shared between the old and new versions, which
parts are new-only (so can be mutated) and which parts are old-only
(and will be free()d after commit).
parent e7b7e187
......@@ -441,6 +441,7 @@ tests/contrib/test_dynarray.c
tests/contrib/test_heap.c
tests/contrib/test_net.c
tests/contrib/test_net_shortwrite.c
tests/contrib/test_qp-cow.c
tests/contrib/test_qp-trie.c
tests/contrib/test_siphash.c
tests/contrib/test_sockaddr.c
......
/* Copyright (C) 2016 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
/* Copyright (C) 2018 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
Copyright (C) 2018 Tony Finch <dot@dotat.at>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
......@@ -36,14 +37,20 @@ typedef uint bitmap_t; /*!< Bit-maps, using the range of 1<<0 to 1<<16 (inclusiv
typedef char static_assert_pointer_fits_in_word
[sizeof(word) >= sizeof(uintptr_t) ? 1 : -1];
#define KEYLENBITS 31
/*! \brief trie keys have lengths
*
* 32 bits are enough for key lengths; probably even 16 bits would be.
* However, a 32 bit length means the alignment will be a multiple of 4,
* allowing us to stash flags in the bottom bits of a pointer to a key.
* However, a 32 bit length means the alignment will be a multiple of
* 4, allowing us to stash the COW and BRANCH flags in the bottom bits
* of a pointer to a key.
*
* We need to steal a couple of bits from the length to keep the COW
* state of key allocations.
*/
typedef struct {
uint32_t len;
uint32_t cow:1, len:KEYLENBITS;
char chars[];
} tkey_t;
......@@ -117,6 +124,7 @@ struct trie {
*/
enum {
TSHIFT_BRANCH = 0,
TSHIFT_COW,
TSHIFT_BMP,
TOP_BMP = TSHIFT_BMP + TWIDTH_BMP,
TSHIFT_INDEX = TOP_BMP,
......@@ -135,8 +143,11 @@ typedef char static_assert_bmp_fits
/*! \brief is this node a branch or a leaf? */
#define TFLAG_BRANCH (BIG1 << TSHIFT_BRANCH)
/*! \brief copy-on-write flag, used in both leaves and branches */
#define TFLAG_COW (BIG1 << TSHIFT_COW)
/*! \brief for extracting pointer to key */
#define TMASK_LEAF (~(word)(TFLAG_BRANCH))
#define TMASK_LEAF (~(word)(TFLAG_BRANCH | TFLAG_COW))
/*! \brief mask for extracting nibble index */
#define TMASK_INDEX TMASK(TWIDTH_INDEX, TSHIFT_INDEX)
......@@ -150,9 +161,12 @@ typedef char static_assert_bmp_fits
/*! \brief Initialize a new leaf, copying the key, and returning failure code. */
static int mkleaf(node_t *leaf, const char *key, uint32_t len, knot_mm_t *mm)
{
if (unlikely((word)len > (BIG1 << KEYLENBITS)))
return KNOT_ENOMEM;
tkey_t *lkey = mm_alloc(mm, sizeof(tkey_t) + len);
if (unlikely(!lkey))
return KNOT_ENOMEM;
lkey->cow = 0;
lkey->len = len;
memcpy(lkey->chars, key, len);
word i = (uintptr_t)lkey;
......@@ -380,26 +394,10 @@ trie_val_t* trie_get_try(trie_t *tbl, const char *key, uint32_t len)
return tvalp(t);
}
int trie_del(trie_t *tbl, const char *key, uint32_t len, trie_val_t *val)
static int del_found(trie_t *tbl, node_t *t, node_t *p, bitmap_t b, trie_val_t *val)
{
assert(tbl);
if (!tbl->weight)
return KNOT_ENOENT;
node_t *t = &tbl->root; // current and parent node
node_t *p = NULL;
bitmap_t b = 0;
while (isbranch(t)) {
__builtin_prefetch(twigs(t));
b = twigbit(t, key, len);
if (!hastwig(t, b))
return KNOT_ENOENT;
p = t;
t = twig(t, twigoff(t, b));
}
tkey_t *lkey = tkey(t);
if (key_cmp(key, len, lkey->chars, lkey->len) != 0)
return KNOT_ENOENT;
mm_free(&tbl->mm, lkey);
assert(!tkey(t)->cow);
mm_free(&tbl->mm, tkey(t));
if (val != NULL)
*val = *tvalp(t); // we return trie_val_t directly when deleting
--tbl->weight;
......@@ -431,6 +429,28 @@ int trie_del(trie_t *tbl, const char *key, uint32_t len, trie_val_t *val)
return KNOT_EOK;
}
int trie_del(trie_t *tbl, const char *key, uint32_t len, trie_val_t *val)
{
assert(tbl);
if (!tbl->weight)
return KNOT_ENOENT;
node_t *t = &tbl->root; // current and parent node
node_t *p = NULL;
bitmap_t b = 0;
while (isbranch(t)) {
__builtin_prefetch(twigs(t));
b = twigbit(t, key, len);
if (!hastwig(t, b))
return KNOT_ENOENT;
p = t;
t = twig(t, twigoff(t, b));
}
tkey_t *lkey = tkey(t);
if (key_cmp(key, len, lkey->chars, lkey->len) != 0)
return KNOT_ENOENT;
return del_found(tbl, t, p, b, val);
}
/*!
* \brief Stack of nodes, storing a path down a trie.
*
......@@ -538,7 +558,8 @@ static int ns_find_branch(nstack_t *ns, const char *key, uint32_t len,
tkey_t *lkey = tkey(ns->stack[ns->len-1]);
// Find index of the first char that differs.
size_t bytei = 0;
for (bytei = 0; bytei < MIN(len,lkey->len); bytei++) {
uint32_t klen = lkey->len;
for (bytei = 0; bytei < MIN(len,klen); bytei++) {
if (key[bytei] != lkey->chars[bytei])
break;
}
......@@ -548,7 +569,7 @@ static int ns_find_branch(nstack_t *ns, const char *key, uint32_t len,
index = TMAX_INDEX;
goto success;
}
if (likely(bytei < MIN(len,lkey->len))) {
if (likely(bytei < MIN(len,klen))) {
byte k2 = (byte)lkey->chars[bytei];
byte k1 = (byte)key[bytei];
if (((k1 ^ k2) & 0xf0) == 0)
......@@ -719,7 +740,7 @@ int trie_get_leq(trie_t *tbl, const char *key, uint32_t len, trie_val_t **val)
// Now we re-do the first "non-matching" step in the trie
// but try the previous child if key was less (it may not exist)
int i = hastwig(t, b)
? twigoff(t, b) - (kbit < tbit)
? (int)twigoff(t, b) - (kbit < tbit)
: (int)twigoff(t, b) - 1 /* twigoff returns successor when !hastwig */;
if (i >= 0) {
ERR_RETURN(ns_longer(ns));
......@@ -735,7 +756,12 @@ success:
}
}
trie_val_t* trie_get_ins(trie_t *tbl, const char *key, uint32_t len)
/* see below */
static int cow_pushdown(trie_cow_t *cow, nstack_t *ns);
/*! \brief implementation of trie_get_ins() and trie_get_cow() */
static trie_val_t* cow_get_ins(trie_cow_t *cow, trie_t *tbl,
const char *key, uint32_t len)
{
assert(tbl);
// First leaf in an empty tbl?
......@@ -755,6 +781,8 @@ trie_val_t* trie_get_ins(trie_t *tbl, const char *key, uint32_t len)
bitmap_t tbit, kbit;
if (unlikely(ns_find_branch(ns, key, len, &idiff, &tbit, &kbit)))
return NULL;
if (unlikely(cow && cow_pushdown(cow, ns) != KNOT_EOK))
return NULL;
node_t *t = ns->stack[ns->len - 1];
if (idiff == TMAX_INDEX) // the same key was already present
return tvalp(t);
......@@ -801,6 +829,11 @@ err_leaf:
}
}
trie_val_t* trie_get_ins(trie_t *tbl, const char *key, uint32_t len)
{
return cow_get_ins(NULL, tbl, key, len);
}
/*! \brief Apply a function to every trie_val_t*, in order; a recursive solution. */
static int apply_nodes(node_t *t, int (*f)(trie_val_t *, void *), void *d)
{
......@@ -878,3 +911,283 @@ trie_val_t* trie_it_val(trie_it_t *it)
assert(!isbranch(t));
return tvalp(t);
}
/*!\file
*
* \section About copy-on-write
*
* In these notes I'll use the term "object" to refer to either the
* twig array of a branch, or the application's data that is referred
* to by a leaf's trie_val_t pointer. Note that for COW we don't care
* about trie node_t structs themselves, but the objects that they
* point to.
*
* \subsection COW states
*
* During a COW transaction an object can be in one of three states:
* shared, only in the old trie, or only in the new trie. When a
* transaction is rolled back, the only-new objects are freed; when a
* transaction is committed the new trie takes the place of the old
* one and only-old objects are freed.
*
* \subsection branch marks and regions
*
* A branch object can be marked by setting the COW flag in the first
* element of its twig array. Marked branches partition the trie into
* regions; an object's state depends on its region.
*
* The unmarked branch objects between a trie's root and the marked
* branches (excluding the marked branches themselves) is exclusively
* owned: either old-only (if you started from the old root) or
* new-only (if you started from the new root).
*
* Marked branch objects, and all objects reachable from marked branch
* objects, are in the shared region accessible from both old and new
* roots. All branch objects below a marked branch must be unmarked.
* (That is, there is at most one marked branch object on any path
* from the root of a trie.)
*
* Branch nodes in the new-only region can be modified in place, in
* the same way as an original qp trie. Branch nodes in the old-only
* or shared regions must not be modified.
*
* \subsection app object states
*
* The app objects reachable from the new-only and old-only regions
* explicitly record their state in a way determined by the
* application. (These app objects are reachable from the old and new
* roots by traversing only unmarked branch objects.)
*
* The app objects reachable from marked branch objects are implicitly
* shared, but their state field has an indeterminate value. If an app
* object was previously touched by a rolled-back transaction it may
* be marked shared or old-only; if it was previously touched by a
* committed transaction it may be marked shared or new-only.
*
* \subsection key states
*
* The memory allocated for tkey_t objects also needs to track its
* sharing state. They have a "cow" flag to mark when they are shared.
* Keys are relatively lazily copied (to make them exclusive) when
* their leaf node is touched by a COW mutation.
*
* [An alternative technique might be to copy them more eagerly, in
* cow_pushdown(), which would avoid the need for a flag bit at the
* cost of more allocator churn in a transaction.]
*
* \subsection outside COW
*
* When a COW transaction is not in progress, there are no marked
* branch objects, so everything is exclusively owned. When a COW
* transaction is finished (committed or rolled back), the branch
* marks are removed. Since they are in the shared region, this branch
* cleanup is visible to both old and new tries.
*
* However the state of app objects is not clean between COW
* transactions. When a COW transaction is committed, we traverse the
* old-only region to find old-only app objects that should be freed
* (and vice versa for rollback). In general, there will be app
* objects that are only reachable from the new-only region, and that
* have a mixture of shared and new states.
*/
/*! \brief Trie copy-on-write state */
struct trie_cow {
trie_t *old;
trie_t *new;
trie_cb *mark_shared;
void *d;
};
/*! \brief is this a marked branch object */
static bool cow_marked(node_t *t)
{
return isbranch(t) && (twigs(t)->i & TFLAG_COW);
}
/*! \brief is this a leaf with a marked key */
static bool cow_key(node_t *t)
{
return !isbranch(t) && tkey(t)->cow;
}
/*! \brief remove mark from a branch object */
static void clear_cow(node_t *t)
{
assert(isbranch(t));
twigs(t)->i &= ~TFLAG_COW;
}
/*! \brief mark a node as shared
*
* For branches this marks the twig array (in COW terminology, the
* branch object); for leaves it uses the callback to mark the app
* object.
*/
static void mark_cow(trie_cow_t *cow, node_t *t)
{
if (isbranch(t)) {
node_t *object = twigs(t);
object->i |= TFLAG_COW;
} else {
tkey_t *lkey = tkey(t);
trie_val_t *valp = tvalp(t);
lkey->cow = 1;
cow->mark_shared(*valp, lkey->chars, lkey->len, cow->d);
}
}
/*! \brief push exclusive COW region down one node */
static int cow_pushdown_one(trie_cow_t *cow, node_t *t)
{
uint cc = branch_weight(t);
node_t *nt = mm_alloc(&cow->new->mm, sizeof(node_t) * cc);
if (nt == NULL)
return KNOT_ENOMEM;
/* mark all the children */
for (uint ci = 0; ci < cc; ++ci)
mark_cow(cow, twig(t, ci));
/* this node must be unmarked in both old and new versions */
clear_cow(t);
t->p = memcpy(nt, twigs(t), sizeof(node_t) * cc);
return KNOT_EOK;
}
/*! \brief push exclusive COW region to cover a whole node stack */
static int cow_pushdown(trie_cow_t *cow, nstack_t *ns)
{
node_t *new_twigs = NULL;
node_t *old_twigs = NULL;
for (uint i = 0; i < ns->len; i++) {
/* if we did a pushdown on the previous iteration, we
need to update this stack entry so it points into
the parent's new twigs instead of the old ones */
if (new_twigs != old_twigs)
ns->stack[i] = new_twigs + (ns->stack[i] - old_twigs);
if (cow_marked(ns->stack[i])) {
old_twigs = twigs(ns->stack[i]);
if (cow_pushdown_one(cow, ns->stack[i]))
return KNOT_ENOMEM;
new_twigs = twigs(ns->stack[i]);
} else {
new_twigs = NULL;
old_twigs = NULL;
/* ensure key is exclusively owned */
if (cow_key(ns->stack[i])) {
node_t oleaf = *ns->stack[i];
tkey_t *okey = tkey(&oleaf);
if(mkleaf(ns->stack[i], okey->chars, okey->len,
&cow->new->mm))
return KNOT_ENOMEM;
ns->stack[i]->p = oleaf.p;
okey->cow = 0;
}
}
}
return KNOT_EOK;
}
trie_cow_t* trie_cow(trie_t *old, trie_cb *mark_shared, void *d)
{
knot_mm_t *mm = &old->mm;
trie_t *new = mm_alloc(mm, sizeof(trie_t));
trie_cow_t *cow = mm_alloc(mm, sizeof(trie_cow_t));
if (new == NULL || cow == NULL) {
mm_free(mm, new);
mm_free(mm, cow);
return NULL;
}
new->mm = old->mm;
new->root = old->root;
new->weight = old->weight;
cow->old = old;
cow->new = new;
cow->mark_shared = mark_shared;
cow->d = d;
if (old->weight)
mark_cow(cow, &old->root);
return cow;
}
trie_t* trie_cow_new(trie_cow_t *cow)
{
assert(cow != NULL);
return cow->new;
}
trie_val_t* trie_get_cow(trie_cow_t *cow, const char *key, uint32_t len)
{
return cow_get_ins(cow, cow->new, key, len);
}
int trie_del_cow(trie_cow_t *cow, const char *key, uint32_t len, trie_val_t *val)
{
trie_t *tbl = cow->new;
// First leaf in an empty tbl?
if (unlikely(!tbl->weight))
return KNOT_ENOENT;
{ // Intentionally un-indented; until end of function, to bound cleanup attr.
// Find the branching-point
__attribute__((cleanup(ns_cleanup)))
nstack_t ns_local;
ns_init(&ns_local, tbl);
nstack_t *ns = &ns_local;
index_t idiff;
bitmap_t tbit, kbit;
ERR_RETURN(ns_find_branch(ns, key, len, &idiff, &tbit, &kbit));
if (idiff != TMAX_INDEX)
return KNOT_ENOENT;
ERR_RETURN(cow_pushdown(cow, ns));
node_t *t = ns->stack[ns->len - 1];
node_t *p = ns->len >= 2 ? ns->stack[ns->len - 2] : NULL;
return del_found(tbl, t, p, p ? twigbit(p, key, len) : 0, val);
}
}
/*! \brief clean up after a COW transaction, recursively */
static void cow_cleanup(trie_cow_t *cow, node_t *t, trie_cb *cb, void *d)
{
if (cow_marked(t)) {
// we have hit the shared region, so just reset the mark
clear_cow(t);
return;
} else if (isbranch(t)) {
// traverse and free the exclusive region
uint cc = branch_weight(t);
for (uint ci = 0; ci < cc; ++ci)
cow_cleanup(cow, twig(t, ci), cb, d);
mm_free(&cow->new->mm, twigs(t));
return;
} else {
// application must decide how to clean up its values
tkey_t *lkey = tkey(t);
trie_val_t *valp = tvalp(t);
cb(*valp, lkey->chars, lkey->len, d);
// clean up exclusively-owned keys
if (lkey->cow)
lkey->cow = 0;
else
mm_free(&cow->new->mm, lkey);
return;
}
}
trie_t* trie_cow_commit(trie_cow_t *cow, trie_cb *cb, void *d)
{
trie_t *ret = cow->new;
if (cow->old->weight)
cow_cleanup(cow, &cow->old->root, cb, d);
mm_free(&ret->mm, cow->old);
mm_free(&ret->mm, cow);
return ret;
}
trie_t* trie_cow_rollback(trie_cow_t *cow, trie_cb *cb, void *d)
{
trie_t *ret = cow->old;
if (cow->new->weight)
cow_cleanup(cow, &cow->new->root, cb, d);
mm_free(&ret->mm, cow->new);
mm_free(&ret->mm, cow);
return ret;
}
/* Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
/* Copyright (C) 2018 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
Copyright (C) 2018 Tony Finch <dot@dotat.at>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
......@@ -39,6 +40,20 @@ typedef struct trie trie_t;
/*! \brief Opaque type for holding a QP-trie iterator. */
typedef struct trie_it trie_it_t;
/*! \brief Callback for performing actions on a trie leaf
*
* Used during copy-on-write transactions
*
* \param val The value of the element to be altered
* \param key The key of the element to be altered
* \param len The length of key
* \param d Additional user data
*/
typedef void trie_cb(trie_val_t val, const char *key, size_t len, void *d);
/*! \brief Opaque type for holding the copy-on-write state for a QP-trie. */
typedef struct trie_cow trie_cow_t;
/*! \brief Create a trie instance. */
trie_t* trie_create(knot_mm_t *mm);
......@@ -110,3 +125,93 @@ const char* trie_it_key(trie_it_t *it, size_t *len);
/*! \brief Return pointer to the value of the current element (writable). */
trie_val_t* trie_it_val(trie_it_t *it);
/*! \brief Start a COW transaction
*
* A copy-on-write transaction starts by obtaining a write lock (in
* your application code) followed by a call to trie_cow(). This
* creates a shared clone of the trie and saves both old and new roots
* in the COW context.
*
* During the COW transaction, you call trie_cow_ins() or
* trie_cow_del() as necessary. These calls ensure that the relevant
* parts of the (new) trie are copied so that they can be modified
* freely.
*
* Your trie_val_t objects must be able to distinguish their
* reachability, either shared, or old-only, or new-only. Before a COW
* transaction the reachability of your objects is indeterminate.
* During a transaction, any trie_val_t objects that might be affected
* (because they are adjacent to a trie_get_cow() or trie_del_cow())
* are first marked as shared using the callback you pass to
* trie_cow().
*
* When the transaction is complete, to commit, call trie_cow_new() to
* get the new root, swap the old and new trie roots (e.g. with
* rcu_xchg_pointer()), wait for readers to finish with the old trie
* (e.g. using synchronize_rcu()), then call trie_cow_commit(). For a
* rollback, you can just call trie_cow_rollback() without waiting
* since that doesn't conflict with readers. After trie_cow_commit()
* or trie_cow_rollback() have finished, you can release your write
* lock.
*
* Concurrent reading of the old trie is allowed during a transaction
* provided that it is known when all readers have finished with the
* old version, e.g. using rcu_read_lock() and rcu_read_unlock().
* There must be only one write transaction at a time.
*
* \param old the old trie
* \param mark_shared callback to mark a leaf as shared
* \param d extra data for the callback
* \return a pointer to a COW context,
* or NULL if there was a failure
*/
trie_cow_t* trie_cow(trie_t *old, trie_cb *mark_shared, void *d);
/*! \brief get the new trie from a COW context */
trie_t* trie_cow_new(trie_cow_t *cow);
/*! \brief variant of trie_get_ins() for use during COW transactions
*
* As necessary, this copies path from the root of the trie to the
* leaf, so that it is no longer shared. Any leaves adjacent to this
* path are marked as shared using the mark_shared callback passed to
* trie_cow().
*
* It is your responsibility to COW your trie_val_t objects. If you copy an
* object you must change the original's reachability from shared to old-only.
* New objects (including copies) must have new-only reachability.
*/
trie_val_t* trie_get_cow(trie_cow_t *cow, const char *key, uint32_t len);
/*!
* \brief variant of trie_del() for use during COW transactions
*
* The mark_shared callback is invoked as necessary, in the same way
* as trie_get_cow().
*
* Returns KNOT_EOK if the key was removed or KNOT_ENOENT if not found.
* If val!=NULL and deletion succeeded, the *val is set to the deleted
* value pointer.
*/
int trie_del_cow(trie_cow_t *cow, const char *key, uint32_t len, trie_val_t *val);
/*! \brief clean up the old trie after committing a COW transaction
*
* Your callback is invoked for any trie_val_t objects that might need
* cleaning up; you must free any objects you have marked as old-only
* and retain objects with shared reachability.
*
* The cow object is free()d, and the new trie root is returned.
*/
trie_t* trie_cow_commit(trie_cow_t *cow, trie_cb *cb, void *d);
/*! \brief clean up the new trie after rolling back a COW transaction
*
* Your callback is invoked for any trie_val_t objects that might need
* cleaning up; you must free any objects you have marked as new-only
* and retain objects with shared reachability.
*
* The cow object is free()d, and the old trie root is returned.
*/
trie_t* trie_cow_rollback(trie_cow_t *cow, trie_cb *cb, void *d);
......@@ -7,6 +7,7 @@
/contrib/test_heap
/contrib/test_net
/contrib/test_net_shortwrite
/contrib/test_qp-cow
/contrib/test_qp-trie
/contrib/test_siphash
/contrib/test_sockaddr
......
......@@ -50,6 +50,7 @@ check_PROGRAMS = \
contrib/test_net \
contrib/test_net_shortwrite \
contrib/test_qp-trie \
contrib/test_qp-cow \
contrib/test_siphash \
contrib/test_sockaddr \
contrib/test_string \
......
/* Copyright (C) 2018 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
Copyright (C) 2018 Tony Finch <dot@dotat.at>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <err.h>
#include <unistd.h>
#include "contrib/qp-trie/trie.h"
#include "contrib/macros.h"
#include "contrib/string.h"