summaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorJozsef Kadlecsik <kadlec@blackhole.kfki.hu>2011-02-01 15:38:36 +0100
committerPatrick McHardy <kaber@trash.net>2011-02-01 15:38:36 +0100
commit6c027889696a7a694b0e2f6e3cabadefec7553b6 (patch)
treebfdb7bbdb8153ac15c45fe86928d4b02ce3fe766 /include
parentnetfilter: ipset; bitmap:port set type support (diff)
downloadlinux-6c027889696a7a694b0e2f6e3cabadefec7553b6.tar.xz
linux-6c027889696a7a694b0e2f6e3cabadefec7553b6.zip
netfilter: ipset: hash:ip set type support
The module implements the hash:ip type support in four flavours: for IPv4 or IPv6, both without and with timeout support. All the hash types are based on the "array hash" or ahash structure and functions as a good compromise between minimal memory footprint and speed. The hashing uses arrays to resolve clashes. The hash table is resized (doubled) when searching becomes too long. Resizing can be triggered by userspace add commands only and those are serialized by the nfnl mutex. During resizing the set is read-locked, so the only possible concurrent operations are the kernel side readers. Those are protected by RCU locking. Because of the four flavours and the other hash types, the functions are implemented in general forms in the ip_set_ahash.h header file and the real functions are generated before compiling by macro expansion. Thus the dereferencing of low-level functions and void pointer arguments could be avoided: the low-level functions are inlined, the function arguments are pointers of type-specific structures. Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> Signed-off-by: Patrick McHardy <kaber@trash.net>
Diffstat (limited to 'include')
-rw-r--r--include/linux/netfilter/ipset/ip_set_ahash.h1074
-rw-r--r--include/linux/netfilter/ipset/ip_set_hash.h26
2 files changed, 1100 insertions, 0 deletions
diff --git a/include/linux/netfilter/ipset/ip_set_ahash.h b/include/linux/netfilter/ipset/ip_set_ahash.h
new file mode 100644
index 000000000000..ec9d9bea1e37
--- /dev/null
+++ b/include/linux/netfilter/ipset/ip_set_ahash.h
@@ -0,0 +1,1074 @@
+#ifndef _IP_SET_AHASH_H
+#define _IP_SET_AHASH_H
+
+#include <linux/rcupdate.h>
+#include <linux/jhash.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+
+/* Hashing which uses arrays to resolve clashing. The hash table is resized
+ * (doubled) when searching becomes too long.
+ * Internally jhash is used with the assumption that the size of the
+ * stored data is a multiple of sizeof(u32). If storage supports timeout,
+ * the timeout field must be the last one in the data structure - that field
+ * is ignored when computing the hash key.
+ *
+ * Readers and resizing
+ *
+ * Resizing can be triggered by userspace command only, and those
+ * are serialized by the nfnl mutex. During resizing the set is
+ * read-locked, so the only possible concurrent operations are
+ * the kernel side readers. Those must be protected by proper RCU locking.
+ */
+
+/* Number of elements to store in an initial array block */
+#define AHASH_INIT_SIZE 4
+/* Max number of elements to store in an array block */
+#define AHASH_MAX_SIZE (3*4)
+
+/* A hash bucket */
+struct hbucket {
+ void *value; /* the array of the values */
+ u8 size; /* size of the array */
+ u8 pos; /* position of the first free entry */
+};
+
+/* The hash table: the table size stored here in order to make resizing easy */
+struct htable {
+ u8 htable_bits; /* size of hash table == 2^htable_bits */
+ struct hbucket bucket[0]; /* hashtable buckets */
+};
+
+#define hbucket(h, i) &((h)->bucket[i])
+
+/* Book-keeping of the prefixes added to the set */
+struct ip_set_hash_nets {
+ u8 cidr; /* the different cidr values in the set */
+ u32 nets; /* number of elements per cidr */
+};
+
+/* The generic ip_set hash structure */
+struct ip_set_hash {
+ struct htable *table; /* the hash table */
+ u32 maxelem; /* max elements in the hash */
+ u32 elements; /* current element (vs timeout) */
+ u32 initval; /* random jhash init value */
+ u32 timeout; /* timeout value, if enabled */
+ struct timer_list gc; /* garbage collection when timeout enabled */
+#ifdef IP_SET_HASH_WITH_NETMASK
+ u8 netmask; /* netmask value for subnets to store */
+#endif
+#ifdef IP_SET_HASH_WITH_NETS
+ struct ip_set_hash_nets nets[0]; /* book-keeping of prefixes */
+#endif
+};
+
+/* Compute htable_bits from the user input parameter hashsize */
+static u8
+htable_bits(u32 hashsize)
+{
+ /* Assume that hashsize == 2^htable_bits */
+ u8 bits = fls(hashsize - 1);
+ if (jhash_size(bits) != hashsize)
+ /* Round up to the first 2^n value */
+ bits = fls(hashsize);
+
+ return bits;
+}
+
+#ifdef IP_SET_HASH_WITH_NETS
+
+#define SET_HOST_MASK(family) (family == AF_INET ? 32 : 128)
+
+/* Network cidr size book keeping when the hash stores different
+ * sized networks */
+static void
+add_cidr(struct ip_set_hash *h, u8 cidr, u8 host_mask)
+{
+ u8 i;
+
+ ++h->nets[cidr-1].nets;
+
+ pr_debug("add_cidr added %u: %u\n", cidr, h->nets[cidr-1].nets);
+
+ if (h->nets[cidr-1].nets > 1)
+ return;
+
+ /* New cidr size */
+ for (i = 0; i < host_mask && h->nets[i].cidr; i++) {
+ /* Add in increasing prefix order, so larger cidr first */
+ if (h->nets[i].cidr < cidr)
+ swap(h->nets[i].cidr, cidr);
+ }
+ if (i < host_mask)
+ h->nets[i].cidr = cidr;
+}
+
+static void
+del_cidr(struct ip_set_hash *h, u8 cidr, u8 host_mask)
+{
+ u8 i;
+
+ --h->nets[cidr-1].nets;
+
+ pr_debug("del_cidr deleted %u: %u\n", cidr, h->nets[cidr-1].nets);
+
+ if (h->nets[cidr-1].nets != 0)
+ return;
+
+ /* All entries with this cidr size deleted, so cleanup h->cidr[] */
+ for (i = 0; i < host_mask - 1 && h->nets[i].cidr; i++) {
+ if (h->nets[i].cidr == cidr)
+ h->nets[i].cidr = cidr = h->nets[i+1].cidr;
+ }
+ h->nets[i - 1].cidr = 0;
+}
+#endif
+
+/* Destroy the hashtable part of the set */
+static void
+ahash_destroy(struct htable *t)
+{
+ struct hbucket *n;
+ u32 i;
+
+ for (i = 0; i < jhash_size(t->htable_bits); i++) {
+ n = hbucket(t, i);
+ if (n->size)
+ /* FIXME: use slab cache */
+ kfree(n->value);
+ }
+
+ ip_set_free(t);
+}
+
+/* Calculate the actual memory size of the set data */
+static size_t
+ahash_memsize(const struct ip_set_hash *h, size_t dsize, u8 host_mask)
+{
+ u32 i;
+ struct htable *t = h->table;
+ size_t memsize = sizeof(*h)
+ + sizeof(*t)
+#ifdef IP_SET_HASH_WITH_NETS
+ + sizeof(struct ip_set_hash_nets) * host_mask
+#endif
+ + jhash_size(t->htable_bits) * sizeof(struct hbucket);
+
+ for (i = 0; i < jhash_size(t->htable_bits); i++)
+ memsize += t->bucket[i].size * dsize;
+
+ return memsize;
+}
+
+/* Flush a hash type of set: destroy all elements */
+static void
+ip_set_hash_flush(struct ip_set *set)
+{
+ struct ip_set_hash *h = set->data;
+ struct htable *t = h->table;
+ struct hbucket *n;
+ u32 i;
+
+ for (i = 0; i < jhash_size(t->htable_bits); i++) {
+ n = hbucket(t, i);
+ if (n->size) {
+ n->size = n->pos = 0;
+ /* FIXME: use slab cache */
+ kfree(n->value);
+ }
+ }
+#ifdef IP_SET_HASH_WITH_NETS
+ memset(h->nets, 0, sizeof(struct ip_set_hash_nets)
+ * SET_HOST_MASK(set->family));
+#endif
+ h->elements = 0;
+}
+
+/* Destroy a hash type of set */
+static void
+ip_set_hash_destroy(struct ip_set *set)
+{
+ struct ip_set_hash *h = set->data;
+
+ if (with_timeout(h->timeout))
+ del_timer_sync(&h->gc);
+
+ ahash_destroy(h->table);
+ kfree(h);
+
+ set->data = NULL;
+}
+
+#define HKEY(data, initval, htable_bits) \
+(jhash2((u32 *)(data), sizeof(struct type_pf_elem)/sizeof(u32), initval) \
+ & jhash_mask(htable_bits))
+
+#endif /* _IP_SET_AHASH_H */
+
+#define CONCAT(a, b, c) a##b##c
+#define TOKEN(a, b, c) CONCAT(a, b, c)
+
+/* Type/family dependent function prototypes */
+
+#define type_pf_data_equal TOKEN(TYPE, PF, _data_equal)
+#define type_pf_data_isnull TOKEN(TYPE, PF, _data_isnull)
+#define type_pf_data_copy TOKEN(TYPE, PF, _data_copy)
+#define type_pf_data_zero_out TOKEN(TYPE, PF, _data_zero_out)
+#define type_pf_data_netmask TOKEN(TYPE, PF, _data_netmask)
+#define type_pf_data_list TOKEN(TYPE, PF, _data_list)
+#define type_pf_data_tlist TOKEN(TYPE, PF, _data_tlist)
+
+#define type_pf_elem TOKEN(TYPE, PF, _elem)
+#define type_pf_telem TOKEN(TYPE, PF, _telem)
+#define type_pf_data_timeout TOKEN(TYPE, PF, _data_timeout)
+#define type_pf_data_expired TOKEN(TYPE, PF, _data_expired)
+#define type_pf_data_timeout_set TOKEN(TYPE, PF, _data_timeout_set)
+
+#define type_pf_elem_add TOKEN(TYPE, PF, _elem_add)
+#define type_pf_add TOKEN(TYPE, PF, _add)
+#define type_pf_del TOKEN(TYPE, PF, _del)
+#define type_pf_test_cidrs TOKEN(TYPE, PF, _test_cidrs)
+#define type_pf_test TOKEN(TYPE, PF, _test)
+
+#define type_pf_elem_tadd TOKEN(TYPE, PF, _elem_tadd)
+#define type_pf_del_telem TOKEN(TYPE, PF, _ahash_del_telem)
+#define type_pf_expire TOKEN(TYPE, PF, _expire)
+#define type_pf_tadd TOKEN(TYPE, PF, _tadd)
+#define type_pf_tdel TOKEN(TYPE, PF, _tdel)
+#define type_pf_ttest_cidrs TOKEN(TYPE, PF, _ahash_ttest_cidrs)
+#define type_pf_ttest TOKEN(TYPE, PF, _ahash_ttest)
+
+#define type_pf_resize TOKEN(TYPE, PF, _resize)
+#define type_pf_tresize TOKEN(TYPE, PF, _tresize)
+#define type_pf_flush ip_set_hash_flush
+#define type_pf_destroy ip_set_hash_destroy
+#define type_pf_head TOKEN(TYPE, PF, _head)
+#define type_pf_list TOKEN(TYPE, PF, _list)
+#define type_pf_tlist TOKEN(TYPE, PF, _tlist)
+#define type_pf_same_set TOKEN(TYPE, PF, _same_set)
+#define type_pf_kadt TOKEN(TYPE, PF, _kadt)
+#define type_pf_uadt TOKEN(TYPE, PF, _uadt)
+#define type_pf_gc TOKEN(TYPE, PF, _gc)
+#define type_pf_gc_init TOKEN(TYPE, PF, _gc_init)
+#define type_pf_variant TOKEN(TYPE, PF, _variant)
+#define type_pf_tvariant TOKEN(TYPE, PF, _tvariant)
+
+/* Flavour without timeout */
+
+/* Get the ith element from the array block n */
+#define ahash_data(n, i) \
+ ((struct type_pf_elem *)((n)->value) + (i))
+
+/* Add an element to the hash table when resizing the set:
+ * we spare the maintenance of the internal counters. */
+static int
+type_pf_elem_add(struct hbucket *n, const struct type_pf_elem *value)
+{
+ if (n->pos >= n->size) {
+ void *tmp;
+
+ if (n->size >= AHASH_MAX_SIZE)
+ /* Trigger rehashing */
+ return -EAGAIN;
+
+ tmp = kzalloc((n->size + AHASH_INIT_SIZE)
+ * sizeof(struct type_pf_elem),
+ GFP_ATOMIC);
+ if (!tmp)
+ return -ENOMEM;
+ if (n->size) {
+ memcpy(tmp, n->value,
+ sizeof(struct type_pf_elem) * n->size);
+ kfree(n->value);
+ }
+ n->value = tmp;
+ n->size += AHASH_INIT_SIZE;
+ }
+ type_pf_data_copy(ahash_data(n, n->pos++), value);
+ return 0;
+}
+
+/* Resize a hash: create a new hash table with doubling the hashsize
+ * and inserting the elements to it. Repeat until we succeed or
+ * fail due to memory pressures. */
+static int
+type_pf_resize(struct ip_set *set, bool retried)
+{
+ struct ip_set_hash *h = set->data;
+ struct htable *t, *orig = h->table;
+ u8 htable_bits = orig->htable_bits;
+ const struct type_pf_elem *data;
+ struct hbucket *n, *m;
+ u32 i, j;
+ int ret;
+
+retry:
+ ret = 0;
+ htable_bits++;
+ pr_debug("attempt to resize set %s from %u to %u, t %p\n",
+ set->name, orig->htable_bits, htable_bits, orig);
+ if (!htable_bits)
+ /* In case we have plenty of memory :-) */
+ return -IPSET_ERR_HASH_FULL;
+ t = ip_set_alloc(sizeof(*t)
+ + jhash_size(htable_bits) * sizeof(struct hbucket));
+ if (!t)
+ return -ENOMEM;
+ t->htable_bits = htable_bits;
+
+ read_lock_bh(&set->lock);
+ for (i = 0; i < jhash_size(orig->htable_bits); i++) {
+ n = hbucket(orig, i);
+ for (j = 0; j < n->pos; j++) {
+ data = ahash_data(n, j);
+ m = hbucket(t, HKEY(data, h->initval, htable_bits));
+ ret = type_pf_elem_add(m, data);
+ if (ret < 0) {
+ read_unlock_bh(&set->lock);
+ ahash_destroy(t);
+ if (ret == -EAGAIN)
+ goto retry;
+ return ret;
+ }
+ }
+ }
+
+ rcu_assign_pointer(h->table, t);
+ read_unlock_bh(&set->lock);
+
+ /* Give time to other readers of the set */
+ synchronize_rcu_bh();
+
+ pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name,
+ orig->htable_bits, orig, t->htable_bits, t);
+ ahash_destroy(orig);
+
+ return 0;
+}
+
+/* Add an element to a hash and update the internal counters when succeeded,
+ * otherwise report the proper error code. */
+static int
+type_pf_add(struct ip_set *set, void *value, u32 timeout)
+{
+ struct ip_set_hash *h = set->data;
+ struct htable *t;
+ const struct type_pf_elem *d = value;
+ struct hbucket *n;
+ int i, ret = 0;
+ u32 key;
+
+ if (h->elements >= h->maxelem)
+ return -IPSET_ERR_HASH_FULL;
+
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh(h->table);
+ key = HKEY(value, h->initval, t->htable_bits);
+ n = hbucket(t, key);
+ for (i = 0; i < n->pos; i++)
+ if (type_pf_data_equal(ahash_data(n, i), d)) {
+ ret = -IPSET_ERR_EXIST;
+ goto out;
+ }
+
+ ret = type_pf_elem_add(n, value);
+ if (ret != 0)
+ goto out;
+
+#ifdef IP_SET_HASH_WITH_NETS
+ add_cidr(h, d->cidr, HOST_MASK);
+#endif
+ h->elements++;
+out:
+ rcu_read_unlock_bh();
+ return ret;
+}
+
+/* Delete an element from the hash: swap it with the last element
+ * and free up space if possible.
+ */
+static int
+type_pf_del(struct ip_set *set, void *value, u32 timeout)
+{
+ struct ip_set_hash *h = set->data;
+ struct htable *t = h->table;
+ const struct type_pf_elem *d = value;
+ struct hbucket *n;
+ int i;
+ struct type_pf_elem *data;
+ u32 key;
+
+ key = HKEY(value, h->initval, t->htable_bits);
+ n = hbucket(t, key);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_data(n, i);
+ if (!type_pf_data_equal(data, d))
+ continue;
+ if (i != n->pos - 1)
+ /* Not last one */
+ type_pf_data_copy(data, ahash_data(n, n->pos - 1));
+
+ n->pos--;
+ h->elements--;
+#ifdef IP_SET_HASH_WITH_NETS
+ del_cidr(h, d->cidr, HOST_MASK);
+#endif
+ if (n->pos + AHASH_INIT_SIZE < n->size) {
+ void *tmp = kzalloc((n->size - AHASH_INIT_SIZE)
+ * sizeof(struct type_pf_elem),
+ GFP_ATOMIC);
+ if (!tmp)
+ return 0;
+ n->size -= AHASH_INIT_SIZE;
+ memcpy(tmp, n->value,
+ n->size * sizeof(struct type_pf_elem));
+ kfree(n->value);
+ n->value = tmp;
+ }
+ return 0;
+ }
+
+ return -IPSET_ERR_EXIST;
+}
+
+#ifdef IP_SET_HASH_WITH_NETS
+
+/* Special test function which takes into account the different network
+ * sizes added to the set */
+static int
+type_pf_test_cidrs(struct ip_set *set, struct type_pf_elem *d, u32 timeout)
+{
+ struct ip_set_hash *h = set->data;
+ struct htable *t = h->table;
+ struct hbucket *n;
+ const struct type_pf_elem *data;
+ int i, j = 0;
+ u32 key;
+ u8 host_mask = SET_HOST_MASK(set->family);
+
+ pr_debug("test by nets\n");
+ for (; j < host_mask && h->nets[j].cidr; j++) {
+ type_pf_data_netmask(d, h->nets[j].cidr);
+ key = HKEY(d, h->initval, t->htable_bits);
+ n = hbucket(t, key);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_data(n, i);
+ if (type_pf_data_equal(data, d))
+ return 1;
+ }
+ }
+ return 0;
+}
+#endif
+
+/* Test whether the element is added to the set */
+static int
+type_pf_test(struct ip_set *set, void *value, u32 timeout)
+{
+ struct ip_set_hash *h = set->data;
+ struct htable *t = h->table;
+ struct type_pf_elem *d = value;
+ struct hbucket *n;
+ const struct type_pf_elem *data;
+ int i;
+ u32 key;
+
+#ifdef IP_SET_HASH_WITH_NETS
+ /* If we test an IP address and not a network address,
+ * try all possible network sizes */
+ if (d->cidr == SET_HOST_MASK(set->family))
+ return type_pf_test_cidrs(set, d, timeout);
+#endif
+
+ key = HKEY(d, h->initval, t->htable_bits);
+ n = hbucket(t, key);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_data(n, i);
+ if (type_pf_data_equal(data, d))
+ return 1;
+ }
+ return 0;
+}
+
+/* Reply a HEADER request: fill out the header part of the set */
+static int
+type_pf_head(struct ip_set *set, struct sk_buff *skb)
+{
+ const struct ip_set_hash *h = set->data;
+ struct nlattr *nested;
+ size_t memsize;
+
+ read_lock_bh(&set->lock);
+ memsize = ahash_memsize(h, with_timeout(h->timeout)
+ ? sizeof(struct type_pf_telem)
+ : sizeof(struct type_pf_elem),
+ set->family == AF_INET ? 32 : 128);
+ read_unlock_bh(&set->lock);
+
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested)
+ goto nla_put_failure;
+ NLA_PUT_NET32(skb, IPSET_ATTR_HASHSIZE,
+ htonl(jhash_size(h->table->htable_bits)));
+ NLA_PUT_NET32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem));
+#ifdef IP_SET_HASH_WITH_NETMASK
+ if (h->netmask != HOST_MASK)
+ NLA_PUT_U8(skb, IPSET_ATTR_NETMASK, h->netmask);
+#endif
+ NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES,
+ htonl(atomic_read(&set->ref) - 1));
+ NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize));
+ if (with_timeout(h->timeout))
+ NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(h->timeout));
+ ipset_nest_end(skb, nested);
+
+ return 0;
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+/* Reply a LIST/SAVE request: dump the elements of the specified set */
+static int
+type_pf_list(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct ip_set_hash *h = set->data;
+ const struct htable *t = h->table;
+ struct nlattr *atd, *nested;
+ const struct hbucket *n;
+ const struct type_pf_elem *data;
+ u32 first = cb->args[2];
+ /* We assume that one hash bucket fills into one page */
+ void *incomplete;
+ int i;
+
+ atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!atd)
+ return -EMSGSIZE;
+ pr_debug("list hash set %s\n", set->name);
+ for (; cb->args[2] < jhash_size(t->htable_bits); cb->args[2]++) {
+ incomplete = skb_tail_pointer(skb);
+ n = hbucket(t, cb->args[2]);
+ pr_debug("cb->args[2]: %lu, t %p n %p\n", cb->args[2], t, n);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_data(n, i);
+ pr_debug("list hash %lu hbucket %p i %u, data %p\n",
+ cb->args[2], n, i, data);
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (cb->args[2] == first) {
+ nla_nest_cancel(skb, atd);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ if (type_pf_data_list(skb, data))
+ goto nla_put_failure;
+ ipset_nest_end(skb, nested);
+ }
+ }
+ ipset_nest_end(skb, atd);
+ /* Set listing finished */
+ cb->args[2] = 0;
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_trim(skb, incomplete);
+ ipset_nest_end(skb, atd);
+ if (unlikely(first == cb->args[2])) {
+ pr_warning("Can't list set %s: one bucket does not fit into "
+ "a message. Please report it!\n", set->name);
+ cb->args[2] = 0;
+ return -EMSGSIZE;
+ }
+ return 0;
+}
+
+static int
+type_pf_kadt(struct ip_set *set, const struct sk_buff * skb,
+ enum ipset_adt adt, u8 pf, u8 dim, u8 flags);
+static int
+type_pf_uadt(struct ip_set *set, struct nlattr *tb[],
+ enum ipset_adt adt, u32 *lineno, u32 flags);
+
+static const struct ip_set_type_variant type_pf_variant = {
+ .kadt = type_pf_kadt,
+ .uadt = type_pf_uadt,
+ .adt = {
+ [IPSET_ADD] = type_pf_add,
+ [IPSET_DEL] = type_pf_del,
+ [IPSET_TEST] = type_pf_test,
+ },
+ .destroy = type_pf_destroy,
+ .flush = type_pf_flush,
+ .head = type_pf_head,
+ .list = type_pf_list,
+ .resize = type_pf_resize,
+ .same_set = type_pf_same_set,
+};
+
+/* Flavour with timeout support */
+
+#define ahash_tdata(n, i) \
+ (struct type_pf_elem *)((struct type_pf_telem *)((n)->value) + (i))
+
+static inline u32
+type_pf_data_timeout(const struct type_pf_elem *data)
+{
+ const struct type_pf_telem *tdata =
+ (const struct type_pf_telem *) data;
+
+ return tdata->timeout;
+}
+
+static inline bool
+type_pf_data_expired(const struct type_pf_elem *data)
+{
+ const struct type_pf_telem *tdata =
+ (const struct type_pf_telem *) data;
+
+ return ip_set_timeout_expired(tdata->timeout);
+}
+
+static inline void
+type_pf_data_timeout_set(struct type_pf_elem *data, u32 timeout)
+{
+ struct type_pf_telem *tdata = (struct type_pf_telem *) data;
+
+ tdata->timeout = ip_set_timeout_set(timeout);
+}
+
+static int
+type_pf_elem_tadd(struct hbucket *n, const struct type_pf_elem *value,
+ u32 timeout)
+{
+ struct type_pf_elem *data;
+
+ if (n->pos >= n->size) {
+ void *tmp;
+
+ if (n->size >= AHASH_MAX_SIZE)
+ /* Trigger rehashing */
+ return -EAGAIN;
+
+ tmp = kzalloc((n->size + AHASH_INIT_SIZE)
+ * sizeof(struct type_pf_telem),
+ GFP_ATOMIC);
+ if (!tmp)
+ return -ENOMEM;
+ if (n->size) {
+ memcpy(tmp, n->value,
+ sizeof(struct type_pf_telem) * n->size);
+ kfree(n->value);
+ }
+ n->value = tmp;
+ n->size += AHASH_INIT_SIZE;
+ }
+ data = ahash_tdata(n, n->pos++);
+ type_pf_data_copy(data, value);
+ type_pf_data_timeout_set(data, timeout);
+ return 0;
+}
+
+/* Delete expired elements from the hashtable */
+static void
+type_pf_expire(struct ip_set_hash *h)
+{
+ struct htable *t = h->table;
+ struct hbucket *n;
+ struct type_pf_elem *data;
+ u32 i;
+ int j;
+
+ for (i = 0; i < jhash_size(t->htable_bits); i++) {
+ n = hbucket(t, i);
+ for (j = 0; j < n->pos; j++) {
+ data = ahash_tdata(n, j);
+ if (type_pf_data_expired(data)) {
+ pr_debug("expired %u/%u\n", i, j);
+#ifdef IP_SET_HASH_WITH_NETS
+ del_cidr(h, data->cidr, HOST_MASK);
+#endif
+ if (j != n->pos - 1)
+ /* Not last one */
+ type_pf_data_copy(data,
+ ahash_tdata(n, n->pos - 1));
+ n->pos--;
+ h->elements--;
+ }
+ }
+ if (n->pos + AHASH_INIT_SIZE < n->size) {
+ void *tmp = kzalloc((n->size - AHASH_INIT_SIZE)
+ * sizeof(struct type_pf_telem),
+ GFP_ATOMIC);
+ if (!tmp)
+ /* Still try to delete expired elements */
+ continue;
+ n->size -= AHASH_INIT_SIZE;
+ memcpy(tmp, n->value,
+ n->size * sizeof(struct type_pf_telem));
+ kfree(n->value);
+ n->value = tmp;
+ }
+ }
+}
+
+static int
+type_pf_tresize(struct ip_set *set, bool retried)
+{
+ struct ip_set_hash *h = set->data;
+ struct htable *t, *orig = h->table;
+ u8 htable_bits = orig->htable_bits;
+ const struct type_pf_elem *data;
+ struct hbucket *n, *m;
+ u32 i, j;
+ int ret;
+
+ /* Try to cleanup once */
+ if (!retried) {
+ i = h->elements;
+ write_lock_bh(&set->lock);
+ type_pf_expire(set->data);
+ write_unlock_bh(&set->lock);
+ if (h->elements < i)
+ return 0;
+ }
+
+retry:
+ ret = 0;
+ htable_bits++;
+ if (!htable_bits)
+ /* In case we have plenty of memory :-) */
+ return -IPSET_ERR_HASH_FULL;
+ t = ip_set_alloc(sizeof(*t)
+ + jhash_size(htable_bits) * sizeof(struct hbucket));
+ if (!t)
+ return -ENOMEM;
+ t->htable_bits = htable_bits;
+
+ read_lock_bh(&set->lock);
+ for (i = 0; i < jhash_size(orig->htable_bits); i++) {
+ n = hbucket(orig, i);
+ for (j = 0; j < n->pos; j++) {
+ data = ahash_tdata(n, j);
+ m = hbucket(t, HKEY(data, h->initval, htable_bits));
+ ret = type_pf_elem_tadd(m, data,
+ type_pf_data_timeout(data));
+ if (ret < 0) {
+ read_unlock_bh(&set->lock);
+ ahash_destroy(t);
+ if (ret == -EAGAIN)
+ goto retry;
+ return ret;
+ }
+ }
+ }
+
+ rcu_assign_pointer(h->table, t);
+ read_unlock_bh(&set->lock);
+
+ /* Give time to other readers of the set */
+ synchronize_rcu_bh();
+
+ ahash_destroy(orig);
+
+ return 0;
+}
+
+static int
+type_pf_tadd(struct ip_set *set, void *value, u32 timeout)
+{
+ struct ip_set_hash *h = set->data;
+ struct htable *t = h->table;
+ const struct type_pf_elem *d = value;
+ struct hbucket *n;
+ struct type_pf_elem *data;
+ int ret = 0, i, j = AHASH_MAX_SIZE + 1;
+ u32 key;
+
+ if (h->elements >= h->maxelem)
+ /* FIXME: when set is full, we slow down here */
+ type_pf_expire(h);
+ if (h->elements >= h->maxelem)
+ return -IPSET_ERR_HASH_FULL;
+
+ rcu_read_lock_bh();
+ t = rcu_dereference_bh(h->table);
+ key = HKEY(d, h->initval, t->htable_bits);
+ n = hbucket(t, key);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_tdata(n, i);
+ if (type_pf_data_equal(data, d)) {
+ if (type_pf_data_expired(data))
+ j = i;
+ else {
+ ret = -IPSET_ERR_EXIST;
+ goto out;
+ }
+ } else if (j == AHASH_MAX_SIZE + 1 &&
+ type_pf_data_expired(data))
+ j = i;
+ }
+ if (j != AHASH_MAX_SIZE + 1) {
+ data = ahash_tdata(n, j);
+#ifdef IP_SET_HASH_WITH_NETS
+ del_cidr(h, data->cidr, HOST_MASK);
+ add_cidr(h, d->cidr, HOST_MASK);
+#endif
+ type_pf_data_copy(data, d);
+ type_pf_data_timeout_set(data, timeout);
+ goto out;
+ }
+ ret = type_pf_elem_tadd(n, d, timeout);
+ if (ret != 0)
+ goto out;
+
+#ifdef IP_SET_HASH_WITH_NETS
+ add_cidr(h, d->cidr, HOST_MASK);
+#endif
+ h->elements++;
+out:
+ rcu_read_unlock_bh();
+ return ret;
+}
+
+static int
+type_pf_tdel(struct ip_set *set, void *value, u32 timeout)
+{
+ struct ip_set_hash *h = set->data;
+ struct htable *t = h->table;
+ const struct type_pf_elem *d = value;
+ struct hbucket *n;
+ int i, ret = 0;
+ struct type_pf_elem *data;
+ u32 key;
+
+ key = HKEY(value, h->initval, t->htable_bits);
+ n = hbucket(t, key);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_tdata(n, i);
+ if (!type_pf_data_equal(data, d))
+ continue;
+ if (type_pf_data_expired(data))
+ ret = -IPSET_ERR_EXIST;
+ if (i != n->pos - 1)
+ /* Not last one */
+ type_pf_data_copy(data, ahash_tdata(n, n->pos - 1));
+
+ n->pos--;
+ h->elements--;
+#ifdef IP_SET_HASH_WITH_NETS
+ del_cidr(h, d->cidr, HOST_MASK);
+#endif
+ if (n->pos + AHASH_INIT_SIZE < n->size) {
+ void *tmp = kzalloc((n->size - AHASH_INIT_SIZE)
+ * sizeof(struct type_pf_telem),
+ GFP_ATOMIC);
+ if (!tmp)
+ return 0;
+ n->size -= AHASH_INIT_SIZE;
+ memcpy(tmp, n->value,
+ n->size * sizeof(struct type_pf_telem));
+ kfree(n->value);
+ n->value = tmp;
+ }
+ return 0;
+ }
+
+ return -IPSET_ERR_EXIST;
+}
+
+#ifdef IP_SET_HASH_WITH_NETS
+static int
+type_pf_ttest_cidrs(struct ip_set *set, struct type_pf_elem *d, u32 timeout)
+{
+ struct ip_set_hash *h = set->data;
+ struct htable *t = h->table;
+ struct type_pf_elem *data;
+ struct hbucket *n;
+ int i, j = 0;
+ u32 key;
+ u8 host_mask = SET_HOST_MASK(set->family);
+
+ for (; j < host_mask && h->nets[j].cidr; j++) {
+ type_pf_data_netmask(d, h->nets[j].cidr);
+ key = HKEY(d, h->initval, t->htable_bits);
+ n = hbucket(t, key);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_tdata(n, i);
+ if (type_pf_data_equal(data, d))
+ return !type_pf_data_expired(data);
+ }
+ }
+ return 0;
+}
+#endif
+
+static int
+type_pf_ttest(struct ip_set *set, void *value, u32 timeout)
+{
+ struct ip_set_hash *h = set->data;
+ struct htable *t = h->table;
+ struct type_pf_elem *data, *d = value;
+ struct hbucket *n;
+ int i;
+ u32 key;
+
+#ifdef IP_SET_HASH_WITH_NETS
+ if (d->cidr == SET_HOST_MASK(set->family))
+ return type_pf_ttest_cidrs(set, d, timeout);
+#endif
+ key = HKEY(d, h->initval, t->htable_bits);
+ n = hbucket(t, key);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_tdata(n, i);
+ if (type_pf_data_equal(data, d))
+ return !type_pf_data_expired(data);
+ }
+ return 0;
+}
+
+static int
+type_pf_tlist(const struct ip_set *set,
+ struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct ip_set_hash *h = set->data;
+ const struct htable *t = h->table;
+ struct nlattr *atd, *nested;
+ const struct hbucket *n;
+ const struct type_pf_elem *data;
+ u32 first = cb->args[2];
+ /* We assume that one hash bucket fills into one page */
+ void *incomplete;
+ int i;
+
+ atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+ if (!atd)
+ return -EMSGSIZE;
+ for (; cb->args[2] < jhash_size(t->htable_bits); cb->args[2]++) {
+ incomplete = skb_tail_pointer(skb);
+ n = hbucket(t, cb->args[2]);
+ for (i = 0; i < n->pos; i++) {
+ data = ahash_tdata(n, i);
+ pr_debug("list %p %u\n", n, i);
+ if (type_pf_data_expired(data))
+ continue;
+ pr_debug("do list %p %u\n", n, i);
+ nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+ if (!nested) {
+ if (cb->args[2] == first) {
+ nla_nest_cancel(skb, atd);
+ return -EMSGSIZE;
+ } else
+ goto nla_put_failure;
+ }
+ if (type_pf_data_tlist(skb, data))
+ goto nla_put_failure;
+ ipset_nest_end(skb, nested);
+ }
+ }
+ ipset_nest_end(skb, atd);
+ /* Set listing finished */
+ cb->args[2] = 0;
+
+ return 0;
+
+nla_put_failure:
+ nlmsg_trim(skb, incomplete);
+ ipset_nest_end(skb, atd);
+ if (unlikely(first == cb->args[2])) {
+ pr_warning("Can't list set %s: one bucket does not fit into "
+ "a message. Please report it!\n", set->name);
+ cb->args[2] = 0;
+ return -EMSGSIZE;
+ }
+ return 0;
+}
+
+static const struct ip_set_type_variant type_pf_tvariant = {
+ .kadt = type_pf_kadt,
+ .uadt = type_pf_uadt,
+ .adt = {
+ [IPSET_ADD] = type_pf_tadd,
+ [IPSET_DEL] = type_pf_tdel,
+ [IPSET_TEST] = type_pf_ttest,
+ },
+ .destroy = type_pf_destroy,
+ .flush = type_pf_flush,
+ .head = type_pf_head,
+ .list = type_pf_tlist,
+ .resize = type_pf_tresize,
+ .same_set = type_pf_same_set,
+};
+
+static void
+type_pf_gc(unsigned long ul_set)
+{
+ struct ip_set *set = (struct ip_set *) ul_set;
+ struct ip_set_hash *h = set->data;
+
+ pr_debug("called\n");
+ write_lock_bh(&set->lock);
+ type_pf_expire(h);
+ write_unlock_bh(&set->lock);
+
+ h->gc.expires = jiffies + IPSET_GC_PERIOD(h->timeout) * HZ;
+ add_timer(&h->gc);
+}
+
+static void
+type_pf_gc_init(struct ip_set *set)
+{
+ struct ip_set_hash *h = set->data;
+
+ init_timer(&h->gc);
+ h->gc.data = (unsigned long) set;
+ h->gc.function = type_pf_gc;
+ h->gc.expires = jiffies + IPSET_GC_PERIOD(h->timeout) * HZ;
+ add_timer(&h->gc);
+ pr_debug("gc initialized, run in every %u\n",
+ IPSET_GC_PERIOD(h->timeout));
+}
+
+#undef type_pf_data_equal
+#undef type_pf_data_isnull
+#undef type_pf_data_copy
+#undef type_pf_data_zero_out
+#undef type_pf_data_list
+#undef type_pf_data_tlist
+
+#undef type_pf_elem
+#undef type_pf_telem
+#undef type_pf_data_timeout
+#undef type_pf_data_expired
+#undef type_pf_data_netmask
+#undef type_pf_data_timeout_set
+
+#undef type_pf_elem_add
+#undef type_pf_add
+#undef type_pf_del
+#undef type_pf_test_cidrs
+#undef type_pf_test
+
+#undef type_pf_elem_tadd
+#undef type_pf_expire
+#undef type_pf_tadd
+#undef type_pf_tdel
+#undef type_pf_ttest_cidrs
+#undef type_pf_ttest
+
+#undef type_pf_resize
+#undef type_pf_tresize
+#undef type_pf_flush
+#undef type_pf_destroy
+#undef type_pf_head
+#undef type_pf_list
+#undef type_pf_tlist
+#undef type_pf_same_set
+#undef type_pf_kadt
+#undef type_pf_uadt
+#undef type_pf_gc
+#undef type_pf_gc_init
+#undef type_pf_variant
+#undef type_pf_tvariant
diff --git a/include/linux/netfilter/ipset/ip_set_hash.h b/include/linux/netfilter/ipset/ip_set_hash.h
new file mode 100644
index 000000000000..b86f15c04524
--- /dev/null
+++ b/include/linux/netfilter/ipset/ip_set_hash.h
@@ -0,0 +1,26 @@
+#ifndef __IP_SET_HASH_H
+#define __IP_SET_HASH_H
+
+/* Hash type specific error codes */
+enum {
+ /* Hash is full */
+ IPSET_ERR_HASH_FULL = IPSET_ERR_TYPE_SPECIFIC,
+ /* Null-valued element */
+ IPSET_ERR_HASH_ELEM,
+ /* Invalid protocol */
+ IPSET_ERR_INVALID_PROTO,
+ /* Protocol missing but must be specified */
+ IPSET_ERR_MISSING_PROTO,
+};
+
+#ifdef __KERNEL__
+
+#define IPSET_DEFAULT_HASHSIZE 1024
+#define IPSET_MIMINAL_HASHSIZE 64
+#define IPSET_DEFAULT_MAXELEM 65536
+#define IPSET_DEFAULT_PROBES 4
+#define IPSET_DEFAULT_RESIZE 100
+
+#endif /* __KERNEL__ */
+
+#endif /* __IP_SET_HASH_H */