84d317418d18430b3bde3d29bacb23f3547d7c3a
[releases.git] / nft_set_rbtree.c
1 /*
2  * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License version 2 as
6  * published by the Free Software Foundation.
7  *
8  * Development of this code funded by Astaro AG (http://www.astaro.com/)
9  */
10
11 #include <linux/kernel.h>
12 #include <linux/init.h>
13 #include <linux/module.h>
14 #include <linux/list.h>
15 #include <linux/rbtree.h>
16 #include <linux/netlink.h>
17 #include <linux/netfilter.h>
18 #include <linux/netfilter/nf_tables.h>
19 #include <net/netfilter/nf_tables.h>
20
21 struct nft_rbtree {
22         struct rb_root          root;
23         rwlock_t                lock;
24         seqcount_t              count;
25         struct delayed_work     gc_work;
26 };
27
28 struct nft_rbtree_elem {
29         struct rb_node          node;
30         struct nft_set_ext      ext;
31 };
32
33 static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
34 {
35         return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
36                (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
37 }
38
39 static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
40 {
41         return !nft_rbtree_interval_end(rbe);
42 }
43
44 static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
45                              const struct nft_rbtree_elem *interval)
46 {
47         return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
48 }
49
50 static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
51                                 const u32 *key, const struct nft_set_ext **ext,
52                                 unsigned int seq)
53 {
54         struct nft_rbtree *priv = nft_set_priv(set);
55         const struct nft_rbtree_elem *rbe, *interval = NULL;
56         u8 genmask = nft_genmask_cur(net);
57         const struct rb_node *parent;
58         const void *this;
59         int d;
60
61         parent = rcu_dereference_raw(priv->root.rb_node);
62         while (parent != NULL) {
63                 if (read_seqcount_retry(&priv->count, seq))
64                         return false;
65
66                 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
67
68                 this = nft_set_ext_key(&rbe->ext);
69                 d = memcmp(this, key, set->klen);
70                 if (d < 0) {
71                         parent = rcu_dereference_raw(parent->rb_left);
72                         if (interval &&
73                             nft_rbtree_equal(set, this, interval) &&
74                             nft_rbtree_interval_end(rbe) &&
75                             nft_rbtree_interval_start(interval))
76                                 continue;
77                         interval = rbe;
78                 } else if (d > 0)
79                         parent = rcu_dereference_raw(parent->rb_right);
80                 else {
81                         if (!nft_set_elem_active(&rbe->ext, genmask)) {
82                                 parent = rcu_dereference_raw(parent->rb_left);
83                                 continue;
84                         }
85                         if (nft_rbtree_interval_end(rbe)) {
86                                 if (nft_set_is_anonymous(set))
87                                         return false;
88                                 parent = rcu_dereference_raw(parent->rb_left);
89                                 interval = NULL;
90                                 continue;
91                         }
92
93                         *ext = &rbe->ext;
94                         return true;
95                 }
96         }
97
98         if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
99             nft_set_elem_active(&interval->ext, genmask) &&
100             nft_rbtree_interval_start(interval)) {
101                 *ext = &interval->ext;
102                 return true;
103         }
104
105         return false;
106 }
107
108 static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
109                               const u32 *key, const struct nft_set_ext **ext)
110 {
111         struct nft_rbtree *priv = nft_set_priv(set);
112         unsigned int seq = read_seqcount_begin(&priv->count);
113         bool ret;
114
115         ret = __nft_rbtree_lookup(net, set, key, ext, seq);
116         if (ret || !read_seqcount_retry(&priv->count, seq))
117                 return ret;
118
119         read_lock_bh(&priv->lock);
120         seq = read_seqcount_begin(&priv->count);
121         ret = __nft_rbtree_lookup(net, set, key, ext, seq);
122         read_unlock_bh(&priv->lock);
123
124         return ret;
125 }
126
127 static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
128                              const u32 *key, struct nft_rbtree_elem **elem,
129                              unsigned int seq, unsigned int flags, u8 genmask)
130 {
131         struct nft_rbtree_elem *rbe, *interval = NULL;
132         struct nft_rbtree *priv = nft_set_priv(set);
133         const struct rb_node *parent;
134         const void *this;
135         int d;
136
137         parent = rcu_dereference_raw(priv->root.rb_node);
138         while (parent != NULL) {
139                 if (read_seqcount_retry(&priv->count, seq))
140                         return false;
141
142                 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
143
144                 this = nft_set_ext_key(&rbe->ext);
145                 d = memcmp(this, key, set->klen);
146                 if (d < 0) {
147                         parent = rcu_dereference_raw(parent->rb_left);
148                         interval = rbe;
149                 } else if (d > 0) {
150                         parent = rcu_dereference_raw(parent->rb_right);
151                 } else {
152                         if (!nft_set_elem_active(&rbe->ext, genmask)) {
153                                 parent = rcu_dereference_raw(parent->rb_left);
154                                 continue;
155                         }
156
157                         if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) ||
158                             (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) ==
159                             (flags & NFT_SET_ELEM_INTERVAL_END)) {
160                                 *elem = rbe;
161                                 return true;
162                         }
163
164                         if (nft_rbtree_interval_end(rbe))
165                                 interval = NULL;
166
167                         parent = rcu_dereference_raw(parent->rb_left);
168                 }
169         }
170
171         if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
172             nft_set_elem_active(&interval->ext, genmask) &&
173             !nft_rbtree_interval_end(interval)) {
174                 *elem = interval;
175                 return true;
176         }
177
178         return false;
179 }
180
181 static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
182                             const struct nft_set_elem *elem, unsigned int flags)
183 {
184         struct nft_rbtree *priv = nft_set_priv(set);
185         unsigned int seq = read_seqcount_begin(&priv->count);
186         struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT);
187         const u32 *key = (const u32 *)&elem->key.val;
188         u8 genmask = nft_genmask_cur(net);
189         bool ret;
190
191         ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
192         if (ret || !read_seqcount_retry(&priv->count, seq))
193                 return rbe;
194
195         read_lock_bh(&priv->lock);
196         seq = read_seqcount_begin(&priv->count);
197         ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
198         if (!ret)
199                 rbe = ERR_PTR(-ENOENT);
200         read_unlock_bh(&priv->lock);
201
202         return rbe;
203 }
204
205 static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
206                                struct nft_rbtree_elem *new,
207                                struct nft_set_ext **ext)
208 {
209         struct nft_rbtree *priv = nft_set_priv(set);
210         u8 genmask = nft_genmask_next(net);
211         struct nft_rbtree_elem *rbe;
212         struct rb_node *parent, **p;
213         int d;
214
215         parent = NULL;
216         p = &priv->root.rb_node;
217         while (*p != NULL) {
218                 parent = *p;
219                 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
220                 d = memcmp(nft_set_ext_key(&rbe->ext),
221                            nft_set_ext_key(&new->ext),
222                            set->klen);
223                 if (d < 0)
224                         p = &parent->rb_left;
225                 else if (d > 0)
226                         p = &parent->rb_right;
227                 else {
228                         if (nft_rbtree_interval_end(rbe) &&
229                             nft_rbtree_interval_start(new)) {
230                                 p = &parent->rb_left;
231                         } else if (nft_rbtree_interval_start(rbe) &&
232                                    nft_rbtree_interval_end(new)) {
233                                 p = &parent->rb_right;
234                         } else if (nft_set_elem_active(&rbe->ext, genmask)) {
235                                 *ext = &rbe->ext;
236                                 return -EEXIST;
237                         } else {
238                                 p = &parent->rb_left;
239                         }
240                 }
241         }
242         rb_link_node_rcu(&new->node, parent, p);
243         rb_insert_color(&new->node, &priv->root);
244         return 0;
245 }
246
247 static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
248                              const struct nft_set_elem *elem,
249                              struct nft_set_ext **ext)
250 {
251         struct nft_rbtree *priv = nft_set_priv(set);
252         struct nft_rbtree_elem *rbe = elem->priv;
253         int err;
254
255         write_lock_bh(&priv->lock);
256         write_seqcount_begin(&priv->count);
257         err = __nft_rbtree_insert(net, set, rbe, ext);
258         write_seqcount_end(&priv->count);
259         write_unlock_bh(&priv->lock);
260
261         return err;
262 }
263
264 static void nft_rbtree_remove(const struct net *net,
265                               const struct nft_set *set,
266                               const struct nft_set_elem *elem)
267 {
268         struct nft_rbtree *priv = nft_set_priv(set);
269         struct nft_rbtree_elem *rbe = elem->priv;
270
271         write_lock_bh(&priv->lock);
272         write_seqcount_begin(&priv->count);
273         rb_erase(&rbe->node, &priv->root);
274         write_seqcount_end(&priv->count);
275         write_unlock_bh(&priv->lock);
276 }
277
278 static void nft_rbtree_activate(const struct net *net,
279                                 const struct nft_set *set,
280                                 const struct nft_set_elem *elem)
281 {
282         struct nft_rbtree_elem *rbe = elem->priv;
283
284         nft_set_elem_change_active(net, set, &rbe->ext);
285         nft_set_elem_clear_busy(&rbe->ext);
286 }
287
288 static bool nft_rbtree_flush(const struct net *net,
289                              const struct nft_set *set, void *priv)
290 {
291         struct nft_rbtree_elem *rbe = priv;
292
293         if (!nft_set_elem_mark_busy(&rbe->ext) ||
294             !nft_is_active(net, &rbe->ext)) {
295                 nft_set_elem_change_active(net, set, &rbe->ext);
296                 return true;
297         }
298         return false;
299 }
300
301 static void *nft_rbtree_deactivate(const struct net *net,
302                                    const struct nft_set *set,
303                                    const struct nft_set_elem *elem)
304 {
305         const struct nft_rbtree *priv = nft_set_priv(set);
306         const struct rb_node *parent = priv->root.rb_node;
307         struct nft_rbtree_elem *rbe, *this = elem->priv;
308         u8 genmask = nft_genmask_next(net);
309         int d;
310
311         while (parent != NULL) {
312                 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
313
314                 d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val,
315                                            set->klen);
316                 if (d < 0)
317                         parent = parent->rb_left;
318                 else if (d > 0)
319                         parent = parent->rb_right;
320                 else {
321                         if (nft_rbtree_interval_end(rbe) &&
322                             nft_rbtree_interval_start(this)) {
323                                 parent = parent->rb_left;
324                                 continue;
325                         } else if (nft_rbtree_interval_start(rbe) &&
326                                    nft_rbtree_interval_end(this)) {
327                                 parent = parent->rb_right;
328                                 continue;
329                         } else if (!nft_set_elem_active(&rbe->ext, genmask)) {
330                                 parent = parent->rb_left;
331                                 continue;
332                         }
333                         nft_rbtree_flush(net, set, rbe);
334                         return rbe;
335                 }
336         }
337         return NULL;
338 }
339
340 static void nft_rbtree_walk(const struct nft_ctx *ctx,
341                             struct nft_set *set,
342                             struct nft_set_iter *iter)
343 {
344         struct nft_rbtree *priv = nft_set_priv(set);
345         struct nft_rbtree_elem *rbe;
346         struct nft_set_elem elem;
347         struct rb_node *node;
348
349         read_lock_bh(&priv->lock);
350         for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
351                 rbe = rb_entry(node, struct nft_rbtree_elem, node);
352
353                 if (iter->count < iter->skip)
354                         goto cont;
355                 if (!nft_set_elem_active(&rbe->ext, iter->genmask))
356                         goto cont;
357
358                 elem.priv = rbe;
359
360                 iter->err = iter->fn(ctx, set, iter, &elem);
361                 if (iter->err < 0) {
362                         read_unlock_bh(&priv->lock);
363                         return;
364                 }
365 cont:
366                 iter->count++;
367         }
368         read_unlock_bh(&priv->lock);
369 }
370
371 static void nft_rbtree_gc(struct work_struct *work)
372 {
373         struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL;
374         struct nft_set_gc_batch *gcb = NULL;
375         struct nft_rbtree *priv;
376         struct rb_node *node;
377         struct nft_set *set;
378
379         priv = container_of(work, struct nft_rbtree, gc_work.work);
380         set  = nft_set_container_of(priv);
381
382         write_lock_bh(&priv->lock);
383         write_seqcount_begin(&priv->count);
384         for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
385                 rbe = rb_entry(node, struct nft_rbtree_elem, node);
386
387                 if (nft_rbtree_interval_end(rbe)) {
388                         rbe_end = rbe;
389                         continue;
390                 }
391                 if (!nft_set_elem_expired(&rbe->ext))
392                         continue;
393                 if (nft_set_elem_mark_busy(&rbe->ext))
394                         continue;
395
396                 if (rbe_prev) {
397                         rb_erase(&rbe_prev->node, &priv->root);
398                         rbe_prev = NULL;
399                 }
400                 gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
401                 if (!gcb)
402                         break;
403
404                 atomic_dec(&set->nelems);
405                 nft_set_gc_batch_add(gcb, rbe);
406                 rbe_prev = rbe;
407
408                 if (rbe_end) {
409                         atomic_dec(&set->nelems);
410                         nft_set_gc_batch_add(gcb, rbe_end);
411                         rb_erase(&rbe_end->node, &priv->root);
412                         rbe_end = NULL;
413                 }
414                 node = rb_next(node);
415                 if (!node)
416                         break;
417         }
418         if (rbe_prev)
419                 rb_erase(&rbe_prev->node, &priv->root);
420         write_seqcount_end(&priv->count);
421         write_unlock_bh(&priv->lock);
422
423         nft_set_gc_batch_complete(gcb);
424
425         queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
426                            nft_set_gc_interval(set));
427 }
428
429 static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
430                                const struct nft_set_desc *desc)
431 {
432         return sizeof(struct nft_rbtree);
433 }
434
435 static int nft_rbtree_init(const struct nft_set *set,
436                            const struct nft_set_desc *desc,
437                            const struct nlattr * const nla[])
438 {
439         struct nft_rbtree *priv = nft_set_priv(set);
440
441         rwlock_init(&priv->lock);
442         seqcount_init(&priv->count);
443         priv->root = RB_ROOT;
444
445         INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc);
446         if (set->flags & NFT_SET_TIMEOUT)
447                 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
448                                    nft_set_gc_interval(set));
449
450         return 0;
451 }
452
453 static void nft_rbtree_destroy(const struct nft_set *set)
454 {
455         struct nft_rbtree *priv = nft_set_priv(set);
456         struct nft_rbtree_elem *rbe;
457         struct rb_node *node;
458
459         cancel_delayed_work_sync(&priv->gc_work);
460         rcu_barrier();
461         while ((node = priv->root.rb_node) != NULL) {
462                 rb_erase(node, &priv->root);
463                 rbe = rb_entry(node, struct nft_rbtree_elem, node);
464                 nft_set_elem_destroy(set, rbe, true);
465         }
466 }
467
468 static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
469                                 struct nft_set_estimate *est)
470 {
471         if (desc->size)
472                 est->size = sizeof(struct nft_rbtree) +
473                             desc->size * sizeof(struct nft_rbtree_elem);
474         else
475                 est->size = ~0;
476
477         est->lookup = NFT_SET_CLASS_O_LOG_N;
478         est->space  = NFT_SET_CLASS_O_N;
479
480         return true;
481 }
482
483 struct nft_set_type nft_set_rbtree_type __read_mostly = {
484         .owner          = THIS_MODULE,
485         .features       = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
486         .ops            = {
487                 .privsize       = nft_rbtree_privsize,
488                 .elemsize       = offsetof(struct nft_rbtree_elem, ext),
489                 .estimate       = nft_rbtree_estimate,
490                 .init           = nft_rbtree_init,
491                 .destroy        = nft_rbtree_destroy,
492                 .insert         = nft_rbtree_insert,
493                 .remove         = nft_rbtree_remove,
494                 .deactivate     = nft_rbtree_deactivate,
495                 .flush          = nft_rbtree_flush,
496                 .activate       = nft_rbtree_activate,
497                 .lookup         = nft_rbtree_lookup,
498                 .walk           = nft_rbtree_walk,
499                 .get            = nft_rbtree_get,
500         },
501 };