GNU Linux-libre 4.19.314-gnu1
[releases.git] / net / netfilter / nft_set_rbtree.c
1 /*
2  * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License version 2 as
6  * published by the Free Software Foundation.
7  *
8  * Development of this code funded by Astaro AG (http://www.astaro.com/)
9  */
10
11 #include <linux/kernel.h>
12 #include <linux/init.h>
13 #include <linux/module.h>
14 #include <linux/list.h>
15 #include <linux/rbtree.h>
16 #include <linux/netlink.h>
17 #include <linux/netfilter.h>
18 #include <linux/netfilter/nf_tables.h>
19 #include <net/netfilter/nf_tables.h>
20
21 struct nft_rbtree {
22         struct rb_root          root;
23         rwlock_t                lock;
24         seqcount_t              count;
25         struct delayed_work     gc_work;
26 };
27
28 struct nft_rbtree_elem {
29         struct rb_node          node;
30         struct nft_set_ext      ext;
31 };
32
33 static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
34 {
35         return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
36                (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
37 }
38
39 static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
40 {
41         return !nft_rbtree_interval_end(rbe);
42 }
43
44 static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
45                              const struct nft_rbtree_elem *interval)
46 {
47         return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
48 }
49
50 static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
51                                 const u32 *key, const struct nft_set_ext **ext,
52                                 unsigned int seq)
53 {
54         struct nft_rbtree *priv = nft_set_priv(set);
55         const struct nft_rbtree_elem *rbe, *interval = NULL;
56         u8 genmask = nft_genmask_cur(net);
57         const struct rb_node *parent;
58         const void *this;
59         int d;
60
61         parent = rcu_dereference_raw(priv->root.rb_node);
62         while (parent != NULL) {
63                 if (read_seqcount_retry(&priv->count, seq))
64                         return false;
65
66                 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
67
68                 this = nft_set_ext_key(&rbe->ext);
69                 d = memcmp(this, key, set->klen);
70                 if (d < 0) {
71                         parent = rcu_dereference_raw(parent->rb_left);
72                         if (interval &&
73                             nft_rbtree_equal(set, this, interval) &&
74                             nft_rbtree_interval_end(rbe) &&
75                             nft_rbtree_interval_start(interval))
76                                 continue;
77                         interval = rbe;
78                 } else if (d > 0)
79                         parent = rcu_dereference_raw(parent->rb_right);
80                 else {
81                         if (!nft_set_elem_active(&rbe->ext, genmask)) {
82                                 parent = rcu_dereference_raw(parent->rb_left);
83                                 continue;
84                         }
85                         if (nft_rbtree_interval_end(rbe)) {
86                                 if (nft_set_is_anonymous(set))
87                                         return false;
88                                 parent = rcu_dereference_raw(parent->rb_left);
89                                 interval = NULL;
90                                 continue;
91                         }
92
93                         *ext = &rbe->ext;
94                         return true;
95                 }
96         }
97
98         if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
99             nft_set_elem_active(&interval->ext, genmask) &&
100             nft_rbtree_interval_start(interval)) {
101                 *ext = &interval->ext;
102                 return true;
103         }
104
105         return false;
106 }
107
108 static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
109                               const u32 *key, const struct nft_set_ext **ext)
110 {
111         struct nft_rbtree *priv = nft_set_priv(set);
112         unsigned int seq = read_seqcount_begin(&priv->count);
113         bool ret;
114
115         ret = __nft_rbtree_lookup(net, set, key, ext, seq);
116         if (ret || !read_seqcount_retry(&priv->count, seq))
117                 return ret;
118
119         read_lock_bh(&priv->lock);
120         seq = read_seqcount_begin(&priv->count);
121         ret = __nft_rbtree_lookup(net, set, key, ext, seq);
122         read_unlock_bh(&priv->lock);
123
124         return ret;
125 }
126
127 static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
128                              const u32 *key, struct nft_rbtree_elem **elem,
129                              unsigned int seq, unsigned int flags, u8 genmask)
130 {
131         struct nft_rbtree_elem *rbe, *interval = NULL;
132         struct nft_rbtree *priv = nft_set_priv(set);
133         const struct rb_node *parent;
134         const void *this;
135         int d;
136
137         parent = rcu_dereference_raw(priv->root.rb_node);
138         while (parent != NULL) {
139                 if (read_seqcount_retry(&priv->count, seq))
140                         return false;
141
142                 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
143
144                 this = nft_set_ext_key(&rbe->ext);
145                 d = memcmp(this, key, set->klen);
146                 if (d < 0) {
147                         parent = rcu_dereference_raw(parent->rb_left);
148                         interval = rbe;
149                 } else if (d > 0) {
150                         parent = rcu_dereference_raw(parent->rb_right);
151                 } else {
152                         if (!nft_set_elem_active(&rbe->ext, genmask)) {
153                                 parent = rcu_dereference_raw(parent->rb_left);
154                                 continue;
155                         }
156
157                         if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) ||
158                             (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) ==
159                             (flags & NFT_SET_ELEM_INTERVAL_END)) {
160                                 *elem = rbe;
161                                 return true;
162                         }
163
164                         if (nft_rbtree_interval_end(rbe))
165                                 interval = NULL;
166
167                         parent = rcu_dereference_raw(parent->rb_left);
168                 }
169         }
170
171         if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
172             nft_set_elem_active(&interval->ext, genmask) &&
173             !nft_rbtree_interval_end(interval)) {
174                 *elem = interval;
175                 return true;
176         }
177
178         return false;
179 }
180
181 static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
182                             const struct nft_set_elem *elem, unsigned int flags)
183 {
184         struct nft_rbtree *priv = nft_set_priv(set);
185         unsigned int seq = read_seqcount_begin(&priv->count);
186         struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT);
187         const u32 *key = (const u32 *)&elem->key.val;
188         u8 genmask = nft_genmask_cur(net);
189         bool ret;
190
191         ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
192         if (ret || !read_seqcount_retry(&priv->count, seq))
193                 return rbe;
194
195         read_lock_bh(&priv->lock);
196         seq = read_seqcount_begin(&priv->count);
197         ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
198         if (!ret)
199                 rbe = ERR_PTR(-ENOENT);
200         read_unlock_bh(&priv->lock);
201
202         return rbe;
203 }
204
205 static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
206                                struct nft_rbtree_elem *new,
207                                struct nft_set_ext **ext)
208 {
209         struct nft_rbtree *priv = nft_set_priv(set);
210         u8 genmask = nft_genmask_next(net);
211         struct nft_rbtree_elem *rbe;
212         struct rb_node *parent, **p;
213         int d;
214
215         parent = NULL;
216         p = &priv->root.rb_node;
217         while (*p != NULL) {
218                 parent = *p;
219                 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
220                 d = memcmp(nft_set_ext_key(&rbe->ext),
221                            nft_set_ext_key(&new->ext),
222                            set->klen);
223                 if (d < 0)
224                         p = &parent->rb_left;
225                 else if (d > 0)
226                         p = &parent->rb_right;
227                 else {
228                         if (nft_rbtree_interval_end(rbe) &&
229                             nft_rbtree_interval_start(new)) {
230                                 p = &parent->rb_left;
231                         } else if (nft_rbtree_interval_start(rbe) &&
232                                    nft_rbtree_interval_end(new)) {
233                                 p = &parent->rb_right;
234                         } else if (nft_set_elem_active(&rbe->ext, genmask)) {
235                                 *ext = &rbe->ext;
236                                 return -EEXIST;
237                         } else {
238                                 p = &parent->rb_left;
239                         }
240                 }
241         }
242         rb_link_node_rcu(&new->node, parent, p);
243         rb_insert_color(&new->node, &priv->root);
244         return 0;
245 }
246
247 static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
248                              const struct nft_set_elem *elem,
249                              struct nft_set_ext **ext)
250 {
251         struct nft_rbtree *priv = nft_set_priv(set);
252         struct nft_rbtree_elem *rbe = elem->priv;
253         int err;
254
255         write_lock_bh(&priv->lock);
256         write_seqcount_begin(&priv->count);
257         err = __nft_rbtree_insert(net, set, rbe, ext);
258         write_seqcount_end(&priv->count);
259         write_unlock_bh(&priv->lock);
260
261         return err;
262 }
263
264 static void nft_rbtree_remove(const struct net *net,
265                               const struct nft_set *set,
266                               const struct nft_set_elem *elem)
267 {
268         struct nft_rbtree *priv = nft_set_priv(set);
269         struct nft_rbtree_elem *rbe = elem->priv;
270
271         write_lock_bh(&priv->lock);
272         write_seqcount_begin(&priv->count);
273         rb_erase(&rbe->node, &priv->root);
274         write_seqcount_end(&priv->count);
275         write_unlock_bh(&priv->lock);
276 }
277
278 static void nft_rbtree_activate(const struct net *net,
279                                 const struct nft_set *set,
280                                 const struct nft_set_elem *elem)
281 {
282         struct nft_rbtree_elem *rbe = elem->priv;
283
284         nft_set_elem_change_active(net, set, &rbe->ext);
285         nft_set_elem_clear_busy(&rbe->ext);
286 }
287
288 static bool nft_rbtree_flush(const struct net *net,
289                              const struct nft_set *set, void *priv)
290 {
291         struct nft_rbtree_elem *rbe = priv;
292
293         if (!nft_set_elem_mark_busy(&rbe->ext) ||
294             !nft_is_active(net, &rbe->ext)) {
295                 nft_set_elem_change_active(net, set, &rbe->ext);
296                 return true;
297         }
298         return false;
299 }
300
301 static void *nft_rbtree_deactivate(const struct net *net,
302                                    const struct nft_set *set,
303                                    const struct nft_set_elem *elem)
304 {
305         const struct nft_rbtree *priv = nft_set_priv(set);
306         const struct rb_node *parent = priv->root.rb_node;
307         struct nft_rbtree_elem *rbe, *this = elem->priv;
308         u8 genmask = nft_genmask_next(net);
309         int d;
310
311         while (parent != NULL) {
312                 rbe = rb_entry(parent, struct nft_rbtree_elem, node);
313
314                 d = memcmp(nft_set_ext_key(&rbe->ext), &elem->key.val,
315                                            set->klen);
316                 if (d < 0)
317                         parent = parent->rb_left;
318                 else if (d > 0)
319                         parent = parent->rb_right;
320                 else {
321                         if (nft_rbtree_interval_end(rbe) &&
322                             nft_rbtree_interval_start(this)) {
323                                 parent = parent->rb_left;
324                                 continue;
325                         } else if (nft_rbtree_interval_start(rbe) &&
326                                    nft_rbtree_interval_end(this)) {
327                                 parent = parent->rb_right;
328                                 continue;
329                         } else if (nft_set_elem_expired(&rbe->ext)) {
330                                 break;
331                         } else if (!nft_set_elem_active(&rbe->ext, genmask)) {
332                                 parent = parent->rb_left;
333                                 continue;
334                         }
335                         nft_rbtree_flush(net, set, rbe);
336                         return rbe;
337                 }
338         }
339         return NULL;
340 }
341
342 static void nft_rbtree_walk(const struct nft_ctx *ctx,
343                             struct nft_set *set,
344                             struct nft_set_iter *iter)
345 {
346         struct nft_rbtree *priv = nft_set_priv(set);
347         struct nft_rbtree_elem *rbe;
348         struct nft_set_elem elem;
349         struct rb_node *node;
350
351         read_lock_bh(&priv->lock);
352         for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
353                 rbe = rb_entry(node, struct nft_rbtree_elem, node);
354
355                 if (iter->count < iter->skip)
356                         goto cont;
357                 if (!nft_set_elem_active(&rbe->ext, iter->genmask))
358                         goto cont;
359
360                 elem.priv = rbe;
361
362                 iter->err = iter->fn(ctx, set, iter, &elem);
363                 if (iter->err < 0) {
364                         read_unlock_bh(&priv->lock);
365                         return;
366                 }
367 cont:
368                 iter->count++;
369         }
370         read_unlock_bh(&priv->lock);
371 }
372
373 static void nft_rbtree_gc(struct work_struct *work)
374 {
375         struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL;
376         struct nft_set_gc_batch *gcb = NULL;
377         struct nft_rbtree *priv;
378         struct rb_node *node;
379         struct nft_set *set;
380         struct net *net;
381         u8 genmask;
382
383         priv = container_of(work, struct nft_rbtree, gc_work.work);
384         set  = nft_set_container_of(priv);
385         net  = read_pnet(&set->net);
386         genmask = nft_genmask_cur(net);
387
388         write_lock_bh(&priv->lock);
389         write_seqcount_begin(&priv->count);
390         for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
391                 rbe = rb_entry(node, struct nft_rbtree_elem, node);
392
393                 if (!nft_set_elem_active(&rbe->ext, genmask))
394                         continue;
395
396                 /* elements are reversed in the rbtree for historical reasons,
397                  * from highest to lowest value, that is why end element is
398                  * always visited before the start element.
399                  */
400                 if (nft_rbtree_interval_end(rbe)) {
401                         rbe_end = rbe;
402                         continue;
403                 }
404                 if (!nft_set_elem_expired(&rbe->ext))
405                         continue;
406
407                 if (nft_set_elem_mark_busy(&rbe->ext)) {
408                         rbe_end = NULL;
409                         continue;
410                 }
411
412                 if (rbe_prev) {
413                         rb_erase(&rbe_prev->node, &priv->root);
414                         rbe_prev = NULL;
415                 }
416                 gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
417                 if (!gcb)
418                         break;
419
420                 atomic_dec(&set->nelems);
421                 nft_set_gc_batch_add(gcb, rbe);
422                 rbe_prev = rbe;
423
424                 if (rbe_end) {
425                         atomic_dec(&set->nelems);
426                         nft_set_gc_batch_add(gcb, rbe_end);
427                         rb_erase(&rbe_end->node, &priv->root);
428                         rbe_end = NULL;
429                 }
430                 node = rb_next(node);
431                 if (!node)
432                         break;
433         }
434         if (rbe_prev)
435                 rb_erase(&rbe_prev->node, &priv->root);
436         write_seqcount_end(&priv->count);
437         write_unlock_bh(&priv->lock);
438
439         nft_set_gc_batch_complete(gcb);
440
441         queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
442                            nft_set_gc_interval(set));
443 }
444
445 static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
446                                const struct nft_set_desc *desc)
447 {
448         return sizeof(struct nft_rbtree);
449 }
450
451 static int nft_rbtree_init(const struct nft_set *set,
452                            const struct nft_set_desc *desc,
453                            const struct nlattr * const nla[])
454 {
455         struct nft_rbtree *priv = nft_set_priv(set);
456
457         rwlock_init(&priv->lock);
458         seqcount_init(&priv->count);
459         priv->root = RB_ROOT;
460
461         INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rbtree_gc);
462         if (set->flags & NFT_SET_TIMEOUT)
463                 queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
464                                    nft_set_gc_interval(set));
465
466         return 0;
467 }
468
469 static void nft_rbtree_destroy(const struct nft_set *set)
470 {
471         struct nft_rbtree *priv = nft_set_priv(set);
472         struct nft_rbtree_elem *rbe;
473         struct rb_node *node;
474
475         cancel_delayed_work_sync(&priv->gc_work);
476         rcu_barrier();
477         while ((node = priv->root.rb_node) != NULL) {
478                 rb_erase(node, &priv->root);
479                 rbe = rb_entry(node, struct nft_rbtree_elem, node);
480                 nft_set_elem_destroy(set, rbe, true);
481         }
482 }
483
484 static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
485                                 struct nft_set_estimate *est)
486 {
487         if (desc->size)
488                 est->size = sizeof(struct nft_rbtree) +
489                             desc->size * sizeof(struct nft_rbtree_elem);
490         else
491                 est->size = ~0;
492
493         est->lookup = NFT_SET_CLASS_O_LOG_N;
494         est->space  = NFT_SET_CLASS_O_N;
495
496         return true;
497 }
498
499 struct nft_set_type nft_set_rbtree_type __read_mostly = {
500         .owner          = THIS_MODULE,
501         .features       = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
502         .ops            = {
503                 .privsize       = nft_rbtree_privsize,
504                 .elemsize       = offsetof(struct nft_rbtree_elem, ext),
505                 .estimate       = nft_rbtree_estimate,
506                 .init           = nft_rbtree_init,
507                 .destroy        = nft_rbtree_destroy,
508                 .insert         = nft_rbtree_insert,
509                 .remove         = nft_rbtree_remove,
510                 .deactivate     = nft_rbtree_deactivate,
511                 .flush          = nft_rbtree_flush,
512                 .activate       = nft_rbtree_activate,
513                 .lookup         = nft_rbtree_lookup,
514                 .walk           = nft_rbtree_walk,
515                 .get            = nft_rbtree_get,
516         },
517 };