1 /* Copyright (c) 2016 Facebook
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
7 #include <linux/cpumask.h>
8 #include <linux/spinlock.h>
9 #include <linux/percpu.h>
11 #include "bpf_lru_list.h"
13 #define LOCAL_FREE_TARGET (128)
14 #define LOCAL_NR_SCANS LOCAL_FREE_TARGET
16 #define PERCPU_FREE_TARGET (4)
17 #define PERCPU_NR_SCANS PERCPU_FREE_TARGET
19 /* Helpers to get the local list index */
20 #define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET)
21 #define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE)
22 #define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING)
23 #define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET)
25 static int get_next_cpu(int cpu)
27 cpu = cpumask_next(cpu, cpu_possible_mask);
28 if (cpu >= nr_cpu_ids)
29 cpu = cpumask_first(cpu_possible_mask);
33 /* Local list helpers */
34 static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l)
36 return &loc_l->lists[LOCAL_FREE_LIST_IDX];
39 static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l)
41 return &loc_l->lists[LOCAL_PENDING_LIST_IDX];
44 /* bpf_lru_node helpers */
45 static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node)
47 return READ_ONCE(node->ref);
50 static void bpf_lru_node_clear_ref(struct bpf_lru_node *node)
52 WRITE_ONCE(node->ref, 0);
55 static void bpf_lru_list_count_inc(struct bpf_lru_list *l,
56 enum bpf_lru_list_type type)
58 if (type < NR_BPF_LRU_LIST_COUNT)
62 static void bpf_lru_list_count_dec(struct bpf_lru_list *l,
63 enum bpf_lru_list_type type)
65 if (type < NR_BPF_LRU_LIST_COUNT)
69 static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l,
70 struct bpf_lru_node *node,
71 struct list_head *free_list,
72 enum bpf_lru_list_type tgt_free_type)
74 if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
77 /* If the removing node is the next_inactive_rotation candidate,
78 * move the next_inactive_rotation pointer also.
80 if (&node->list == l->next_inactive_rotation)
81 l->next_inactive_rotation = l->next_inactive_rotation->prev;
83 bpf_lru_list_count_dec(l, node->type);
85 node->type = tgt_free_type;
86 list_move(&node->list, free_list);
89 /* Move nodes from local list to the LRU list */
90 static void __bpf_lru_node_move_in(struct bpf_lru_list *l,
91 struct bpf_lru_node *node,
92 enum bpf_lru_list_type tgt_type)
94 if (WARN_ON_ONCE(!IS_LOCAL_LIST_TYPE(node->type)) ||
95 WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type)))
98 bpf_lru_list_count_inc(l, tgt_type);
99 node->type = tgt_type;
100 bpf_lru_node_clear_ref(node);
101 list_move(&node->list, &l->lists[tgt_type]);
104 /* Move nodes between or within active and inactive list (like
105 * active to inactive, inactive to active or tail of active back to
106 * the head of active).
108 static void __bpf_lru_node_move(struct bpf_lru_list *l,
109 struct bpf_lru_node *node,
110 enum bpf_lru_list_type tgt_type)
112 if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)) ||
113 WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type)))
116 if (node->type != tgt_type) {
117 bpf_lru_list_count_dec(l, node->type);
118 bpf_lru_list_count_inc(l, tgt_type);
119 node->type = tgt_type;
121 bpf_lru_node_clear_ref(node);
123 /* If the moving node is the next_inactive_rotation candidate,
124 * move the next_inactive_rotation pointer also.
126 if (&node->list == l->next_inactive_rotation)
127 l->next_inactive_rotation = l->next_inactive_rotation->prev;
129 list_move(&node->list, &l->lists[tgt_type]);
132 static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l)
134 return l->counts[BPF_LRU_LIST_T_INACTIVE] <
135 l->counts[BPF_LRU_LIST_T_ACTIVE];
138 /* Rotate the active list:
140 * 2. If the node has the ref bit set, it will be rotated
141 * back to the head of active list with the ref bit cleared.
142 * Give this node one more chance to survive in the active list.
143 * 3. If the ref bit is not set, move it to the head of the
145 * 4. It will at most scan nr_scans nodes
147 static void __bpf_lru_list_rotate_active(struct bpf_lru *lru,
148 struct bpf_lru_list *l)
150 struct list_head *active = &l->lists[BPF_LRU_LIST_T_ACTIVE];
151 struct bpf_lru_node *node, *tmp_node, *first_node;
154 first_node = list_first_entry(active, struct bpf_lru_node, list);
155 list_for_each_entry_safe_reverse(node, tmp_node, active, list) {
156 if (bpf_lru_node_is_ref(node))
157 __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
159 __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
161 if (++i == lru->nr_scans || node == first_node)
166 /* Rotate the inactive list. It starts from the next_inactive_rotation
167 * 1. If the node has ref bit set, it will be moved to the head
168 * of active list with the ref bit cleared.
169 * 2. If the node does not have ref bit set, it will leave it
170 * at its current location (i.e. do nothing) so that it can
171 * be considered during the next inactive_shrink.
172 * 3. It will at most scan nr_scans nodes
174 static void __bpf_lru_list_rotate_inactive(struct bpf_lru *lru,
175 struct bpf_lru_list *l)
177 struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
178 struct list_head *cur, *last, *next = inactive;
179 struct bpf_lru_node *node;
182 if (list_empty(inactive))
185 last = l->next_inactive_rotation->next;
186 if (last == inactive)
189 cur = l->next_inactive_rotation;
190 while (i < lru->nr_scans) {
191 if (cur == inactive) {
196 node = list_entry(cur, struct bpf_lru_node, list);
198 if (bpf_lru_node_is_ref(node))
199 __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
206 l->next_inactive_rotation = next;
209 /* Shrink the inactive list. It starts from the tail of the
210 * inactive list and only move the nodes without the ref bit
211 * set to the designated free list.
214 __bpf_lru_list_shrink_inactive(struct bpf_lru *lru,
215 struct bpf_lru_list *l,
216 unsigned int tgt_nshrink,
217 struct list_head *free_list,
218 enum bpf_lru_list_type tgt_free_type)
220 struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
221 struct bpf_lru_node *node, *tmp_node;
222 unsigned int nshrinked = 0;
225 list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) {
226 if (bpf_lru_node_is_ref(node)) {
227 __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
228 } else if (lru->del_from_htab(lru->del_arg, node)) {
229 __bpf_lru_node_move_to_free(l, node, free_list,
231 if (++nshrinked == tgt_nshrink)
235 if (++i == lru->nr_scans)
242 /* 1. Rotate the active list (if needed)
243 * 2. Always rotate the inactive list
245 static void __bpf_lru_list_rotate(struct bpf_lru *lru, struct bpf_lru_list *l)
247 if (bpf_lru_list_inactive_low(l))
248 __bpf_lru_list_rotate_active(lru, l);
250 __bpf_lru_list_rotate_inactive(lru, l);
253 /* Calls __bpf_lru_list_shrink_inactive() to shrink some
254 * ref-bit-cleared nodes and move them to the designated
257 * If it cannot get a free node after calling
258 * __bpf_lru_list_shrink_inactive(). It will just remove
259 * one node from either inactive or active list without
260 * honoring the ref-bit. It prefers inactive list to active
261 * list in this situation.
263 static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru,
264 struct bpf_lru_list *l,
265 unsigned int tgt_nshrink,
266 struct list_head *free_list,
267 enum bpf_lru_list_type tgt_free_type)
270 struct bpf_lru_node *node, *tmp_node;
271 struct list_head *force_shrink_list;
272 unsigned int nshrinked;
274 nshrinked = __bpf_lru_list_shrink_inactive(lru, l, tgt_nshrink,
275 free_list, tgt_free_type);
279 /* Do a force shrink by ignoring the reference bit */
280 if (!list_empty(&l->lists[BPF_LRU_LIST_T_INACTIVE]))
281 force_shrink_list = &l->lists[BPF_LRU_LIST_T_INACTIVE];
283 force_shrink_list = &l->lists[BPF_LRU_LIST_T_ACTIVE];
285 list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list,
287 if (lru->del_from_htab(lru->del_arg, node)) {
288 __bpf_lru_node_move_to_free(l, node, free_list,
297 /* Flush the nodes from the local pending list to the LRU list */
298 static void __local_list_flush(struct bpf_lru_list *l,
299 struct bpf_lru_locallist *loc_l)
301 struct bpf_lru_node *node, *tmp_node;
303 list_for_each_entry_safe_reverse(node, tmp_node,
304 local_pending_list(loc_l), list) {
305 if (bpf_lru_node_is_ref(node))
306 __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE);
308 __bpf_lru_node_move_in(l, node,
309 BPF_LRU_LIST_T_INACTIVE);
313 static void bpf_lru_list_push_free(struct bpf_lru_list *l,
314 struct bpf_lru_node *node)
318 if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
321 raw_spin_lock_irqsave(&l->lock, flags);
322 __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
323 raw_spin_unlock_irqrestore(&l->lock, flags);
326 static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
327 struct bpf_lru_locallist *loc_l)
329 struct bpf_lru_list *l = &lru->common_lru.lru_list;
330 struct bpf_lru_node *node, *tmp_node;
331 unsigned int nfree = 0;
333 raw_spin_lock(&l->lock);
335 __local_list_flush(l, loc_l);
337 __bpf_lru_list_rotate(lru, l);
339 list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE],
341 __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l),
342 BPF_LRU_LOCAL_LIST_T_FREE);
343 if (++nfree == LOCAL_FREE_TARGET)
347 if (nfree < LOCAL_FREE_TARGET)
348 __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree,
349 local_free_list(loc_l),
350 BPF_LRU_LOCAL_LIST_T_FREE);
352 raw_spin_unlock(&l->lock);
355 static void __local_list_add_pending(struct bpf_lru *lru,
356 struct bpf_lru_locallist *loc_l,
358 struct bpf_lru_node *node,
361 *(u32 *)((void *)node + lru->hash_offset) = hash;
363 node->type = BPF_LRU_LOCAL_LIST_T_PENDING;
364 bpf_lru_node_clear_ref(node);
365 list_add(&node->list, local_pending_list(loc_l));
368 static struct bpf_lru_node *
369 __local_list_pop_free(struct bpf_lru_locallist *loc_l)
371 struct bpf_lru_node *node;
373 node = list_first_entry_or_null(local_free_list(loc_l),
377 list_del(&node->list);
382 static struct bpf_lru_node *
383 __local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l)
385 struct bpf_lru_node *node;
389 /* Get from the tail (i.e. older element) of the pending list. */
390 list_for_each_entry_reverse(node, local_pending_list(loc_l),
392 if ((!bpf_lru_node_is_ref(node) || force) &&
393 lru->del_from_htab(lru->del_arg, node)) {
394 list_del(&node->list);
407 static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
410 struct list_head *free_list;
411 struct bpf_lru_node *node = NULL;
412 struct bpf_lru_list *l;
414 int cpu = raw_smp_processor_id();
416 l = per_cpu_ptr(lru->percpu_lru, cpu);
418 raw_spin_lock_irqsave(&l->lock, flags);
420 __bpf_lru_list_rotate(lru, l);
422 free_list = &l->lists[BPF_LRU_LIST_T_FREE];
423 if (list_empty(free_list))
424 __bpf_lru_list_shrink(lru, l, PERCPU_FREE_TARGET, free_list,
425 BPF_LRU_LIST_T_FREE);
427 if (!list_empty(free_list)) {
428 node = list_first_entry(free_list, struct bpf_lru_node, list);
429 *(u32 *)((void *)node + lru->hash_offset) = hash;
430 bpf_lru_node_clear_ref(node);
431 __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
434 raw_spin_unlock_irqrestore(&l->lock, flags);
439 static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
442 struct bpf_lru_locallist *loc_l, *steal_loc_l;
443 struct bpf_common_lru *clru = &lru->common_lru;
444 struct bpf_lru_node *node;
445 int steal, first_steal;
447 int cpu = raw_smp_processor_id();
449 loc_l = per_cpu_ptr(clru->local_list, cpu);
451 raw_spin_lock_irqsave(&loc_l->lock, flags);
453 node = __local_list_pop_free(loc_l);
455 bpf_lru_list_pop_free_to_local(lru, loc_l);
456 node = __local_list_pop_free(loc_l);
460 __local_list_add_pending(lru, loc_l, cpu, node, hash);
462 raw_spin_unlock_irqrestore(&loc_l->lock, flags);
467 /* No free nodes found from the local free list and
468 * the global LRU list.
470 * Steal from the local free/pending list of the
471 * current CPU and remote CPU in RR. It starts
472 * with the loc_l->next_steal CPU.
475 first_steal = loc_l->next_steal;
478 steal_loc_l = per_cpu_ptr(clru->local_list, steal);
480 raw_spin_lock_irqsave(&steal_loc_l->lock, flags);
482 node = __local_list_pop_free(steal_loc_l);
484 node = __local_list_pop_pending(lru, steal_loc_l);
486 raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
488 steal = get_next_cpu(steal);
489 } while (!node && steal != first_steal);
491 loc_l->next_steal = steal;
494 raw_spin_lock_irqsave(&loc_l->lock, flags);
495 __local_list_add_pending(lru, loc_l, cpu, node, hash);
496 raw_spin_unlock_irqrestore(&loc_l->lock, flags);
502 struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash)
505 return bpf_percpu_lru_pop_free(lru, hash);
507 return bpf_common_lru_pop_free(lru, hash);
510 static void bpf_common_lru_push_free(struct bpf_lru *lru,
511 struct bpf_lru_node *node)
513 u8 node_type = READ_ONCE(node->type);
516 if (WARN_ON_ONCE(node_type == BPF_LRU_LIST_T_FREE) ||
517 WARN_ON_ONCE(node_type == BPF_LRU_LOCAL_LIST_T_FREE))
520 if (node_type == BPF_LRU_LOCAL_LIST_T_PENDING) {
521 struct bpf_lru_locallist *loc_l;
523 loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu);
525 raw_spin_lock_irqsave(&loc_l->lock, flags);
527 if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) {
528 raw_spin_unlock_irqrestore(&loc_l->lock, flags);
532 node->type = BPF_LRU_LOCAL_LIST_T_FREE;
533 bpf_lru_node_clear_ref(node);
534 list_move(&node->list, local_free_list(loc_l));
536 raw_spin_unlock_irqrestore(&loc_l->lock, flags);
541 bpf_lru_list_push_free(&lru->common_lru.lru_list, node);
544 static void bpf_percpu_lru_push_free(struct bpf_lru *lru,
545 struct bpf_lru_node *node)
547 struct bpf_lru_list *l;
550 l = per_cpu_ptr(lru->percpu_lru, node->cpu);
552 raw_spin_lock_irqsave(&l->lock, flags);
554 __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
556 raw_spin_unlock_irqrestore(&l->lock, flags);
559 void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node)
562 bpf_percpu_lru_push_free(lru, node);
564 bpf_common_lru_push_free(lru, node);
567 static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
568 u32 node_offset, u32 elem_size,
571 struct bpf_lru_list *l = &lru->common_lru.lru_list;
574 for (i = 0; i < nr_elems; i++) {
575 struct bpf_lru_node *node;
577 node = (struct bpf_lru_node *)(buf + node_offset);
578 node->type = BPF_LRU_LIST_T_FREE;
579 bpf_lru_node_clear_ref(node);
580 list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
585 static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf,
586 u32 node_offset, u32 elem_size,
591 struct bpf_lru_list *l;
593 pcpu_entries = nr_elems / num_possible_cpus();
597 for_each_possible_cpu(cpu) {
598 struct bpf_lru_node *node;
600 l = per_cpu_ptr(lru->percpu_lru, cpu);
602 node = (struct bpf_lru_node *)(buf + node_offset);
604 node->type = BPF_LRU_LIST_T_FREE;
605 bpf_lru_node_clear_ref(node);
606 list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
611 if (i % pcpu_entries)
616 void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
617 u32 elem_size, u32 nr_elems)
620 bpf_percpu_lru_populate(lru, buf, node_offset, elem_size,
623 bpf_common_lru_populate(lru, buf, node_offset, elem_size,
627 static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu)
631 for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++)
632 INIT_LIST_HEAD(&loc_l->lists[i]);
634 loc_l->next_steal = cpu;
636 raw_spin_lock_init(&loc_l->lock);
639 static void bpf_lru_list_init(struct bpf_lru_list *l)
643 for (i = 0; i < NR_BPF_LRU_LIST_T; i++)
644 INIT_LIST_HEAD(&l->lists[i]);
646 for (i = 0; i < NR_BPF_LRU_LIST_COUNT; i++)
649 l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE];
651 raw_spin_lock_init(&l->lock);
654 int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
655 del_from_htab_func del_from_htab, void *del_arg)
660 lru->percpu_lru = alloc_percpu(struct bpf_lru_list);
661 if (!lru->percpu_lru)
664 for_each_possible_cpu(cpu) {
665 struct bpf_lru_list *l;
667 l = per_cpu_ptr(lru->percpu_lru, cpu);
668 bpf_lru_list_init(l);
670 lru->nr_scans = PERCPU_NR_SCANS;
672 struct bpf_common_lru *clru = &lru->common_lru;
674 clru->local_list = alloc_percpu(struct bpf_lru_locallist);
675 if (!clru->local_list)
678 for_each_possible_cpu(cpu) {
679 struct bpf_lru_locallist *loc_l;
681 loc_l = per_cpu_ptr(clru->local_list, cpu);
682 bpf_lru_locallist_init(loc_l, cpu);
685 bpf_lru_list_init(&clru->lru_list);
686 lru->nr_scans = LOCAL_NR_SCANS;
689 lru->percpu = percpu;
690 lru->del_from_htab = del_from_htab;
691 lru->del_arg = del_arg;
692 lru->hash_offset = hash_offset;
697 void bpf_lru_destroy(struct bpf_lru *lru)
700 free_percpu(lru->percpu_lru);
702 free_percpu(lru->common_lru.local_list);