1 // SPDX-License-Identifier: GPL-2.0
2 /* IPVS: Maglev Hashing scheduling module
4 * Authors: Inju Song <inju.song@navercorp.com>
8 /* The mh algorithm is to assign a preference list of all the lookup
9 * table positions to each destination and populate the table with
10 * the most-preferred position of destinations. Then it is to select
11 * destination with the hash key of source IP address through looking
12 * up a the lookup table.
14 * The algorithm is detailed in:
15 * [3.4 Consistent Hasing]
16 https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf
20 #define KMSG_COMPONENT "IPVS"
21 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
24 #include <linux/slab.h>
25 #include <linux/module.h>
26 #include <linux/kernel.h>
27 #include <linux/skbuff.h>
29 #include <net/ip_vs.h>
31 #include <linux/siphash.h>
32 #include <linux/bitops.h>
33 #include <linux/gcd.h>
35 #define IP_VS_SVC_F_SCHED_MH_FALLBACK IP_VS_SVC_F_SCHED1 /* MH fallback */
36 #define IP_VS_SVC_F_SCHED_MH_PORT IP_VS_SVC_F_SCHED2 /* MH use port */
38 struct ip_vs_mh_lookup {
39 struct ip_vs_dest __rcu *dest; /* real server (cache) */
42 struct ip_vs_mh_dest_setup {
43 unsigned int offset; /* starting offset */
44 unsigned int skip; /* skip */
45 unsigned int perm; /* next_offset */
46 int turns; /* weight / gcd() and rshift */
49 /* Available prime numbers for MH table */
50 static int primes[] = {251, 509, 1021, 2039, 4093,
51 8191, 16381, 32749, 65521, 131071};
53 /* For IPVS MH entry hash table */
54 #ifndef CONFIG_IP_VS_MH_TAB_INDEX
55 #define CONFIG_IP_VS_MH_TAB_INDEX 12
57 #define IP_VS_MH_TAB_BITS (CONFIG_IP_VS_MH_TAB_INDEX / 2)
58 #define IP_VS_MH_TAB_INDEX (CONFIG_IP_VS_MH_TAB_INDEX - 8)
59 #define IP_VS_MH_TAB_SIZE primes[IP_VS_MH_TAB_INDEX]
61 struct ip_vs_mh_state {
62 struct rcu_head rcu_head;
63 struct ip_vs_mh_lookup *lookup;
64 struct ip_vs_mh_dest_setup *dest_setup;
65 hsiphash_key_t hash1, hash2;
70 static inline void generate_hash_secret(hsiphash_key_t *hash1,
71 hsiphash_key_t *hash2)
73 hash1->key[0] = 2654435761UL;
74 hash1->key[1] = 2654435761UL;
76 hash2->key[0] = 2654446892UL;
77 hash2->key[1] = 2654446892UL;
80 /* Helper function to determine if server is unavailable */
81 static inline bool is_unavailable(struct ip_vs_dest *dest)
83 return atomic_read(&dest->weight) <= 0 ||
84 dest->flags & IP_VS_DEST_F_OVERLOAD;
87 /* Returns hash value for IPVS MH entry */
88 static inline unsigned int
89 ip_vs_mh_hashkey(int af, const union nf_inet_addr *addr,
90 __be16 port, hsiphash_key_t *key, unsigned int offset)
93 __be32 addr_fold = addr->ip;
95 #ifdef CONFIG_IP_VS_IPV6
97 addr_fold = addr->ip6[0] ^ addr->ip6[1] ^
98 addr->ip6[2] ^ addr->ip6[3];
100 v = (offset + ntohs(port) + ntohl(addr_fold));
101 return hsiphash(&v, sizeof(v), key);
104 /* Reset all the hash buckets of the specified table. */
105 static void ip_vs_mh_reset(struct ip_vs_mh_state *s)
108 struct ip_vs_mh_lookup *l;
109 struct ip_vs_dest *dest;
112 for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) {
113 dest = rcu_dereference_protected(l->dest, 1);
115 ip_vs_dest_put(dest);
116 RCU_INIT_POINTER(l->dest, NULL);
122 static int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
123 struct ip_vs_service *svc)
126 struct ip_vs_mh_dest_setup *ds;
127 struct ip_vs_dest *dest;
130 /* If gcd is smaller then 1, number of dests or
131 * all last_weight of dests are zero. So, skip
132 * permutation for the dests.
137 /* Set dest_setup for the dests permutation */
138 p = &svc->destinations;
139 ds = &s->dest_setup[0];
140 while ((p = p->next) != &svc->destinations) {
141 dest = list_entry(p, struct ip_vs_dest, n_list);
143 ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr,
144 dest->port, &s->hash1, 0) %
146 ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr,
147 dest->port, &s->hash2, 0) %
148 (IP_VS_MH_TAB_SIZE - 1) + 1;
149 ds->perm = ds->offset;
151 lw = atomic_read(&dest->last_weight);
152 ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0);
159 static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
160 struct ip_vs_service *svc)
163 unsigned long *table;
165 struct ip_vs_mh_dest_setup *ds;
166 struct ip_vs_dest *dest, *new_dest;
168 /* If gcd is smaller then 1, number of dests or
169 * all last_weight of dests are zero. So, skip
170 * the population for the dests and reset lookup table.
177 table = bitmap_zalloc(IP_VS_MH_TAB_SIZE, GFP_KERNEL);
181 p = &svc->destinations;
184 while (n < IP_VS_MH_TAB_SIZE) {
185 if (p == &svc->destinations)
188 ds = &s->dest_setup[0];
189 while (p != &svc->destinations) {
190 /* Ignore added server with zero weight */
198 while (test_bit(c, table)) {
199 /* Add skip, mod IP_VS_MH_TAB_SIZE */
200 ds->perm += ds->skip;
201 if (ds->perm >= IP_VS_MH_TAB_SIZE)
202 ds->perm -= IP_VS_MH_TAB_SIZE;
208 dest = rcu_dereference_protected(s->lookup[c].dest, 1);
209 new_dest = list_entry(p, struct ip_vs_dest, n_list);
210 if (dest != new_dest) {
212 ip_vs_dest_put(dest);
213 ip_vs_dest_hold(new_dest);
214 RCU_INIT_POINTER(s->lookup[c].dest, new_dest);
217 if (++n == IP_VS_MH_TAB_SIZE)
220 if (++dt_count >= ds->turns) {
233 /* Get ip_vs_dest associated with supplied parameters. */
234 static inline struct ip_vs_dest *
235 ip_vs_mh_get(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
236 const union nf_inet_addr *addr, __be16 port)
238 unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0)
240 struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest);
242 return (!dest || is_unavailable(dest)) ? NULL : dest;
245 /* As ip_vs_mh_get, but with fallback if selected server is unavailable */
246 static inline struct ip_vs_dest *
247 ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
248 const union nf_inet_addr *addr, __be16 port)
250 unsigned int offset, roffset;
251 unsigned int hash, ihash;
252 struct ip_vs_dest *dest;
254 /* First try the dest it's supposed to go to */
255 ihash = ip_vs_mh_hashkey(svc->af, addr, port,
256 &s->hash1, 0) % IP_VS_MH_TAB_SIZE;
257 dest = rcu_dereference(s->lookup[ihash].dest);
260 if (!is_unavailable(dest))
263 IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%u, reselecting",
264 IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
266 /* If the original dest is unavailable, loop around the table
267 * starting from ihash to find a new dest
269 for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) {
270 roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE;
271 hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1,
272 roffset) % IP_VS_MH_TAB_SIZE;
273 dest = rcu_dereference(s->lookup[hash].dest);
276 if (!is_unavailable(dest))
279 "MH: selected unavailable server %s:%u (offset %u), reselecting",
280 IP_VS_DBG_ADDR(dest->af, &dest->addr),
281 ntohs(dest->port), roffset);
287 /* Assign all the hash buckets of the specified table with the service. */
288 static int ip_vs_mh_reassign(struct ip_vs_mh_state *s,
289 struct ip_vs_service *svc)
293 if (svc->num_dests > IP_VS_MH_TAB_SIZE)
296 if (svc->num_dests >= 1) {
297 s->dest_setup = kcalloc(svc->num_dests,
298 sizeof(struct ip_vs_mh_dest_setup),
304 ip_vs_mh_permutate(s, svc);
306 ret = ip_vs_mh_populate(s, svc);
310 IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%u\n",
311 IP_VS_DBG_ADDR(svc->af, &svc->addr),
315 if (svc->num_dests >= 1) {
316 kfree(s->dest_setup);
317 s->dest_setup = NULL;
322 static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc)
324 struct ip_vs_dest *dest;
328 list_for_each_entry(dest, &svc->destinations, n_list) {
329 weight = atomic_read(&dest->last_weight);
340 /* To avoid assigning huge weight for the MH table,
341 * calculate shift value with gcd.
343 static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd)
345 struct ip_vs_dest *dest;
346 int new_weight, weight = 0;
349 /* If gcd is smaller then 1, number of dests or
350 * all last_weight of dests are zero. So, return
351 * shift value as zero.
356 list_for_each_entry(dest, &svc->destinations, n_list) {
357 new_weight = atomic_read(&dest->last_weight);
358 if (new_weight > weight)
362 /* Because gcd is greater than zero,
363 * the maximum weight and gcd are always greater than zero
367 /* shift = occupied bits of weight/gcd - MH highest bits */
368 shift = fls(mw) - IP_VS_MH_TAB_BITS;
369 return (shift >= 0) ? shift : 0;
372 static void ip_vs_mh_state_free(struct rcu_head *head)
374 struct ip_vs_mh_state *s;
376 s = container_of(head, struct ip_vs_mh_state, rcu_head);
381 static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
384 struct ip_vs_mh_state *s;
386 /* Allocate the MH table for this service */
387 s = kzalloc(sizeof(*s), GFP_KERNEL);
391 s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup),
398 generate_hash_secret(&s->hash1, &s->hash2);
399 s->gcd = ip_vs_mh_gcd_weight(svc);
400 s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
403 "MH lookup table (memory=%zdbytes) allocated for current service\n",
404 sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
406 /* Assign the lookup table with current dests */
407 ret = ip_vs_mh_reassign(s, svc);
410 ip_vs_mh_state_free(&s->rcu_head);
414 /* No more failures, attach state */
419 static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
421 struct ip_vs_mh_state *s = svc->sched_data;
423 /* Got to clean up lookup entry here */
426 call_rcu(&s->rcu_head, ip_vs_mh_state_free);
427 IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n",
428 sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
431 static int ip_vs_mh_dest_changed(struct ip_vs_service *svc,
432 struct ip_vs_dest *dest)
434 struct ip_vs_mh_state *s = svc->sched_data;
436 s->gcd = ip_vs_mh_gcd_weight(svc);
437 s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
439 /* Assign the lookup table with the updated service */
440 return ip_vs_mh_reassign(s, svc);
443 /* Helper function to get port number */
445 ip_vs_mh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
447 __be16 _ports[2], *ports;
449 /* At this point we know that we have a valid packet of some kind.
450 * Because ICMP packets are only guaranteed to have the first 8
451 * bytes, let's just grab the ports. Fortunately they're in the
452 * same position for all three of the protocols we care about.
454 switch (iph->protocol) {
458 ports = skb_header_pointer(skb, iph->len, sizeof(_ports),
460 if (unlikely(!ports))
463 if (likely(!ip_vs_iph_inverse(iph)))
472 /* Maglev Hashing scheduling */
473 static struct ip_vs_dest *
474 ip_vs_mh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
475 struct ip_vs_iphdr *iph)
477 struct ip_vs_dest *dest;
478 struct ip_vs_mh_state *s;
480 const union nf_inet_addr *hash_addr;
482 hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr;
484 IP_VS_DBG(6, "%s : Scheduling...\n", __func__);
486 if (svc->flags & IP_VS_SVC_F_SCHED_MH_PORT)
487 port = ip_vs_mh_get_port(skb, iph);
489 s = (struct ip_vs_mh_state *)svc->sched_data;
491 if (svc->flags & IP_VS_SVC_F_SCHED_MH_FALLBACK)
492 dest = ip_vs_mh_get_fallback(svc, s, hash_addr, port);
494 dest = ip_vs_mh_get(svc, s, hash_addr, port);
497 ip_vs_scheduler_err(svc, "no destination available");
501 IP_VS_DBG_BUF(6, "MH: source IP address %s:%u --> server %s:%u\n",
502 IP_VS_DBG_ADDR(svc->af, hash_addr),
504 IP_VS_DBG_ADDR(dest->af, &dest->addr),
510 /* IPVS MH Scheduler structure */
511 static struct ip_vs_scheduler ip_vs_mh_scheduler = {
513 .refcnt = ATOMIC_INIT(0),
514 .module = THIS_MODULE,
515 .n_list = LIST_HEAD_INIT(ip_vs_mh_scheduler.n_list),
516 .init_service = ip_vs_mh_init_svc,
517 .done_service = ip_vs_mh_done_svc,
518 .add_dest = ip_vs_mh_dest_changed,
519 .del_dest = ip_vs_mh_dest_changed,
520 .upd_dest = ip_vs_mh_dest_changed,
521 .schedule = ip_vs_mh_schedule,
524 static int __init ip_vs_mh_init(void)
526 return register_ip_vs_scheduler(&ip_vs_mh_scheduler);
529 static void __exit ip_vs_mh_cleanup(void)
531 unregister_ip_vs_scheduler(&ip_vs_mh_scheduler);
535 module_init(ip_vs_mh_init);
536 module_exit(ip_vs_mh_cleanup);
537 MODULE_DESCRIPTION("Maglev hashing ipvs scheduler");
538 MODULE_LICENSE("GPL v2");
539 MODULE_AUTHOR("Inju Song <inju.song@navercorp.com>");