GNU Linux-libre 6.9.1-gnu
[releases.git] / net / xdp / xsk.c
1 // SPDX-License-Identifier: GPL-2.0
2 /* XDP sockets
3  *
4  * AF_XDP sockets allows a channel between XDP programs and userspace
5  * applications.
6  * Copyright(c) 2018 Intel Corporation.
7  *
8  * Author(s): Björn Töpel <bjorn.topel@intel.com>
9  *            Magnus Karlsson <magnus.karlsson@intel.com>
10  */
11
12 #define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__
13
14 #include <linux/if_xdp.h>
15 #include <linux/init.h>
16 #include <linux/sched/mm.h>
17 #include <linux/sched/signal.h>
18 #include <linux/sched/task.h>
19 #include <linux/socket.h>
20 #include <linux/file.h>
21 #include <linux/uaccess.h>
22 #include <linux/net.h>
23 #include <linux/netdevice.h>
24 #include <linux/rculist.h>
25 #include <linux/vmalloc.h>
26 #include <net/xdp_sock_drv.h>
27 #include <net/busy_poll.h>
28 #include <net/netdev_rx_queue.h>
29 #include <net/xdp.h>
30
31 #include "xsk_queue.h"
32 #include "xdp_umem.h"
33 #include "xsk.h"
34
35 #define TX_BATCH_SIZE 32
36 #define MAX_PER_SOCKET_BUDGET (TX_BATCH_SIZE)
37
38 static DEFINE_PER_CPU(struct list_head, xskmap_flush_list);
39
40 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
41 {
42         if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
43                 return;
44
45         pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
46         pool->cached_need_wakeup |= XDP_WAKEUP_RX;
47 }
48 EXPORT_SYMBOL(xsk_set_rx_need_wakeup);
49
50 void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
51 {
52         struct xdp_sock *xs;
53
54         if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
55                 return;
56
57         rcu_read_lock();
58         list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
59                 xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
60         }
61         rcu_read_unlock();
62
63         pool->cached_need_wakeup |= XDP_WAKEUP_TX;
64 }
65 EXPORT_SYMBOL(xsk_set_tx_need_wakeup);
66
67 void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
68 {
69         if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
70                 return;
71
72         pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
73         pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
74 }
75 EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);
76
77 void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
78 {
79         struct xdp_sock *xs;
80
81         if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
82                 return;
83
84         rcu_read_lock();
85         list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
86                 xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
87         }
88         rcu_read_unlock();
89
90         pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
91 }
92 EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);
93
94 bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
95 {
96         return pool->uses_need_wakeup;
97 }
98 EXPORT_SYMBOL(xsk_uses_need_wakeup);
99
100 struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
101                                             u16 queue_id)
102 {
103         if (queue_id < dev->real_num_rx_queues)
104                 return dev->_rx[queue_id].pool;
105         if (queue_id < dev->real_num_tx_queues)
106                 return dev->_tx[queue_id].pool;
107
108         return NULL;
109 }
110 EXPORT_SYMBOL(xsk_get_pool_from_qid);
111
112 void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
113 {
114         if (queue_id < dev->num_rx_queues)
115                 dev->_rx[queue_id].pool = NULL;
116         if (queue_id < dev->num_tx_queues)
117                 dev->_tx[queue_id].pool = NULL;
118 }
119
120 /* The buffer pool is stored both in the _rx struct and the _tx struct as we do
121  * not know if the device has more tx queues than rx, or the opposite.
122  * This might also change during run time.
123  */
124 int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
125                         u16 queue_id)
126 {
127         if (queue_id >= max_t(unsigned int,
128                               dev->real_num_rx_queues,
129                               dev->real_num_tx_queues))
130                 return -EINVAL;
131
132         if (queue_id < dev->real_num_rx_queues)
133                 dev->_rx[queue_id].pool = pool;
134         if (queue_id < dev->real_num_tx_queues)
135                 dev->_tx[queue_id].pool = pool;
136
137         return 0;
138 }
139
140 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len,
141                         u32 flags)
142 {
143         u64 addr;
144         int err;
145
146         addr = xp_get_handle(xskb);
147         err = xskq_prod_reserve_desc(xs->rx, addr, len, flags);
148         if (err) {
149                 xs->rx_queue_full++;
150                 return err;
151         }
152
153         xp_release(xskb);
154         return 0;
155 }
156
157 static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
158 {
159         struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
160         u32 frags = xdp_buff_has_frags(xdp);
161         struct xdp_buff_xsk *pos, *tmp;
162         struct list_head *xskb_list;
163         u32 contd = 0;
164         int err;
165
166         if (frags)
167                 contd = XDP_PKT_CONTD;
168
169         err = __xsk_rcv_zc(xs, xskb, len, contd);
170         if (err)
171                 goto err;
172         if (likely(!frags))
173                 return 0;
174
175         xskb_list = &xskb->pool->xskb_list;
176         list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) {
177                 if (list_is_singular(xskb_list))
178                         contd = 0;
179                 len = pos->xdp.data_end - pos->xdp.data;
180                 err = __xsk_rcv_zc(xs, pos, len, contd);
181                 if (err)
182                         goto err;
183                 list_del(&pos->xskb_list_node);
184         }
185
186         return 0;
187 err:
188         xsk_buff_free(xdp);
189         return err;
190 }
191
192 static void *xsk_copy_xdp_start(struct xdp_buff *from)
193 {
194         if (unlikely(xdp_data_meta_unsupported(from)))
195                 return from->data;
196         else
197                 return from->data_meta;
198 }
199
200 static u32 xsk_copy_xdp(void *to, void **from, u32 to_len,
201                         u32 *from_len, skb_frag_t **frag, u32 rem)
202 {
203         u32 copied = 0;
204
205         while (1) {
206                 u32 copy_len = min_t(u32, *from_len, to_len);
207
208                 memcpy(to, *from, copy_len);
209                 copied += copy_len;
210                 if (rem == copied)
211                         return copied;
212
213                 if (*from_len == copy_len) {
214                         *from = skb_frag_address(*frag);
215                         *from_len = skb_frag_size((*frag)++);
216                 } else {
217                         *from += copy_len;
218                         *from_len -= copy_len;
219                 }
220                 if (to_len == copy_len)
221                         return copied;
222
223                 to_len -= copy_len;
224                 to += copy_len;
225         }
226 }
227
228 static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
229 {
230         u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool);
231         void *copy_from = xsk_copy_xdp_start(xdp), *copy_to;
232         u32 from_len, meta_len, rem, num_desc;
233         struct xdp_buff_xsk *xskb;
234         struct xdp_buff *xsk_xdp;
235         skb_frag_t *frag;
236
237         from_len = xdp->data_end - copy_from;
238         meta_len = xdp->data - copy_from;
239         rem = len + meta_len;
240
241         if (len <= frame_size && !xdp_buff_has_frags(xdp)) {
242                 int err;
243
244                 xsk_xdp = xsk_buff_alloc(xs->pool);
245                 if (!xsk_xdp) {
246                         xs->rx_dropped++;
247                         return -ENOMEM;
248                 }
249                 memcpy(xsk_xdp->data - meta_len, copy_from, rem);
250                 xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp);
251                 err = __xsk_rcv_zc(xs, xskb, len, 0);
252                 if (err) {
253                         xsk_buff_free(xsk_xdp);
254                         return err;
255                 }
256
257                 return 0;
258         }
259
260         num_desc = (len - 1) / frame_size + 1;
261
262         if (!xsk_buff_can_alloc(xs->pool, num_desc)) {
263                 xs->rx_dropped++;
264                 return -ENOMEM;
265         }
266         if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) {
267                 xs->rx_queue_full++;
268                 return -ENOBUFS;
269         }
270
271         if (xdp_buff_has_frags(xdp)) {
272                 struct skb_shared_info *sinfo;
273
274                 sinfo = xdp_get_shared_info_from_buff(xdp);
275                 frag =  &sinfo->frags[0];
276         }
277
278         do {
279                 u32 to_len = frame_size + meta_len;
280                 u32 copied;
281
282                 xsk_xdp = xsk_buff_alloc(xs->pool);
283                 copy_to = xsk_xdp->data - meta_len;
284
285                 copied = xsk_copy_xdp(copy_to, &copy_from, to_len, &from_len, &frag, rem);
286                 rem -= copied;
287
288                 xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp);
289                 __xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0);
290                 meta_len = 0;
291         } while (rem);
292
293         return 0;
294 }
295
296 static bool xsk_tx_writeable(struct xdp_sock *xs)
297 {
298         if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
299                 return false;
300
301         return true;
302 }
303
304 static bool xsk_is_bound(struct xdp_sock *xs)
305 {
306         if (READ_ONCE(xs->state) == XSK_BOUND) {
307                 /* Matches smp_wmb() in bind(). */
308                 smp_rmb();
309                 return true;
310         }
311         return false;
312 }
313
314 static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
315 {
316         struct net_device *dev = xdp->rxq->dev;
317         u32 qid = xdp->rxq->queue_index;
318
319         if (!xsk_is_bound(xs))
320                 return -ENXIO;
321
322         if (!dev->_rx[qid].pool || xs->umem != dev->_rx[qid].pool->umem)
323                 return -EINVAL;
324
325         if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) {
326                 xs->rx_dropped++;
327                 return -ENOSPC;
328         }
329
330         sk_mark_napi_id_once_xdp(&xs->sk, xdp);
331         return 0;
332 }
333
334 static void xsk_flush(struct xdp_sock *xs)
335 {
336         xskq_prod_submit(xs->rx);
337         __xskq_cons_release(xs->pool->fq);
338         sock_def_readable(&xs->sk);
339 }
340
341 int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
342 {
343         u32 len = xdp_get_buff_len(xdp);
344         int err;
345
346         spin_lock_bh(&xs->rx_lock);
347         err = xsk_rcv_check(xs, xdp, len);
348         if (!err) {
349                 err = __xsk_rcv(xs, xdp, len);
350                 xsk_flush(xs);
351         }
352         spin_unlock_bh(&xs->rx_lock);
353         return err;
354 }
355
356 static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
357 {
358         u32 len = xdp_get_buff_len(xdp);
359         int err;
360
361         err = xsk_rcv_check(xs, xdp, len);
362         if (err)
363                 return err;
364
365         if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
366                 len = xdp->data_end - xdp->data;
367                 return xsk_rcv_zc(xs, xdp, len);
368         }
369
370         err = __xsk_rcv(xs, xdp, len);
371         if (!err)
372                 xdp_return_buff(xdp);
373         return err;
374 }
375
376 int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
377 {
378         struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
379         int err;
380
381         err = xsk_rcv(xs, xdp);
382         if (err)
383                 return err;
384
385         if (!xs->flush_node.prev)
386                 list_add(&xs->flush_node, flush_list);
387
388         return 0;
389 }
390
391 void __xsk_map_flush(void)
392 {
393         struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list);
394         struct xdp_sock *xs, *tmp;
395
396         list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
397                 xsk_flush(xs);
398                 __list_del_clearprev(&xs->flush_node);
399         }
400 }
401
402 #ifdef CONFIG_DEBUG_NET
403 bool xsk_map_check_flush(void)
404 {
405         if (list_empty(this_cpu_ptr(&xskmap_flush_list)))
406                 return false;
407         __xsk_map_flush();
408         return true;
409 }
410 #endif
411
412 void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
413 {
414         xskq_prod_submit_n(pool->cq, nb_entries);
415 }
416 EXPORT_SYMBOL(xsk_tx_completed);
417
418 void xsk_tx_release(struct xsk_buff_pool *pool)
419 {
420         struct xdp_sock *xs;
421
422         rcu_read_lock();
423         list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
424                 __xskq_cons_release(xs->tx);
425                 if (xsk_tx_writeable(xs))
426                         xs->sk.sk_write_space(&xs->sk);
427         }
428         rcu_read_unlock();
429 }
430 EXPORT_SYMBOL(xsk_tx_release);
431
432 bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
433 {
434         bool budget_exhausted = false;
435         struct xdp_sock *xs;
436
437         rcu_read_lock();
438 again:
439         list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
440                 if (xs->tx_budget_spent >= MAX_PER_SOCKET_BUDGET) {
441                         budget_exhausted = true;
442                         continue;
443                 }
444
445                 if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
446                         if (xskq_has_descs(xs->tx))
447                                 xskq_cons_release(xs->tx);
448                         continue;
449                 }
450
451                 xs->tx_budget_spent++;
452
453                 /* This is the backpressure mechanism for the Tx path.
454                  * Reserve space in the completion queue and only proceed
455                  * if there is space in it. This avoids having to implement
456                  * any buffering in the Tx path.
457                  */
458                 if (xskq_prod_reserve_addr(pool->cq, desc->addr))
459                         goto out;
460
461                 xskq_cons_release(xs->tx);
462                 rcu_read_unlock();
463                 return true;
464         }
465
466         if (budget_exhausted) {
467                 list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list)
468                         xs->tx_budget_spent = 0;
469
470                 budget_exhausted = false;
471                 goto again;
472         }
473
474 out:
475         rcu_read_unlock();
476         return false;
477 }
478 EXPORT_SYMBOL(xsk_tx_peek_desc);
479
480 static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries)
481 {
482         struct xdp_desc *descs = pool->tx_descs;
483         u32 nb_pkts = 0;
484
485         while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
486                 nb_pkts++;
487
488         xsk_tx_release(pool);
489         return nb_pkts;
490 }
491
492 u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts)
493 {
494         struct xdp_sock *xs;
495
496         rcu_read_lock();
497         if (!list_is_singular(&pool->xsk_tx_list)) {
498                 /* Fallback to the non-batched version */
499                 rcu_read_unlock();
500                 return xsk_tx_peek_release_fallback(pool, nb_pkts);
501         }
502
503         xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
504         if (!xs) {
505                 nb_pkts = 0;
506                 goto out;
507         }
508
509         nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts);
510
511         /* This is the backpressure mechanism for the Tx path. Try to
512          * reserve space in the completion queue for all packets, but
513          * if there are fewer slots available, just process that many
514          * packets. This avoids having to implement any buffering in
515          * the Tx path.
516          */
517         nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts);
518         if (!nb_pkts)
519                 goto out;
520
521         nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts);
522         if (!nb_pkts) {
523                 xs->tx->queue_empty_descs++;
524                 goto out;
525         }
526
527         __xskq_cons_release(xs->tx);
528         xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts);
529         xs->sk.sk_write_space(&xs->sk);
530
531 out:
532         rcu_read_unlock();
533         return nb_pkts;
534 }
535 EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);
536
537 static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
538 {
539         struct net_device *dev = xs->dev;
540
541         return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
542 }
543
544 static int xsk_cq_reserve_addr_locked(struct xdp_sock *xs, u64 addr)
545 {
546         unsigned long flags;
547         int ret;
548
549         spin_lock_irqsave(&xs->pool->cq_lock, flags);
550         ret = xskq_prod_reserve_addr(xs->pool->cq, addr);
551         spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
552
553         return ret;
554 }
555
556 static void xsk_cq_submit_locked(struct xdp_sock *xs, u32 n)
557 {
558         unsigned long flags;
559
560         spin_lock_irqsave(&xs->pool->cq_lock, flags);
561         xskq_prod_submit_n(xs->pool->cq, n);
562         spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
563 }
564
565 static void xsk_cq_cancel_locked(struct xdp_sock *xs, u32 n)
566 {
567         unsigned long flags;
568
569         spin_lock_irqsave(&xs->pool->cq_lock, flags);
570         xskq_prod_cancel_n(xs->pool->cq, n);
571         spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
572 }
573
574 static u32 xsk_get_num_desc(struct sk_buff *skb)
575 {
576         return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
577 }
578
579 static void xsk_destruct_skb(struct sk_buff *skb)
580 {
581         struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
582
583         if (compl->tx_timestamp) {
584                 /* sw completion timestamp, not a real one */
585                 *compl->tx_timestamp = ktime_get_tai_fast_ns();
586         }
587
588         xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb));
589         sock_wfree(skb);
590 }
591
592 static void xsk_set_destructor_arg(struct sk_buff *skb)
593 {
594         long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
595
596         skb_shinfo(skb)->destructor_arg = (void *)num;
597 }
598
599 static void xsk_consume_skb(struct sk_buff *skb)
600 {
601         struct xdp_sock *xs = xdp_sk(skb->sk);
602
603         skb->destructor = sock_wfree;
604         xsk_cq_cancel_locked(xs, xsk_get_num_desc(skb));
605         /* Free skb without triggering the perf drop trace */
606         consume_skb(skb);
607         xs->skb = NULL;
608 }
609
610 static void xsk_drop_skb(struct sk_buff *skb)
611 {
612         xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb);
613         xsk_consume_skb(skb);
614 }
615
616 static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
617                                               struct xdp_desc *desc)
618 {
619         struct xsk_buff_pool *pool = xs->pool;
620         u32 hr, len, ts, offset, copy, copied;
621         struct sk_buff *skb = xs->skb;
622         struct page *page;
623         void *buffer;
624         int err, i;
625         u64 addr;
626
627         if (!skb) {
628                 hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
629
630                 skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
631                 if (unlikely(!skb))
632                         return ERR_PTR(err);
633
634                 skb_reserve(skb, hr);
635         }
636
637         addr = desc->addr;
638         len = desc->len;
639         ts = pool->unaligned ? len : pool->chunk_size;
640
641         buffer = xsk_buff_raw_get_data(pool, addr);
642         offset = offset_in_page(buffer);
643         addr = buffer - pool->addrs;
644
645         for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) {
646                 if (unlikely(i >= MAX_SKB_FRAGS))
647                         return ERR_PTR(-EOVERFLOW);
648
649                 page = pool->umem->pgs[addr >> PAGE_SHIFT];
650                 get_page(page);
651
652                 copy = min_t(u32, PAGE_SIZE - offset, len - copied);
653                 skb_fill_page_desc(skb, i, page, offset, copy);
654
655                 copied += copy;
656                 addr += copy;
657                 offset = 0;
658         }
659
660         skb->len += len;
661         skb->data_len += len;
662         skb->truesize += ts;
663
664         refcount_add(ts, &xs->sk.sk_wmem_alloc);
665
666         return skb;
667 }
668
669 static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
670                                      struct xdp_desc *desc)
671 {
672         struct xsk_tx_metadata *meta = NULL;
673         struct net_device *dev = xs->dev;
674         struct sk_buff *skb = xs->skb;
675         bool first_frag = false;
676         int err;
677
678         if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
679                 skb = xsk_build_skb_zerocopy(xs, desc);
680                 if (IS_ERR(skb)) {
681                         err = PTR_ERR(skb);
682                         goto free_err;
683                 }
684         } else {
685                 u32 hr, tr, len;
686                 void *buffer;
687
688                 buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
689                 len = desc->len;
690
691                 if (!skb) {
692                         hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
693                         tr = dev->needed_tailroom;
694                         skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
695                         if (unlikely(!skb))
696                                 goto free_err;
697
698                         skb_reserve(skb, hr);
699                         skb_put(skb, len);
700
701                         err = skb_store_bits(skb, 0, buffer, len);
702                         if (unlikely(err)) {
703                                 kfree_skb(skb);
704                                 goto free_err;
705                         }
706
707                         first_frag = true;
708                 } else {
709                         int nr_frags = skb_shinfo(skb)->nr_frags;
710                         struct page *page;
711                         u8 *vaddr;
712
713                         if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
714                                 err = -EOVERFLOW;
715                                 goto free_err;
716                         }
717
718                         page = alloc_page(xs->sk.sk_allocation);
719                         if (unlikely(!page)) {
720                                 err = -EAGAIN;
721                                 goto free_err;
722                         }
723
724                         vaddr = kmap_local_page(page);
725                         memcpy(vaddr, buffer, len);
726                         kunmap_local(vaddr);
727
728                         skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
729                         refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
730                 }
731
732                 if (first_frag && desc->options & XDP_TX_METADATA) {
733                         if (unlikely(xs->pool->tx_metadata_len == 0)) {
734                                 err = -EINVAL;
735                                 goto free_err;
736                         }
737
738                         meta = buffer - xs->pool->tx_metadata_len;
739                         if (unlikely(!xsk_buff_valid_tx_metadata(meta))) {
740                                 err = -EINVAL;
741                                 goto free_err;
742                         }
743
744                         if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) {
745                                 if (unlikely(meta->request.csum_start +
746                                              meta->request.csum_offset +
747                                              sizeof(__sum16) > len)) {
748                                         err = -EINVAL;
749                                         goto free_err;
750                                 }
751
752                                 skb->csum_start = hr + meta->request.csum_start;
753                                 skb->csum_offset = meta->request.csum_offset;
754                                 skb->ip_summed = CHECKSUM_PARTIAL;
755
756                                 if (unlikely(xs->pool->tx_sw_csum)) {
757                                         err = skb_checksum_help(skb);
758                                         if (err)
759                                                 goto free_err;
760                                 }
761                         }
762                 }
763         }
764
765         skb->dev = dev;
766         skb->priority = READ_ONCE(xs->sk.sk_priority);
767         skb->mark = READ_ONCE(xs->sk.sk_mark);
768         skb->destructor = xsk_destruct_skb;
769         xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
770         xsk_set_destructor_arg(skb);
771
772         return skb;
773
774 free_err:
775         if (err == -EOVERFLOW) {
776                 /* Drop the packet */
777                 xsk_set_destructor_arg(xs->skb);
778                 xsk_drop_skb(xs->skb);
779                 xskq_cons_release(xs->tx);
780         } else {
781                 /* Let application retry */
782                 xsk_cq_cancel_locked(xs, 1);
783         }
784
785         return ERR_PTR(err);
786 }
787
788 static int __xsk_generic_xmit(struct sock *sk)
789 {
790         struct xdp_sock *xs = xdp_sk(sk);
791         u32 max_batch = TX_BATCH_SIZE;
792         bool sent_frame = false;
793         struct xdp_desc desc;
794         struct sk_buff *skb;
795         int err = 0;
796
797         mutex_lock(&xs->mutex);
798
799         /* Since we dropped the RCU read lock, the socket state might have changed. */
800         if (unlikely(!xsk_is_bound(xs))) {
801                 err = -ENXIO;
802                 goto out;
803         }
804
805         if (xs->queue_id >= xs->dev->real_num_tx_queues)
806                 goto out;
807
808         while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
809                 if (max_batch-- == 0) {
810                         err = -EAGAIN;
811                         goto out;
812                 }
813
814                 /* This is the backpressure mechanism for the Tx path.
815                  * Reserve space in the completion queue and only proceed
816                  * if there is space in it. This avoids having to implement
817                  * any buffering in the Tx path.
818                  */
819                 if (xsk_cq_reserve_addr_locked(xs, desc.addr))
820                         goto out;
821
822                 skb = xsk_build_skb(xs, &desc);
823                 if (IS_ERR(skb)) {
824                         err = PTR_ERR(skb);
825                         if (err != -EOVERFLOW)
826                                 goto out;
827                         err = 0;
828                         continue;
829                 }
830
831                 xskq_cons_release(xs->tx);
832
833                 if (xp_mb_desc(&desc)) {
834                         xs->skb = skb;
835                         continue;
836                 }
837
838                 err = __dev_direct_xmit(skb, xs->queue_id);
839                 if  (err == NETDEV_TX_BUSY) {
840                         /* Tell user-space to retry the send */
841                         xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb));
842                         xsk_consume_skb(skb);
843                         err = -EAGAIN;
844                         goto out;
845                 }
846
847                 /* Ignore NET_XMIT_CN as packet might have been sent */
848                 if (err == NET_XMIT_DROP) {
849                         /* SKB completed but not sent */
850                         err = -EBUSY;
851                         xs->skb = NULL;
852                         goto out;
853                 }
854
855                 sent_frame = true;
856                 xs->skb = NULL;
857         }
858
859         if (xskq_has_descs(xs->tx)) {
860                 if (xs->skb)
861                         xsk_drop_skb(xs->skb);
862                 xskq_cons_release(xs->tx);
863         }
864
865 out:
866         if (sent_frame)
867                 if (xsk_tx_writeable(xs))
868                         sk->sk_write_space(sk);
869
870         mutex_unlock(&xs->mutex);
871         return err;
872 }
873
874 static int xsk_generic_xmit(struct sock *sk)
875 {
876         int ret;
877
878         /* Drop the RCU lock since the SKB path might sleep. */
879         rcu_read_unlock();
880         ret = __xsk_generic_xmit(sk);
881         /* Reaquire RCU lock before going into common code. */
882         rcu_read_lock();
883
884         return ret;
885 }
886
887 static bool xsk_no_wakeup(struct sock *sk)
888 {
889 #ifdef CONFIG_NET_RX_BUSY_POLL
890         /* Prefer busy-polling, skip the wakeup. */
891         return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
892                 READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID;
893 #else
894         return false;
895 #endif
896 }
897
898 static int xsk_check_common(struct xdp_sock *xs)
899 {
900         if (unlikely(!xsk_is_bound(xs)))
901                 return -ENXIO;
902         if (unlikely(!(xs->dev->flags & IFF_UP)))
903                 return -ENETDOWN;
904
905         return 0;
906 }
907
908 static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
909 {
910         bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
911         struct sock *sk = sock->sk;
912         struct xdp_sock *xs = xdp_sk(sk);
913         struct xsk_buff_pool *pool;
914         int err;
915
916         err = xsk_check_common(xs);
917         if (err)
918                 return err;
919         if (unlikely(need_wait))
920                 return -EOPNOTSUPP;
921         if (unlikely(!xs->tx))
922                 return -ENOBUFS;
923
924         if (sk_can_busy_loop(sk)) {
925                 if (xs->zc)
926                         __sk_mark_napi_id_once(sk, xsk_pool_get_napi_id(xs->pool));
927                 sk_busy_loop(sk, 1); /* only support non-blocking sockets */
928         }
929
930         if (xs->zc && xsk_no_wakeup(sk))
931                 return 0;
932
933         pool = xs->pool;
934         if (pool->cached_need_wakeup & XDP_WAKEUP_TX) {
935                 if (xs->zc)
936                         return xsk_wakeup(xs, XDP_WAKEUP_TX);
937                 return xsk_generic_xmit(sk);
938         }
939         return 0;
940 }
941
942 static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
943 {
944         int ret;
945
946         rcu_read_lock();
947         ret = __xsk_sendmsg(sock, m, total_len);
948         rcu_read_unlock();
949
950         return ret;
951 }
952
953 static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
954 {
955         bool need_wait = !(flags & MSG_DONTWAIT);
956         struct sock *sk = sock->sk;
957         struct xdp_sock *xs = xdp_sk(sk);
958         int err;
959
960         err = xsk_check_common(xs);
961         if (err)
962                 return err;
963         if (unlikely(!xs->rx))
964                 return -ENOBUFS;
965         if (unlikely(need_wait))
966                 return -EOPNOTSUPP;
967
968         if (sk_can_busy_loop(sk))
969                 sk_busy_loop(sk, 1); /* only support non-blocking sockets */
970
971         if (xsk_no_wakeup(sk))
972                 return 0;
973
974         if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
975                 return xsk_wakeup(xs, XDP_WAKEUP_RX);
976         return 0;
977 }
978
979 static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
980 {
981         int ret;
982
983         rcu_read_lock();
984         ret = __xsk_recvmsg(sock, m, len, flags);
985         rcu_read_unlock();
986
987         return ret;
988 }
989
990 static __poll_t xsk_poll(struct file *file, struct socket *sock,
991                              struct poll_table_struct *wait)
992 {
993         __poll_t mask = 0;
994         struct sock *sk = sock->sk;
995         struct xdp_sock *xs = xdp_sk(sk);
996         struct xsk_buff_pool *pool;
997
998         sock_poll_wait(file, sock, wait);
999
1000         rcu_read_lock();
1001         if (xsk_check_common(xs))
1002                 goto out;
1003
1004         pool = xs->pool;
1005
1006         if (pool->cached_need_wakeup) {
1007                 if (xs->zc)
1008                         xsk_wakeup(xs, pool->cached_need_wakeup);
1009                 else if (xs->tx)
1010                         /* Poll needs to drive Tx also in copy mode */
1011                         xsk_generic_xmit(sk);
1012         }
1013
1014         if (xs->rx && !xskq_prod_is_empty(xs->rx))
1015                 mask |= EPOLLIN | EPOLLRDNORM;
1016         if (xs->tx && xsk_tx_writeable(xs))
1017                 mask |= EPOLLOUT | EPOLLWRNORM;
1018 out:
1019         rcu_read_unlock();
1020         return mask;
1021 }
1022
1023 static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
1024                           bool umem_queue)
1025 {
1026         struct xsk_queue *q;
1027
1028         if (entries == 0 || *queue || !is_power_of_2(entries))
1029                 return -EINVAL;
1030
1031         q = xskq_create(entries, umem_queue);
1032         if (!q)
1033                 return -ENOMEM;
1034
1035         /* Make sure queue is ready before it can be seen by others */
1036         smp_wmb();
1037         WRITE_ONCE(*queue, q);
1038         return 0;
1039 }
1040
1041 static void xsk_unbind_dev(struct xdp_sock *xs)
1042 {
1043         struct net_device *dev = xs->dev;
1044
1045         if (xs->state != XSK_BOUND)
1046                 return;
1047         WRITE_ONCE(xs->state, XSK_UNBOUND);
1048
1049         /* Wait for driver to stop using the xdp socket. */
1050         xp_del_xsk(xs->pool, xs);
1051         synchronize_net();
1052         dev_put(dev);
1053 }
1054
1055 static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
1056                                               struct xdp_sock __rcu ***map_entry)
1057 {
1058         struct xsk_map *map = NULL;
1059         struct xsk_map_node *node;
1060
1061         *map_entry = NULL;
1062
1063         spin_lock_bh(&xs->map_list_lock);
1064         node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
1065                                         node);
1066         if (node) {
1067                 bpf_map_inc(&node->map->map);
1068                 map = node->map;
1069                 *map_entry = node->map_entry;
1070         }
1071         spin_unlock_bh(&xs->map_list_lock);
1072         return map;
1073 }
1074
1075 static void xsk_delete_from_maps(struct xdp_sock *xs)
1076 {
1077         /* This function removes the current XDP socket from all the
1078          * maps it resides in. We need to take extra care here, due to
1079          * the two locks involved. Each map has a lock synchronizing
1080          * updates to the entries, and each socket has a lock that
1081          * synchronizes access to the list of maps (map_list). For
1082          * deadlock avoidance the locks need to be taken in the order
1083          * "map lock"->"socket map list lock". We start off by
1084          * accessing the socket map list, and take a reference to the
1085          * map to guarantee existence between the
1086          * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
1087          * calls. Then we ask the map to remove the socket, which
1088          * tries to remove the socket from the map. Note that there
1089          * might be updates to the map between
1090          * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
1091          */
1092         struct xdp_sock __rcu **map_entry = NULL;
1093         struct xsk_map *map;
1094
1095         while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
1096                 xsk_map_try_sock_delete(map, xs, map_entry);
1097                 bpf_map_put(&map->map);
1098         }
1099 }
1100
1101 static int xsk_release(struct socket *sock)
1102 {
1103         struct sock *sk = sock->sk;
1104         struct xdp_sock *xs = xdp_sk(sk);
1105         struct net *net;
1106
1107         if (!sk)
1108                 return 0;
1109
1110         net = sock_net(sk);
1111
1112         if (xs->skb)
1113                 xsk_drop_skb(xs->skb);
1114
1115         mutex_lock(&net->xdp.lock);
1116         sk_del_node_init_rcu(sk);
1117         mutex_unlock(&net->xdp.lock);
1118
1119         sock_prot_inuse_add(net, sk->sk_prot, -1);
1120
1121         xsk_delete_from_maps(xs);
1122         mutex_lock(&xs->mutex);
1123         xsk_unbind_dev(xs);
1124         mutex_unlock(&xs->mutex);
1125
1126         xskq_destroy(xs->rx);
1127         xskq_destroy(xs->tx);
1128         xskq_destroy(xs->fq_tmp);
1129         xskq_destroy(xs->cq_tmp);
1130
1131         sock_orphan(sk);
1132         sock->sk = NULL;
1133
1134         sock_put(sk);
1135
1136         return 0;
1137 }
1138
1139 static struct socket *xsk_lookup_xsk_from_fd(int fd)
1140 {
1141         struct socket *sock;
1142         int err;
1143
1144         sock = sockfd_lookup(fd, &err);
1145         if (!sock)
1146                 return ERR_PTR(-ENOTSOCK);
1147
1148         if (sock->sk->sk_family != PF_XDP) {
1149                 sockfd_put(sock);
1150                 return ERR_PTR(-ENOPROTOOPT);
1151         }
1152
1153         return sock;
1154 }
1155
1156 static bool xsk_validate_queues(struct xdp_sock *xs)
1157 {
1158         return xs->fq_tmp && xs->cq_tmp;
1159 }
1160
1161 static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
1162 {
1163         struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
1164         struct sock *sk = sock->sk;
1165         struct xdp_sock *xs = xdp_sk(sk);
1166         struct net_device *dev;
1167         int bound_dev_if;
1168         u32 flags, qid;
1169         int err = 0;
1170
1171         if (addr_len < sizeof(struct sockaddr_xdp))
1172                 return -EINVAL;
1173         if (sxdp->sxdp_family != AF_XDP)
1174                 return -EINVAL;
1175
1176         flags = sxdp->sxdp_flags;
1177         if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
1178                       XDP_USE_NEED_WAKEUP | XDP_USE_SG))
1179                 return -EINVAL;
1180
1181         bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
1182         if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex)
1183                 return -EINVAL;
1184
1185         rtnl_lock();
1186         mutex_lock(&xs->mutex);
1187         if (xs->state != XSK_READY) {
1188                 err = -EBUSY;
1189                 goto out_release;
1190         }
1191
1192         dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
1193         if (!dev) {
1194                 err = -ENODEV;
1195                 goto out_release;
1196         }
1197
1198         if (!xs->rx && !xs->tx) {
1199                 err = -EINVAL;
1200                 goto out_unlock;
1201         }
1202
1203         qid = sxdp->sxdp_queue_id;
1204
1205         if (flags & XDP_SHARED_UMEM) {
1206                 struct xdp_sock *umem_xs;
1207                 struct socket *sock;
1208
1209                 if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
1210                     (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) {
1211                         /* Cannot specify flags for shared sockets. */
1212                         err = -EINVAL;
1213                         goto out_unlock;
1214                 }
1215
1216                 if (xs->umem) {
1217                         /* We have already our own. */
1218                         err = -EINVAL;
1219                         goto out_unlock;
1220                 }
1221
1222                 sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
1223                 if (IS_ERR(sock)) {
1224                         err = PTR_ERR(sock);
1225                         goto out_unlock;
1226                 }
1227
1228                 umem_xs = xdp_sk(sock->sk);
1229                 if (!xsk_is_bound(umem_xs)) {
1230                         err = -EBADF;
1231                         sockfd_put(sock);
1232                         goto out_unlock;
1233                 }
1234
1235                 if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
1236                         /* Share the umem with another socket on another qid
1237                          * and/or device.
1238                          */
1239                         xs->pool = xp_create_and_assign_umem(xs,
1240                                                              umem_xs->umem);
1241                         if (!xs->pool) {
1242                                 err = -ENOMEM;
1243                                 sockfd_put(sock);
1244                                 goto out_unlock;
1245                         }
1246
1247                         err = xp_assign_dev_shared(xs->pool, umem_xs, dev,
1248                                                    qid);
1249                         if (err) {
1250                                 xp_destroy(xs->pool);
1251                                 xs->pool = NULL;
1252                                 sockfd_put(sock);
1253                                 goto out_unlock;
1254                         }
1255                 } else {
1256                         /* Share the buffer pool with the other socket. */
1257                         if (xs->fq_tmp || xs->cq_tmp) {
1258                                 /* Do not allow setting your own fq or cq. */
1259                                 err = -EINVAL;
1260                                 sockfd_put(sock);
1261                                 goto out_unlock;
1262                         }
1263
1264                         xp_get_pool(umem_xs->pool);
1265                         xs->pool = umem_xs->pool;
1266
1267                         /* If underlying shared umem was created without Tx
1268                          * ring, allocate Tx descs array that Tx batching API
1269                          * utilizes
1270                          */
1271                         if (xs->tx && !xs->pool->tx_descs) {
1272                                 err = xp_alloc_tx_descs(xs->pool, xs);
1273                                 if (err) {
1274                                         xp_put_pool(xs->pool);
1275                                         xs->pool = NULL;
1276                                         sockfd_put(sock);
1277                                         goto out_unlock;
1278                                 }
1279                         }
1280                 }
1281
1282                 xdp_get_umem(umem_xs->umem);
1283                 WRITE_ONCE(xs->umem, umem_xs->umem);
1284                 sockfd_put(sock);
1285         } else if (!xs->umem || !xsk_validate_queues(xs)) {
1286                 err = -EINVAL;
1287                 goto out_unlock;
1288         } else {
1289                 /* This xsk has its own umem. */
1290                 xs->pool = xp_create_and_assign_umem(xs, xs->umem);
1291                 if (!xs->pool) {
1292                         err = -ENOMEM;
1293                         goto out_unlock;
1294                 }
1295
1296                 err = xp_assign_dev(xs->pool, dev, qid, flags);
1297                 if (err) {
1298                         xp_destroy(xs->pool);
1299                         xs->pool = NULL;
1300                         goto out_unlock;
1301                 }
1302         }
1303
1304         /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
1305         xs->fq_tmp = NULL;
1306         xs->cq_tmp = NULL;
1307
1308         xs->dev = dev;
1309         xs->zc = xs->umem->zc;
1310         xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG);
1311         xs->queue_id = qid;
1312         xp_add_xsk(xs->pool, xs);
1313
1314 out_unlock:
1315         if (err) {
1316                 dev_put(dev);
1317         } else {
1318                 /* Matches smp_rmb() in bind() for shared umem
1319                  * sockets, and xsk_is_bound().
1320                  */
1321                 smp_wmb();
1322                 WRITE_ONCE(xs->state, XSK_BOUND);
1323         }
1324 out_release:
1325         mutex_unlock(&xs->mutex);
1326         rtnl_unlock();
1327         return err;
1328 }
1329
1330 struct xdp_umem_reg_v1 {
1331         __u64 addr; /* Start of packet data area */
1332         __u64 len; /* Length of packet data area */
1333         __u32 chunk_size;
1334         __u32 headroom;
1335 };
1336
1337 struct xdp_umem_reg_v2 {
1338         __u64 addr; /* Start of packet data area */
1339         __u64 len; /* Length of packet data area */
1340         __u32 chunk_size;
1341         __u32 headroom;
1342         __u32 flags;
1343 };
1344
1345 static int xsk_setsockopt(struct socket *sock, int level, int optname,
1346                           sockptr_t optval, unsigned int optlen)
1347 {
1348         struct sock *sk = sock->sk;
1349         struct xdp_sock *xs = xdp_sk(sk);
1350         int err;
1351
1352         if (level != SOL_XDP)
1353                 return -ENOPROTOOPT;
1354
1355         switch (optname) {
1356         case XDP_RX_RING:
1357         case XDP_TX_RING:
1358         {
1359                 struct xsk_queue **q;
1360                 int entries;
1361
1362                 if (optlen < sizeof(entries))
1363                         return -EINVAL;
1364                 if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1365                         return -EFAULT;
1366
1367                 mutex_lock(&xs->mutex);
1368                 if (xs->state != XSK_READY) {
1369                         mutex_unlock(&xs->mutex);
1370                         return -EBUSY;
1371                 }
1372                 q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
1373                 err = xsk_init_queue(entries, q, false);
1374                 if (!err && optname == XDP_TX_RING)
1375                         /* Tx needs to be explicitly woken up the first time */
1376                         xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
1377                 mutex_unlock(&xs->mutex);
1378                 return err;
1379         }
1380         case XDP_UMEM_REG:
1381         {
1382                 size_t mr_size = sizeof(struct xdp_umem_reg);
1383                 struct xdp_umem_reg mr = {};
1384                 struct xdp_umem *umem;
1385
1386                 if (optlen < sizeof(struct xdp_umem_reg_v1))
1387                         return -EINVAL;
1388                 else if (optlen < sizeof(struct xdp_umem_reg_v2))
1389                         mr_size = sizeof(struct xdp_umem_reg_v1);
1390                 else if (optlen < sizeof(mr))
1391                         mr_size = sizeof(struct xdp_umem_reg_v2);
1392
1393                 if (copy_from_sockptr(&mr, optval, mr_size))
1394                         return -EFAULT;
1395
1396                 mutex_lock(&xs->mutex);
1397                 if (xs->state != XSK_READY || xs->umem) {
1398                         mutex_unlock(&xs->mutex);
1399                         return -EBUSY;
1400                 }
1401
1402                 umem = xdp_umem_create(&mr);
1403                 if (IS_ERR(umem)) {
1404                         mutex_unlock(&xs->mutex);
1405                         return PTR_ERR(umem);
1406                 }
1407
1408                 /* Make sure umem is ready before it can be seen by others */
1409                 smp_wmb();
1410                 WRITE_ONCE(xs->umem, umem);
1411                 mutex_unlock(&xs->mutex);
1412                 return 0;
1413         }
1414         case XDP_UMEM_FILL_RING:
1415         case XDP_UMEM_COMPLETION_RING:
1416         {
1417                 struct xsk_queue **q;
1418                 int entries;
1419
1420                 if (optlen < sizeof(entries))
1421                         return -EINVAL;
1422                 if (copy_from_sockptr(&entries, optval, sizeof(entries)))
1423                         return -EFAULT;
1424
1425                 mutex_lock(&xs->mutex);
1426                 if (xs->state != XSK_READY) {
1427                         mutex_unlock(&xs->mutex);
1428                         return -EBUSY;
1429                 }
1430
1431                 q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
1432                         &xs->cq_tmp;
1433                 err = xsk_init_queue(entries, q, true);
1434                 mutex_unlock(&xs->mutex);
1435                 return err;
1436         }
1437         default:
1438                 break;
1439         }
1440
1441         return -ENOPROTOOPT;
1442 }
1443
1444 static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
1445 {
1446         ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
1447         ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
1448         ring->desc = offsetof(struct xdp_rxtx_ring, desc);
1449 }
1450
1451 static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
1452 {
1453         ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
1454         ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
1455         ring->desc = offsetof(struct xdp_umem_ring, desc);
1456 }
1457
1458 struct xdp_statistics_v1 {
1459         __u64 rx_dropped;
1460         __u64 rx_invalid_descs;
1461         __u64 tx_invalid_descs;
1462 };
1463
1464 static int xsk_getsockopt(struct socket *sock, int level, int optname,
1465                           char __user *optval, int __user *optlen)
1466 {
1467         struct sock *sk = sock->sk;
1468         struct xdp_sock *xs = xdp_sk(sk);
1469         int len;
1470
1471         if (level != SOL_XDP)
1472                 return -ENOPROTOOPT;
1473
1474         if (get_user(len, optlen))
1475                 return -EFAULT;
1476         if (len < 0)
1477                 return -EINVAL;
1478
1479         switch (optname) {
1480         case XDP_STATISTICS:
1481         {
1482                 struct xdp_statistics stats = {};
1483                 bool extra_stats = true;
1484                 size_t stats_size;
1485
1486                 if (len < sizeof(struct xdp_statistics_v1)) {
1487                         return -EINVAL;
1488                 } else if (len < sizeof(stats)) {
1489                         extra_stats = false;
1490                         stats_size = sizeof(struct xdp_statistics_v1);
1491                 } else {
1492                         stats_size = sizeof(stats);
1493                 }
1494
1495                 mutex_lock(&xs->mutex);
1496                 stats.rx_dropped = xs->rx_dropped;
1497                 if (extra_stats) {
1498                         stats.rx_ring_full = xs->rx_queue_full;
1499                         stats.rx_fill_ring_empty_descs =
1500                                 xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
1501                         stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
1502                 } else {
1503                         stats.rx_dropped += xs->rx_queue_full;
1504                 }
1505                 stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
1506                 stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
1507                 mutex_unlock(&xs->mutex);
1508
1509                 if (copy_to_user(optval, &stats, stats_size))
1510                         return -EFAULT;
1511                 if (put_user(stats_size, optlen))
1512                         return -EFAULT;
1513
1514                 return 0;
1515         }
1516         case XDP_MMAP_OFFSETS:
1517         {
1518                 struct xdp_mmap_offsets off;
1519                 struct xdp_mmap_offsets_v1 off_v1;
1520                 bool flags_supported = true;
1521                 void *to_copy;
1522
1523                 if (len < sizeof(off_v1))
1524                         return -EINVAL;
1525                 else if (len < sizeof(off))
1526                         flags_supported = false;
1527
1528                 if (flags_supported) {
1529                         /* xdp_ring_offset is identical to xdp_ring_offset_v1
1530                          * except for the flags field added to the end.
1531                          */
1532                         xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1533                                                &off.rx);
1534                         xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
1535                                                &off.tx);
1536                         xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1537                                                &off.fr);
1538                         xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
1539                                                &off.cr);
1540                         off.rx.flags = offsetof(struct xdp_rxtx_ring,
1541                                                 ptrs.flags);
1542                         off.tx.flags = offsetof(struct xdp_rxtx_ring,
1543                                                 ptrs.flags);
1544                         off.fr.flags = offsetof(struct xdp_umem_ring,
1545                                                 ptrs.flags);
1546                         off.cr.flags = offsetof(struct xdp_umem_ring,
1547                                                 ptrs.flags);
1548
1549                         len = sizeof(off);
1550                         to_copy = &off;
1551                 } else {
1552                         xsk_enter_rxtx_offsets(&off_v1.rx);
1553                         xsk_enter_rxtx_offsets(&off_v1.tx);
1554                         xsk_enter_umem_offsets(&off_v1.fr);
1555                         xsk_enter_umem_offsets(&off_v1.cr);
1556
1557                         len = sizeof(off_v1);
1558                         to_copy = &off_v1;
1559                 }
1560
1561                 if (copy_to_user(optval, to_copy, len))
1562                         return -EFAULT;
1563                 if (put_user(len, optlen))
1564                         return -EFAULT;
1565
1566                 return 0;
1567         }
1568         case XDP_OPTIONS:
1569         {
1570                 struct xdp_options opts = {};
1571
1572                 if (len < sizeof(opts))
1573                         return -EINVAL;
1574
1575                 mutex_lock(&xs->mutex);
1576                 if (xs->zc)
1577                         opts.flags |= XDP_OPTIONS_ZEROCOPY;
1578                 mutex_unlock(&xs->mutex);
1579
1580                 len = sizeof(opts);
1581                 if (copy_to_user(optval, &opts, len))
1582                         return -EFAULT;
1583                 if (put_user(len, optlen))
1584                         return -EFAULT;
1585
1586                 return 0;
1587         }
1588         default:
1589                 break;
1590         }
1591
1592         return -EOPNOTSUPP;
1593 }
1594
1595 static int xsk_mmap(struct file *file, struct socket *sock,
1596                     struct vm_area_struct *vma)
1597 {
1598         loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
1599         unsigned long size = vma->vm_end - vma->vm_start;
1600         struct xdp_sock *xs = xdp_sk(sock->sk);
1601         int state = READ_ONCE(xs->state);
1602         struct xsk_queue *q = NULL;
1603
1604         if (state != XSK_READY && state != XSK_BOUND)
1605                 return -EBUSY;
1606
1607         if (offset == XDP_PGOFF_RX_RING) {
1608                 q = READ_ONCE(xs->rx);
1609         } else if (offset == XDP_PGOFF_TX_RING) {
1610                 q = READ_ONCE(xs->tx);
1611         } else {
1612                 /* Matches the smp_wmb() in XDP_UMEM_REG */
1613                 smp_rmb();
1614                 if (offset == XDP_UMEM_PGOFF_FILL_RING)
1615                         q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) :
1616                                                  READ_ONCE(xs->pool->fq);
1617                 else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
1618                         q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) :
1619                                                  READ_ONCE(xs->pool->cq);
1620         }
1621
1622         if (!q)
1623                 return -EINVAL;
1624
1625         /* Matches the smp_wmb() in xsk_init_queue */
1626         smp_rmb();
1627         if (size > q->ring_vmalloc_size)
1628                 return -EINVAL;
1629
1630         return remap_vmalloc_range(vma, q->ring, 0);
1631 }
1632
1633 static int xsk_notifier(struct notifier_block *this,
1634                         unsigned long msg, void *ptr)
1635 {
1636         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1637         struct net *net = dev_net(dev);
1638         struct sock *sk;
1639
1640         switch (msg) {
1641         case NETDEV_UNREGISTER:
1642                 mutex_lock(&net->xdp.lock);
1643                 sk_for_each(sk, &net->xdp.list) {
1644                         struct xdp_sock *xs = xdp_sk(sk);
1645
1646                         mutex_lock(&xs->mutex);
1647                         if (xs->dev == dev) {
1648                                 sk->sk_err = ENETDOWN;
1649                                 if (!sock_flag(sk, SOCK_DEAD))
1650                                         sk_error_report(sk);
1651
1652                                 xsk_unbind_dev(xs);
1653
1654                                 /* Clear device references. */
1655                                 xp_clear_dev(xs->pool);
1656                         }
1657                         mutex_unlock(&xs->mutex);
1658                 }
1659                 mutex_unlock(&net->xdp.lock);
1660                 break;
1661         }
1662         return NOTIFY_DONE;
1663 }
1664
1665 static struct proto xsk_proto = {
1666         .name =         "XDP",
1667         .owner =        THIS_MODULE,
1668         .obj_size =     sizeof(struct xdp_sock),
1669 };
1670
1671 static const struct proto_ops xsk_proto_ops = {
1672         .family         = PF_XDP,
1673         .owner          = THIS_MODULE,
1674         .release        = xsk_release,
1675         .bind           = xsk_bind,
1676         .connect        = sock_no_connect,
1677         .socketpair     = sock_no_socketpair,
1678         .accept         = sock_no_accept,
1679         .getname        = sock_no_getname,
1680         .poll           = xsk_poll,
1681         .ioctl          = sock_no_ioctl,
1682         .listen         = sock_no_listen,
1683         .shutdown       = sock_no_shutdown,
1684         .setsockopt     = xsk_setsockopt,
1685         .getsockopt     = xsk_getsockopt,
1686         .sendmsg        = xsk_sendmsg,
1687         .recvmsg        = xsk_recvmsg,
1688         .mmap           = xsk_mmap,
1689 };
1690
1691 static void xsk_destruct(struct sock *sk)
1692 {
1693         struct xdp_sock *xs = xdp_sk(sk);
1694
1695         if (!sock_flag(sk, SOCK_DEAD))
1696                 return;
1697
1698         if (!xp_put_pool(xs->pool))
1699                 xdp_put_umem(xs->umem, !xs->pool);
1700 }
1701
1702 static int xsk_create(struct net *net, struct socket *sock, int protocol,
1703                       int kern)
1704 {
1705         struct xdp_sock *xs;
1706         struct sock *sk;
1707
1708         if (!ns_capable(net->user_ns, CAP_NET_RAW))
1709                 return -EPERM;
1710         if (sock->type != SOCK_RAW)
1711                 return -ESOCKTNOSUPPORT;
1712
1713         if (protocol)
1714                 return -EPROTONOSUPPORT;
1715
1716         sock->state = SS_UNCONNECTED;
1717
1718         sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
1719         if (!sk)
1720                 return -ENOBUFS;
1721
1722         sock->ops = &xsk_proto_ops;
1723
1724         sock_init_data(sock, sk);
1725
1726         sk->sk_family = PF_XDP;
1727
1728         sk->sk_destruct = xsk_destruct;
1729
1730         sock_set_flag(sk, SOCK_RCU_FREE);
1731
1732         xs = xdp_sk(sk);
1733         xs->state = XSK_READY;
1734         mutex_init(&xs->mutex);
1735         spin_lock_init(&xs->rx_lock);
1736
1737         INIT_LIST_HEAD(&xs->map_list);
1738         spin_lock_init(&xs->map_list_lock);
1739
1740         mutex_lock(&net->xdp.lock);
1741         sk_add_node_rcu(sk, &net->xdp.list);
1742         mutex_unlock(&net->xdp.lock);
1743
1744         sock_prot_inuse_add(net, &xsk_proto, 1);
1745
1746         return 0;
1747 }
1748
1749 static const struct net_proto_family xsk_family_ops = {
1750         .family = PF_XDP,
1751         .create = xsk_create,
1752         .owner  = THIS_MODULE,
1753 };
1754
1755 static struct notifier_block xsk_netdev_notifier = {
1756         .notifier_call  = xsk_notifier,
1757 };
1758
1759 static int __net_init xsk_net_init(struct net *net)
1760 {
1761         mutex_init(&net->xdp.lock);
1762         INIT_HLIST_HEAD(&net->xdp.list);
1763         return 0;
1764 }
1765
1766 static void __net_exit xsk_net_exit(struct net *net)
1767 {
1768         WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
1769 }
1770
1771 static struct pernet_operations xsk_net_ops = {
1772         .init = xsk_net_init,
1773         .exit = xsk_net_exit,
1774 };
1775
1776 static int __init xsk_init(void)
1777 {
1778         int err, cpu;
1779
1780         err = proto_register(&xsk_proto, 0 /* no slab */);
1781         if (err)
1782                 goto out;
1783
1784         err = sock_register(&xsk_family_ops);
1785         if (err)
1786                 goto out_proto;
1787
1788         err = register_pernet_subsys(&xsk_net_ops);
1789         if (err)
1790                 goto out_sk;
1791
1792         err = register_netdevice_notifier(&xsk_netdev_notifier);
1793         if (err)
1794                 goto out_pernet;
1795
1796         for_each_possible_cpu(cpu)
1797                 INIT_LIST_HEAD(&per_cpu(xskmap_flush_list, cpu));
1798         return 0;
1799
1800 out_pernet:
1801         unregister_pernet_subsys(&xsk_net_ops);
1802 out_sk:
1803         sock_unregister(PF_XDP);
1804 out_proto:
1805         proto_unregister(&xsk_proto);
1806 out:
1807         return err;
1808 }
1809
1810 fs_initcall(xsk_init);